1/*-
2 * Copyright (c) 2013 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas of 3am Software Foundry.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <machine/asm.h>
31
32RCSID("$NetBSD: memcpy_neon.S,v 1.3 2025/02/27 08:39:53 andvar Exp $")
33
34          .text
35ENTRY(memcpy)
36          teq       r2, #0                        /* 0 length? */
37          cmpne     r0, r1                        /*   if not, does src == dst? */
38          RETc(eq)                      /*   yes, (to either) return */
39
40          mov       r3, r0                        /* keep r0 unchanged */
41#if 0
42          cmp       r2, #16                       /* copy less than 8 bytes? */
43          bhs       .Ldst_aligner                 /*   nope, do it the long way */
44
451:        ldrb      ip, [r1], #1                  /* load a byte from src */
46          subs      r2, r2, #1                    /* and more to transfer? */
47          strb      ip, [r3], #1                  /* save it to dst */
48          bne       1b                            /*   yes, do next byte */
49          RET                                     /* return */
50#endif
51
52.Ldst_aligner:
53          tst       r3, #7                        /* is dst pointer word aligned? */
54          beq       .Lsrc_aligner                 /*   yes, check src pointer */
55          /*
56           * Until the dst pointer is word aligned, read src and dst byte by
57           * byte until it is aligned or we've copied everything.
58           */
59          ldrb      ip, [r1], #1                  /* load a byte from src */
60          strb      ip, [r3], #1                  /* save the byte to dst */
61          subs      r2, r2, #1                    /* end of transfer? */
62          bne       .Ldst_aligner                 /*   no, try next byte */
63          RET                                     /* yes, we're done! */
64
65.Lsrc_aligner:
66          push      {r4-r5}                       /* save some registers */
67          add       r4, r2, r3                    /* keep a pointer to the end of src */
68          ands      r5, r1, #7                    /* get misalignment of src pointer */
69          beq       .Lcongruent_main    /*   aligned, do it the fast way */
70
71          vdup.8    d1, r5                        /* set offset for table */
72          rsb       r5, r5, #8                    /* calculate leftover of each word */
73          bic       r1, r1, #7                    /* dword align src pointer */
74
75          vldr      d0, .Ltbl_value               /* load table value */
76          vadd.u8   d0, d0, d1                    /* add offset to it */
77
78          vld1.64 {d1}, [r1:64]!                  /* load a dword from src */
79
80          cmp       r2, r5                        /* do we already have enough? */
81          bhi       .Lincongruent                 /*   no, so read more */
82
83.Lincongruent_finish:
84          vtbl.8    d0, {d1-d2}, d0               /* merge last dwords */
85          cmp       r2, #8                        /* room for a full dword? */
86#ifdef __ARMEB__
87          vrev64.32 d0, d0              /* word swap to LE */
88#endif
89          blo       .Lfinish            /*   no, write final partial dword */
90          vst1.32 {d0}, [r3:64]                   /*   yes, write final full dword */
91          b         .Ldone                        /* and we're done! */
92
93.Lincongruent:
94          vld1.64 {d2}, [r1:64]!                  /* load a dword */
95          cmp       r2, #8                        /* can we write a full dword? */
96          blo       .Lincongruent_finish          /*   no, finish it. */
97          vtbl.8    d1, {d1-d2}, d0               /* reorder */
98          vst1.64 {d1}, [r3:64]!                  /* store a dword */
99          subs      r2, r2, #8                    /* have we written everything? */
100          beq       .Ldone                        /*   yes, we're done! */
101          vmov      d1, d2                        /* prepare for next dword */
102          tst       r3, #63                       /* are we 64-byte aligned? */
103          bne       .Lincongruent                 /*   no, load next dword */
104
105          /*
106           * We are now 64-byte aligneds so all writes should fill one or more
107           * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
108           * still need to read 4 dwords (3 full dwords and 1 dword for that
109           * last byte).
110           */
111          cmp       r2, #32                       /* can we write 4 more dwords? */
112          blo       .Lincongruent_dword /*   no, handle dword by dword */
113          vld1.64 {d2-d5}, [r1:64]!     /* read 4 dwords */
114          cmp       r2, #64                       /* can we write 4 more dwords? */
115          blo       .Lincongruent_4dword          /*   no, handle it */
116
1171:        vld1.64 {d7-d10}, [r1:64]!    /* read 4 dwords */
118          vtbl.8    d1, {d1-d2}, d0               /* reorder */
119          vtbl.8    d2, {d2-d3}, d0               /* reorder */
120          vtbl.8    d3, {d3-d4}, d0               /* reorder */
121          vtbl.8    d4, {d4-d5}, d0               /* reorder */
122          vst1.64 {d1-d4}, [r3:64]!     /* write 4 dwords */
123          vmov      d6, d5                        /* move out of the way the load */
124          cmp       r2, #96                       /* have 8+4 dwords to write? */
125          blo       2f                            /*   no more data, skip the load */
126          vld1.64 {d2-d5}, [r1:64]!     /* more data, load 4 dwords */
1272:        vtbl.8    d6, {d6-d7}, d0               /* reorder */
128          vtbl.8    d7, {d7-d8}, d0               /* reorder */
129          vtbl.8    d8, {d8-d9}, d0               /* reorder */
130          vtbl.8    d9, {d9-d10}, d0    /* reorder */
131          vst1.64 {d6-d9}, [r3:64]!     /* write 4 dwords */
132          subs      r2, r2, #64
133          beq       .Ldone
134          vmov      d1, d10
135          cmp       r2, #64
136          bhs       1b
137
138          /*
139           * we have leftovers in d1 and new untranslated date in d2-d5.
140           */
141.Lincongruent_4dword:
142          cmp       r2, #32
143          blo       .Lincongruent_dword
144
145          vtbl.8    d1, {d1-d2}, d0               /* reorder */
146          vtbl.8    d2, {d2-d3}, d0               /* reorder */
147          vtbl.8    d3, {d3-d4}, d0               /* reorder */
148          vtbl.8    d4, {d4-d5}, d0               /* reorder */
149          vst1.64 {d1-d4}, [r3:64]!     /* write 4 dwords */
150          vmov      d1, d5                        /* move leftovers */
151          subs      r2, r2, #32
152          beq       .Ldone
153
154.Lincongruent_dword:
155#if 0
156          cmp       r2, r5                        /* enough in leftovers? */
157          bls       .Lincongruent_finish          /*   yes, finish it. */
158          vld1.64 {d2}, [r1:64]!                  /* load a dword */
159          cmp       r2, #8                        /* can we write a full dword? */
160          blo       .Lincongruent_finish          /*   no, finish it. */
161          vtbl.8    d1, {d1-d2}, d0               /* reorder */
162          vst1.64 {d1}, [r3:64]!                  /* store a dword */
163          subs      r2, r2, #8                    /* have we written everything? */
164          beq       .Ldone                        /*   yes, we're done! */
165          b         .Lincongruent_dword /* and go get it */
166#else
167          cmp       r2, r5                        /* are the bytes we have enough? */
168          bls       .Lincongruent_finish          /*   yes, finish it. */
169          mov       ip, r2                        /* get remaining count */
170          bic       ip, ip, #7                    /* truncate to a dword */
171          rsb       ip, ip, #32                   /* subtract from 32 */
172          ands      r2, r2, #7                    /* count mod 8 */
173          add       pc, pc, ip, lsl #1  /* and jump! */
174          nop
175          vld1.64 {d2}, [r1:64]!                  /* load a dword */
176          vtbl.8    d1, {d1-d2}, d0               /* reorder */
177          vst1.64 {d1}, [r3:64]!                  /* store a dword */
178          vmov      d1, d2                        /* prepare for next dword */
179          vld1.64 {d2}, [r1:64]!                  /* load a dword */
180          vtbl.8    d1, {d1-d2}, d0               /* reorder */
181          vst1.64 {d1}, [r3:64]!                  /* store a dword */
182          vmov      d1, d2                        /* prepare for next dword */
183          vld1.64 {d2}, [r1:64]!                  /* load a dword */
184          vtbl.8    d1, {d1-d2}, d0               /* reorder */
185          vst1.64 {d1}, [r3:64]!                  /* store a dword */
186          vmov      d1, d2                        /* prepare for next dword */
187          vld1.64 {d2}, [r1:64]!                  /* load a dword */
188          vtbl.8    d1, {d1-d2}, d0               /* reorder */
189          vst1.64 {d1}, [r3:64]!                  /* store a dword */
190          vmov      d1, d2                        /* prepare for next dword */
191          beq       .Ldone
192          vld1.64 {d2}, [r1:64]!                  /* load a dword */
193          b         .Lincongruent_finish          /* write last partial dword */
194#endif
195
196.Lcongruent_main:
197          vld1.32 {d0}, [r1:64]!                  /* load next dword */
198          cmp       r2, #8                        /* compare current ptr against end */
199          blo       .Lfinish            /*   greater so write final dword */
200          vst1.32 {d0}, [r3:64]!                  /* store dword */
201          subs      r2, r2, #8                    /* compare current ptr against end */
202          beq       .Ldone                        /*   equal? we're done! */
203          tst       r3, #63                       /* have we hit a 64-byte boundary? */
204          bne       .Lcongruent_main    /*   no, write next word */
205
206          cmp       r2, #64                       /* can we write 4 dwords? */
207          blo       .Lcongruent_loop    /*   no, this dword by dword */
208          vldm      r1!, {d0-d7}                  /* load next 7 dwords */
209          cmp       r2, #128            /* can we write 16 dwords */
210          blo       3f                            /*   no, then deal with 8 dwords */
211
212          /*
213           * The following writes two 64-byte interleaving stores and loads.
214           */
2151:        vldm      r1!, {d8-d15}                 /* load next 8 dwords */
216          vstm      r3!, {d0-d7}                  /* store 8 more dwords */
217          cmp       r2, #192            /* can we write 16+8 dwords? */
218          blo       2f                            /*   no, don't load the next 8 dwords */
219          vldm      r1!, {d0-d7}                  /*   yes, load next 8 dwords */
2202:        vstm      r3!, {d8-d15}                 /* store 8 more dwords */
221          sub       r2, r2, #128                  /* we just stored 16 (8+8) dwords */
222          beq       .Ldone                        /*   if 0, we're done! */
223          cmp       r2, #128            /* can we write 16 dwords */
224          bhs       1b                            /*   yes, do it again */
225          cmp       r2, #64                       /* have we loaded 8 dwords? */
226          blo       .Lcongruent_loop    /*   no, proceed to do it dword */
227
228          /*
229           * We now have 8 dwords we can write in d0-d7.
230           */
2313:        vstm      r3!, {d0-d7}                  /* store 8 more dwords */
232          subs      r2, r2, #64                   /* we wrote 8 dwords */
233          beq       .Ldone                        /*   if 0, we're done! */
234
235.Lcongruent_loop:
236          vld1.32 {d0}, [r1]!           /* load dword from src */
237          cmp       r2, #8                        /* can we write a full dword? */
238          blo       .Lfinish            /*   no, write last partial dword */
239.Lcongruent_loop_start:
240          vst1.32 {d0}, [r3]!           /* store dword into dst */
241          subs      r2, r2, #8                    /* subtract it from length */
242          beq       .Ldone                        /*   if 0, we're done! */
243          vld1.32 {d0}, [r1]!           /* load dword from src */
244          cmp       r2, #8                        /* can we write a full dword? */
245          bhs       .Lcongruent_loop_start        /*   yes, so do it */
246
247.Lfinish:
248          vmov      r4, r5, d0                    /* get last dword from NEON */
249          tst       r2, #4                        /* do we have at least 4 bytes left? */
250          strne     r4, [r3], #4                  /* store the 1st word */
251          movne     r4, r5                        /* move 2nd word into place */
252          tst       r2, #2                        /* do we have at least 2 bytes left? */
253#ifdef __ARMEB__
254          movne     r4, r4, ror #16               /*   yes, swap halfwords */
255#endif
256          strneh    r4, [r3], #2                  /*   yes, store the halfword */
257#ifdef __ARMEL__
258          movne     r4, r4, lsr #16               /*   yes, discard just written bytes */
259#endif
260          tst       r2, #1                        /* do we have a final byte? */
261#ifdef __ARMEB__
262          movne     r4, r4, lsr #24               /*   yes, move MSB to LSB */
263#endif
264          strneb    r4, [r3], #1                  /*   yes, store it */
265
266.Ldone:
267          pop       {r4-r5}                       /* restore registers */
268          RET
269
270          .p2align 3
271.Ltbl_value:
272#ifdef __ARMEL__
273          .quad     0x0706050403020100
274#else
275          .quad     0x0001020304050607
276#endif
277END(memcpy)
278