1/*        $NetBSD: memcpy_xscale.S,v 1.6 2023/01/19 18:03:03 mlelstv Exp $      */
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39
40/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
41ENTRY(memcpy)
42          pld       [r1]
43          cmp       r2, #0x0c
44          bls       .Lmemcpy_short                /* <= 12 bytes */
45          mov       r3, r0                        /* We must not clobber r0 */
46
47          /* Word-align the destination buffer */
48          ands      ip, r3, #0x03                 /* Already word aligned? */
49          beq       .Lmemcpy_wordaligned          /* Yup */
50          cmp       ip, #0x02
51          ldrb      ip, [r1], #0x01
52          sub       r2, r2, #0x01
53          strb      ip, [r3], #0x01
54          ldrbls    ip, [r1], #0x01
55          subls     r2, r2, #0x01
56          strbls    ip, [r3], #0x01
57          ldrblo    ip, [r1], #0x01
58          sublo     r2, r2, #0x01
59          strblo    ip, [r3], #0x01
60
61          /* Destination buffer is now word aligned */
62.Lmemcpy_wordaligned:
63          ands      ip, r1, #0x03                 /* Is src also word-aligned? */
64          bne       .Lmemcpy_bad_align  /* Nope. Things just got bad */
65
66          /* Quad-align the destination buffer */
67          tst       r3, #0x07           /* Already quad aligned? */
68          ldrne     ip, [r1], #0x04
69          push      {r4-r9}             /* Free up some registers */
70          subne     r2, r2, #0x04
71          strne     ip, [r3], #0x04
72
73          /* Destination buffer quad aligned, source is at least word aligned */
74          subs      r2, r2, #0x80
75          blo       .Lmemcpy_w_lessthan128
76
77          /* Copy 128 bytes at a time */
78.Lmemcpy_w_loop128:
79          ldr       r4, [r1], #0x04               /* LD:00-03 */
80          ldr       r5, [r1], #0x04               /* LD:04-07 */
81          pld       [r1, #0x18]                   /* Prefetch 0x20 */
82          ldr       r6, [r1], #0x04               /* LD:08-0b */
83          ldr       r7, [r1], #0x04               /* LD:0c-0f */
84          ldr       r8, [r1], #0x04               /* LD:10-13 */
85          ldr       r9, [r1], #0x04               /* LD:14-17 */
86          strd      r4, r5, [r3], #0x08 /* ST:00-07 */
87          ldr       r4, [r1], #0x04               /* LD:18-1b */
88          ldr       r5, [r1], #0x04               /* LD:1c-1f */
89          strd      r6, r7, [r3], #0x08 /* ST:08-0f */
90          ldr       r6, [r1], #0x04               /* LD:20-23 */
91          ldr       r7, [r1], #0x04               /* LD:24-27 */
92          pld       [r1, #0x18]                   /* Prefetch 0x40 */
93          strd      r8, r9, [r3], #0x08 /* ST:10-17 */
94          ldr       r8, [r1], #0x04               /* LD:28-2b */
95          ldr       r9, [r1], #0x04               /* LD:2c-2f */
96          strd      r4, r5, [r3], #0x08 /* ST:18-1f */
97          ldr       r4, [r1], #0x04               /* LD:30-33 */
98          ldr       r5, [r1], #0x04               /* LD:34-37 */
99          strd      r6, r7, [r3], #0x08 /* ST:20-27 */
100          ldr       r6, [r1], #0x04               /* LD:38-3b */
101          ldr       r7, [r1], #0x04               /* LD:3c-3f */
102          strd      r8, r9, [r3], #0x08 /* ST:28-2f */
103          ldr       r8, [r1], #0x04               /* LD:40-43 */
104          ldr       r9, [r1], #0x04               /* LD:44-47 */
105          pld       [r1, #0x18]                   /* Prefetch 0x60 */
106          strd      r4, r5, [r3], #0x08 /* ST:30-37 */
107          ldr       r4, [r1], #0x04               /* LD:48-4b */
108          ldr       r5, [r1], #0x04               /* LD:4c-4f */
109          strd      r6, r7, [r3], #0x08 /* ST:38-3f */
110          ldr       r6, [r1], #0x04               /* LD:50-53 */
111          ldr       r7, [r1], #0x04               /* LD:54-57 */
112          strd      r8, r9, [r3], #0x08 /* ST:40-47 */
113          ldr       r8, [r1], #0x04               /* LD:58-5b */
114          ldr       r9, [r1], #0x04               /* LD:5c-5f */
115          strd      r4, r5, [r3], #0x08 /* ST:48-4f */
116          ldr       r4, [r1], #0x04               /* LD:60-63 */
117          ldr       r5, [r1], #0x04               /* LD:64-67 */
118          pld       [r1, #0x18]                   /* Prefetch 0x80 */
119          strd      r6, r7, [r3], #0x08 /* ST:50-57 */
120          ldr       r6, [r1], #0x04               /* LD:68-6b */
121          ldr       r7, [r1], #0x04               /* LD:6c-6f */
122          strd      r8, r9, [r3], #0x08 /* ST:58-5f */
123          ldr       r8, [r1], #0x04               /* LD:70-73 */
124          ldr       r9, [r1], #0x04               /* LD:74-77 */
125          strd      r4, r5, [r3], #0x08 /* ST:60-67 */
126          ldr       r4, [r1], #0x04               /* LD:78-7b */
127          ldr       r5, [r1], #0x04               /* LD:7c-7f */
128          strd      r6, r7, [r3], #0x08 /* ST:68-6f */
129          strd      r8, r9, [r3], #0x08 /* ST:70-77 */
130          subs      r2, r2, #0x80
131          strd      r4, r5, [r3], #0x08 /* ST:78-7f */
132          bhs       .Lmemcpy_w_loop128
133
134.Lmemcpy_w_lessthan128:
135          adds      r2, r2, #0x80                 /* Adjust for extra sub */
136          popeq     {r4-r9}
137          RETc(eq)                      /* Return now if done */
138          subs      r2, r2, #0x20
139          blo       .Lmemcpy_w_lessthan32
140
141          /* Copy 32 bytes at a time */
142.Lmemcpy_w_loop32:
143          ldr       r4, [r1], #0x04
144          ldr       r5, [r1], #0x04
145          pld       [r1, #0x18]
146          ldr       r6, [r1], #0x04
147          ldr       r7, [r1], #0x04
148          ldr       r8, [r1], #0x04
149          ldr       r9, [r1], #0x04
150          strd      r4, r5, [r3], #0x08
151          ldr       r4, [r1], #0x04
152          ldr       r5, [r1], #0x04
153          strd      r6, r7, [r3], #0x08
154          strd      r8, r9, [r3], #0x08
155          subs      r2, r2, #0x20
156          strd      r4, r5, [r3], #0x08
157          bhs       .Lmemcpy_w_loop32
158
159.Lmemcpy_w_lessthan32:
160          adds      r2, r2, #0x20                 /* Adjust for extra sub */
161          popeq     {r4-r9}
162          RETc(eq)                      /* Return now if done */
163
164          and       r4, r2, #0x18
165          rsbs      r4, r4, #0x18
166          addne     pc, pc, r4, lsl #1
167          nop
168
169          /* At least 24 bytes remaining */
170          ldr       r4, [r1], #0x04
171          ldr       r5, [r1], #0x04
172          sub       r2, r2, #0x08
173          strd      r4, r5, [r3], #0x08
174
175          /* At least 16 bytes remaining */
176          ldr       r4, [r1], #0x04
177          ldr       r5, [r1], #0x04
178          sub       r2, r2, #0x08
179          strd      r4, r5, [r3], #0x08
180
181          /* At least 8 bytes remaining */
182          ldr       r4, [r1], #0x04
183          ldr       r5, [r1], #0x04
184          subs      r2, r2, #0x08
185          strd      r4, r5, [r3], #0x08
186
187          /* Less than 8 bytes remaining */
188          pop       {r4-r9}
189          RETc(eq)                      /* Return now if done */
190          subs      r2, r2, #0x04
191          ldrhs     ip, [r1], #0x04
192          strhs     ip, [r3], #0x04
193          RETc(eq)                      /* Return now if done */
194          addlo     r2, r2, #0x04
195          ldrb      ip, [r1], #0x01
196          cmp       r2, #0x02
197          ldrbhs    r2, [r1], #0x01
198          strb      ip, [r3], #0x01
199          ldrbhi    ip, [r1]
200          strbhs    r2, [r3], #0x01
201          strbhi    ip, [r3]
202          RET
203
204
205/*
206 * At this point, it has not been possible to word align both buffers.
207 * The destination buffer is word aligned, but the source buffer is not.
208 */
209.Lmemcpy_bad_align:
210          push      {r4-r7}
211          bic       r1, r1, #0x03
212          cmp       ip, #2
213          ldr       ip, [r1], #0x04
214          bhi       .Lmemcpy_bad3
215          beq       .Lmemcpy_bad2
216          b         .Lmemcpy_bad1
217
218.Lmemcpy_bad1_loop16:
219#ifdef __ARMEB__
220          mov       r4, ip, lsl #8
221#else
222          mov       r4, ip, lsr #8
223#endif
224          ldr       r5, [r1], #0x04
225          pld       [r1, #0x018]
226          ldr       r6, [r1], #0x04
227          ldr       r7, [r1], #0x04
228          ldr       ip, [r1], #0x04
229#ifdef __ARMEB__
230          orr       r4, r4, r5, lsr #24
231          mov       r5, r5, lsl #8
232          orr       r5, r5, r6, lsr #24
233          mov       r6, r6, lsl #8
234          orr       r6, r6, r7, lsr #24
235          mov       r7, r7, lsl #8
236          orr       r7, r7, ip, lsr #24
237#else
238          orr       r4, r4, r5, lsl #24
239          mov       r5, r5, lsr #8
240          orr       r5, r5, r6, lsl #24
241          mov       r6, r6, lsr #8
242          orr       r6, r6, r7, lsl #24
243          mov       r7, r7, lsr #8
244          orr       r7, r7, ip, lsl #24
245#endif
246          str       r4, [r3], #0x04
247          str       r5, [r3], #0x04
248          str       r6, [r3], #0x04
249          str       r7, [r3], #0x04
250          sub       r2, r2, #0x10
251
252.Lmemcpy_bad1:
253          cmp       r2, #0x20
254          bhs       .Lmemcpy_bad1_loop16
255          cmp       r2, #0x10
256          blo       .Lmemcpy_bad1_loop16_short
257
258          /* copy last 16 bytes (without preload) */
259#ifdef __ARMEB__
260          mov       r4, ip, lsl #8
261#else
262          mov       r4, ip, lsr #8
263#endif
264          ldr       r5, [r1], #0x04
265          ldr       r6, [r1], #0x04
266          ldr       r7, [r1], #0x04
267          ldr       ip, [r1], #0x04
268#ifdef __ARMEB__
269          orr       r4, r4, r5, lsr #24
270          mov       r5, r5, lsl #8
271          orr       r5, r5, r6, lsr #24
272          mov       r6, r6, lsl #8
273          orr       r6, r6, r7, lsr #24
274          mov       r7, r7, lsl #8
275          orr       r7, r7, ip, lsr #24
276#else
277          orr       r4, r4, r5, lsl #24
278          mov       r5, r5, lsr #8
279          orr       r5, r5, r6, lsl #24
280          mov       r6, r6, lsr #8
281          orr       r6, r6, r7, lsl #24
282          mov       r7, r7, lsr #8
283          orr       r7, r7, ip, lsl #24
284#endif
285          str       r4, [r3], #0x04
286          str       r5, [r3], #0x04
287          str       r6, [r3], #0x04
288          str       r7, [r3], #0x04
289          subs      r2, r2, #0x10
290          popeq     {r4-r7}
291          RETc(eq)                      /* Return now if done */
292
293.Lmemcpy_bad1_loop16_short:
294          subs      r2, r2, #0x04
295          sublo     r1, r1, #0x03
296          blo       .Lmemcpy_bad_done
297
298.Lmemcpy_bad1_loop4:
299#ifdef __ARMEB__
300          mov       r4, ip, lsl #8
301#else
302          mov       r4, ip, lsr #8
303#endif
304          ldr       ip, [r1], #0x04
305          subs      r2, r2, #0x04
306#ifdef __ARMEB__
307          orr       r4, r4, ip, lsr #24
308#else
309          orr       r4, r4, ip, lsl #24
310#endif
311          str       r4, [r3], #0x04
312          bhs       .Lmemcpy_bad1_loop4
313          sub       r1, r1, #0x03
314          b         .Lmemcpy_bad_done
315
316.Lmemcpy_bad2_loop16:
317#ifdef __ARMEB__
318          mov       r4, ip, lsl #16
319#else
320          mov       r4, ip, lsr #16
321#endif
322          ldr       r5, [r1], #0x04
323          pld       [r1, #0x018]
324          ldr       r6, [r1], #0x04
325          ldr       r7, [r1], #0x04
326          ldr       ip, [r1], #0x04
327#ifdef __ARMEB__
328          orr       r4, r4, r5, lsr #16
329          mov       r5, r5, lsl #16
330          orr       r5, r5, r6, lsr #16
331          mov       r6, r6, lsl #16
332          orr       r6, r6, r7, lsr #16
333          mov       r7, r7, lsl #16
334          orr       r7, r7, ip, lsr #16
335#else
336          orr       r4, r4, r5, lsl #16
337          mov       r5, r5, lsr #16
338          orr       r5, r5, r6, lsl #16
339          mov       r6, r6, lsr #16
340          orr       r6, r6, r7, lsl #16
341          mov       r7, r7, lsr #16
342          orr       r7, r7, ip, lsl #16
343#endif
344          str       r4, [r3], #0x04
345          str       r5, [r3], #0x04
346          str       r6, [r3], #0x04
347          str       r7, [r3], #0x04
348          sub       r2, r2, #0x10
349
350.Lmemcpy_bad2:
351          cmp       r2, #0x20
352          bhs       .Lmemcpy_bad2_loop16
353          cmp       r2, #0x10
354          blo       .Lmemcpy_bad2_loop16_short
355
356          /* copy last 16 bytes (without preload) */
357#ifdef __ARMEB__
358          mov       r4, ip, lsl #16
359#else
360          mov       r4, ip, lsr #16
361#endif
362          ldr       r5, [r1], #0x04
363          ldr       r6, [r1], #0x04
364          ldr       r7, [r1], #0x04
365          ldr       ip, [r1], #0x04
366#ifdef __ARMEB__
367          orr       r4, r4, r5, lsr #16
368          mov       r5, r5, lsl #16
369          orr       r5, r5, r6, lsr #16
370          mov       r6, r6, lsl #16
371          orr       r6, r6, r7, lsr #16
372          mov       r7, r7, lsl #16
373          orr       r7, r7, ip, lsr #16
374#else
375          orr       r4, r4, r5, lsl #16
376          mov       r5, r5, lsr #16
377          orr       r5, r5, r6, lsl #16
378          mov       r6, r6, lsr #16
379          orr       r6, r6, r7, lsl #16
380          mov       r7, r7, lsr #16
381          orr       r7, r7, ip, lsl #16
382#endif
383          str       r4, [r3], #0x04
384          str       r5, [r3], #0x04
385          str       r6, [r3], #0x04
386          str       r7, [r3], #0x04
387          subs      r2, r2, #0x10
388          popeq     {r4-r7}
389          RETc(eq)                      /* Return now if done */
390
391.Lmemcpy_bad2_loop16_short:
392          subs      r2, r2, #0x04
393          sublo     r1, r1, #0x02
394          blo       .Lmemcpy_bad_done
395
396.Lmemcpy_bad2_loop4:
397#ifdef __ARMEB__
398          mov       r4, ip, lsl #16
399#else
400          mov       r4, ip, lsr #16
401#endif
402          ldr       ip, [r1], #0x04
403          subs      r2, r2, #0x04
404#ifdef __ARMEB__
405          orr       r4, r4, ip, lsr #16
406#else
407          orr       r4, r4, ip, lsl #16
408#endif
409          str       r4, [r3], #0x04
410          bhs       .Lmemcpy_bad2_loop4
411          sub       r1, r1, #0x02
412          b         .Lmemcpy_bad_done
413
414.Lmemcpy_bad3_loop16:
415#ifdef __ARMEB__
416          mov       r4, ip, lsl #24
417#else
418          mov       r4, ip, lsr #24
419#endif
420          ldr       r5, [r1], #0x04
421          pld       [r1, #0x018]
422          ldr       r6, [r1], #0x04
423          ldr       r7, [r1], #0x04
424          ldr       ip, [r1], #0x04
425#ifdef __ARMEB__
426          orr       r4, r4, r5, lsr #8
427          mov       r5, r5, lsl #24
428          orr       r5, r5, r6, lsr #8
429          mov       r6, r6, lsl #24
430          orr       r6, r6, r7, lsr #8
431          mov       r7, r7, lsl #24
432          orr       r7, r7, ip, lsr #8
433#else
434          orr       r4, r4, r5, lsl #8
435          mov       r5, r5, lsr #24
436          orr       r5, r5, r6, lsl #8
437          mov       r6, r6, lsr #24
438          orr       r6, r6, r7, lsl #8
439          mov       r7, r7, lsr #24
440          orr       r7, r7, ip, lsl #8
441#endif
442          str       r4, [r3], #0x04
443          str       r5, [r3], #0x04
444          str       r6, [r3], #0x04
445          str       r7, [r3], #0x04
446          sub       r2, r2, #0x10
447
448.Lmemcpy_bad3:
449          cmp       r2, #0x20
450          bhs       .Lmemcpy_bad3_loop16
451          cmp       r2, #0x10
452          blo       .Lmemcpy_bad3_loop16_short
453
454          /* copy last 16 bytes (without preload) */
455#ifdef __ARMEB__
456          mov       r4, ip, lsl #24
457#else
458          mov       r4, ip, lsr #24
459#endif
460          ldr       r5, [r1], #0x04
461          ldr       r6, [r1], #0x04
462          ldr       r7, [r1], #0x04
463          ldr       ip, [r1], #0x04
464#ifdef __ARMEB__
465          orr       r4, r4, r5, lsr #8
466          mov       r5, r5, lsl #24
467          orr       r5, r5, r6, lsr #8
468          mov       r6, r6, lsl #24
469          orr       r6, r6, r7, lsr #8
470          mov       r7, r7, lsl #24
471          orr       r7, r7, ip, lsr #8
472#else
473          orr       r4, r4, r5, lsl #8
474          mov       r5, r5, lsr #24
475          orr       r5, r5, r6, lsl #8
476          mov       r6, r6, lsr #24
477          orr       r6, r6, r7, lsl #8
478          mov       r7, r7, lsr #24
479          orr       r7, r7, ip, lsl #8
480#endif
481          str       r4, [r3], #0x04
482          str       r5, [r3], #0x04
483          str       r6, [r3], #0x04
484          str       r7, [r3], #0x04
485          subs      r2, r2, #0x10
486          popeq     {r4-r7}
487          RETc(eq)                      /* Return now if done */
488
489.Lmemcpy_bad3_loop16_short:
490          subs      r2, r2, #0x04
491          sublo     r1, r1, #0x01
492          blo       .Lmemcpy_bad_done
493
494.Lmemcpy_bad3_loop4:
495#ifdef __ARMEB__
496          mov       r4, ip, lsl #24
497#else
498          mov       r4, ip, lsr #24
499#endif
500          ldr       ip, [r1], #0x04
501          subs      r2, r2, #0x04
502#ifdef __ARMEB__
503          orr       r4, r4, ip, lsr #8
504#else
505          orr       r4, r4, ip, lsl #8
506#endif
507          str       r4, [r3], #0x04
508          bhs       .Lmemcpy_bad3_loop4
509          sub       r1, r1, #0x01
510
511.Lmemcpy_bad_done:
512          pop       {r4-r7}
513          adds      r2, r2, #0x04
514          RETc(eq)
515          ldrb      ip, [r1], #0x01
516          cmp       r2, #0x02
517          ldrbhs    r2, [r1], #0x01
518          strb      ip, [r3], #0x01
519          ldrbhi    ip, [r1]
520          strbhs    r2, [r3], #0x01
521          strbhi    ip, [r3]
522          RET
523
524
525/*
526 * Handle short copies (less than 16 bytes), possibly misaligned.
527 * Some of these are *very* common, thanks to the network stack,
528 * and so are handled specially.
529 */
530.Lmemcpy_short:
531#ifndef _STANDALONE
532          add       pc, pc, r2, lsl #2
533          nop
534          RET                                     /* 0x00 */
535          b         .Lmemcpy_bytewise   /* 0x01 */
536          b         .Lmemcpy_bytewise   /* 0x02 */
537          b         .Lmemcpy_bytewise   /* 0x03 */
538          b         .Lmemcpy_4                    /* 0x04 */
539          b         .Lmemcpy_bytewise   /* 0x05 */
540          b         .Lmemcpy_6                    /* 0x06 */
541          b         .Lmemcpy_bytewise   /* 0x07 */
542          b         .Lmemcpy_8                    /* 0x08 */
543          b         .Lmemcpy_bytewise   /* 0x09 */
544          b         .Lmemcpy_bytewise   /* 0x0a */
545          b         .Lmemcpy_bytewise   /* 0x0b */
546          b         .Lmemcpy_c                    /* 0x0c */
547#endif
548.Lmemcpy_bytewise:
549          mov       r3, r0                        /* We must not clobber r0 */
550          ldrb      ip, [r1], #0x01
5511:        subs      r2, r2, #0x01
552          strb      ip, [r3], #0x01
553          ldrbne    ip, [r1], #0x01
554          bne       1b
555          RET
556
557#ifndef _STANDALONE
558/******************************************************************************
559 * Special case for 4 byte copies
560 */
561#define   LMEMCPY_4_LOG2      6         /* 64 bytes */
562#define   LMEMCPY_4_PAD       .align LMEMCPY_4_LOG2
563          LMEMCPY_4_PAD
564.Lmemcpy_4:
565          and       r2, r1, #0x03
566          orr       r2, r2, r0, lsl #2
567          ands      r2, r2, #0x0f
568          sub       r3, pc, #0x14
569          addne     pc, r3, r2, lsl #LMEMCPY_4_LOG2
570
571/*
572 * 0000: dst is 32-bit aligned, src is 32-bit aligned
573 */
574          ldr       r2, [r1]
575          str       r2, [r0]
576          RET
577          LMEMCPY_4_PAD
578
579/*
580 * 0001: dst is 32-bit aligned, src is 8-bit aligned
581 */
582          ldr       r3, [r1, #-1]                 /* BE:r3 = x012  LE:r3 = 210x */
583          ldr       r2, [r1, #3]                  /* BE:r2 = 3xxx  LE:r2 = xxx3 */
584#ifdef __ARMEB__
585          mov       r3, r3, lsl #8                /* r3 = 012. */
586          orr       r3, r3, r2, lsr #24 /* r3 = 0123 */
587#else
588          mov       r3, r3, lsr #8                /* r3 = .210 */
589          orr       r3, r3, r2, lsl #24 /* r3 = 3210 */
590#endif
591          str       r3, [r0]
592          RET
593          LMEMCPY_4_PAD
594
595/*
596 * 0010: dst is 32-bit aligned, src is 16-bit aligned
597 */
598#ifdef __ARMEB__
599          ldrh      r3, [r1]
600          ldrh      r2, [r1, #0x02]
601#else
602          ldrh      r3, [r1, #0x02]
603          ldrh      r2, [r1]
604#endif
605          orr       r3, r2, r3, lsl #16
606          str       r3, [r0]
607          RET
608          LMEMCPY_4_PAD
609
610/*
611 * 0011: dst is 32-bit aligned, src is 8-bit aligned
612 */
613          ldr       r3, [r1, #-3]                 /* BE:r3 = xxx0  LE:r3 = 0xxx */
614          ldr       r2, [r1, #1]                  /* BE:r2 = 123x  LE:r2 = x321 */
615#ifdef __ARMEB__
616          mov       r3, r3, lsl #24               /* r3 = 0... */
617          orr       r3, r3, r2, lsr #8  /* r3 = 0123 */
618#else
619          mov       r3, r3, lsr #24               /* r3 = ...0 */
620          orr       r3, r3, r2, lsl #8  /* r3 = 3210 */
621#endif
622          str       r3, [r0]
623          RET
624          LMEMCPY_4_PAD
625
626/*
627 * 0100: dst is 8-bit aligned, src is 32-bit aligned
628 */
629          ldr       r2, [r1]
630#ifdef __ARMEB__
631          strb      r2, [r0, #0x03]
632          mov       r3, r2, lsr #8
633          mov       r1, r2, lsr #24
634          strb      r1, [r0]
635#else
636          strb      r2, [r0]
637          mov       r3, r2, lsr #8
638          mov       r1, r2, lsr #24
639          strb      r1, [r0, #0x03]
640#endif
641          strh      r3, [r0, #0x01]
642          RET
643          LMEMCPY_4_PAD
644
645/*
646 * 0101: dst is 8-bit aligned, src is 8-bit aligned
647 */
648          ldrb      r2, [r1]
649          ldrh      r3, [r1, #0x01]
650          ldrb      r1, [r1, #0x03]
651          strb      r2, [r0]
652          strh      r3, [r0, #0x01]
653          strb      r1, [r0, #0x03]
654          RET
655          LMEMCPY_4_PAD
656
657/*
658 * 0110: dst is 8-bit aligned, src is 16-bit aligned
659 */
660          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
661          ldrh      r3, [r1, #0x02]               /* LE:r3 = ..23  LE:r3 = ..32 */
662#ifdef __ARMEB__
663          mov       r1, r2, lsr #8                /* r1 = ...0 */
664          strb      r1, [r0]
665          mov       r2, r2, lsl #8                /* r2 = .01. */
666          orr       r2, r2, r3, lsr #8  /* r2 = .012 */
667#else
668          strb      r2, [r0]
669          mov       r2, r2, lsr #8                /* r2 = ...1 */
670          orr       r2, r2, r3, lsl #8  /* r2 = .321 */
671          mov       r3, r3, lsr #8                /* r3 = ...3 */
672#endif
673          strh      r2, [r0, #0x01]
674          strb      r3, [r0, #0x03]
675          RET
676          LMEMCPY_4_PAD
677
678/*
679 * 0111: dst is 8-bit aligned, src is 8-bit aligned
680 */
681          ldrb      r2, [r1]
682          ldrh      r3, [r1, #0x01]
683          ldrb      r1, [r1, #0x03]
684          strb      r2, [r0]
685          strh      r3, [r0, #0x01]
686          strb      r1, [r0, #0x03]
687          RET
688          LMEMCPY_4_PAD
689
690/*
691 * 1000: dst is 16-bit aligned, src is 32-bit aligned
692 */
693          ldr       r2, [r1]
694#ifdef __ARMEB__
695          strh      r2, [r0, #0x02]
696          mov       r3, r2, lsr #16
697          strh      r3, [r0]
698#else
699          strh      r2, [r0]
700          mov       r3, r2, lsr #16
701          strh      r3, [r0, #0x02]
702#endif
703          RET
704          LMEMCPY_4_PAD
705
706/*
707 * 1001: dst is 16-bit aligned, src is 8-bit aligned
708 */
709          ldr       r2, [r1, #-1]                 /* BE:r2 = x012  LE:r2 = 210x */
710          ldr       r3, [r1, #3]                  /* BE:r3 = 3xxx  LE:r3 = xxx3 */
711          mov       r1, r2, lsr #8                /* BE:r1 = .x01  LE:r1 = .210 */
712          strh      r1, [r0]
713#ifdef __ARMEB__
714          mov       r2, r2, lsl #8                /* r2 = 012. */
715          orr       r2, r2, r3, lsr #24 /* r2 = 0123 */
716#else
717          mov       r2, r2, lsr #24               /* r2 = ...2 */
718          orr       r2, r2, r3, lsl #8  /* r2 = xx32 */
719#endif
720          strh      r2, [r0, #0x02]
721          RET
722          LMEMCPY_4_PAD
723
724/*
725 * 1010: dst is 16-bit aligned, src is 16-bit aligned
726 */
727          ldrh      r2, [r1]
728          ldrh      r3, [r1, #0x02]
729          strh      r2, [r0]
730          strh      r3, [r0, #0x02]
731          RET
732          LMEMCPY_4_PAD
733
734/*
735 * 1011: dst is 16-bit aligned, src is 8-bit aligned
736 */
737          ldr       r3, [r1, #1]                  /* BE:r3 = 123x  LE:r3 = x321 */
738          ldr       r2, [r1, #-3]                 /* BE:r2 = xxx0  LE:r2 = 0xxx */
739          mov       r1, r3, lsr #8                /* BE:r1 = .123  LE:r1 = .x32 */
740          strh      r1, [r0, #0x02]
741#ifdef __ARMEB__
742          mov       r3, r3, lsr #24               /* r3 = ...1 */
743          orr       r3, r3, r2, lsl #8  /* r3 = xx01 */
744#else
745          mov       r3, r3, lsl #8                /* r3 = 321. */
746          orr       r3, r3, r2, lsr #24 /* r3 = 3210 */
747#endif
748          strh      r3, [r0]
749          RET
750          LMEMCPY_4_PAD
751
752/*
753 * 1100: dst is 8-bit aligned, src is 32-bit aligned
754 */
755          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
756#ifdef __ARMEB__
757          strb      r2, [r0, #0x03]
758          mov       r3, r2, lsr #8
759          mov       r1, r2, lsr #24
760          strh      r3, [r0, #0x01]
761          strb      r1, [r0]
762#else
763          strb      r2, [r0]
764          mov       r3, r2, lsr #8
765          mov       r1, r2, lsr #24
766          strh      r3, [r0, #0x01]
767          strb      r1, [r0, #0x03]
768#endif
769          RET
770          LMEMCPY_4_PAD
771
772/*
773 * 1101: dst is 8-bit aligned, src is 8-bit aligned
774 */
775          ldrb      r2, [r1]
776          ldrh      r3, [r1, #0x01]
777          ldrb      r1, [r1, #0x03]
778          strb      r2, [r0]
779          strh      r3, [r0, #0x01]
780          strb      r1, [r0, #0x03]
781          RET
782          LMEMCPY_4_PAD
783
784/*
785 * 1110: dst is 8-bit aligned, src is 16-bit aligned
786 */
787#ifdef __ARMEB__
788          ldrh      r3, [r1, #0x02]               /* BE:r3 = ..23  LE:r3 = ..32 */
789          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
790          strb      r3, [r0, #0x03]
791          mov       r3, r3, lsr #8                /* r3 = ...2 */
792          orr       r3, r3, r2, lsl #8  /* r3 = ..12 */
793          strh      r3, [r0, #0x01]
794          mov       r2, r2, lsr #8                /* r2 = ...0 */
795          strb      r2, [r0]
796#else
797          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
798          ldrh      r3, [r1, #0x02]               /* BE:r3 = ..23  LE:r3 = ..32 */
799          strb      r2, [r0]
800          mov       r2, r2, lsr #8                /* r2 = ...1 */
801          orr       r2, r2, r3, lsl #8  /* r2 = .321 */
802          strh      r2, [r0, #0x01]
803          mov       r3, r3, lsr #8                /* r3 = ...3 */
804          strb      r3, [r0, #0x03]
805#endif
806          RET
807          LMEMCPY_4_PAD
808
809/*
810 * 1111: dst is 8-bit aligned, src is 8-bit aligned
811 */
812          ldrb      r2, [r1]
813          ldrh      r3, [r1, #0x01]
814          ldrb      r1, [r1, #0x03]
815          strb      r2, [r0]
816          strh      r3, [r0, #0x01]
817          strb      r1, [r0, #0x03]
818          RET
819          LMEMCPY_4_PAD
820
821
822/******************************************************************************
823 * Special case for 6 byte copies
824 */
825#define   LMEMCPY_6_LOG2      6         /* 64 bytes */
826#define   LMEMCPY_6_PAD       .align LMEMCPY_6_LOG2
827          LMEMCPY_6_PAD
828.Lmemcpy_6:
829          and       r2, r1, #0x03
830          orr       r2, r2, r0, lsl #2
831          ands      r2, r2, #0x0f
832          sub       r3, pc, #0x14
833          addne     pc, r3, r2, lsl #LMEMCPY_6_LOG2
834
835/*
836 * 0000: dst is 32-bit aligned, src is 32-bit aligned
837 */
838          ldr       r2, [r1]
839          ldrh      r3, [r1, #0x04]
840          str       r2, [r0]
841          strh      r3, [r0, #0x04]
842          RET
843          LMEMCPY_6_PAD
844
845/*
846 * 0001: dst is 32-bit aligned, src is 8-bit aligned
847 */
848          ldr       r2, [r1, #-1]                 /* BE:r2 = x012  LE:r2 = 210x */
849          ldr       r3, [r1, #0x03]               /* BE:r3 = 345x  LE:r3 = x543 */
850#ifdef __ARMEB__
851          mov       r2, r2, lsl #8                /* r2 = 012. */
852          orr       r2, r2, r3, lsr #24 /* r2 = 0123 */
853#else
854          mov       r2, r2, lsr #8                /* r2 = .210 */
855          orr       r2, r2, r3, lsl #24 /* r2 = 3210 */
856#endif
857          mov       r3, r3, lsr #8                /* BE:r3 = .345  LE:r3 = .x54 */
858          str       r2, [r0]
859          strh      r3, [r0, #0x04]
860          RET
861          LMEMCPY_6_PAD
862
863/*
864 * 0010: dst is 32-bit aligned, src is 16-bit aligned
865 */
866          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
867          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
868#ifdef __ARMEB__
869          mov       r1, r3, lsr #16               /* r1 = ..23 */
870          orr       r1, r1, r2, lsl #16 /* r1 = 0123 */
871          str       r1, [r0]
872          strh      r3, [r0, #0x04]
873#else
874          mov       r1, r3, lsr #16               /* r1 = ..54 */
875          orr       r2, r2, r3, lsl #16 /* r2 = 3210 */
876          str       r2, [r0]
877          strh      r1, [r0, #0x04]
878#endif
879          RET
880          LMEMCPY_6_PAD
881
882/*
883 * 0011: dst is 32-bit aligned, src is 8-bit aligned
884 */
885          ldr       r2, [r1, #-3]                 /* BE:r2 = xxx0  LE:r2 = 0xxx */
886          ldr       r3, [r1, #1]                  /* BE:r3 = 1234  LE:r3 = 4321 */
887          ldr       r1, [r1, #5]                  /* BE:r1 = 5xxx  LE:r3 = xxx5 */
888#ifdef __ARMEB__
889          mov       r2, r2, lsl #24               /* r2 = 0... */
890          orr       r2, r2, r3, lsr #8  /* r2 = 0123 */
891          mov       r3, r3, lsl #8                /* r3 = 234. */
892          orr       r1, r3, r1, lsr #24 /* r1 = 2345 */
893#else
894          mov       r2, r2, lsr #24               /* r2 = ...0 */
895          orr       r2, r2, r3, lsl #8  /* r2 = 3210 */
896          mov       r1, r1, lsl #8                /* r1 = xx5. */
897          orr       r1, r1, r3, lsr #24 /* r1 = xx54 */
898#endif
899          str       r2, [r0]
900          strh      r1, [r0, #0x04]
901          RET
902          LMEMCPY_6_PAD
903
904/*
905 * 0100: dst is 8-bit aligned, src is 32-bit aligned
906 */
907          ldr       r3, [r1]            /* BE:r3 = 0123  LE:r3 = 3210 */
908          ldrh      r2, [r1, #0x04]               /* BE:r2 = ..45  LE:r2 = ..54 */
909          mov       r1, r3, lsr #8                /* BE:r1 = .012  LE:r1 = .321 */
910          strh      r1, [r0, #0x01]
911#ifdef __ARMEB__
912          mov       r1, r3, lsr #24               /* r1 = ...0 */
913          strb      r1, [r0]
914          mov       r3, r3, lsl #8                /* r3 = 123. */
915          orr       r3, r3, r2, lsr #8  /* r3 = 1234 */
916#else
917          strb      r3, [r0]
918          mov       r3, r3, lsr #24               /* r3 = ...3 */
919          orr       r3, r3, r2, lsl #8  /* r3 = .543 */
920          mov       r2, r2, lsr #8                /* r2 = ...5 */
921#endif
922          strh      r3, [r0, #0x03]
923          strb      r2, [r0, #0x05]
924          RET
925          LMEMCPY_6_PAD
926
927/*
928 * 0101: dst is 8-bit aligned, src is 8-bit aligned
929 */
930          ldrb      r2, [r1]
931          ldrh      r3, [r1, #0x01]
932          ldrh      ip, [r1, #0x03]
933          ldrb      r1, [r1, #0x05]
934          strb      r2, [r0]
935          strh      r3, [r0, #0x01]
936          strh      ip, [r0, #0x03]
937          strb      r1, [r0, #0x05]
938          RET
939          LMEMCPY_6_PAD
940
941/*
942 * 0110: dst is 8-bit aligned, src is 16-bit aligned
943 */
944          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
945          ldr       r1, [r1, #0x02]               /* BE:r1 = 2345  LE:r1 = 5432 */
946#ifdef __ARMEB__
947          mov       r3, r2, lsr #8                /* r3 = ...0 */
948          strb      r3, [r0]
949          strb      r1, [r0, #0x05]
950          mov       r3, r1, lsr #8                /* r3 = .234 */
951          strh      r3, [r0, #0x03]
952          mov       r3, r2, lsl #8                /* r3 = .01. */
953          orr       r3, r3, r1, lsr #24 /* r3 = .012 */
954          strh      r3, [r0, #0x01]
955#else
956          strb      r2, [r0]
957          mov       r3, r1, lsr #24
958          strb      r3, [r0, #0x05]
959          mov       r3, r1, lsr #8                /* r3 = .543 */
960          strh      r3, [r0, #0x03]
961          mov       r3, r2, lsr #8                /* r3 = ...1 */
962          orr       r3, r3, r1, lsl #8  /* r3 = 4321 */
963          strh      r3, [r0, #0x01]
964#endif
965          RET
966          LMEMCPY_6_PAD
967
968/*
969 * 0111: dst is 8-bit aligned, src is 8-bit aligned
970 */
971          ldrb      r2, [r1]
972          ldrh      r3, [r1, #0x01]
973          ldrh      ip, [r1, #0x03]
974          ldrb      r1, [r1, #0x05]
975          strb      r2, [r0]
976          strh      r3, [r0, #0x01]
977          strh      ip, [r0, #0x03]
978          strb      r1, [r0, #0x05]
979          RET
980          LMEMCPY_6_PAD
981
982/*
983 * 1000: dst is 16-bit aligned, src is 32-bit aligned
984 */
985#ifdef __ARMEB__
986          ldr       r2, [r1]            /* r2 = 0123 */
987          ldrh      r3, [r1, #0x04]               /* r3 = ..45 */
988          mov       r1, r2, lsr #16               /* r1 = ..01 */
989          orr       r3, r3, r2, lsl#16  /* r3 = 2345 */
990          strh      r1, [r0]
991          str       r3, [r0, #0x02]
992#else
993          ldrh      r2, [r1, #0x04]               /* r2 = ..54 */
994          ldr       r3, [r1]            /* r3 = 3210 */
995          mov       r2, r2, lsl #16               /* r2 = 54.. */
996          orr       r2, r2, r3, lsr #16 /* r2 = 5432 */
997          strh      r3, [r0]
998          str       r2, [r0, #0x02]
999#endif
1000          RET
1001          LMEMCPY_6_PAD
1002
1003/*
1004 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1005 */
1006          ldr       r3, [r1, #-1]                 /* BE:r3 = x012  LE:r3 = 210x */
1007          ldr       r2, [r1, #3]                  /* BE:r2 = 345x  LE:r2 = x543 */
1008          mov       r1, r3, lsr #8                /* BE:r1 = .x01  LE:r1 = .210 */
1009#ifdef __ARMEB__
1010          mov       r2, r2, lsr #8                /* r2 = .345 */
1011          orr       r2, r2, r3, lsl #24 /* r2 = 2345 */
1012#else
1013          mov       r2, r2, lsl #8                /* r2 = 543. */
1014          orr       r2, r2, r3, lsr #24 /* r2 = 5432 */
1015#endif
1016          strh      r1, [r0]
1017          str       r2, [r0, #0x02]
1018          RET
1019          LMEMCPY_6_PAD
1020
1021/*
1022 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1023 */
1024          ldrh      r2, [r1]
1025          ldr       r3, [r1, #0x02]
1026          strh      r2, [r0]
1027          str       r3, [r0, #0x02]
1028          RET
1029          LMEMCPY_6_PAD
1030
1031/*
1032 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1033 */
1034          ldrb      r3, [r1]            /* r3 = ...0 */
1035          ldr       r2, [r1, #0x01]               /* BE:r2 = 1234  LE:r2 = 4321 */
1036          ldrb      r1, [r1, #0x05]               /* r1 = ...5 */
1037#ifdef __ARMEB__
1038          mov       r3, r3, lsl #8                /* r3 = ..0. */
1039          orr       r3, r3, r2, lsr #24 /* r3 = ..01 */
1040          orr       r1, r1, r2, lsl #8  /* r1 = 2345 */
1041#else
1042          orr       r3, r3, r2, lsl #8  /* r3 = 3210 */
1043          mov       r1, r1, lsl #24               /* r1 = 5... */
1044          orr       r1, r1, r2, lsr #8  /* r1 = 5432 */
1045#endif
1046          strh      r3, [r0]
1047          str       r1, [r0, #0x02]
1048          RET
1049          LMEMCPY_6_PAD
1050
1051/*
1052 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1053 */
1054          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
1055          ldrh      r1, [r1, #0x04]               /* BE:r1 = ..45  LE:r1 = ..54 */
1056#ifdef __ARMEB__
1057          mov       r3, r2, lsr #24               /* r3 = ...0 */
1058          strb      r3, [r0]
1059          mov       r2, r2, lsl #8                /* r2 = 123. */
1060          orr       r2, r2, r1, lsr #8  /* r2 = 1234 */
1061#else
1062          strb      r2, [r0]
1063          mov       r2, r2, lsr #8                /* r2 = .321 */
1064          orr       r2, r2, r1, lsl #24 /* r2 = 4321 */
1065          mov       r1, r1, lsr #8                /* r1 = ...5 */
1066#endif
1067          str       r2, [r0, #0x01]
1068          strb      r1, [r0, #0x05]
1069          RET
1070          LMEMCPY_6_PAD
1071
1072/*
1073 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1074 */
1075          ldrb      r2, [r1]
1076          ldrh      r3, [r1, #0x01]
1077          ldrh      ip, [r1, #0x03]
1078          ldrb      r1, [r1, #0x05]
1079          strb      r2, [r0]
1080          strh      r3, [r0, #0x01]
1081          strh      ip, [r0, #0x03]
1082          strb      r1, [r0, #0x05]
1083          RET
1084          LMEMCPY_6_PAD
1085
1086/*
1087 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1088 */
1089          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1090          ldr       r1, [r1, #0x02]               /* BE:r1 = 2345  LE:r1 = 5432 */
1091#ifdef __ARMEB__
1092          mov       r3, r2, lsr #8                /* r3 = ...0 */
1093          strb      r3, [r0]
1094          mov       r2, r2, lsl #24               /* r2 = 1... */
1095          orr       r2, r2, r1, lsr #8  /* r2 = 1234 */
1096#else
1097          strb      r2, [r0]
1098          mov       r2, r2, lsr #8                /* r2 = ...1 */
1099          orr       r2, r2, r1, lsl #8  /* r2 = 4321 */
1100          mov       r1, r1, lsr #24               /* r1 = ...5 */
1101#endif
1102          str       r2, [r0, #0x01]
1103          strb      r1, [r0, #0x05]
1104          RET
1105          LMEMCPY_6_PAD
1106
1107/*
1108 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1109 */
1110          ldrb      r2, [r1]
1111          ldr       r3, [r1, #0x01]
1112          ldrb      r1, [r1, #0x05]
1113          strb      r2, [r0]
1114          str       r3, [r0, #0x01]
1115          strb      r1, [r0, #0x05]
1116          RET
1117          LMEMCPY_6_PAD
1118
1119
1120/******************************************************************************
1121 * Special case for 8 byte copies
1122 */
1123#define   LMEMCPY_8_LOG2      6         /* 64 bytes */
1124#define   LMEMCPY_8_PAD       .align LMEMCPY_8_LOG2
1125          LMEMCPY_8_PAD
1126.Lmemcpy_8:
1127          and       r2, r1, #0x03
1128          orr       r2, r2, r0, lsl #2
1129          ands      r2, r2, #0x0f
1130          sub       r3, pc, #0x14
1131          addne     pc, r3, r2, lsl #LMEMCPY_8_LOG2
1132
1133/*
1134 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1135 */
1136          ldr       r2, [r1]
1137          ldr       r3, [r1, #0x04]
1138          str       r2, [r0]
1139          str       r3, [r0, #0x04]
1140          RET
1141          LMEMCPY_8_PAD
1142
1143/*
1144 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1145 */
1146          ldr       r3, [r1, #-1]                 /* BE:r3 = x012  LE:r3 = 210x */
1147          ldr       r2, [r1, #0x03]               /* BE:r2 = 3456  LE:r2 = 6543 */
1148          ldrb      r1, [r1, #0x07]               /* r1 = ...7 */
1149#ifdef __ARMEB__
1150          mov       r3, r3, lsl #8                /* r3 = 012. */
1151          orr       r3, r3, r2, lsr #24 /* r3 = 0123 */
1152          orr       r2, r1, r2, lsl #8  /* r2 = 4567 */
1153#else
1154          mov       r3, r3, lsr #8                /* r3 = .210 */
1155          orr       r3, r3, r2, lsl #24 /* r3 = 3210 */
1156          mov       r1, r1, lsl #24               /* r1 = 7... */
1157          orr       r2, r1, r2, lsr #8  /* r2 = 7654 */
1158#endif
1159          str       r3, [r0]
1160          str       r2, [r0, #0x04]
1161          RET
1162          LMEMCPY_8_PAD
1163
1164/*
1165 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1166 */
1167          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1168          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
1169          ldrh      r1, [r1, #0x06]               /* BE:r1 = ..67  LE:r1 = ..76 */
1170#ifdef __ARMEB__
1171          mov       r2, r2, lsl #16               /* r2 = 01.. */
1172          orr       r2, r2, r3, lsr #16 /* r2 = 0123 */
1173          orr       r3, r1, r3, lsl #16 /* r3 = 4567 */
1174#else
1175          orr       r2, r2, r3, lsl #16 /* r2 = 3210 */
1176          mov       r3, r3, lsr #16               /* r3 = ..54 */
1177          orr       r3, r3, r1, lsl #16 /* r3 = 7654 */
1178#endif
1179          str       r2, [r0]
1180          str       r3, [r0, #0x04]
1181          RET
1182          LMEMCPY_8_PAD
1183
1184/*
1185 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1186 */
1187          ldrb      r3, [r1]            /* r3 = ...0 */
1188          ldr       r2, [r1, #0x01]               /* BE:r2 = 1234  LE:r2 = 4321 */
1189          ldr       r1, [r1, #0x05]               /* BE:r1 = 567x  LE:r1 = x765 */
1190#ifdef __ARMEB__
1191          mov       r3, r3, lsl #24               /* r3 = 0... */
1192          orr       r3, r3, r2, lsr #8  /* r3 = 0123 */
1193          mov       r2, r2, lsl #24               /* r2 = 4... */
1194          orr       r2, r2, r1, lsr #8  /* r2 = 4567 */
1195#else
1196          orr       r3, r3, r2, lsl #8  /* r3 = 3210 */
1197          mov       r2, r2, lsr #24               /* r2 = ...4 */
1198          orr       r2, r2, r1, lsl #8  /* r2 = 7654 */
1199#endif
1200          str       r3, [r0]
1201          str       r2, [r0, #0x04]
1202          RET
1203          LMEMCPY_8_PAD
1204
1205/*
1206 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1207 */
1208          ldr       r3, [r1]            /* BE:r3 = 0123  LE:r3 = 3210 */
1209          ldr       r2, [r1, #0x04]               /* BE:r2 = 4567  LE:r2 = 7654 */
1210#ifdef __ARMEB__
1211          mov       r1, r3, lsr #24               /* r1 = ...0 */
1212          strb      r1, [r0]
1213          mov       r1, r3, lsr #8                /* r1 = .012 */
1214          strb      r2, [r0, #0x07]
1215          mov       r3, r3, lsl #24               /* r3 = 3... */
1216          orr       r3, r3, r2, lsr #8  /* r3 = 3456 */
1217#else
1218          strb      r3, [r0]
1219          mov       r1, r2, lsr #24               /* r1 = ...7 */
1220          strb      r1, [r0, #0x07]
1221          mov       r1, r3, lsr #8                /* r1 = .321 */
1222          mov       r3, r3, lsr #24               /* r3 = ...3 */
1223          orr       r3, r3, r2, lsl #8  /* r3 = 6543 */
1224#endif
1225          strh      r1, [r0, #0x01]
1226          str       r3, [r0, #0x03]
1227          RET
1228          LMEMCPY_8_PAD
1229
1230/*
1231 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1232 */
1233          ldrb      r2, [r1]
1234          ldrh      r3, [r1, #0x01]
1235          ldr       ip, [r1, #0x03]
1236          ldrb      r1, [r1, #0x07]
1237          strb      r2, [r0]
1238          strh      r3, [r0, #0x01]
1239          str       ip, [r0, #0x03]
1240          strb      r1, [r0, #0x07]
1241          RET
1242          LMEMCPY_8_PAD
1243
1244/*
1245 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1246 */
1247          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1248          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
1249          ldrh      r1, [r1, #0x06]               /* BE:r1 = ..67  LE:r1 = ..76 */
1250#ifdef __ARMEB__
1251          mov       ip, r2, lsr #8                /* ip = ...0 */
1252          strb      ip, [r0]
1253          mov       ip, r2, lsl #8                /* ip = .01. */
1254          orr       ip, ip, r3, lsr #24 /* ip = .012 */
1255          strb      r1, [r0, #0x07]
1256          mov       r3, r3, lsl #8                /* r3 = 345. */
1257          orr       r3, r3, r1, lsr #8  /* r3 = 3456 */
1258#else
1259          strb      r2, [r0]            /* 0 */
1260          mov       ip, r1, lsr #8                /* ip = ...7 */
1261          strb      ip, [r0, #0x07]               /* 7 */
1262          mov       ip, r2, lsr #8                /* ip = ...1 */
1263          orr       ip, ip, r3, lsl #8  /* ip = 4321 */
1264          mov       r3, r3, lsr #8                /* r3 = .543 */
1265          orr       r3, r3, r1, lsl #24 /* r3 = 6543 */
1266#endif
1267          strh      ip, [r0, #0x01]
1268          str       r3, [r0, #0x03]
1269          RET
1270          LMEMCPY_8_PAD
1271
1272/*
1273 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1274 */
1275          ldrb      r3, [r1]            /* r3 = ...0 */
1276          ldr       ip, [r1, #0x01]               /* BE:ip = 1234  LE:ip = 4321 */
1277          ldrh      r2, [r1, #0x05]               /* BE:r2 = ..56  LE:r2 = ..65 */
1278          ldrb      r1, [r1, #0x07]               /* r1 = ...7 */
1279          strb      r3, [r0]
1280          mov       r3, ip, lsr #16               /* BE:r3 = ..12  LE:r3 = ..43 */
1281#ifdef __ARMEB__
1282          strh      r3, [r0, #0x01]
1283          orr       r2, r2, ip, lsl #16 /* r2 = 3456 */
1284#else
1285          strh      ip, [r0, #0x01]
1286          orr       r2, r3, r2, lsl #16 /* r2 = 6543 */
1287#endif
1288          str       r2, [r0, #0x03]
1289          strb      r1, [r0, #0x07]
1290          RET
1291          LMEMCPY_8_PAD
1292
1293/*
1294 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1295 */
1296          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
1297          ldr       r3, [r1, #0x04]               /* BE:r3 = 4567  LE:r3 = 7654 */
1298          mov       r1, r2, lsr #16               /* BE:r1 = ..01  LE:r1 = ..32 */
1299#ifdef __ARMEB__
1300          strh      r1, [r0]
1301          mov       r1, r3, lsr #16               /* r1 = ..45 */
1302          orr       r2, r1 ,r2, lsl #16 /* r2 = 2345 */
1303#else
1304          strh      r2, [r0]
1305          orr       r2, r1, r3, lsl #16 /* r2 = 5432 */
1306          mov       r3, r3, lsr #16               /* r3 = ..76 */
1307#endif
1308          str       r2, [r0, #0x02]
1309          strh      r3, [r0, #0x06]
1310          RET
1311          LMEMCPY_8_PAD
1312
1313/*
1314 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1315 */
1316          ldr       r2, [r1, #-1]                 /* BE:r2 = x012  LE:r2 = 210x */
1317          ldr       r3, [r1, #0x03]               /* BE:r3 = 3456  LE:r3 = 6543 */
1318          ldrb      ip, [r1, #0x07]               /* ip = ...7 */
1319          mov       r1, r2, lsr #8                /* BE:r1 = .x01  LE:r1 = .210 */
1320          strh      r1, [r0]
1321#ifdef __ARMEB__
1322          mov       r1, r2, lsl #24               /* r1 = 2... */
1323          orr       r1, r1, r3, lsr #8  /* r1 = 2345 */
1324          orr       r3, ip, r3, lsl #8  /* r3 = 4567 */
1325#else
1326          mov       r1, r2, lsr #24               /* r1 = ...2 */
1327          orr       r1, r1, r3, lsl #8  /* r1 = 5432 */
1328          mov       r3, r3, lsr #24               /* r3 = ...6 */
1329          orr       r3, r3, ip, lsl #8  /* r3 = ..76 */
1330#endif
1331          str       r1, [r0, #0x02]
1332          strh      r3, [r0, #0x06]
1333          RET
1334          LMEMCPY_8_PAD
1335
1336/*
1337 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1338 */
1339          ldrh      r2, [r1]
1340          ldr       ip, [r1, #0x02]
1341          ldrh      r3, [r1, #0x06]
1342          strh      r2, [r0]
1343          str       ip, [r0, #0x02]
1344          strh      r3, [r0, #0x06]
1345          RET
1346          LMEMCPY_8_PAD
1347
1348/*
1349 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1350 */
1351          ldr       r3, [r1, #0x05]               /* BE:r3 = 567x  LE:r3 = x765 */
1352          ldr       r2, [r1, #0x01]               /* BE:r2 = 1234  LE:r2 = 4321 */
1353          ldrb      ip, [r1]            /* ip = ...0 */
1354          mov       r1, r3, lsr #8                /* BE:r1 = .567  LE:r1 = .x76 */
1355          strh      r1, [r0, #0x06]
1356#ifdef __ARMEB__
1357          mov       r3, r3, lsr #24               /* r3 = ...5 */
1358          orr       r3, r3, r2, lsl #8  /* r3 = 2345 */
1359          mov       r2, r2, lsr #24               /* r2 = ...1 */
1360          orr       r2, r2, ip, lsl #8  /* r2 = ..01 */
1361#else
1362          mov       r3, r3, lsl #24               /* r3 = 5... */
1363          orr       r3, r3, r2, lsr #8  /* r3 = 5432 */
1364          orr       r2, ip, r2, lsl #8  /* r2 = 3210 */
1365#endif
1366          str       r3, [r0, #0x02]
1367          strh      r2, [r0]
1368          RET
1369          LMEMCPY_8_PAD
1370
1371/*
1372 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1373 */
1374          ldr       r3, [r1, #0x04]               /* BE:r3 = 4567  LE:r3 = 7654 */
1375          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
1376          mov       r1, r3, lsr #8                /* BE:r1 = .456  LE:r1 = .765 */
1377          strh      r1, [r0, #0x05]
1378#ifdef __ARMEB__
1379          strb      r3, [r0, #0x07]
1380          mov       r1, r2, lsr #24               /* r1 = ...0 */
1381          strb      r1, [r0]
1382          mov       r2, r2, lsl #8                /* r2 = 123. */
1383          orr       r2, r2, r3, lsr #24 /* r2 = 1234 */
1384          str       r2, [r0, #0x01]
1385#else
1386          strb      r2, [r0]
1387          mov       r1, r3, lsr #24               /* r1 = ...7 */
1388          strb      r1, [r0, #0x07]
1389          mov       r2, r2, lsr #8                /* r2 = .321 */
1390          orr       r2, r2, r3, lsl #24 /* r2 = 4321 */
1391          str       r2, [r0, #0x01]
1392#endif
1393          RET
1394          LMEMCPY_8_PAD
1395
1396/*
1397 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1398 */
1399          ldrb      r3, [r1]            /* r3 = ...0 */
1400          ldrh      r2, [r1, #0x01]               /* BE:r2 = ..12  LE:r2 = ..21 */
1401          ldr       ip, [r1, #0x03]               /* BE:ip = 3456  LE:ip = 6543 */
1402          ldrb      r1, [r1, #0x07]               /* r1 = ...7 */
1403          strb      r3, [r0]
1404          mov       r3, ip, lsr #16               /* BE:r3 = ..34  LE:r3 = ..65 */
1405#ifdef __ARMEB__
1406          strh      ip, [r0, #0x05]
1407          orr       r2, r3, r2, lsl #16 /* r2 = 1234 */
1408#else
1409          strh      r3, [r0, #0x05]
1410          orr       r2, r2, ip, lsl #16 /* r2 = 4321 */
1411#endif
1412          str       r2, [r0, #0x01]
1413          strb      r1, [r0, #0x07]
1414          RET
1415          LMEMCPY_8_PAD
1416
1417/*
1418 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1419 */
1420          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1421          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
1422          ldrh      r1, [r1, #0x06]               /* BE:r1 = ..67  LE:r1 = ..76 */
1423#ifdef __ARMEB__
1424          mov       ip, r2, lsr #8                /* ip = ...0 */
1425          strb      ip, [r0]
1426          mov       ip, r2, lsl #24               /* ip = 1... */
1427          orr       ip, ip, r3, lsr #8  /* ip = 1234 */
1428          strb      r1, [r0, #0x07]
1429          mov       r1, r1, lsr #8                /* r1 = ...6 */
1430          orr       r1, r1, r3, lsl #8  /* r1 = 3456 */
1431#else
1432          strb      r2, [r0]
1433          mov       ip, r2, lsr #8                /* ip = ...1 */
1434          orr       ip, ip, r3, lsl #8  /* ip = 4321 */
1435          mov       r2, r1, lsr #8                /* r2 = ...7 */
1436          strb      r2, [r0, #0x07]
1437          mov       r1, r1, lsl #8                /* r1 = .76. */
1438          orr       r1, r1, r3, lsr #24 /* r1 = .765 */
1439#endif
1440          str       ip, [r0, #0x01]
1441          strh      r1, [r0, #0x05]
1442          RET
1443          LMEMCPY_8_PAD
1444
1445/*
1446 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1447 */
1448          ldrb      r2, [r1]
1449          ldr       ip, [r1, #0x01]
1450          ldrh      r3, [r1, #0x05]
1451          ldrb      r1, [r1, #0x07]
1452          strb      r2, [r0]
1453          str       ip, [r0, #0x01]
1454          strh      r3, [r0, #0x05]
1455          strb      r1, [r0, #0x07]
1456          RET
1457          LMEMCPY_8_PAD
1458
1459/******************************************************************************
1460 * Special case for 12 byte copies
1461 */
1462#define   LMEMCPY_C_LOG2      7         /* 128 bytes */
1463#define   LMEMCPY_C_PAD       .align LMEMCPY_C_LOG2
1464          LMEMCPY_C_PAD
1465.Lmemcpy_c:
1466          and       r2, r1, #0x03
1467          orr       r2, r2, r0, lsl #2
1468          ands      r2, r2, #0x0f
1469          sub       r3, pc, #0x14
1470          addne     pc, r3, r2, lsl #LMEMCPY_C_LOG2
1471
1472/*
1473 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1474 */
1475          ldr       r2, [r1]
1476          ldr       r3, [r1, #0x04]
1477          ldr       r1, [r1, #0x08]
1478          str       r2, [r0]
1479          str       r3, [r0, #0x04]
1480          str       r1, [r0, #0x08]
1481          RET
1482          LMEMCPY_C_PAD
1483
1484/*
1485 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1486 */
1487          ldrb      r2, [r1, #0xb]                /* r2 = ...B */
1488          ldr       ip, [r1, #0x07]               /* BE:ip = 789A  LE:ip = A987 */
1489          ldr       r3, [r1, #0x03]               /* BE:r3 = 3456  LE:r3 = 6543 */
1490          ldr       r1, [r1, #-1]                 /* BE:r1 = x012  LE:r1 = 210x */
1491#ifdef __ARMEB__
1492          orr       r2, r2, ip, lsl #8  /* r2 = 89AB */
1493          str       r2, [r0, #0x08]
1494          mov       r2, ip, lsr #24               /* r2 = ...7 */
1495          orr       r2, r2, r3, lsl #8  /* r2 = 4567 */
1496          mov       r1, r1, lsl #8                /* r1 = 012. */
1497          orr       r1, r1, r3, lsr #24 /* r1 = 0123 */
1498#else
1499          mov       r2, r2, lsl #24               /* r2 = B... */
1500          orr       r2, r2, ip, lsr #8  /* r2 = BA98 */
1501          str       r2, [r0, #0x08]
1502          mov       r2, ip, lsl #24               /* r2 = 7... */
1503          orr       r2, r2, r3, lsr #8  /* r2 = 7654 */
1504          mov       r1, r1, lsr #8                /* r1 = .210 */
1505          orr       r1, r1, r3, lsl #24 /* r1 = 3210 */
1506#endif
1507          str       r2, [r0, #0x04]
1508          str       r1, [r0]
1509          RET
1510          LMEMCPY_C_PAD
1511
1512/*
1513 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1514 */
1515          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1516          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
1517          ldr       ip, [r1, #0x06]               /* BE:ip = 6789  LE:ip = 9876 */
1518          ldrh      r1, [r1, #0x0a]               /* BE:r1 = ..AB  LE:r1 = ..BA */
1519#ifdef __ARMEB__
1520          mov       r2, r2, lsl #16               /* r2 = 01.. */
1521          orr       r2, r2, r3, lsr #16 /* r2 = 0123 */
1522          str       r2, [r0]
1523          mov       r3, r3, lsl #16               /* r3 = 45.. */
1524          orr       r3, r3, ip, lsr #16 /* r3 = 4567 */
1525          orr       r1, r1, ip, lsl #16 /* r1 = 89AB */
1526#else
1527          orr       r2, r2, r3, lsl #16 /* r2 = 3210 */
1528          str       r2, [r0]
1529          mov       r3, r3, lsr #16               /* r3 = ..54 */
1530          orr       r3, r3, ip, lsl #16 /* r3 = 7654 */
1531          mov       r1, r1, lsl #16               /* r1 = BA.. */
1532          orr       r1, r1, ip, lsr #16 /* r1 = BA98 */
1533#endif
1534          str       r3, [r0, #0x04]
1535          str       r1, [r0, #0x08]
1536          RET
1537          LMEMCPY_C_PAD
1538
1539/*
1540 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1541 */
1542          ldrb      r2, [r1]            /* r2 = ...0 */
1543          ldr       r3, [r1, #0x01]               /* BE:r3 = 1234  LE:r3 = 4321 */
1544          ldr       ip, [r1, #0x05]               /* BE:ip = 5678  LE:ip = 8765 */
1545          ldr       r1, [r1, #0x09]               /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1546#ifdef __ARMEB__
1547          mov       r2, r2, lsl #24               /* r2 = 0... */
1548          orr       r2, r2, r3, lsr #8  /* r2 = 0123 */
1549          str       r2, [r0]
1550          mov       r3, r3, lsl #24               /* r3 = 4... */
1551          orr       r3, r3, ip, lsr #8  /* r3 = 4567 */
1552          mov       r1, r1, lsr #8                /* r1 = .9AB */
1553          orr       r1, r1, ip, lsl #24 /* r1 = 89AB */
1554#else
1555          orr       r2, r2, r3, lsl #8  /* r2 = 3210 */
1556          str       r2, [r0]
1557          mov       r3, r3, lsr #24               /* r3 = ...4 */
1558          orr       r3, r3, ip, lsl #8  /* r3 = 7654 */
1559          mov       r1, r1, lsl #8                /* r1 = BA9. */
1560          orr       r1, r1, ip, lsr #24 /* r1 = BA98 */
1561#endif
1562          str       r3, [r0, #0x04]
1563          str       r1, [r0, #0x08]
1564          RET
1565          LMEMCPY_C_PAD
1566
1567/*
1568 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1569 */
1570          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
1571          ldr       r3, [r1, #0x04]               /* BE:r3 = 4567  LE:r3 = 7654 */
1572          ldr       ip, [r1, #0x08]               /* BE:ip = 89AB  LE:ip = BA98 */
1573          mov       r1, r2, lsr #8                /* BE:r1 = .012  LE:r1 = .321 */
1574          strh      r1, [r0, #0x01]
1575#ifdef __ARMEB__
1576          mov       r1, r2, lsr #24               /* r1 = ...0 */
1577          strb      r1, [r0]
1578          mov       r1, r2, lsl #24               /* r1 = 3... */
1579          orr       r2, r1, r3, lsr #8  /* r1 = 3456 */
1580          mov       r1, r3, lsl #24               /* r1 = 7... */
1581          orr       r1, r1, ip, lsr #8  /* r1 = 789A */
1582#else
1583          strb      r2, [r0]
1584          mov       r1, r2, lsr #24               /* r1 = ...3 */
1585          orr       r2, r1, r3, lsl #8  /* r1 = 6543 */
1586          mov       r1, r3, lsr #24               /* r1 = ...7 */
1587          orr       r1, r1, ip, lsl #8  /* r1 = A987 */
1588          mov       ip, ip, lsr #24               /* ip = ...B */
1589#endif
1590          str       r2, [r0, #0x03]
1591          str       r1, [r0, #0x07]
1592          strb      ip, [r0, #0x0b]
1593          RET
1594          LMEMCPY_C_PAD
1595
1596/*
1597 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1598 */
1599          ldrb      r2, [r1]
1600          ldrh      r3, [r1, #0x01]
1601          ldr       ip, [r1, #0x03]
1602          strb      r2, [r0]
1603          ldr       r2, [r1, #0x07]
1604          ldrb      r1, [r1, #0x0b]
1605          strh      r3, [r0, #0x01]
1606          str       ip, [r0, #0x03]
1607          str       r2, [r0, #0x07]
1608          strb      r1, [r0, #0x0b]
1609          RET
1610          LMEMCPY_C_PAD
1611
1612/*
1613 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1614 */
1615          ldrh      r2, [r1]            /* BE:r2 = ..01  LE:r2 = ..10 */
1616          ldr       r3, [r1, #0x02]               /* BE:r3 = 2345  LE:r3 = 5432 */
1617          ldr       ip, [r1, #0x06]               /* BE:ip = 6789  LE:ip = 9876 */
1618          ldrh      r1, [r1, #0x0a]               /* BE:r1 = ..AB  LE:r1 = ..BA */
1619#ifdef __ARMEB__
1620          mov       r2, r2, ror #8                /* r2 = 1..0 */
1621          strb      r2, [r0]
1622          mov       r2, r2, lsr #16               /* r2 = ..1. */
1623          orr       r2, r2, r3, lsr #24 /* r2 = ..12 */
1624          strh      r2, [r0, #0x01]
1625          mov       r2, r3, lsl #8                /* r2 = 345. */
1626          orr       r3, r2, ip, lsr #24 /* r3 = 3456 */
1627          mov       r2, ip, lsl #8                /* r2 = 789. */
1628          orr       r2, r2, r1, lsr #8  /* r2 = 789A */
1629#else
1630          strb      r2, [r0]
1631          mov       r2, r2, lsr #8                /* r2 = ...1 */
1632          orr       r2, r2, r3, lsl #8  /* r2 = 4321 */
1633          strh      r2, [r0, #0x01]
1634          mov       r2, r3, lsr #8                /* r2 = .543 */
1635          orr       r3, r2, ip, lsl #24 /* r3 = 6543 */
1636          mov       r2, ip, lsr #8                /* r2 = .987 */
1637          orr       r2, r2, r1, lsl #24 /* r2 = A987 */
1638          mov       r1, r1, lsr #8                /* r1 = ...B */
1639#endif
1640          str       r3, [r0, #0x03]
1641          str       r2, [r0, #0x07]
1642          strb      r1, [r0, #0x0b]
1643          RET
1644          LMEMCPY_C_PAD
1645
1646/*
1647 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1648 */
1649          ldrb      r2, [r1]
1650          ldr       r3, [r1, #0x01]               /* BE:r3 = 1234  LE:r3 = 4321 */
1651          ldr       ip, [r1, #0x05]               /* BE:ip = 5678  LE:ip = 8765 */
1652          ldr       r1, [r1, #0x09]               /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1653          strb      r2, [r0]
1654#ifdef __ARMEB__
1655          mov       r2, r3, lsr #16               /* r2 = ..12 */
1656          strh      r2, [r0, #0x01]
1657          mov       r3, r3, lsl #16               /* r3 = 34.. */
1658          orr       r3, r3, ip, lsr #16 /* r3 = 3456 */
1659          mov       ip, ip, lsl #16               /* ip = 78.. */
1660          orr       ip, ip, r1, lsr #16 /* ip = 789A */
1661          mov       r1, r1, lsr #8                /* r1 = .9AB */
1662#else
1663          strh      r3, [r0, #0x01]
1664          mov       r3, r3, lsr #16               /* r3 = ..43 */
1665          orr       r3, r3, ip, lsl #16 /* r3 = 6543 */
1666          mov       ip, ip, lsr #16               /* ip = ..87 */
1667          orr       ip, ip, r1, lsl #16 /* ip = A987 */
1668          mov       r1, r1, lsr #16               /* r1 = ..xB */
1669#endif
1670          str       r3, [r0, #0x03]
1671          str       ip, [r0, #0x07]
1672          strb      r1, [r0, #0x0b]
1673          RET
1674          LMEMCPY_C_PAD
1675
1676/*
1677 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1678 */
1679          ldr       ip, [r1]            /* BE:ip = 0123  LE:ip = 3210 */
1680          ldr       r3, [r1, #0x04]               /* BE:r3 = 4567  LE:r3 = 7654 */
1681          ldr       r2, [r1, #0x08]               /* BE:r2 = 89AB  LE:r2 = BA98 */
1682          mov       r1, ip, lsr #16               /* BE:r1 = ..01  LE:r1 = ..32 */
1683#ifdef __ARMEB__
1684          strh      r1, [r0]
1685          mov       r1, ip, lsl #16               /* r1 = 23.. */
1686          orr       r1, r1, r3, lsr #16 /* r1 = 2345 */
1687          mov       r3, r3, lsl #16               /* r3 = 67.. */
1688          orr       r3, r3, r2, lsr #16 /* r3 = 6789 */
1689#else
1690          strh      ip, [r0]
1691          orr       r1, r1, r3, lsl #16 /* r1 = 5432 */
1692          mov       r3, r3, lsr #16               /* r3 = ..76 */
1693          orr       r3, r3, r2, lsl #16 /* r3 = 9876 */
1694          mov       r2, r2, lsr #16               /* r2 = ..BA */
1695#endif
1696          str       r1, [r0, #0x02]
1697          str       r3, [r0, #0x06]
1698          strh      r2, [r0, #0x0a]
1699          RET
1700          LMEMCPY_C_PAD
1701
1702/*
1703 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1704 */
1705          ldr       r2, [r1, #-1]                 /* BE:r2 = x012  LE:r2 = 210x */
1706          ldr       r3, [r1, #0x03]               /* BE:r3 = 3456  LE:r3 = 6543 */
1707          mov       ip, r2, lsr #8                /* BE:ip = .x01  LE:ip = .210 */
1708          strh      ip, [r0]
1709          ldr       ip, [r1, #0x07]               /* BE:ip = 789A  LE:ip = A987 */
1710          ldrb      r1, [r1, #0x0b]               /* r1 = ...B */
1711#ifdef __ARMEB__
1712          mov       r2, r2, lsl #24               /* r2 = 2... */
1713          orr       r2, r2, r3, lsr #8  /* r2 = 2345 */
1714          mov       r3, r3, lsl #24               /* r3 = 6... */
1715          orr       r3, r3, ip, lsr #8  /* r3 = 6789 */
1716          orr       r1, r1, ip, lsl #8  /* r1 = 89AB */
1717#else
1718          mov       r2, r2, lsr #24               /* r2 = ...2 */
1719          orr       r2, r2, r3, lsl #8  /* r2 = 5432 */
1720          mov       r3, r3, lsr #24               /* r3 = ...6 */
1721          orr       r3, r3, ip, lsl #8  /* r3 = 9876 */
1722          mov       r1, r1, lsl #8                /* r1 = ..B. */
1723          orr       r1, r1, ip, lsr #24 /* r1 = ..BA */
1724#endif
1725          str       r2, [r0, #0x02]
1726          str       r3, [r0, #0x06]
1727          strh      r1, [r0, #0x0a]
1728          RET
1729          LMEMCPY_C_PAD
1730
1731/*
1732 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1733 */
1734          ldrh      r2, [r1]
1735          ldr       r3, [r1, #0x02]
1736          ldr       ip, [r1, #0x06]
1737          ldrh      r1, [r1, #0x0a]
1738          strh      r2, [r0]
1739          str       r3, [r0, #0x02]
1740          str       ip, [r0, #0x06]
1741          strh      r1, [r0, #0x0a]
1742          RET
1743          LMEMCPY_C_PAD
1744
1745/*
1746 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1747 */
1748          ldr       r2, [r1, #0x09]               /* BE:r2 = 9ABx  LE:r2 = xBA9 */
1749          ldr       r3, [r1, #0x05]               /* BE:r3 = 5678  LE:r3 = 8765 */
1750          mov       ip, r2, lsr #8                /* BE:ip = .9AB  LE:ip = .xBA */
1751          strh      ip, [r0, #0x0a]
1752          ldr       ip, [r1, #0x01]               /* BE:ip = 1234  LE:ip = 4321 */
1753          ldrb      r1, [r1]            /* r1 = ...0 */
1754#ifdef __ARMEB__
1755          mov       r2, r2, lsr #24               /* r2 = ...9 */
1756          orr       r2, r2, r3, lsl #8  /* r2 = 6789 */
1757          mov       r3, r3, lsr #24               /* r3 = ...5 */
1758          orr       r3, r3, ip, lsl #8  /* r3 = 2345 */
1759          mov       r1, r1, lsl #8                /* r1 = ..0. */
1760          orr       r1, r1, ip, lsr #24 /* r1 = ..01 */
1761#else
1762          mov       r2, r2, lsl #24               /* r2 = 9... */
1763          orr       r2, r2, r3, lsr #8  /* r2 = 9876 */
1764          mov       r3, r3, lsl #24               /* r3 = 5... */
1765          orr       r3, r3, ip, lsr #8  /* r3 = 5432 */
1766          orr       r1, r1, ip, lsl #8  /* r1 = 3210 */
1767#endif
1768          str       r2, [r0, #0x06]
1769          str       r3, [r0, #0x02]
1770          strh      r1, [r0]
1771          RET
1772          LMEMCPY_C_PAD
1773
1774/*
1775 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1776 */
1777          ldr       r2, [r1]            /* BE:r2 = 0123  LE:r2 = 3210 */
1778          ldr       ip, [r1, #0x04]               /* BE:ip = 4567  LE:ip = 7654 */
1779          ldr       r1, [r1, #0x08]               /* BE:r1 = 89AB  LE:r1 = BA98 */
1780#ifdef __ARMEB__
1781          mov       r3, r2, lsr #24               /* r3 = ...0 */
1782          strb      r3, [r0]
1783          mov       r2, r2, lsl #8                /* r2 = 123. */
1784          orr       r2, r2, ip, lsr #24 /* r2 = 1234 */
1785          str       r2, [r0, #0x01]
1786          mov       r2, ip, lsl #8                /* r2 = 567. */
1787          orr       r2, r2, r1, lsr #24 /* r2 = 5678 */
1788          str       r2, [r0, #0x05]
1789          mov       r2, r1, lsr #8                /* r2 = ..9A */
1790          strh      r2, [r0, #0x09]
1791          strb      r1, [r0, #0x0b]
1792#else
1793          strb      r2, [r0]
1794          mov       r3, r2, lsr #8                /* r3 = .321 */
1795          orr       r3, r3, ip, lsl #24 /* r3 = 4321 */
1796          str       r3, [r0, #0x01]
1797          mov       r3, ip, lsr #8                /* r3 = .765 */
1798          orr       r3, r3, r1, lsl #24 /* r3 = 8765 */
1799          str       r3, [r0, #0x05]
1800          mov       r1, r1, lsr #8                /* r1 = .BA9 */
1801          strh      r1, [r0, #0x09]
1802          mov       r1, r1, lsr #16               /* r1 = ...B */
1803          strb      r1, [r0, #0x0b]
1804#endif
1805          RET
1806          LMEMCPY_C_PAD
1807
1808/*
1809 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1810 */
1811          ldrb      r2, [r1, #0x0b]               /* r2 = ...B */
1812          ldr       r3, [r1, #0x07]               /* BE:r3 = 789A  LE:r3 = A987 */
1813          ldr       ip, [r1, #0x03]               /* BE:ip = 3456  LE:ip = 6543 */
1814          ldr       r1, [r1, #-1]                 /* BE:r1 = x012  LE:r1 = 210x */
1815          strb      r2, [r0, #0x0b]
1816#ifdef __ARMEB__
1817          strh      r3, [r0, #0x09]
1818          mov       r3, r3, lsr #16               /* r3 = ..78 */
1819          orr       r3, r3, ip, lsl #16 /* r3 = 5678 */
1820          mov       ip, ip, lsr #16               /* ip = ..34 */
1821          orr       ip, ip, r1, lsl #16 /* ip = 1234 */
1822          mov       r1, r1, lsr #16               /* r1 = ..x0 */
1823#else
1824          mov       r2, r3, lsr #16               /* r2 = ..A9 */
1825          strh      r2, [r0, #0x09]
1826          mov       r3, r3, lsl #16               /* r3 = 87.. */
1827          orr       r3, r3, ip, lsr #16 /* r3 = 8765 */
1828          mov       ip, ip, lsl #16               /* ip = 43.. */
1829          orr       ip, ip, r1, lsr #16 /* ip = 4321 */
1830          mov       r1, r1, lsr #8                /* r1 = .210 */
1831#endif
1832          str       r3, [r0, #0x05]
1833          str       ip, [r0, #0x01]
1834          strb      r1, [r0]
1835          RET
1836          LMEMCPY_C_PAD
1837
1838/*
1839 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1840 */
1841#ifdef __ARMEB__
1842          ldrh      r2, [r1, #0x0a]               /* r2 = ..AB */
1843          ldr       ip, [r1, #0x06]               /* ip = 6789 */
1844          ldr       r3, [r1, #0x02]               /* r3 = 2345 */
1845          ldrh      r1, [r1]            /* r1 = ..01 */
1846          strb      r2, [r0, #0x0b]
1847          mov       r2, r2, lsr #8                /* r2 = ...A */
1848          orr       r2, r2, ip, lsl #8  /* r2 = 789A */
1849          mov       ip, ip, lsr #8                /* ip = .678 */
1850          orr       ip, ip, r3, lsl #24 /* ip = 5678 */
1851          mov       r3, r3, lsr #8                /* r3 = .234 */
1852          orr       r3, r3, r1, lsl #24 /* r3 = 1234 */
1853          mov       r1, r1, lsr #8                /* r1 = ...0 */
1854          strb      r1, [r0]
1855          str       r3, [r0, #0x01]
1856          str       ip, [r0, #0x05]
1857          strh      r2, [r0, #0x09]
1858#else
1859          ldrh      r2, [r1]            /* r2 = ..10 */
1860          ldr       r3, [r1, #0x02]               /* r3 = 5432 */
1861          ldr       ip, [r1, #0x06]               /* ip = 9876 */
1862          ldrh      r1, [r1, #0x0a]               /* r1 = ..BA */
1863          strb      r2, [r0]
1864          mov       r2, r2, lsr #8                /* r2 = ...1 */
1865          orr       r2, r2, r3, lsl #8  /* r2 = 4321 */
1866          mov       r3, r3, lsr #24               /* r3 = ...5 */
1867          orr       r3, r3, ip, lsl #8  /* r3 = 8765 */
1868          mov       ip, ip, lsr #24               /* ip = ...9 */
1869          orr       ip, ip, r1, lsl #8  /* ip = .BA9 */
1870          mov       r1, r1, lsr #8                /* r1 = ...B */
1871          str       r2, [r0, #0x01]
1872          str       r3, [r0, #0x05]
1873          strh      ip, [r0, #0x09]
1874          strb      r1, [r0, #0x0b]
1875#endif
1876          RET
1877          LMEMCPY_C_PAD
1878
1879/*
1880 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1881 */
1882          ldrb      r2, [r1]
1883          ldr       r3, [r1, #0x01]
1884          ldr       ip, [r1, #0x05]
1885          strb      r2, [r0]
1886          ldrh      r2, [r1, #0x09]
1887          ldrb      r1, [r1, #0x0b]
1888          str       r3, [r0, #0x01]
1889          str       ip, [r0, #0x05]
1890          strh      r2, [r0, #0x09]
1891          strb      r1, [r0, #0x0b]
1892          RET
1893END(memcpy)
1894#endif    /* !_STANDALONE */
1895