1/*        $NetBSD: memset_neon.S,v 1.1 2012/12/13 01:41:59 matt Exp $ */
2
3/*-
4 * Copyright (c) 2012 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Matt Thomas of 3am Software Foundry.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31#include <machine/asm.h>
32#include "assym.h"
33
34/*
35 * memset: Sets a block of memory to the specified value
36 * Using NEON instructions
37 *
38 * On entry:
39 *   r0 - dest address
40 *   r1 - byte to write
41 *   r2 - number of bytes to write
42 *
43 * On exit:
44 *   r0 - dest address
45 */
46/* LINTSTUB: Func: void *memset(void *, int, size_t) */
47ENTRY(memset)
48          and                 r3, r1, #0xff       /* We deal with bytes */
49          mov                 r1, r2
50          mov                 ip, r0              /* r0 needs to stay the same */
51
52          vdup.8              q0, r3              /* move fill to SIMD */
53          /* we no longer need to keep the fill value in a ARM register */
54
55          /* Ok first we will dword align the address */
56          ands                r2, ip, #7          /* grab the bottom three bits */
57          beq                 .Lmemset_dwordaligned         /* The addr is dword aligned */
58
59          bic                 ip, ip, #7          /* clear bottom three bits of addr */
60          vldr                d7, [ip]  /* load from memory */
61          add                 r1, r1, r2          /* add "pre-fill" to length */
62          lsl                 r2, r2, #3          /* byte to no-fill bit count */
63
64#ifdef __ARMEB__
65          neg                 r1, r1              /* start from the MSB */
66#endif
67          vmov                s4, r2              /* move to SIMD d2 */
68          vmvn.u64  d3, #0              /* set all ones */
69          vshl.u64  d3, d3, d2          /* create a no-fill mask */
70          vmvn.u64  d2, d3              /* invert mask for a fill-mask */
71          vand                d7, d7, d3          /* mask out fill bits */
72          vand                d2, d0, d2          /* mask out no-fill bits */
73          vorr                d7, d2, d7          /* merge fill and memory */
74
75          cmp                 r1, #8              /* Do we have less than 8 bytes */
76          movlt               r2, #0              /* indicate this is the last word */
77          blt                 .Lmemset_lessthaneight_noload
78
79          vstmia              ip!, {d7} /* write back to memory */
80          subs                r1, r1, #8          /* and remove 8 bytes from the length */
81          RETc(eq)
82
83          /* We are now doubleword aligned */
84.Lmemset_dwordaligned:
85          vmov                q1, q0              /* put fill in q1 (d2-d3) */
86          vmov                q2, q0              /* put fill in q2 (d4-d5) */
87          vmov                q3, q0              /* put fill in q3 (d6-d7) */
88
89          and                 r2, ip, #63         /* check for 64-byte alignment */
90          beq                 .Lmemset_8dwordaligned
91          /*
92           * Let's align to a 64-byte boundary so that stores don't cross
93           * cacheline boundaries.  We also know we have at least 128-bytes to
94           * copy so we don't have to worry about the length at the moment.
95           */
96          rsb                 r2, r2, #64         /* how many bytes until 64 bytes */
97          cmp                 r1, r2              /* compare against length */
98          andlt               r2, r1, #0x38       /* if < len, use trunc(len, 8) */
99          subs                r1, r1, r2          /* subtract from len */
100          add                 pc, pc, r2          /* and jump to it */
101          nop
102          RETc(eq);                     b         .Lmemset_lessthaneight
103          vstmia              ip!, {d0};          b         .Lmemset_8dwordaligned
104          vstmia              ip!, {d0-d1};       b         .Lmemset_8dwordaligned
105          vstmia              ip!, {d0-d2};       b         .Lmemset_8dwordaligned
106          vstmia              ip!, {d0-d3};       b         .Lmemset_8dwordaligned
107          vstmia              ip!, {d0-d4};       b         .Lmemset_8dwordaligned
108          vstmia              ip!, {d0-d5};       b         .Lmemset_8dwordaligned
109          vstmia              ip!, {d0-d6}
110.Lmemset_8dwordaligned:
111          vmov                d0, d1              /* restore in case of unaligned start */
112          cmp                 r1, #8              /* do we have less than 8 bytes */
113          movlt               r2, #0              /* indicate last word */
114          blt                 .Lmemset_lessthaneight
115
116          cmp                 r1, #512
117          blt                 .Lmemset_sub512
118
119          /* Do 512 bytes at a time */
120          mov                 r2, #512
121.Lmemset_512:
122          vstmia              ip!, {d0-d7}
123          vstmia              ip!, {d0-d7}
124          vstmia              ip!, {d0-d7}
125          vstmia              ip!, {d0-d7}
126          vstmia              ip!, {d0-d7}
127          vstmia              ip!, {d0-d7}
128          vstmia              ip!, {d0-d7}
129          vstmia              ip!, {d0-d7}
130.Lmemset_0:
131          subs                r1, r1, r2
132          RETc(eq)                      /* return if done */
133          cmp                 r1, #512
134          bge                 .Lmemset_512
135
136          /*
137           * We have less than 512 bytes left, but since the sequence above
138           * store 64 bytes at a time, we determine the number of instructions
139           * we need to store the remainder (if >= 64 bytes) and execute that
140           * many vstmia.
141           */
142.Lmemset_sub512:
143          lsr                 r2, r1, #6          /* divide by 64 */
144          lslne               r4, r2, #2          /* multiply by 4 */
145          addne               r4, r4, #1f + 8 - .Lmemset_0
146                                                  /* add the # of bytes between */
1471:        subne               pc, r4              /* and go */
148
149          /*
150           * We have less than 64 bytes to copy on a 8dword aligned address
151           */
152          and                 r2, r1, #56         /* get # of full dwords */
153          ands                r1, r1, #7          /* get # of extra bytes */
154          beq                 .Lmemset_finalstore
155          /*
156           * The last word is a partial fill so load its value and update it
157           * to include the fill value.
158           */
159.Lmemset_lessthaneight:
160          vldr                d7, [ip, r2]        /* load the last partial dword */
161.Lmemset_lessthaneight_noload:
162          lsl                 r1, r1, #3          /* byte to fill bit count */
163#ifdef __ARMEB__
164          neg                 r1, r1              /* start from the MSB */
165#endif
166          vmov                s4, r1              /* move to SIMD d2 */
167          vmvn.u64  d3, #0              /* set all ones */
168          vshl.u64  d3, d3, d2          /* create a no-fill mask */
169          vmvn.u64  d2, d3              /* invert mask */
170          vand                d7, d7, d2          /* keep no-fill bits */
171          vand                d2, d0, d3          /* mask out no-fill bits */
172          vorr                d7, d2, d7          /* merge fill and no-fill */
173          vmov                q1, q0              /* restore d2 & d3 */
174          add                 r2, r2, #8          /* compensate for the partial dword */
175.Lmemset_finalstore:
176          add                 pc, pc, r2          /* and jump to it */
177          nop
178          vstr                d7, [ip]; RET
179          vstmia              ip, {d6-d7};        RET
180          vstmia              ip, {d5-d7};        RET
181          vstmia              ip, {d4-d7};        RET
182          vstmia              ip, {d3-d7};        RET
183          vstmia              ip, {d2-d7};        RET
184          vstmia              ip, {d1-d7};        RET
185          vstmia              ip, {d0-d7};        RET
186END(memset)
187