1/* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */
2
3/*-
4 * Copyright (c) 2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Matt Thomas of 3am Software Foundry.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <machine/asm.h>
33
34ENTRY(memset)
35          cbz       x2, .Lret
36          mov       x15, x0                       /* working data pointer */
37          cbz       x1, .Lzerofill
38          cbz       x1, .Lfilled
39          /*
40           * Non zero fill, replicate to all 64 bits of x1.
41           */
42          and       x1, x1, #0xff
43          orr       x1, x1, x1, lsl #8
44          orr       x1, x1, x1, lsl #16
45          orr       x1, x1, x1, lsl #32
46.Lfilled:
47          cmp       x2, #15                       /* if it's small, ignore alignment */
48          b.ls      .Llast_subqword
49
50          mov       x6, x1
51          tst       x15, #15
52          b.eq      .Lqword_loop
53
54/*
55 * We have at least 15 to copy which means we can get qword alignment
56 * without having to check the amount left.
57 */
58          tbz       x15, #0, .Lhword_aligned
59          strb      w1, [x15], #1
60.Lhword_aligned:
61          tbz       x15, #1, .Lword_aligned
62          strh      w1, [x15], #2
63.Lword_aligned:
64          tbz       x15, #2, .Ldword_aligned
65          str       w1, [x15], #4
66.Ldword_aligned:
67          tbz       x15, #3, .Lqword_aligned
68          str       x1, [x15], #8
69/*
70 * Now we qword aligned. Figure how much we have to write to get here.
71 * Then subtract from the length.  If we get 0, we're done.
72 */
73.Lqword_aligned:
74          sub       x5, x15, x0
75          subs      x2, x2, x5
76          b.eq      .Lret
77
78/*
79 * Write 16 bytes at time.  If we don't have 16 bytes to write, bail.
80 * Keep looping if there's data to set.
81 */
82.Lqword_loop:
83          subs      x2, x2, #16
84          b.mi      .Llast_subqword
85          stp       x1, x6, [x15], #16
86          b.ne      .Lqword_loop
87          ret
88
89/*
90 * We have less than a qword to write.  We hope we are aligned but since
91 * unaligned access works, we don't have to be aligned.
92 */
93.Llast_subqword:
94          tbz       x2, #3, .Llast_subdword
95          str       x1, [x15], #8
96.Llast_subdword:
97          tbz       x2, #2, .Llast_subword
98          str       w1, [x15], #4
99.Llast_subword:
100          tbz       x2, #1, .Llast_subhword
101          strh      w1, [x15], #2
102.Llast_subhword:
103          tbz       x2, #0, .Lret
104          strb      w1, [x15]
105.Lret:    ret
106
107/*
108 * If we are filling with zeros then let's see if we can use the
109 *        dc zva, <Xt>
110 * instruction to speed things up.
111 */
112.Lzerofill:
113          mrs       x9, dczid_el0
114          /*
115           * Make sure we can the instruction isn't prohibited.
116           */
117          tbnz      x9, #4, .Lfilled
118          /*
119           * Now find out the block size.
120           */
121          ubfx      x9, x9, #0, #4      /* extract low 4 bits */
122          add       x9, x9, #2          /* add log2(word) */
123          mov       x10, #1             /* the value is log2(words) */
124          lsl       x10, x10, x9        /* shift to get the block size */
125          cmp       x2, x10             /* are we even copying a block? */
126          b.lt      .Lfilled  /*   no, do it 16 bytes at a time */
127          /*
128           * Now we figure out how many aligned blocks we have
129           */
130          sub       x11, x10, #1        /* make block size a mask */
131          add       x12, x15, x11       /* round start to a block boundary */
132          asr       x12, x12, x9        /* "starting" block number */
133          add       x13, x15, x2        /* get ending address */
134          asr       x13, x13, x9        /* "ending" block numebr */
135          cmp       x13, x12  /* how many blocks? */
136          b.ls      .Lfilled  /*   none, do it 16 bytes at a time */
137
138          /*
139           * Now we have one or more blocks to deal with.  First now we need
140           * to get block aligned.
141           */
142          and       x7, x15, x11        /* are already aligned on a block boundary? */
143          cbz       x7, .Lblock_aligned
144
145          sub       x7, x10, x7         /* subtract offset from block length */
146          sub       x2, x2, x7          /* subtract that from length */
147          asr       x7, x7, #4          /* length -> N*16 */
148
149          tbz       x15, #0, .Lzero_hword_aligned
150          strb      wzr, [x15], #1
151.Lzero_hword_aligned:
152          tbz       x15, #1, .Lzero_word_aligned
153          strh      wzr, [x15], #2
154.Lzero_word_aligned:
155          tbz       x15, #2, .Lzero_dword_aligned
156          str       wzr, [x15], #4
157.Lzero_dword_aligned:
158          tbz       x15, #3, .Lzero_qword_aligned
159          str       xzr, [x15], #8
160.Lzero_qword_aligned:
161          cbz       x7, .Lblock_aligned /* aligned? just branch */
162
163          /* align to DCZID_EL0:BS boundary */
164          tbz       x7, #0, 0f                    /* fill 16byte? */
165          stp       xzr, xzr, [x15], #16
1660:
167          tbz       x7, #1, 1f                    /* fill 32byte? */
168          stp       xzr, xzr, [x15], #16
169          stp       xzr, xzr, [x15], #16
1701:
171          lsr       x7, x7, #2
172          cbz       x7, 9f
173.L64bytes_fill:
174          sub       x7, x7, #1
175          stp       xzr, xzr, [x15], #16
176          stp       xzr, xzr, [x15], #16
177          stp       xzr, xzr, [x15], #16
178          stp       xzr, xzr, [x15], #16
179          cbnz      x7, .L64bytes_fill
1809:
181
182/*
183 * Now we are block aligned.
184 */
185.Lblock_aligned:
186          subs      x2, x2, x10
187          b.mi      .Lblock_done
188          dc        zva, x15
189          add       x15, x15, x10
190          b.ne      .Lblock_aligned
191          ret
192
193.Lblock_done:
194          and       x2, x2, x11         /* make positive again */
195          mov       x6, xzr             /* fill 2nd xword */
196          b         .Lqword_loop        /* and finish filling */
197
198END(memset)
199