1/* $OpenBSD: sha1_amd64_generic.S,v 1.2 2025/01/18 02:56:07 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define	ctx		%rdi
25#define	in		%rsi
26#define	num		%rdx
27
28#define	end		%rbp
29
30#define	hs0		%r8d
31#define	hs1		%r9d
32#define	hs2		%r10d
33#define	hs3		%r11d
34#define	hs4		%r12d
35
36#define	tmp0		%eax
37#define	tmp1		%ebx
38#define	tmp2		%ecx
39#define	tmp3		%edx
40
41/*
42 * Load message into wt, storing a copy in the message schedule:
43 *
44 *  Wt = Mt
45 */
46#define sha1_message_schedule_load(idx, m, w, wt) \
47	movl	((idx&0xf)*4)(m), wt;				\
48	bswapl	wt;						\
49	movl	wt, ((idx&0xf)*4)(w);
50
51/*
52 * Update message schedule and return current value in wt:
53 *
54 *  W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
55 */
56#define sha1_message_schedule_update(idx, w, wt) \
57	movl	(((idx-3)&0xf)*4)(w), wt;	/* W13 */	\
58	xorl	(((idx-8)&0xf)*4)(w), wt;	/* W8 */	\
59	xorl	(((idx-14)&0xf)*4)(w), wt;	/* W2 */	\
60	xorl	(((idx)&0xf)*4)(w), wt;		/* W0 */	\
61	roll	$1, wt;						\
62	\
63	movl	wt, ((idx&0xf)*4)(w);
64
65/*
66 * Compute a SHA-1 round without logic function:
67 *
68 *  T = rol(a, 5) + e + Kt + Wt
69 *
70 * The caller is required to compute the appropriate logic function
71 * (Ch, Maj, Parity) and add it to e.
72 *
73 * Upon completion b = rol(b, 30), e = T, pending rotation.
74 */
75#define sha1_round(a, b, c, d, e, kt, wt) \
76	leal	kt(wt, e, 1), e;		/* Kt + Wt */	\
77	\
78	movl	a, tmp1;			/* rol(a, 5) */	\
79	roll	$5, tmp1;					\
80	addl	tmp1, e;					\
81	\
82	roll	$30, b;				/* rol(b, 30) */
83
84/*
85 * Compute a SHA-1 round with Ch:
86 *
87 *  T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
88 *
89 *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
90 *
91 * Upon completion b = rol(b, 30), e = T, pending rotation.
92 */
93#define sha1_round_ch(a, b, c, d, e, kt, wt) \
94	movl	c, tmp2;			/* Ch */	\
95	xorl	d, tmp2;			/* Ch */	\
96	andl	b, tmp2;			/* Ch */	\
97	xorl	d, tmp2;			/* Ch */	\
98	addl	tmp2, e;			/* Ch */	\
99	\
100	sha1_round(a, b, c, d, e, kt, wt);
101
102/*
103 * Compute a SHA-1 round with Parity:
104 *
105 *  T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
106 *
107 *  Parity(x, y, z) = x ^ y ^ z
108 *
109 * Upon completion b = rol(b, 30), e = T, pending rotation.
110 */
111#define sha1_round_parity(a, b, c, d, e, kt, wt) \
112	movl	b, tmp2;			/* Parity */	\
113	xorl	c, tmp2;			/* Parity */	\
114	xorl	d, tmp2;			/* Parity */	\
115	addl	tmp2, e;			/* Parity */	\
116	\
117	sha1_round(a, b, c, d, e, kt, wt);
118
119/*
120 * Compute a SHA-1 round with Maj:
121 *
122 *  T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
123 *
124 *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
125 *
126 * Upon completion b = rol(b, 30), e = T, pending rotation.
127 */
128#define sha1_round_maj(a, b, c, d, e, kt, wt) \
129	movl	c, tmp2;			/* Maj */	\
130	xorl	d, tmp2;			/* Maj */	\
131	andl	b, tmp2;			/* Maj */	\
132	movl	c, tmp3;			/* Maj */	\
133	andl	d, tmp3;			/* Maj */	\
134	xorl	tmp2, tmp3;			/* Maj */	\
135	addl	tmp3, e;			/* Maj */	\
136	\
137	sha1_round(a, b, c, d, e, kt, wt);
138
139#define sha1_round1_load(idx, a, b, c, d, e) \
140	sha1_message_schedule_load(idx, in, %rsp, tmp0) \
141	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
142
143#define sha1_round1_update(idx, a, b, c, d, e) \
144	sha1_message_schedule_update(idx, %rsp, tmp0) \
145	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
146
147#define sha1_round2_update(idx, a, b, c, d, e) \
148	sha1_message_schedule_update(idx, %rsp, tmp0) \
149	sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
150
151#define sha1_round3_update(idx, a, b, c, d, e) \
152	sha1_message_schedule_update(idx, %rsp, tmp0) \
153	sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
154
155#define sha1_round4_update(idx, a, b, c, d, e) \
156	sha1_message_schedule_update(idx, %rsp, tmp0) \
157	sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
158
159.text
160
161/*
162 * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num);
163 *
164 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
165 */
166.align 16
167.globl	sha1_block_generic
168.type	sha1_block_generic,@function
169sha1_block_generic:
170	_CET_ENDBR
171
172	/* Save callee save registers. */
173	pushq	%rbx
174	pushq	%rbp
175	pushq	%r12
176
177	/* Allocate space for message schedule. */
178	movq	%rsp, %rax
179	subq	$(64+1*8), %rsp
180	andq	$~63, %rsp
181	movq	%rax, (64+0*8)(%rsp)
182
183	/* Compute end of message. */
184	shlq	$6, num
185	leaq	(in, num, 1), end
186
187	/* Load current hash state from context. */
188	movl	(0*4)(ctx), hs0
189	movl	(1*4)(ctx), hs1
190	movl	(2*4)(ctx), hs2
191	movl	(3*4)(ctx), hs3
192	movl	(4*4)(ctx), hs4
193
194	jmp	.Lblock_loop
195
196.align 16
197.Lblock_loop:
198
199	/* Round 0 through 15. */
200	sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
201	sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
202	sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
203	sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
204	sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
205	sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
206	sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
207	sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
208	sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
209	sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
210	sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
211	sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
212	sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
213	sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
214	sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
215	sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
216
217	/* Round 16 through 31. */
218	sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
219	sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
220	sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
221	sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
222	sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
223	sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
224	sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
225	sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
226	sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
227	sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
228	sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
229	sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
230	sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
231	sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
232	sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
233	sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
234
235	/* Round 32 through 47. */
236	sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
237	sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
238	sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
239	sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
240	sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
241	sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
242	sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
243	sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
244	sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
245	sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
246	sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
247	sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
248	sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
249	sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
250	sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
251	sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
252
253	/* Round 48 through 63. */
254	sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
255	sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
256	sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
257	sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
258	sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
259	sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
260	sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
261	sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
262	sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
263	sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
264	sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
265	sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
266	sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
267	sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
268	sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
269	sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
270
271	/* Round 64 through 79. */
272	sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
273	sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
274	sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
275	sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
276	sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
277	sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
278	sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
279	sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
280	sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
281	sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
282	sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
283	sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
284	sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
285	sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
286	sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
287	sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
288
289	/* Add intermediate state to hash state. */
290	addl	(0*4)(ctx), hs0
291	addl	(1*4)(ctx), hs1
292	addl	(2*4)(ctx), hs2
293	addl	(3*4)(ctx), hs3
294	addl	(4*4)(ctx), hs4
295
296	/* Store new hash state to context. */
297	movl	hs0, (0*4)(ctx)
298	movl	hs1, (1*4)(ctx)
299	movl	hs2, (2*4)(ctx)
300	movl	hs3, (3*4)(ctx)
301	movl	hs4, (4*4)(ctx)
302
303	addq	$64, in
304	cmpq	end, in
305	jb	.Lblock_loop
306
307	movq	(64+0*8)(%rsp), %rsp
308
309	/* Restore callee save registers. */
310	popq	%r12
311	popq	%rbp
312	popq	%rbx
313
314	ret
315