1/* $OpenBSD: sha1_amd64_generic.S,v 1.2 2025/01/18 02:56:07 jsing Exp $ */ 2/* 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18#ifdef __CET__ 19#include <cet.h> 20#else 21#define _CET_ENDBR 22#endif 23 24#define ctx %rdi 25#define in %rsi 26#define num %rdx 27 28#define end %rbp 29 30#define hs0 %r8d 31#define hs1 %r9d 32#define hs2 %r10d 33#define hs3 %r11d 34#define hs4 %r12d 35 36#define tmp0 %eax 37#define tmp1 %ebx 38#define tmp2 %ecx 39#define tmp3 %edx 40 41/* 42 * Load message into wt, storing a copy in the message schedule: 43 * 44 * Wt = Mt 45 */ 46#define sha1_message_schedule_load(idx, m, w, wt) \ 47 movl ((idx&0xf)*4)(m), wt; \ 48 bswapl wt; \ 49 movl wt, ((idx&0xf)*4)(w); 50 51/* 52 * Update message schedule and return current value in wt: 53 * 54 * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1) 55 */ 56#define sha1_message_schedule_update(idx, w, wt) \ 57 movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \ 58 xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \ 59 xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \ 60 xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \ 61 roll $1, wt; \ 62 \ 63 movl wt, ((idx&0xf)*4)(w); 64 65/* 66 * Compute a SHA-1 round without logic function: 67 * 68 * T = rol(a, 5) + e + Kt + Wt 69 * 70 * The caller is required to compute the appropriate logic function 71 * (Ch, Maj, Parity) and add it to e. 72 * 73 * Upon completion b = rol(b, 30), e = T, pending rotation. 74 */ 75#define sha1_round(a, b, c, d, e, kt, wt) \ 76 leal kt(wt, e, 1), e; /* Kt + Wt */ \ 77 \ 78 movl a, tmp1; /* rol(a, 5) */ \ 79 roll $5, tmp1; \ 80 addl tmp1, e; \ 81 \ 82 roll $30, b; /* rol(b, 30) */ 83 84/* 85 * Compute a SHA-1 round with Ch: 86 * 87 * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt 88 * 89 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z 90 * 91 * Upon completion b = rol(b, 30), e = T, pending rotation. 92 */ 93#define sha1_round_ch(a, b, c, d, e, kt, wt) \ 94 movl c, tmp2; /* Ch */ \ 95 xorl d, tmp2; /* Ch */ \ 96 andl b, tmp2; /* Ch */ \ 97 xorl d, tmp2; /* Ch */ \ 98 addl tmp2, e; /* Ch */ \ 99 \ 100 sha1_round(a, b, c, d, e, kt, wt); 101 102/* 103 * Compute a SHA-1 round with Parity: 104 * 105 * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt 106 * 107 * Parity(x, y, z) = x ^ y ^ z 108 * 109 * Upon completion b = rol(b, 30), e = T, pending rotation. 110 */ 111#define sha1_round_parity(a, b, c, d, e, kt, wt) \ 112 movl b, tmp2; /* Parity */ \ 113 xorl c, tmp2; /* Parity */ \ 114 xorl d, tmp2; /* Parity */ \ 115 addl tmp2, e; /* Parity */ \ 116 \ 117 sha1_round(a, b, c, d, e, kt, wt); 118 119/* 120 * Compute a SHA-1 round with Maj: 121 * 122 * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt 123 * 124 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) 125 * 126 * Upon completion b = rol(b, 30), e = T, pending rotation. 127 */ 128#define sha1_round_maj(a, b, c, d, e, kt, wt) \ 129 movl c, tmp2; /* Maj */ \ 130 xorl d, tmp2; /* Maj */ \ 131 andl b, tmp2; /* Maj */ \ 132 movl c, tmp3; /* Maj */ \ 133 andl d, tmp3; /* Maj */ \ 134 xorl tmp2, tmp3; /* Maj */ \ 135 addl tmp3, e; /* Maj */ \ 136 \ 137 sha1_round(a, b, c, d, e, kt, wt); 138 139#define sha1_round1_load(idx, a, b, c, d, e) \ 140 sha1_message_schedule_load(idx, in, %rsp, tmp0) \ 141 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) 142 143#define sha1_round1_update(idx, a, b, c, d, e) \ 144 sha1_message_schedule_update(idx, %rsp, tmp0) \ 145 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) 146 147#define sha1_round2_update(idx, a, b, c, d, e) \ 148 sha1_message_schedule_update(idx, %rsp, tmp0) \ 149 sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0) 150 151#define sha1_round3_update(idx, a, b, c, d, e) \ 152 sha1_message_schedule_update(idx, %rsp, tmp0) \ 153 sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0) 154 155#define sha1_round4_update(idx, a, b, c, d, e) \ 156 sha1_message_schedule_update(idx, %rsp, tmp0) \ 157 sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0) 158 159.text 160 161/* 162 * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num); 163 * 164 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 165 */ 166.align 16 167.globl sha1_block_generic 168.type sha1_block_generic,@function 169sha1_block_generic: 170 _CET_ENDBR 171 172 /* Save callee save registers. */ 173 pushq %rbx 174 pushq %rbp 175 pushq %r12 176 177 /* Allocate space for message schedule. */ 178 movq %rsp, %rax 179 subq $(64+1*8), %rsp 180 andq $~63, %rsp 181 movq %rax, (64+0*8)(%rsp) 182 183 /* Compute end of message. */ 184 shlq $6, num 185 leaq (in, num, 1), end 186 187 /* Load current hash state from context. */ 188 movl (0*4)(ctx), hs0 189 movl (1*4)(ctx), hs1 190 movl (2*4)(ctx), hs2 191 movl (3*4)(ctx), hs3 192 movl (4*4)(ctx), hs4 193 194 jmp .Lblock_loop 195 196.align 16 197.Lblock_loop: 198 199 /* Round 0 through 15. */ 200 sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4) 201 sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3) 202 sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2) 203 sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1) 204 sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0) 205 sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4) 206 sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3) 207 sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2) 208 sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1) 209 sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0) 210 sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4) 211 sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3) 212 sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2) 213 sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1) 214 sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0) 215 sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4) 216 217 /* Round 16 through 31. */ 218 sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3); 219 sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2); 220 sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1); 221 sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0); 222 sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4); 223 sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3); 224 sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2); 225 sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1); 226 sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0); 227 sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4); 228 sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3); 229 sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2); 230 sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1); 231 sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0); 232 sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4); 233 sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3); 234 235 /* Round 32 through 47. */ 236 sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2); 237 sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1); 238 sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0); 239 sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4); 240 sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3); 241 sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2); 242 sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1); 243 sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0); 244 sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4); 245 sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3); 246 sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2); 247 sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1); 248 sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0); 249 sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4); 250 sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3); 251 sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2); 252 253 /* Round 48 through 63. */ 254 sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1); 255 sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0); 256 sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4); 257 sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3); 258 sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2); 259 sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1); 260 sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0); 261 sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4); 262 sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3); 263 sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2); 264 sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1); 265 sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0); 266 sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4); 267 sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3); 268 sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2); 269 sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1); 270 271 /* Round 64 through 79. */ 272 sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0); 273 sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4); 274 sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3); 275 sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2); 276 sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1); 277 sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0); 278 sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4); 279 sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3); 280 sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2); 281 sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1); 282 sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0); 283 sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4); 284 sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3); 285 sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2); 286 sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1); 287 sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0); 288 289 /* Add intermediate state to hash state. */ 290 addl (0*4)(ctx), hs0 291 addl (1*4)(ctx), hs1 292 addl (2*4)(ctx), hs2 293 addl (3*4)(ctx), hs3 294 addl (4*4)(ctx), hs4 295 296 /* Store new hash state to context. */ 297 movl hs0, (0*4)(ctx) 298 movl hs1, (1*4)(ctx) 299 movl hs2, (2*4)(ctx) 300 movl hs3, (3*4)(ctx) 301 movl hs4, (4*4)(ctx) 302 303 addq $64, in 304 cmpq end, in 305 jb .Lblock_loop 306 307 movq (64+0*8)(%rsp), %rsp 308 309 /* Restore callee save registers. */ 310 popq %r12 311 popq %rbp 312 popq %rbx 313 314 ret 315