ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/src/trunk/crypto/openssl/crypto/aes/asm/aesv8-armx.pl
Revision: 12147
Committed: Sun Jan 20 05:34:05 2019 UTC (5 years, 3 months ago) by laffer1
Content type: text/plain
File size: 21533 byte(s)
Log Message:
openssl 1.0.2p

File Contents

# Content
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 # CBC enc CBC dec CTR
26 # Apple A7 2.39 1.20 1.20
27 # Cortex-A53 1.32 1.29 1.46
28 # Cortex-A57(*) 1.95 0.85 0.93
29 # Denver 1.96 0.86 0.80
30 #
31 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
32 # and are still same even for updated module;
33
34 $flavour = shift;
35 open STDOUT,">".shift;
36
37 $prefix="aes_v8";
38
39 $code=<<___;
40 #include "arm_arch.h"
41
42 #if __ARM_MAX_ARCH__>=7
43 .text
44 ___
45 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
46 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
47 #^^^^^^ this is done to simplify adoption by not depending
48 # on latest binutils.
49
50 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
51 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
52 # maintain both 32- and 64-bit codes within single module and
53 # transliterate common code to either flavour with regex vodoo.
54 #
55 {{{
56 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
57 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
58 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
59
60
61 $code.=<<___;
62 .align 5
63 rcon:
64 .long 0x01,0x01,0x01,0x01
65 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
66 .long 0x1b,0x1b,0x1b,0x1b
67
68 .globl ${prefix}_set_encrypt_key
69 .type ${prefix}_set_encrypt_key,%function
70 .align 5
71 ${prefix}_set_encrypt_key:
72 .Lenc_key:
73 ___
74 $code.=<<___ if ($flavour =~ /64/);
75 stp x29,x30,[sp,#-16]!
76 add x29,sp,#0
77 ___
78 $code.=<<___;
79 mov $ptr,#-1
80 cmp $inp,#0
81 b.eq .Lenc_key_abort
82 cmp $out,#0
83 b.eq .Lenc_key_abort
84 mov $ptr,#-2
85 cmp $bits,#128
86 b.lt .Lenc_key_abort
87 cmp $bits,#256
88 b.gt .Lenc_key_abort
89 tst $bits,#0x3f
90 b.ne .Lenc_key_abort
91
92 adr $ptr,rcon
93 cmp $bits,#192
94
95 veor $zero,$zero,$zero
96 vld1.8 {$in0},[$inp],#16
97 mov $bits,#8 // reuse $bits
98 vld1.32 {$rcon,$mask},[$ptr],#32
99
100 b.lt .Loop128
101 b.eq .L192
102 b .L256
103
104 .align 4
105 .Loop128:
106 vtbl.8 $key,{$in0},$mask
107 vext.8 $tmp,$zero,$in0,#12
108 vst1.32 {$in0},[$out],#16
109 aese $key,$zero
110 subs $bits,$bits,#1
111
112 veor $in0,$in0,$tmp
113 vext.8 $tmp,$zero,$tmp,#12
114 veor $in0,$in0,$tmp
115 vext.8 $tmp,$zero,$tmp,#12
116 veor $key,$key,$rcon
117 veor $in0,$in0,$tmp
118 vshl.u8 $rcon,$rcon,#1
119 veor $in0,$in0,$key
120 b.ne .Loop128
121
122 vld1.32 {$rcon},[$ptr]
123
124 vtbl.8 $key,{$in0},$mask
125 vext.8 $tmp,$zero,$in0,#12
126 vst1.32 {$in0},[$out],#16
127 aese $key,$zero
128
129 veor $in0,$in0,$tmp
130 vext.8 $tmp,$zero,$tmp,#12
131 veor $in0,$in0,$tmp
132 vext.8 $tmp,$zero,$tmp,#12
133 veor $key,$key,$rcon
134 veor $in0,$in0,$tmp
135 vshl.u8 $rcon,$rcon,#1
136 veor $in0,$in0,$key
137
138 vtbl.8 $key,{$in0},$mask
139 vext.8 $tmp,$zero,$in0,#12
140 vst1.32 {$in0},[$out],#16
141 aese $key,$zero
142
143 veor $in0,$in0,$tmp
144 vext.8 $tmp,$zero,$tmp,#12
145 veor $in0,$in0,$tmp
146 vext.8 $tmp,$zero,$tmp,#12
147 veor $key,$key,$rcon
148 veor $in0,$in0,$tmp
149 veor $in0,$in0,$key
150 vst1.32 {$in0},[$out]
151 add $out,$out,#0x50
152
153 mov $rounds,#10
154 b .Ldone
155
156 .align 4
157 .L192:
158 vld1.8 {$in1},[$inp],#8
159 vmov.i8 $key,#8 // borrow $key
160 vst1.32 {$in0},[$out],#16
161 vsub.i8 $mask,$mask,$key // adjust the mask
162
163 .Loop192:
164 vtbl.8 $key,{$in1},$mask
165 vext.8 $tmp,$zero,$in0,#12
166 vst1.32 {$in1},[$out],#8
167 aese $key,$zero
168 subs $bits,$bits,#1
169
170 veor $in0,$in0,$tmp
171 vext.8 $tmp,$zero,$tmp,#12
172 veor $in0,$in0,$tmp
173 vext.8 $tmp,$zero,$tmp,#12
174 veor $in0,$in0,$tmp
175
176 vdup.32 $tmp,${in0}[3]
177 veor $tmp,$tmp,$in1
178 veor $key,$key,$rcon
179 vext.8 $in1,$zero,$in1,#12
180 vshl.u8 $rcon,$rcon,#1
181 veor $in1,$in1,$tmp
182 veor $in0,$in0,$key
183 veor $in1,$in1,$key
184 vst1.32 {$in0},[$out],#16
185 b.ne .Loop192
186
187 mov $rounds,#12
188 add $out,$out,#0x20
189 b .Ldone
190
191 .align 4
192 .L256:
193 vld1.8 {$in1},[$inp]
194 mov $bits,#7
195 mov $rounds,#14
196 vst1.32 {$in0},[$out],#16
197
198 .Loop256:
199 vtbl.8 $key,{$in1},$mask
200 vext.8 $tmp,$zero,$in0,#12
201 vst1.32 {$in1},[$out],#16
202 aese $key,$zero
203 subs $bits,$bits,#1
204
205 veor $in0,$in0,$tmp
206 vext.8 $tmp,$zero,$tmp,#12
207 veor $in0,$in0,$tmp
208 vext.8 $tmp,$zero,$tmp,#12
209 veor $key,$key,$rcon
210 veor $in0,$in0,$tmp
211 vshl.u8 $rcon,$rcon,#1
212 veor $in0,$in0,$key
213 vst1.32 {$in0},[$out],#16
214 b.eq .Ldone
215
216 vdup.32 $key,${in0}[3] // just splat
217 vext.8 $tmp,$zero,$in1,#12
218 aese $key,$zero
219
220 veor $in1,$in1,$tmp
221 vext.8 $tmp,$zero,$tmp,#12
222 veor $in1,$in1,$tmp
223 vext.8 $tmp,$zero,$tmp,#12
224 veor $in1,$in1,$tmp
225
226 veor $in1,$in1,$key
227 b .Loop256
228
229 .Ldone:
230 str $rounds,[$out]
231 mov $ptr,#0
232
233 .Lenc_key_abort:
234 mov x0,$ptr // return value
235 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
236 ret
237 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
238
239 .globl ${prefix}_set_decrypt_key
240 .type ${prefix}_set_decrypt_key,%function
241 .align 5
242 ${prefix}_set_decrypt_key:
243 ___
244 $code.=<<___ if ($flavour =~ /64/);
245 stp x29,x30,[sp,#-16]!
246 add x29,sp,#0
247 ___
248 $code.=<<___ if ($flavour !~ /64/);
249 stmdb sp!,{r4,lr}
250 ___
251 $code.=<<___;
252 bl .Lenc_key
253
254 cmp x0,#0
255 b.ne .Ldec_key_abort
256
257 sub $out,$out,#240 // restore original $out
258 mov x4,#-16
259 add $inp,$out,x12,lsl#4 // end of key schedule
260
261 vld1.32 {v0.16b},[$out]
262 vld1.32 {v1.16b},[$inp]
263 vst1.32 {v0.16b},[$inp],x4
264 vst1.32 {v1.16b},[$out],#16
265
266 .Loop_imc:
267 vld1.32 {v0.16b},[$out]
268 vld1.32 {v1.16b},[$inp]
269 aesimc v0.16b,v0.16b
270 aesimc v1.16b,v1.16b
271 vst1.32 {v0.16b},[$inp],x4
272 vst1.32 {v1.16b},[$out],#16
273 cmp $inp,$out
274 b.hi .Loop_imc
275
276 vld1.32 {v0.16b},[$out]
277 aesimc v0.16b,v0.16b
278 vst1.32 {v0.16b},[$inp]
279
280 eor x0,x0,x0 // return value
281 .Ldec_key_abort:
282 ___
283 $code.=<<___ if ($flavour !~ /64/);
284 ldmia sp!,{r4,pc}
285 ___
286 $code.=<<___ if ($flavour =~ /64/);
287 ldp x29,x30,[sp],#16
288 ret
289 ___
290 $code.=<<___;
291 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
292 ___
293 }}}
294 {{{
295 sub gen_block () {
296 my $dir = shift;
297 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
298 my ($inp,$out,$key)=map("x$_",(0..2));
299 my $rounds="w3";
300 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
301
302 $code.=<<___;
303 .globl ${prefix}_${dir}crypt
304 .type ${prefix}_${dir}crypt,%function
305 .align 5
306 ${prefix}_${dir}crypt:
307 ldr $rounds,[$key,#240]
308 vld1.32 {$rndkey0},[$key],#16
309 vld1.8 {$inout},[$inp]
310 sub $rounds,$rounds,#2
311 vld1.32 {$rndkey1},[$key],#16
312
313 .Loop_${dir}c:
314 aes$e $inout,$rndkey0
315 aes$mc $inout,$inout
316 vld1.32 {$rndkey0},[$key],#16
317 subs $rounds,$rounds,#2
318 aes$e $inout,$rndkey1
319 aes$mc $inout,$inout
320 vld1.32 {$rndkey1},[$key],#16
321 b.gt .Loop_${dir}c
322
323 aes$e $inout,$rndkey0
324 aes$mc $inout,$inout
325 vld1.32 {$rndkey0},[$key]
326 aes$e $inout,$rndkey1
327 veor $inout,$inout,$rndkey0
328
329 vst1.8 {$inout},[$out]
330 ret
331 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
332 ___
333 }
334 &gen_block("en");
335 &gen_block("de");
336 }}}
337 {{{
338 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
339 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
340 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
341
342 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
343 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
344
345 ### q8-q15 preloaded key schedule
346
347 $code.=<<___;
348 .globl ${prefix}_cbc_encrypt
349 .type ${prefix}_cbc_encrypt,%function
350 .align 5
351 ${prefix}_cbc_encrypt:
352 ___
353 $code.=<<___ if ($flavour =~ /64/);
354 stp x29,x30,[sp,#-16]!
355 add x29,sp,#0
356 ___
357 $code.=<<___ if ($flavour !~ /64/);
358 mov ip,sp
359 stmdb sp!,{r4-r8,lr}
360 vstmdb sp!,{d8-d15} @ ABI specification says so
361 ldmia ip,{r4-r5} @ load remaining args
362 ___
363 $code.=<<___;
364 subs $len,$len,#16
365 mov $step,#16
366 b.lo .Lcbc_abort
367 cclr $step,eq
368
369 cmp $enc,#0 // en- or decrypting?
370 ldr $rounds,[$key,#240]
371 and $len,$len,#-16
372 vld1.8 {$ivec},[$ivp]
373 vld1.8 {$dat},[$inp],$step
374
375 vld1.32 {q8-q9},[$key] // load key schedule...
376 sub $rounds,$rounds,#6
377 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
378 sub $rounds,$rounds,#2
379 vld1.32 {q10-q11},[$key_],#32
380 vld1.32 {q12-q13},[$key_],#32
381 vld1.32 {q14-q15},[$key_],#32
382 vld1.32 {$rndlast},[$key_]
383
384 add $key_,$key,#32
385 mov $cnt,$rounds
386 b.eq .Lcbc_dec
387
388 cmp $rounds,#2
389 veor $dat,$dat,$ivec
390 veor $rndzero_n_last,q8,$rndlast
391 b.eq .Lcbc_enc128
392
393 vld1.32 {$in0-$in1},[$key_]
394 add $key_,$key,#16
395 add $key4,$key,#16*4
396 add $key5,$key,#16*5
397 aese $dat,q8
398 aesmc $dat,$dat
399 add $key6,$key,#16*6
400 add $key7,$key,#16*7
401 b .Lenter_cbc_enc
402
403 .align 4
404 .Loop_cbc_enc:
405 aese $dat,q8
406 aesmc $dat,$dat
407 vst1.8 {$ivec},[$out],#16
408 .Lenter_cbc_enc:
409 aese $dat,q9
410 aesmc $dat,$dat
411 aese $dat,$in0
412 aesmc $dat,$dat
413 vld1.32 {q8},[$key4]
414 cmp $rounds,#4
415 aese $dat,$in1
416 aesmc $dat,$dat
417 vld1.32 {q9},[$key5]
418 b.eq .Lcbc_enc192
419
420 aese $dat,q8
421 aesmc $dat,$dat
422 vld1.32 {q8},[$key6]
423 aese $dat,q9
424 aesmc $dat,$dat
425 vld1.32 {q9},[$key7]
426 nop
427
428 .Lcbc_enc192:
429 aese $dat,q8
430 aesmc $dat,$dat
431 subs $len,$len,#16
432 aese $dat,q9
433 aesmc $dat,$dat
434 cclr $step,eq
435 aese $dat,q10
436 aesmc $dat,$dat
437 aese $dat,q11
438 aesmc $dat,$dat
439 vld1.8 {q8},[$inp],$step
440 aese $dat,q12
441 aesmc $dat,$dat
442 veor q8,q8,$rndzero_n_last
443 aese $dat,q13
444 aesmc $dat,$dat
445 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
446 aese $dat,q14
447 aesmc $dat,$dat
448 aese $dat,q15
449 veor $ivec,$dat,$rndlast
450 b.hs .Loop_cbc_enc
451
452 vst1.8 {$ivec},[$out],#16
453 b .Lcbc_done
454
455 .align 5
456 .Lcbc_enc128:
457 vld1.32 {$in0-$in1},[$key_]
458 aese $dat,q8
459 aesmc $dat,$dat
460 b .Lenter_cbc_enc128
461 .Loop_cbc_enc128:
462 aese $dat,q8
463 aesmc $dat,$dat
464 vst1.8 {$ivec},[$out],#16
465 .Lenter_cbc_enc128:
466 aese $dat,q9
467 aesmc $dat,$dat
468 subs $len,$len,#16
469 aese $dat,$in0
470 aesmc $dat,$dat
471 cclr $step,eq
472 aese $dat,$in1
473 aesmc $dat,$dat
474 aese $dat,q10
475 aesmc $dat,$dat
476 aese $dat,q11
477 aesmc $dat,$dat
478 vld1.8 {q8},[$inp],$step
479 aese $dat,q12
480 aesmc $dat,$dat
481 aese $dat,q13
482 aesmc $dat,$dat
483 aese $dat,q14
484 aesmc $dat,$dat
485 veor q8,q8,$rndzero_n_last
486 aese $dat,q15
487 veor $ivec,$dat,$rndlast
488 b.hs .Loop_cbc_enc128
489
490 vst1.8 {$ivec},[$out],#16
491 b .Lcbc_done
492 ___
493 {
494 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
495 $code.=<<___;
496 .align 5
497 .Lcbc_dec:
498 vld1.8 {$dat2},[$inp],#16
499 subs $len,$len,#32 // bias
500 add $cnt,$rounds,#2
501 vorr $in1,$dat,$dat
502 vorr $dat1,$dat,$dat
503 vorr $in2,$dat2,$dat2
504 b.lo .Lcbc_dec_tail
505
506 vorr $dat1,$dat2,$dat2
507 vld1.8 {$dat2},[$inp],#16
508 vorr $in0,$dat,$dat
509 vorr $in1,$dat1,$dat1
510 vorr $in2,$dat2,$dat2
511
512 .Loop3x_cbc_dec:
513 aesd $dat0,q8
514 aesimc $dat0,$dat0
515 aesd $dat1,q8
516 aesimc $dat1,$dat1
517 aesd $dat2,q8
518 aesimc $dat2,$dat2
519 vld1.32 {q8},[$key_],#16
520 subs $cnt,$cnt,#2
521 aesd $dat0,q9
522 aesimc $dat0,$dat0
523 aesd $dat1,q9
524 aesimc $dat1,$dat1
525 aesd $dat2,q9
526 aesimc $dat2,$dat2
527 vld1.32 {q9},[$key_],#16
528 b.gt .Loop3x_cbc_dec
529
530 aesd $dat0,q8
531 aesimc $dat0,$dat0
532 aesd $dat1,q8
533 aesimc $dat1,$dat1
534 aesd $dat2,q8
535 aesimc $dat2,$dat2
536 veor $tmp0,$ivec,$rndlast
537 subs $len,$len,#0x30
538 veor $tmp1,$in0,$rndlast
539 mov.lo x6,$len // x6, $cnt, is zero at this point
540 aesd $dat0,q9
541 aesimc $dat0,$dat0
542 aesd $dat1,q9
543 aesimc $dat1,$dat1
544 aesd $dat2,q9
545 aesimc $dat2,$dat2
546 veor $tmp2,$in1,$rndlast
547 add $inp,$inp,x6 // $inp is adjusted in such way that
548 // at exit from the loop $dat1-$dat2
549 // are loaded with last "words"
550 vorr $ivec,$in2,$in2
551 mov $key_,$key
552 aesd $dat0,q12
553 aesimc $dat0,$dat0
554 aesd $dat1,q12
555 aesimc $dat1,$dat1
556 aesd $dat2,q12
557 aesimc $dat2,$dat2
558 vld1.8 {$in0},[$inp],#16
559 aesd $dat0,q13
560 aesimc $dat0,$dat0
561 aesd $dat1,q13
562 aesimc $dat1,$dat1
563 aesd $dat2,q13
564 aesimc $dat2,$dat2
565 vld1.8 {$in1},[$inp],#16
566 aesd $dat0,q14
567 aesimc $dat0,$dat0
568 aesd $dat1,q14
569 aesimc $dat1,$dat1
570 aesd $dat2,q14
571 aesimc $dat2,$dat2
572 vld1.8 {$in2},[$inp],#16
573 aesd $dat0,q15
574 aesd $dat1,q15
575 aesd $dat2,q15
576 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
577 add $cnt,$rounds,#2
578 veor $tmp0,$tmp0,$dat0
579 veor $tmp1,$tmp1,$dat1
580 veor $dat2,$dat2,$tmp2
581 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
582 vst1.8 {$tmp0},[$out],#16
583 vorr $dat0,$in0,$in0
584 vst1.8 {$tmp1},[$out],#16
585 vorr $dat1,$in1,$in1
586 vst1.8 {$dat2},[$out],#16
587 vorr $dat2,$in2,$in2
588 b.hs .Loop3x_cbc_dec
589
590 cmn $len,#0x30
591 b.eq .Lcbc_done
592 nop
593
594 .Lcbc_dec_tail:
595 aesd $dat1,q8
596 aesimc $dat1,$dat1
597 aesd $dat2,q8
598 aesimc $dat2,$dat2
599 vld1.32 {q8},[$key_],#16
600 subs $cnt,$cnt,#2
601 aesd $dat1,q9
602 aesimc $dat1,$dat1
603 aesd $dat2,q9
604 aesimc $dat2,$dat2
605 vld1.32 {q9},[$key_],#16
606 b.gt .Lcbc_dec_tail
607
608 aesd $dat1,q8
609 aesimc $dat1,$dat1
610 aesd $dat2,q8
611 aesimc $dat2,$dat2
612 aesd $dat1,q9
613 aesimc $dat1,$dat1
614 aesd $dat2,q9
615 aesimc $dat2,$dat2
616 aesd $dat1,q12
617 aesimc $dat1,$dat1
618 aesd $dat2,q12
619 aesimc $dat2,$dat2
620 cmn $len,#0x20
621 aesd $dat1,q13
622 aesimc $dat1,$dat1
623 aesd $dat2,q13
624 aesimc $dat2,$dat2
625 veor $tmp1,$ivec,$rndlast
626 aesd $dat1,q14
627 aesimc $dat1,$dat1
628 aesd $dat2,q14
629 aesimc $dat2,$dat2
630 veor $tmp2,$in1,$rndlast
631 aesd $dat1,q15
632 aesd $dat2,q15
633 b.eq .Lcbc_dec_one
634 veor $tmp1,$tmp1,$dat1
635 veor $tmp2,$tmp2,$dat2
636 vorr $ivec,$in2,$in2
637 vst1.8 {$tmp1},[$out],#16
638 vst1.8 {$tmp2},[$out],#16
639 b .Lcbc_done
640
641 .Lcbc_dec_one:
642 veor $tmp1,$tmp1,$dat2
643 vorr $ivec,$in2,$in2
644 vst1.8 {$tmp1},[$out],#16
645
646 .Lcbc_done:
647 vst1.8 {$ivec},[$ivp]
648 .Lcbc_abort:
649 ___
650 }
651 $code.=<<___ if ($flavour !~ /64/);
652 vldmia sp!,{d8-d15}
653 ldmia sp!,{r4-r8,pc}
654 ___
655 $code.=<<___ if ($flavour =~ /64/);
656 ldr x29,[sp],#16
657 ret
658 ___
659 $code.=<<___;
660 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
661 ___
662 }}}
663 {{{
664 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
665 my ($rounds,$cnt,$key_)=("w5","w6","x7");
666 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
667 my $step="x12"; # aliases with $tctr2
668
669 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
670 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
671
672 my ($dat,$tmp)=($dat0,$tmp0);
673
674 ### q8-q15 preloaded key schedule
675
676 $code.=<<___;
677 .globl ${prefix}_ctr32_encrypt_blocks
678 .type ${prefix}_ctr32_encrypt_blocks,%function
679 .align 5
680 ${prefix}_ctr32_encrypt_blocks:
681 ___
682 $code.=<<___ if ($flavour =~ /64/);
683 stp x29,x30,[sp,#-16]!
684 add x29,sp,#0
685 ___
686 $code.=<<___ if ($flavour !~ /64/);
687 mov ip,sp
688 stmdb sp!,{r4-r10,lr}
689 vstmdb sp!,{d8-d15} @ ABI specification says so
690 ldr r4, [ip] @ load remaining arg
691 ___
692 $code.=<<___;
693 ldr $rounds,[$key,#240]
694
695 ldr $ctr, [$ivp, #12]
696 vld1.32 {$dat0},[$ivp]
697
698 vld1.32 {q8-q9},[$key] // load key schedule...
699 sub $rounds,$rounds,#4
700 mov $step,#16
701 cmp $len,#2
702 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
703 sub $rounds,$rounds,#2
704 vld1.32 {q12-q13},[$key_],#32
705 vld1.32 {q14-q15},[$key_],#32
706 vld1.32 {$rndlast},[$key_]
707 add $key_,$key,#32
708 mov $cnt,$rounds
709 cclr $step,lo
710 #ifndef __ARMEB__
711 rev $ctr, $ctr
712 #endif
713 vorr $dat1,$dat0,$dat0
714 add $tctr1, $ctr, #1
715 vorr $dat2,$dat0,$dat0
716 add $ctr, $ctr, #2
717 vorr $ivec,$dat0,$dat0
718 rev $tctr1, $tctr1
719 vmov.32 ${dat1}[3],$tctr1
720 b.ls .Lctr32_tail
721 rev $tctr2, $ctr
722 sub $len,$len,#3 // bias
723 vmov.32 ${dat2}[3],$tctr2
724 b .Loop3x_ctr32
725
726 .align 4
727 .Loop3x_ctr32:
728 aese $dat0,q8
729 aesmc $dat0,$dat0
730 aese $dat1,q8
731 aesmc $dat1,$dat1
732 aese $dat2,q8
733 aesmc $dat2,$dat2
734 vld1.32 {q8},[$key_],#16
735 subs $cnt,$cnt,#2
736 aese $dat0,q9
737 aesmc $dat0,$dat0
738 aese $dat1,q9
739 aesmc $dat1,$dat1
740 aese $dat2,q9
741 aesmc $dat2,$dat2
742 vld1.32 {q9},[$key_],#16
743 b.gt .Loop3x_ctr32
744
745 aese $dat0,q8
746 aesmc $tmp0,$dat0
747 aese $dat1,q8
748 aesmc $tmp1,$dat1
749 vld1.8 {$in0},[$inp],#16
750 vorr $dat0,$ivec,$ivec
751 aese $dat2,q8
752 aesmc $dat2,$dat2
753 vld1.8 {$in1},[$inp],#16
754 vorr $dat1,$ivec,$ivec
755 aese $tmp0,q9
756 aesmc $tmp0,$tmp0
757 aese $tmp1,q9
758 aesmc $tmp1,$tmp1
759 vld1.8 {$in2},[$inp],#16
760 mov $key_,$key
761 aese $dat2,q9
762 aesmc $tmp2,$dat2
763 vorr $dat2,$ivec,$ivec
764 add $tctr0,$ctr,#1
765 aese $tmp0,q12
766 aesmc $tmp0,$tmp0
767 aese $tmp1,q12
768 aesmc $tmp1,$tmp1
769 veor $in0,$in0,$rndlast
770 add $tctr1,$ctr,#2
771 aese $tmp2,q12
772 aesmc $tmp2,$tmp2
773 veor $in1,$in1,$rndlast
774 add $ctr,$ctr,#3
775 aese $tmp0,q13
776 aesmc $tmp0,$tmp0
777 aese $tmp1,q13
778 aesmc $tmp1,$tmp1
779 veor $in2,$in2,$rndlast
780 rev $tctr0,$tctr0
781 aese $tmp2,q13
782 aesmc $tmp2,$tmp2
783 vmov.32 ${dat0}[3], $tctr0
784 rev $tctr1,$tctr1
785 aese $tmp0,q14
786 aesmc $tmp0,$tmp0
787 aese $tmp1,q14
788 aesmc $tmp1,$tmp1
789 vmov.32 ${dat1}[3], $tctr1
790 rev $tctr2,$ctr
791 aese $tmp2,q14
792 aesmc $tmp2,$tmp2
793 vmov.32 ${dat2}[3], $tctr2
794 subs $len,$len,#3
795 aese $tmp0,q15
796 aese $tmp1,q15
797 aese $tmp2,q15
798
799 veor $in0,$in0,$tmp0
800 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
801 vst1.8 {$in0},[$out],#16
802 veor $in1,$in1,$tmp1
803 mov $cnt,$rounds
804 vst1.8 {$in1},[$out],#16
805 veor $in2,$in2,$tmp2
806 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
807 vst1.8 {$in2},[$out],#16
808 b.hs .Loop3x_ctr32
809
810 adds $len,$len,#3
811 b.eq .Lctr32_done
812 cmp $len,#1
813 mov $step,#16
814 cclr $step,eq
815
816 .Lctr32_tail:
817 aese $dat0,q8
818 aesmc $dat0,$dat0
819 aese $dat1,q8
820 aesmc $dat1,$dat1
821 vld1.32 {q8},[$key_],#16
822 subs $cnt,$cnt,#2
823 aese $dat0,q9
824 aesmc $dat0,$dat0
825 aese $dat1,q9
826 aesmc $dat1,$dat1
827 vld1.32 {q9},[$key_],#16
828 b.gt .Lctr32_tail
829
830 aese $dat0,q8
831 aesmc $dat0,$dat0
832 aese $dat1,q8
833 aesmc $dat1,$dat1
834 aese $dat0,q9
835 aesmc $dat0,$dat0
836 aese $dat1,q9
837 aesmc $dat1,$dat1
838 vld1.8 {$in0},[$inp],$step
839 aese $dat0,q12
840 aesmc $dat0,$dat0
841 aese $dat1,q12
842 aesmc $dat1,$dat1
843 vld1.8 {$in1},[$inp]
844 aese $dat0,q13
845 aesmc $dat0,$dat0
846 aese $dat1,q13
847 aesmc $dat1,$dat1
848 veor $in0,$in0,$rndlast
849 aese $dat0,q14
850 aesmc $dat0,$dat0
851 aese $dat1,q14
852 aesmc $dat1,$dat1
853 veor $in1,$in1,$rndlast
854 aese $dat0,q15
855 aese $dat1,q15
856
857 cmp $len,#1
858 veor $in0,$in0,$dat0
859 veor $in1,$in1,$dat1
860 vst1.8 {$in0},[$out],#16
861 b.eq .Lctr32_done
862 vst1.8 {$in1},[$out]
863
864 .Lctr32_done:
865 ___
866 $code.=<<___ if ($flavour !~ /64/);
867 vldmia sp!,{d8-d15}
868 ldmia sp!,{r4-r10,pc}
869 ___
870 $code.=<<___ if ($flavour =~ /64/);
871 ldr x29,[sp],#16
872 ret
873 ___
874 $code.=<<___;
875 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
876 ___
877 }}}
878 $code.=<<___;
879 #endif
880 ___
881 ########################################
882 if ($flavour =~ /64/) { ######## 64-bit code
883 my %opcode = (
884 "aesd" => 0x4e285800, "aese" => 0x4e284800,
885 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
886
887 local *unaes = sub {
888 my ($mnemonic,$arg)=@_;
889
890 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
891 sprintf ".inst\t0x%08x\t//%s %s",
892 $opcode{$mnemonic}|$1|($2<<5),
893 $mnemonic,$arg;
894 };
895
896 foreach(split("\n",$code)) {
897 s/\`([^\`]*)\`/eval($1)/geo;
898
899 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
900 s/@\s/\/\//o; # old->new style commentary
901
902 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
903 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
904 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
905 s/vmov\.i8/movi/o or # fix up legacy mnemonics
906 s/vext\.8/ext/o or
907 s/vrev32\.8/rev32/o or
908 s/vtst\.8/cmtst/o or
909 s/vshr/ushr/o or
910 s/^(\s+)v/$1/o or # strip off v prefix
911 s/\bbx\s+lr\b/ret/o;
912
913 # fix up remainig legacy suffixes
914 s/\.[ui]?8//o;
915 m/\],#8/o and s/\.16b/\.8b/go;
916 s/\.[ui]?32//o and s/\.16b/\.4s/go;
917 s/\.[ui]?64//o and s/\.16b/\.2d/go;
918 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
919
920 print $_,"\n";
921 }
922 } else { ######## 32-bit code
923 my %opcode = (
924 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
925 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
926
927 local *unaes = sub {
928 my ($mnemonic,$arg)=@_;
929
930 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
931 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
932 |(($2&7)<<1) |(($2&8)<<2);
933 # since ARMv7 instructions are always encoded little-endian.
934 # correct solution is to use .inst directive, but older
935 # assemblers don't implement it:-(
936 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
937 $word&0xff,($word>>8)&0xff,
938 ($word>>16)&0xff,($word>>24)&0xff,
939 $mnemonic,$arg;
940 }
941 };
942
943 sub unvtbl {
944 my $arg=shift;
945
946 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
947 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
948 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
949 }
950
951 sub unvdup32 {
952 my $arg=shift;
953
954 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
955 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
956 }
957
958 sub unvmov32 {
959 my $arg=shift;
960
961 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
962 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
963 }
964
965 foreach(split("\n",$code)) {
966 s/\`([^\`]*)\`/eval($1)/geo;
967
968 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
969 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
970 s/\/\/\s?/@ /o; # new->old style commentary
971
972 # fix up remainig new-style suffixes
973 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
974 s/\],#[0-9]+/]!/o;
975
976 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
977 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
978 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
979 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
980 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
981 s/^(\s+)b\./$1b/o or
982 s/^(\s+)mov\./$1mov/o or
983 s/^(\s+)ret/$1bx\tlr/o;
984
985 print $_,"\n";
986 }
987 }
988
989 close STDOUT;