1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2011
18#
19# The module implements GCM GHASH function and underlying single
20# multiplication operation in GF(2^128). Even though subroutines
21# have _4bit suffix, they are not using any tables, but rely on
22# hardware Galois Field Multiply support. Streamed GHASH processes
23# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
24# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
25# comparing apples vs. oranges, but compiler surely could have done
26# better, because theoretical [though not necessarily achievable]
27# estimate for "4-bit" table-driven implementation is ~12 cycles.
28
29$output = pop and open STDOUT,">$output";
30
31($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");   # arguments
32
33($Z0,$Z1,$Z2,$Z3,   $H0, $H1, $H2, $H3,
34                              $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
35($H01u,$H01y,$H2u,$H3u,       $H0y,$H1y,$H2y,$H3y,
36                              $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
37($FF000000,$E10000)=("B30","B31");
38($xip,$x0,$x1,$xib)=map("B$_",(6..9));  # $xip zaps $len
39 $xia="A9";
40($rem,$res)=("B4","B5");                # $rem zaps $Htable
41
42$code.=<<___;
43          .text
44
45          .if       .ASSEMBLER_VERSION<7000000
46          .asg      0,__TI_EABI__
47          .endif
48          .if       __TI_EABI__
49          .asg      gcm_gmult_1bit,_gcm_gmult_1bit
50          .asg      gcm_gmult_4bit,_gcm_gmult_4bit
51          .asg      gcm_ghash_4bit,_gcm_ghash_4bit
52          .endif
53
54          .asg      B3,RA
55
56          .if       0
57          .global   _gcm_gmult_1bit
58_gcm_gmult_1bit:
59          ADDAD     $Htable,2,$Htable
60          .endif
61          .global   _gcm_gmult_4bit
62_gcm_gmult_4bit:
63          .asmfunc
64          LDDW      *${Htable}[-1],$H1:$H0        ; H.lo
65          LDDW      *${Htable}[-2],$H3:$H2        ; H.hi
66||        MV        $Xip,${xip}                   ; reassign Xi
67||        MVK       15,B1                         ; SPLOOPD constant
68
69          MVK       0xE1,$E10000
70||        LDBU      *++${xip}[15],$x1   ; Xi[15]
71          MVK       0xFF,$FF000000
72||        LDBU      *--${xip},$x0                 ; Xi[14]
73          SHL       $E10000,16,$E10000  ; [pre-shifted] reduction polynomial
74          SHL       $FF000000,24,$FF000000        ; upper byte mask
75||        BNOP      ghash_loop?
76||        MVK       1,B0                          ; take a single spin
77
78          PACKH2    $H0,$H1,$xia                  ; pack H0' and H1's upper bytes
79          AND       $H2,$FF000000,$H2u  ; H2's upper byte
80          AND       $H3,$FF000000,$H3u  ; H3's upper byte
81||        SHRU      $H2u,8,$H2u
82          SHRU      $H3u,8,$H3u
83||        ZERO      $Z1:$Z0
84          SHRU2     $xia,8,$H01u
85||        ZERO      $Z3:$Z2
86          .endasmfunc
87
88          .global   _gcm_ghash_4bit
89_gcm_ghash_4bit:
90          .asmfunc
91          LDDW      *${Htable}[-1],$H1:$H0        ; H.lo
92||        SHRU      $len,4,B0           ; reassign len
93          LDDW      *${Htable}[-2],$H3:$H2        ; H.hi
94||        MV        $Xip,${xip}                   ; reassign Xi
95||        MVK       15,B1                         ; SPLOOPD constant
96
97          MVK       0xE1,$E10000
98|| [B0]   LDNDW     *${inp}[1],$H1x:$H0x
99          MVK       0xFF,$FF000000
100|| [B0]   LDNDW     *${inp}++[2],$H3x:$H2x
101          SHL       $E10000,16,$E10000  ; [pre-shifted] reduction polynomial
102||        LDDW      *${xip}[1],$Z1:$Z0
103          SHL       $FF000000,24,$FF000000        ; upper byte mask
104||        LDDW      *${xip}[0],$Z3:$Z2
105
106          PACKH2    $H0,$H1,$xia                  ; pack H0' and H1's upper bytes
107          AND       $H2,$FF000000,$H2u  ; H2's upper byte
108          AND       $H3,$FF000000,$H3u  ; H3's upper byte
109||        SHRU      $H2u,8,$H2u
110          SHRU      $H3u,8,$H3u
111          SHRU2     $xia,8,$H01u
112
113|| [B0]   XOR       $H0x,$Z0,$Z0                  ; Xi^=inp
114|| [B0]   XOR       $H1x,$Z1,$Z1
115          .if       .LITTLE_ENDIAN
116   [B0]   XOR       $H2x,$Z2,$Z2
117|| [B0]   XOR       $H3x,$Z3,$Z3
118|| [B0]   SHRU      $Z1,24,$xia                   ; Xi[15], avoid cross-path stall
119          STDW      $Z1:$Z0,*${xip}[1]
120|| [B0]   SHRU      $Z1,16,$x0                    ; Xi[14]
121|| [B0]   ZERO      $Z1:$Z0
122          .else
123   [B0]   XOR       $H2x,$Z2,$Z2
124|| [B0]   XOR       $H3x,$Z3,$Z3
125|| [B0]   MV        $Z0,$xia            ; Xi[15], avoid cross-path stall
126          STDW      $Z1:$Z0,*${xip}[1]
127|| [B0] SHRU        $Z0,8,$x0           ; Xi[14]
128|| [B0]   ZERO      $Z1:$Z0
129          .endif
130          STDW      $Z3:$Z2,*${xip}[0]
131|| [B0]   ZERO      $Z3:$Z2
132|| [B0]   MV        $xia,$x1
133   [B0]   ADDK      14,${xip}
134
135ghash_loop?:
136          SPLOOPD   6                             ; 6*16+7
137||        MVC       B1,ILC
138|| [B0]   SUB       B0,1,B0
139||        ZERO      A0
140||        ADD       $x1,$x1,$xib                  ; SHL     $x1,1,$xib
141||        SHL       $x1,1,$xia
142___
143
144########____________________________
145#  0    D2.     M1          M2      |
146#  1            M1                  |
147#  2            M1          M2      |
148#  3        D1. M1          M2      |
149#  4        S1. L1                  |
150#  5    S2  S1x L1          D2  L2  |____________________________
151#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
152#  7/1          L1  S1  D1x S2  M2  |        M1                  |
153#  8/2              S1  L1x S2      |        M1          M2      |
154#  9/3              S1  L1x         |    D1. M1          M2      |
155# 10/4                  D1x         |    S1. L1                  |
156# 11/5                              |S2  S1x L1          D2  L2  |____________
157# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
158#    7/1                                     L1  S1  D1x S2  M2  |        ....
159#    8/2                                         S1  L1x S2      |        ....
160#####...                                         ................|............
161$code.=<<___;
162          XORMPY    $H0,$xia,$H0x                 ; 0       ; H·(Xi[i]<<1)
163||        XORMPY    $H01u,$xib,$H01y
164|| [A0]   LDBU      *--${xip},$x0
165          XORMPY    $H1,$xia,$H1x                 ; 1
166          XORMPY    $H2,$xia,$H2x                 ; 2
167||        XORMPY    $H2u,$xib,$H2y
168          XORMPY    $H3,$xia,$H3x                 ; 3
169||        XORMPY    $H3u,$xib,$H3y
170||[!A0]   MVK.D     15,A0                                   ; *--${xip} counter
171          XOR.L     $H0x,$Z0,$Z0                  ; 4       ; Z^=H·(Xi[i]<<1)
172|| [A0]   SUB.S     A0,1,A0
173          XOR.L     $H1x,$Z1,$Z1                  ; 5
174||        AND.D     $H01y,$FF000000,$H0z
175||        SWAP2.L   $H01y,$H1y                    ;         ; SHL     $H01y,16,$H1y
176||        SHL       $x0,1,$xib
177||        SHL       $x0,1,$xia
178
179          XOR.L     $H2x,$Z2,$Z2                  ; 6/0     ; [0,0] in epilogue
180||        SHL       $Z0,1,$rem                    ;         ; rem=Z<<1
181||        SHRMB.S   $Z1,$Z0,$Z0                   ;         ; Z>>=8
182||        AND.L     $H1y,$FF000000,$H1z
183          XOR.L     $H3x,$Z3,$Z3                  ; 7/1
184||        SHRMB.S   $Z2,$Z1,$Z1
185||        XOR.D     $H0z,$Z0,$Z0                            ; merge upper byte products
186||        AND.S     $H2y,$FF000000,$H2z
187||        XORMPY    $E10000,$rem,$res   ;         ; implicit rem&0x1FE
188          XOR.L     $H1z,$Z1,$Z1                  ; 8/2
189||        SHRMB.S   $Z3,$Z2,$Z2
190||        AND.S     $H3y,$FF000000,$H3z
191          XOR.L     $H2z,$Z2,$Z2                  ; 9/3
192||        SHRU      $Z3,8,$Z3
193          XOR.D     $H3z,$Z3,$Z3                  ; 10/4
194          NOP                                     ; 11/5
195
196          SPKERNEL 0,2
197||        XOR.D     $res,$Z3,$Z3                  ; 12/6/0; Z^=res
198
199          ; input pre-fetch is possible where D1 slot is available...
200   [B0]   LDNDW     *${inp}[1],$H1x:$H0x          ; 8/-
201   [B0]   LDNDW     *${inp}++[2],$H3x:$H2x        ; 9/-
202          NOP                                     ; 10/-
203          .if       .LITTLE_ENDIAN
204          SWAP2     $Z0,$Z1                       ; 11/-
205||        SWAP4     $Z1,$Z0
206          SWAP4     $Z1,$Z1                       ; 12/-
207||        SWAP2     $Z0,$Z0
208          SWAP2     $Z2,$Z3
209||        SWAP4     $Z3,$Z2
210||[!B0]   BNOP      RA
211          SWAP4     $Z3,$Z3
212||        SWAP2     $Z2,$Z2
213|| [B0]   BNOP      ghash_loop?
214   [B0]   XOR       $H0x,$Z0,$Z0                  ; Xi^=inp
215|| [B0]   XOR       $H1x,$Z1,$Z1
216   [B0]   XOR       $H2x,$Z2,$Z2
217|| [B0]   XOR       $H3x,$Z3,$Z3
218|| [B0]   SHRU      $Z1,24,$xia                   ; Xi[15], avoid cross-path stall
219          STDW      $Z1:$Z0,*${xip}[1]
220|| [B0]   SHRU      $Z1,16,$x0                    ; Xi[14]
221|| [B0]   ZERO      $Z1:$Z0
222          .else
223  [!B0]   BNOP      RA                            ; 11/-
224   [B0]   BNOP      ghash_loop?                   ; 12/-
225   [B0]   XOR       $H0x,$Z0,$Z0                  ; Xi^=inp
226|| [B0]   XOR       $H1x,$Z1,$Z1
227   [B0]   XOR       $H2x,$Z2,$Z2
228|| [B0]   XOR       $H3x,$Z3,$Z3
229|| [B0]   MV        $Z0,$xia            ; Xi[15], avoid cross-path stall
230          STDW      $Z1:$Z0,*${xip}[1]
231|| [B0] SHRU        $Z0,8,$x0           ; Xi[14]
232|| [B0]   ZERO      $Z1:$Z0
233          .endif
234          STDW      $Z3:$Z2,*${xip}[0]
235|| [B0]   ZERO      $Z3:$Z2
236|| [B0]   MV        $xia,$x1
237   [B0]   ADDK      14,${xip}
238          .endasmfunc
239
240          .sect     .const
241          .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
242          .align    4
243___
244
245print $code;
246close STDOUT or die "error closing STDOUT: $!";
247