[Midnightbsd-cvs] src [12153] trunk/secure/lib/libcrypto/amd64: update

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sun Jan 20 00:38:15 EST 2019


Revision: 12153
          http://svnweb.midnightbsd.org/src/?rev=12153
Author:   laffer1
Date:     2019-01-20 00:38:15 -0500 (Sun, 20 Jan 2019)
Log Message:
-----------
update

Modified Paths:
--------------
    trunk/secure/lib/libcrypto/amd64/aes-x86_64.S
    trunk/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
    trunk/secure/lib/libcrypto/amd64/aesni-x86_64.S
    trunk/secure/lib/libcrypto/amd64/bsaes-x86_64.S
    trunk/secure/lib/libcrypto/amd64/cmll-x86_64.S
    trunk/secure/lib/libcrypto/amd64/ghash-x86_64.S
    trunk/secure/lib/libcrypto/amd64/md5-x86_64.S
    trunk/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S
    trunk/secure/lib/libcrypto/amd64/rc4-x86_64.S
    trunk/secure/lib/libcrypto/amd64/sha1-x86_64.S
    trunk/secure/lib/libcrypto/amd64/sha256-x86_64.S
    trunk/secure/lib/libcrypto/amd64/sha512-x86_64.S
    trunk/secure/lib/libcrypto/amd64/vpaes-x86_64.S
    trunk/secure/lib/libcrypto/amd64/wp-x86_64.S
    trunk/secure/lib/libcrypto/amd64/x86_64-gf2m.S
    trunk/secure/lib/libcrypto/amd64/x86_64-mont.S
    trunk/secure/lib/libcrypto/amd64/x86_64-mont5.S
    trunk/secure/lib/libcrypto/amd64/x86_64cpuid.S

Modified: trunk/secure/lib/libcrypto/amd64/aes-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aes-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/aes-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/aes-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from aes-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aes-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aes-x86_64.pl. */
 .text	
 .type	_x86_64_AES_encrypt, at function
 .align	16
@@ -84,8 +84,8 @@
 	movl	0(%r14,%rdi,8),%edi
 	movl	0(%r14,%rbp,8),%ebp
 
-	andl	$65280,%edi
-	andl	$65280,%ebp
+	andl	$0x0000ff00,%edi
+	andl	$0x0000ff00,%ebp
 
 	xorl	%edi,%r10d
 	xorl	%ebp,%r11d
@@ -97,8 +97,8 @@
 	movl	0(%r14,%rsi,8),%esi
 	movl	0(%r14,%rdi,8),%edi
 
-	andl	$65280,%esi
-	andl	$65280,%edi
+	andl	$0x0000ff00,%esi
+	andl	$0x0000ff00,%edi
 	shrl	$16,%ebx
 	xorl	%esi,%r12d
 	xorl	%edi,%r8d
@@ -111,9 +111,9 @@
 	movl	0(%r14,%rdi,8),%edi
 	movl	0(%r14,%rbp,8),%ebp
 
-	andl	$16711680,%esi
-	andl	$16711680,%edi
-	andl	$16711680,%ebp
+	andl	$0x00ff0000,%esi
+	andl	$0x00ff0000,%edi
+	andl	$0x00ff0000,%ebp
 
 	xorl	%esi,%r10d
 	xorl	%edi,%r11d
@@ -126,9 +126,9 @@
 	movl	2(%r14,%rdi,8),%edi
 	movl	2(%r14,%rbp,8),%ebp
 
-	andl	$16711680,%esi
-	andl	$4278190080,%edi
-	andl	$4278190080,%ebp
+	andl	$0x00ff0000,%esi
+	andl	$0xff000000,%edi
+	andl	$0xff000000,%ebp
 
 	xorl	%esi,%r8d
 	xorl	%edi,%r10d
@@ -141,8 +141,8 @@
 	movl	2(%r14,%rdi,8),%edi
 	movl	16+0(%r15),%eax
 
-	andl	$4278190080,%esi
-	andl	$4278190080,%edi
+	andl	$0xff000000,%esi
+	andl	$0xff000000,%edi
 
 	xorl	%esi,%r12d
 	xorl	%edi,%r8d
@@ -153,7 +153,7 @@
 	xorl	%r11d,%ebx
 	xorl	%r12d,%ecx
 	xorl	%r8d,%edx
-.byte	0xf3,0xc3			
+.byte	0xf3,0xc3
 .size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
 .type	_x86_64_AES_encrypt_compact, at function
 .align	16
@@ -178,68 +178,66 @@
 	movzbl	%al,%r10d
 	movzbl	%bl,%r11d
 	movzbl	%cl,%r12d
+	movzbl	%dl,%r8d
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	shrl	$16,%ecx
+	movzbl	%dh,%ebp
 	movzbl	(%r14,%r10,1),%r10d
 	movzbl	(%r14,%r11,1),%r11d
 	movzbl	(%r14,%r12,1),%r12d
+	movzbl	(%r14,%r8,1),%r8d
 
-	movzbl	%dl,%r8d
-	movzbl	%bh,%esi
-	movzbl	%ch,%edi
-	movzbl	(%r14,%r8,1),%r8d
 	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	%ah,%esi
 	movzbl	(%r14,%rdi,1),%r13d
-
-	movzbl	%dh,%ebp
-	movzbl	%ah,%esi
-	shrl	$16,%ecx
+	movzbl	%cl,%edi
 	movzbl	(%r14,%rbp,1),%ebp
 	movzbl	(%r14,%rsi,1),%esi
-	shrl	$16,%edx
 
-	movzbl	%cl,%edi
 	shll	$8,%r9d
+	shrl	$16,%edx
 	shll	$8,%r13d
-	movzbl	(%r14,%rdi,1),%edi
 	xorl	%r9d,%r10d
-	xorl	%r13d,%r11d
-
+	shrl	$16,%eax
 	movzbl	%dl,%r9d
-	shrl	$16,%eax
 	shrl	$16,%ebx
+	xorl	%r13d,%r11d
+	shll	$8,%ebp
 	movzbl	%al,%r13d
-	shll	$8,%ebp
-	shll	$8,%esi
-	movzbl	(%r14,%r9,1),%r9d
-	movzbl	(%r14,%r13,1),%r13d
+	movzbl	(%r14,%rdi,1),%edi
 	xorl	%ebp,%r12d
-	xorl	%esi,%r8d
 
+	shll	$8,%esi
 	movzbl	%bl,%ebp
+	shll	$16,%edi
+	xorl	%esi,%r8d
+	movzbl	(%r14,%r9,1),%r9d
 	movzbl	%dh,%esi
-	shll	$16,%edi
-	movzbl	(%r14,%rbp,1),%ebp
-	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%r13,1),%r13d
 	xorl	%edi,%r10d
 
+	shrl	$8,%ecx
 	movzbl	%ah,%edi
-	shrl	$8,%ecx
+	shll	$16,%r9d
 	shrl	$8,%ebx
+	shll	$16,%r13d
+	xorl	%r9d,%r11d
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
 	movzbl	(%r14,%rdi,1),%edi
 	movzbl	(%r14,%rcx,1),%edx
 	movzbl	(%r14,%rbx,1),%ecx
-	shll	$16,%r9d
-	shll	$16,%r13d
+
 	shll	$16,%ebp
-	xorl	%r9d,%r11d
 	xorl	%r13d,%r12d
+	shll	$24,%esi
 	xorl	%ebp,%r8d
-
-	shll	$24,%esi
 	shll	$24,%edi
+	xorl	%esi,%r10d
 	shll	$24,%edx
-	xorl	%esi,%r10d
+	xorl	%edi,%r11d
 	shll	$24,%ecx
-	xorl	%edi,%r11d
 	movl	%r10d,%eax
 	movl	%r11d,%ebx
 	xorl	%r12d,%ecx
@@ -246,12 +244,12 @@
 	xorl	%r8d,%edx
 	cmpq	16(%rsp),%r15
 	je	.Lenc_compact_done
-	movl	%eax,%esi
-	movl	%ebx,%edi
-	andl	$2155905152,%esi
-	andl	$2155905152,%edi
-	movl	%esi,%r10d
-	movl	%edi,%r11d
+	movl	$0x80808080,%r10d
+	movl	$0x80808080,%r11d
+	andl	%eax,%r10d
+	andl	%ebx,%r11d
+	movl	%r10d,%esi
+	movl	%r11d,%edi
 	shrl	$7,%r10d
 	leal	(%rax,%rax,1),%r8d
 	shrl	$7,%r11d
@@ -258,10 +256,10 @@
 	leal	(%rbx,%rbx,1),%r9d
 	subl	%r10d,%esi
 	subl	%r11d,%edi
-	andl	$4278124286,%r8d
-	andl	$4278124286,%r9d
-	andl	$454761243,%esi
-	andl	$454761243,%edi
+	andl	$0xfefefefe,%r8d
+	andl	$0xfefefefe,%r9d
+	andl	$0x1b1b1b1b,%esi
+	andl	$0x1b1b1b1b,%edi
 	movl	%eax,%r10d
 	movl	%ebx,%r11d
 	xorl	%esi,%r8d
@@ -269,25 +267,25 @@
 
 	xorl	%r8d,%eax
 	xorl	%r9d,%ebx
-	movl	%ecx,%esi
-	movl	%edx,%edi
+	movl	$0x80808080,%r12d
 	roll	$24,%eax
+	movl	$0x80808080,%ebp
 	roll	$24,%ebx
-	andl	$2155905152,%esi
-	andl	$2155905152,%edi
+	andl	%ecx,%r12d
+	andl	%edx,%ebp
 	xorl	%r8d,%eax
 	xorl	%r9d,%ebx
-	movl	%esi,%r12d
-	movl	%edi,%ebp
+	movl	%r12d,%esi
 	rorl	$16,%r10d
+	movl	%ebp,%edi
 	rorl	$16,%r11d
+	leal	(%rcx,%rcx,1),%r8d
 	shrl	$7,%r12d
-	leal	(%rcx,%rcx,1),%r8d
 	xorl	%r10d,%eax
+	shrl	$7,%ebp
 	xorl	%r11d,%ebx
-	shrl	$7,%ebp
+	rorl	$8,%r10d
 	leal	(%rdx,%rdx,1),%r9d
-	rorl	$8,%r10d
 	rorl	$8,%r11d
 	subl	%r12d,%esi
 	subl	%ebp,%edi
@@ -294,32 +292,32 @@
 	xorl	%r10d,%eax
 	xorl	%r11d,%ebx
 
-	andl	$4278124286,%r8d
-	andl	$4278124286,%r9d
-	andl	$454761243,%esi
-	andl	$454761243,%edi
+	andl	$0xfefefefe,%r8d
+	andl	$0xfefefefe,%r9d
+	andl	$0x1b1b1b1b,%esi
+	andl	$0x1b1b1b1b,%edi
 	movl	%ecx,%r12d
 	movl	%edx,%ebp
 	xorl	%esi,%r8d
 	xorl	%edi,%r9d
 
+	rorl	$16,%r12d
 	xorl	%r8d,%ecx
+	rorl	$16,%ebp
 	xorl	%r9d,%edx
 	roll	$24,%ecx
+	movl	0(%r14),%esi
 	roll	$24,%edx
 	xorl	%r8d,%ecx
+	movl	64(%r14),%edi
 	xorl	%r9d,%edx
-	movl	0(%r14),%esi
-	rorl	$16,%r12d
-	rorl	$16,%ebp
-	movl	64(%r14),%edi
+	movl	128(%r14),%r8d
 	xorl	%r12d,%ecx
+	rorl	$8,%r12d
 	xorl	%ebp,%edx
-	movl	128(%r14),%r8d
-	rorl	$8,%r12d
 	rorl	$8,%ebp
+	xorl	%r12d,%ecx
 	movl	192(%r14),%r9d
-	xorl	%r12d,%ecx
 	xorl	%ebp,%edx
 	jmp	.Lenc_loop_compact
 .align	16
@@ -328,7 +326,7 @@
 	xorl	4(%r15),%ebx
 	xorl	8(%r15),%ecx
 	xorl	12(%r15),%edx
-.byte	0xf3,0xc3			
+.byte	0xf3,0xc3
 .size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
 .globl	AES_encrypt
 .type	AES_encrypt, at function
@@ -350,7 +348,7 @@
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
 	negq	%rcx
-	andq	$960,%rcx
+	andq	$0x3c0,%rcx
 	subq	%rcx,%rsp
 	subq	$32,%rsp
 
@@ -375,7 +373,7 @@
 	leaq	.LAES_Te+2048(%rip),%r14
 	leaq	768(%rsp),%rbp
 	subq	%r14,%rbp
-	andq	$768,%rbp
+	andq	$0x300,%rbp
 	leaq	(%r14,%rbp,1),%r14
 
 	call	_x86_64_AES_encrypt_compact
@@ -550,7 +548,7 @@
 	xorl	%r11d,%ebx
 	xorl	%r12d,%ecx
 	xorl	%r8d,%edx
-.byte	0xf3,0xc3			
+.byte	0xf3,0xc3
 .size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
 .type	_x86_64_AES_decrypt_compact, at function
 .align	16
@@ -576,70 +574,69 @@
 	movzbl	%al,%r10d
 	movzbl	%bl,%r11d
 	movzbl	%cl,%r12d
+	movzbl	%dl,%r8d
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	shrl	$16,%edx
+	movzbl	%bh,%ebp
 	movzbl	(%r14,%r10,1),%r10d
 	movzbl	(%r14,%r11,1),%r11d
 	movzbl	(%r14,%r12,1),%r12d
+	movzbl	(%r14,%r8,1),%r8d
 
-	movzbl	%dl,%r8d
-	movzbl	%dh,%esi
-	movzbl	%ah,%edi
-	movzbl	(%r14,%r8,1),%r8d
 	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	%ch,%esi
 	movzbl	(%r14,%rdi,1),%r13d
-
-	movzbl	%bh,%ebp
-	movzbl	%ch,%esi
-	shrl	$16,%ecx
 	movzbl	(%r14,%rbp,1),%ebp
 	movzbl	(%r14,%rsi,1),%esi
-	shrl	$16,%edx
 
+	shrl	$16,%ecx
+	shll	$8,%r13d
+	shll	$8,%r9d
 	movzbl	%cl,%edi
-	shll	$8,%r9d
-	shll	$8,%r13d
-	movzbl	(%r14,%rdi,1),%edi
+	shrl	$16,%eax
 	xorl	%r9d,%r10d
-	xorl	%r13d,%r11d
+	shrl	$16,%ebx
+	movzbl	%dl,%r9d
 
-	movzbl	%dl,%r9d
-	shrl	$16,%eax
-	shrl	$16,%ebx
-	movzbl	%al,%r13d
 	shll	$8,%ebp
+	xorl	%r13d,%r11d
 	shll	$8,%esi
-	movzbl	(%r14,%r9,1),%r9d
-	movzbl	(%r14,%r13,1),%r13d
+	movzbl	%al,%r13d
+	movzbl	(%r14,%rdi,1),%edi
 	xorl	%ebp,%r12d
+	movzbl	%bl,%ebp
+
+	shll	$16,%edi
 	xorl	%esi,%r8d
-
-	movzbl	%bl,%ebp
+	movzbl	(%r14,%r9,1),%r9d
 	movzbl	%bh,%esi
-	shll	$16,%edi
 	movzbl	(%r14,%rbp,1),%ebp
-	movzbl	(%r14,%rsi,1),%esi
 	xorl	%edi,%r10d
+	movzbl	(%r14,%r13,1),%r13d
+	movzbl	%ch,%edi
 
-	movzbl	%ch,%edi
+	shll	$16,%ebp
 	shll	$16,%r9d
 	shll	$16,%r13d
-	movzbl	(%r14,%rdi,1),%ebx
+	xorl	%ebp,%r8d
+	movzbl	%dh,%ebp
 	xorl	%r9d,%r11d
+	shrl	$8,%eax
 	xorl	%r13d,%r12d
 
-	movzbl	%dh,%edi
-	shrl	$8,%eax
-	shll	$16,%ebp
-	movzbl	(%r14,%rdi,1),%ecx
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%ebx
+	movzbl	(%r14,%rbp,1),%ecx
 	movzbl	(%r14,%rax,1),%edx
-	xorl	%ebp,%r8d
 
+	movl	%r10d,%eax
 	shll	$24,%esi
 	shll	$24,%ebx
 	shll	$24,%ecx
-	xorl	%esi,%r10d
+	xorl	%esi,%eax
 	shll	$24,%edx
 	xorl	%r11d,%ebx
-	movl	%r10d,%eax
 	xorl	%r12d,%ecx
 	xorl	%r8d,%edx
 	cmpq	16(%rsp),%r15
@@ -652,12 +649,12 @@
 	orq	%rbx,%rax
 	orq	%rdx,%rcx
 	movq	256+16(%r14),%rbp
-	movq	%rax,%rbx
-	movq	%rcx,%rdx
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r9
-	movq	%rdx,%r12
+	movq	%rsi,%r9
+	movq	%rsi,%r12
+	andq	%rax,%r9
+	andq	%rcx,%r12
+	movq	%r9,%rbx
+	movq	%r12,%rdx
 	shrq	$7,%r9
 	leaq	(%rax,%rax,1),%r8
 	shrq	$7,%r12
@@ -668,15 +665,15 @@
 	andq	%rdi,%r11
 	andq	%rbp,%rbx
 	andq	%rbp,%rdx
-	xorq	%r8,%rbx
-	xorq	%r11,%rdx
-	movq	%rbx,%r8
-	movq	%rdx,%r11
+	xorq	%rbx,%r8
+	xorq	%rdx,%r11
+	movq	%rsi,%r10
+	movq	%rsi,%r13
 
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r10
-	movq	%rdx,%r13
+	andq	%r8,%r10
+	andq	%r11,%r13
+	movq	%r10,%rbx
+	movq	%r13,%rdx
 	shrq	$7,%r10
 	leaq	(%r8,%r8,1),%r9
 	shrq	$7,%r13
@@ -687,15 +684,15 @@
 	andq	%rdi,%r12
 	andq	%rbp,%rbx
 	andq	%rbp,%rdx
-	xorq	%r9,%rbx
-	xorq	%r12,%rdx
-	movq	%rbx,%r9
-	movq	%rdx,%r12
+	xorq	%rbx,%r9
+	xorq	%rdx,%r12
+	movq	%rsi,%r10
+	movq	%rsi,%r13
 
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r10
-	movq	%rdx,%r13
+	andq	%r9,%r10
+	andq	%r12,%r13
+	movq	%r10,%rbx
+	movq	%r13,%rdx
 	shrq	$7,%r10
 	xorq	%rax,%r8
 	shrq	$7,%r13
@@ -720,51 +717,51 @@
 	movq	%rax,%rbx
 	movq	%rcx,%rdx
 	xorq	%r10,%r9
+	shrq	$32,%rbx
 	xorq	%r13,%r12
-	shrq	$32,%rbx
 	shrq	$32,%rdx
 	xorq	%r8,%r10
+	roll	$8,%eax
 	xorq	%r11,%r13
-	roll	$8,%eax
 	roll	$8,%ecx
 	xorq	%r9,%r10
+	roll	$8,%ebx
 	xorq	%r12,%r13
 
-	roll	$8,%ebx
 	roll	$8,%edx
 	xorl	%r10d,%eax
+	shrq	$32,%r10
 	xorl	%r13d,%ecx
-	shrq	$32,%r10
 	shrq	$32,%r13
 	xorl	%r10d,%ebx
 	xorl	%r13d,%edx
 
 	movq	%r8,%r10
+	roll	$24,%r8d
 	movq	%r11,%r13
+	roll	$24,%r11d
 	shrq	$32,%r10
+	xorl	%r8d,%eax
 	shrq	$32,%r13
-	roll	$24,%r8d
-	roll	$24,%r11d
+	xorl	%r11d,%ecx
 	roll	$24,%r10d
+	movq	%r9,%r8
 	roll	$24,%r13d
-	xorl	%r8d,%eax
-	xorl	%r11d,%ecx
-	movq	%r9,%r8
 	movq	%r12,%r11
+	shrq	$32,%r8
 	xorl	%r10d,%ebx
+	shrq	$32,%r11
 	xorl	%r13d,%edx
 
 	movq	0(%r14),%rsi
-	shrq	$32,%r8
-	shrq	$32,%r11
+	roll	$16,%r9d
 	movq	64(%r14),%rdi
-	roll	$16,%r9d
 	roll	$16,%r12d
 	movq	128(%r14),%rbp
 	roll	$16,%r8d
-	roll	$16,%r11d
 	movq	192(%r14),%r10
 	xorl	%r9d,%eax
+	roll	$16,%r11d
 	xorl	%r12d,%ecx
 	movq	256(%r14),%r13
 	xorl	%r8d,%ebx
@@ -776,7 +773,7 @@
 	xorl	4(%r15),%ebx
 	xorl	8(%r15),%ecx
 	xorl	12(%r15),%edx
-.byte	0xf3,0xc3			
+.byte	0xf3,0xc3
 .size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
 .globl	AES_decrypt
 .type	AES_decrypt, at function
@@ -798,7 +795,7 @@
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
 	negq	%rcx
-	andq	$960,%rcx
+	andq	$0x3c0,%rcx
 	subq	%rcx,%rsp
 	subq	$32,%rsp
 
@@ -823,7 +820,7 @@
 	leaq	.LAES_Td+2048(%rip),%r14
 	leaq	768(%rsp),%rbp
 	subq	%r14,%rbp
-	andq	$768,%rbp
+	andq	$0x300,%rbp
 	leaq	(%r14,%rbp,1),%r14
 	shrq	$3,%rbp
 	addq	%rbp,%r14
@@ -862,10 +859,6 @@
 
 	call	_x86_64_AES_set_encrypt_key
 
-	movq	8(%rsp),%r15
-	movq	16(%rsp),%r14
-	movq	24(%rsp),%r13
-	movq	32(%rsp),%r12
 	movq	40(%rsp),%rbp
 	movq	48(%rsp),%rbx
 	addq	$56,%rsp
@@ -1110,7 +1103,7 @@
 .Lbadpointer:
 	movq	$-1,%rax
 .Lexit:
-.byte	0xf3,0xc3			
+.byte	0xf3,0xc3
 .size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
 .globl	private_AES_set_decrypt_key
 .type	private_AES_set_decrypt_key, at function
@@ -1163,12 +1156,12 @@
 	leaq	16(%r15),%r15
 	movq	0(%r15),%rax
 	movq	8(%r15),%rcx
-	movq	%rax,%rbx
-	movq	%rcx,%rdx
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r9
-	movq	%rdx,%r12
+	movq	%rsi,%r9
+	movq	%rsi,%r12
+	andq	%rax,%r9
+	andq	%rcx,%r12
+	movq	%r9,%rbx
+	movq	%r12,%rdx
 	shrq	$7,%r9
 	leaq	(%rax,%rax,1),%r8
 	shrq	$7,%r12
@@ -1179,15 +1172,15 @@
 	andq	%rdi,%r11
 	andq	%rbp,%rbx
 	andq	%rbp,%rdx
-	xorq	%r8,%rbx
-	xorq	%r11,%rdx
-	movq	%rbx,%r8
-	movq	%rdx,%r11
+	xorq	%rbx,%r8
+	xorq	%rdx,%r11
+	movq	%rsi,%r10
+	movq	%rsi,%r13
 
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r10
-	movq	%rdx,%r13
+	andq	%r8,%r10
+	andq	%r11,%r13
+	movq	%r10,%rbx
+	movq	%r13,%rdx
 	shrq	$7,%r10
 	leaq	(%r8,%r8,1),%r9
 	shrq	$7,%r13
@@ -1198,15 +1191,15 @@
 	andq	%rdi,%r12
 	andq	%rbp,%rbx
 	andq	%rbp,%rdx
-	xorq	%r9,%rbx
-	xorq	%r12,%rdx
-	movq	%rbx,%r9
-	movq	%rdx,%r12
+	xorq	%rbx,%r9
+	xorq	%rdx,%r12
+	movq	%rsi,%r10
+	movq	%rsi,%r13
 
-	andq	%rsi,%rbx
-	andq	%rsi,%rdx
-	movq	%rbx,%r10
-	movq	%rdx,%r13
+	andq	%r9,%r10
+	andq	%r12,%r13
+	movq	%r10,%rbx
+	movq	%r13,%rdx
 	shrq	$7,%r10
 	xorq	%rax,%r8
 	shrq	$7,%r13
@@ -1231,51 +1224,51 @@
 	movq	%rax,%rbx
 	movq	%rcx,%rdx
 	xorq	%r10,%r9
+	shrq	$32,%rbx
 	xorq	%r13,%r12
-	shrq	$32,%rbx
 	shrq	$32,%rdx
 	xorq	%r8,%r10
+	roll	$8,%eax
 	xorq	%r11,%r13
-	roll	$8,%eax
 	roll	$8,%ecx
 	xorq	%r9,%r10
+	roll	$8,%ebx
 	xorq	%r12,%r13
 
-	roll	$8,%ebx
 	roll	$8,%edx
 	xorl	%r10d,%eax
+	shrq	$32,%r10
 	xorl	%r13d,%ecx
-	shrq	$32,%r10
 	shrq	$32,%r13
 	xorl	%r10d,%ebx
 	xorl	%r13d,%edx
 
 	movq	%r8,%r10
+	roll	$24,%r8d
 	movq	%r11,%r13
+	roll	$24,%r11d
 	shrq	$32,%r10
+	xorl	%r8d,%eax
 	shrq	$32,%r13
-	roll	$24,%r8d
-	roll	$24,%r11d
+	xorl	%r11d,%ecx
 	roll	$24,%r10d
+	movq	%r9,%r8
 	roll	$24,%r13d
-	xorl	%r8d,%eax
-	xorl	%r11d,%ecx
-	movq	%r9,%r8
 	movq	%r12,%r11
+	shrq	$32,%r8
 	xorl	%r10d,%ebx
+	shrq	$32,%r11
 	xorl	%r13d,%edx
 
 
-	shrq	$32,%r8
-	shrq	$32,%r11
+	roll	$16,%r9d
 
-	roll	$16,%r9d
 	roll	$16,%r12d
 
 	roll	$16,%r8d
-	roll	$16,%r11d
 
 	xorl	%r9d,%eax
+	roll	$16,%r11d
 	xorl	%r12d,%ecx
 
 	xorl	%r8d,%ebx
@@ -1343,9 +1336,9 @@
 	movq	%r14,%r10
 	leaq	2304(%r14),%r11
 	movq	%r15,%r12
-	andq	$4095,%r10
-	andq	$4095,%r11
-	andq	$4095,%r12
+	andq	$0xFFF,%r10
+	andq	$0xFFF,%r11
+	andq	$0xFFF,%r12
 
 	cmpq	%r11,%r12
 	jb	.Lcbc_te_break_out
@@ -1354,7 +1347,7 @@
 	jmp	.Lcbc_te_ok
 .Lcbc_te_break_out:
 	subq	%r10,%r12
-	andq	$4095,%r12
+	andq	$0xFFF,%r12
 	addq	$320,%r12
 	subq	%r12,%r15
 .align	4
@@ -1380,7 +1373,7 @@
 
 	movq	%r15,%r10
 	subq	%r14,%r10
-	andq	$4095,%r10
+	andq	$0xfff,%r10
 	cmpq	$2304,%r10
 	jb	.Lcbc_do_ecopy
 	cmpq	$4096-248,%r10
@@ -1391,7 +1384,7 @@
 	leaq	80(%rsp),%rdi
 	leaq	80(%rsp),%r15
 	movl	$30,%ecx
-.long	0x90A548F3	
+.long	0x90A548F3
 	movl	%eax,(%rdi)
 .Lcbc_skip_ecopy:
 	movq	%r15,0(%rsp)
@@ -1553,7 +1546,7 @@
 	je	.Lcbc_exit
 	movl	$30,%ecx
 	xorq	%rax,%rax
-.long	0x90AB48F3	
+.long	0x90AB48F3
 
 	jmp	.Lcbc_exit
 
@@ -1567,7 +1560,7 @@
 	leaq	-88-63(%rcx),%r10
 	subq	%rbp,%r10
 	negq	%r10
-	andq	$960,%r10
+	andq	$0x3c0,%r10
 	subq	%r10,%rbp
 
 	xchgq	%rsp,%rbp
@@ -1596,7 +1589,7 @@
 	leaq	2048(%r14),%r14
 	leaq	768-8(%rsp),%rax
 	subq	%r14,%rax
-	andq	$768,%rax
+	andq	$0x300,%rax
 	leaq	(%r14,%rax,1),%r14
 
 	cmpq	$0,%rbx
@@ -1608,7 +1601,7 @@
 	movl	4(%rbp),%ebx
 	movl	8(%rbp),%ecx
 	movl	12(%rbp),%edx
-	jz	.Lcbc_slow_enc_tail	
+	jz	.Lcbc_slow_enc_tail
 
 .align	4
 .Lcbc_slow_enc_loop:
@@ -1653,16 +1646,16 @@
 	movq	%r10,%rcx
 	movq	%r8,%rsi
 	movq	%r9,%rdi
-.long	0x9066A4F3		
+.long	0x9066A4F3
 	movq	$16,%rcx
 	subq	%r10,%rcx
 	xorq	%rax,%rax
-.long	0x9066AAF3		
+.long	0x9066AAF3
 	movq	%r9,%r8
 	movq	$16,%r10
 	movq	%r11,%rax
 	movq	%r12,%rcx
-	jmp	.Lcbc_slow_enc_loop	
+	jmp	.Lcbc_slow_enc_loop
 
 .align	16
 .LSLOW_DECRYPT:
@@ -1738,7 +1731,7 @@
 	movq	%r9,%rdi
 	leaq	64(%rsp),%rsi
 	leaq	16(%r10),%rcx
-.long	0x9066A4F3	
+.long	0x9066A4F3
 	jmp	.Lcbc_exit
 
 .align	16

Modified: trunk/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,16 +1,18 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from aesni-sha1-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S 325335 2017-11-02 18:22:53Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-sha1-x86_64.pl. */
 .text	
 
 
 .globl	aesni_cbc_sha1_enc
 .type	aesni_cbc_sha1_enc, at function
-.align	16
+.align	32
 aesni_cbc_sha1_enc:
 
 	movl	OPENSSL_ia32cap_P+0(%rip),%r10d
-	movl	OPENSSL_ia32cap_P+4(%rip),%r11d
+	movq	OPENSSL_ia32cap_P+4(%rip),%r11
+	btq	$61,%r11
+	jc	aesni_cbc_sha1_enc_shaext
 	andl	$268435456,%r11d
 	andl	$1073741824,%r10d
 	orl	%r11d,%r10d
@@ -20,7 +22,7 @@
 	.byte	0xf3,0xc3
 .size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 .type	aesni_cbc_sha1_enc_ssse3, at function
-.align	16
+.align	32
 aesni_cbc_sha1_enc_ssse3:
 	movq	8(%rsp),%r10
 
@@ -37,12 +39,12 @@
 	movq	%rdi,%r12
 	movq	%rsi,%r13
 	movq	%rdx,%r14
-	movq	%rcx,%r15
-	movdqu	(%r8),%xmm11
+	leaq	112(%rcx),%r15
+	movdqu	(%r8),%xmm2
 	movq	%r8,88(%rsp)
 	shlq	$6,%r14
 	subq	%r12,%r13
-	movl	240(%r15),%r8d
+	movl	240-112(%r15),%r8d
 	addq	%r10,%r14
 
 	leaq	K_XX_XX(%rip),%r11
@@ -52,1188 +54,1168 @@
 	movl	12(%r9),%edx
 	movl	%ebx,%esi
 	movl	16(%r9),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
 
-	movdqa	64(%r11),%xmm6
-	movdqa	0(%r11),%xmm9
-	movdqu	0(%r10),%xmm0
-	movdqu	16(%r10),%xmm1
-	movdqu	32(%r10),%xmm2
-	movdqu	48(%r10),%xmm3
-.byte	102,15,56,0,198
+	movdqa	64(%r11),%xmm3
+	movdqa	0(%r11),%xmm13
+	movdqu	0(%r10),%xmm4
+	movdqu	16(%r10),%xmm5
+	movdqu	32(%r10),%xmm6
+	movdqu	48(%r10),%xmm7
+.byte	102,15,56,0,227
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
 	addq	$64,%r10
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
-.byte	102,15,56,0,222
-	paddd	%xmm9,%xmm0
-	paddd	%xmm9,%xmm1
-	paddd	%xmm9,%xmm2
-	movdqa	%xmm0,0(%rsp)
-	psubd	%xmm9,%xmm0
-	movdqa	%xmm1,16(%rsp)
-	psubd	%xmm9,%xmm1
-	movdqa	%xmm2,32(%rsp)
-	psubd	%xmm9,%xmm2
-	movups	(%r15),%xmm13
-	movups	16(%r15),%xmm14
+	paddd	%xmm13,%xmm4
+.byte	102,15,56,0,251
+	paddd	%xmm13,%xmm5
+	paddd	%xmm13,%xmm6
+	movdqa	%xmm4,0(%rsp)
+	psubd	%xmm13,%xmm4
+	movdqa	%xmm5,16(%rsp)
+	psubd	%xmm13,%xmm5
+	movdqa	%xmm6,32(%rsp)
+	psubd	%xmm13,%xmm6
+	movups	-112(%r15),%xmm15
+	movups	16-112(%r15),%xmm0
 	jmp	.Loop_ssse3
-.align	16
+.align	32
 .Loop_ssse3:
-	movdqa	%xmm1,%xmm4
+	rorl	$2,%ebx
+	movups	0(%r12),%xmm14
+	xorps	%xmm15,%xmm14
+	xorps	%xmm14,%xmm2
+	movups	-80(%r15),%xmm1
+.byte	102,15,56,220,208
+	pshufd	$238,%xmm4,%xmm8
+	xorl	%edx,%esi
+	movdqa	%xmm7,%xmm12
+	paddd	%xmm7,%xmm13
+	movl	%eax,%edi
 	addl	0(%rsp),%ebp
-	movups	0(%r12),%xmm12
-	xorps	%xmm13,%xmm12
-	xorps	%xmm12,%xmm11
-.byte	102,69,15,56,220,222
-	movups	32(%r15),%xmm15
-	xorl	%edx,%ecx
-	movdqa	%xmm3,%xmm8
-.byte	102,15,58,15,224,8
-	movl	%eax,%edi
+	punpcklqdq	%xmm5,%xmm8
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	paddd	%xmm3,%xmm9
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	psrldq	$4,%xmm8
-	xorl	%edx,%esi
-	addl	%eax,%ebp
-	pxor	%xmm0,%xmm4
-	rorl	$2,%ebx
 	addl	%esi,%ebp
-	pxor	%xmm2,%xmm8
-	addl	4(%rsp),%edx
+	psrldq	$4,%xmm12
+	andl	%ebx,%edi
 	xorl	%ecx,%ebx
+	pxor	%xmm4,%xmm8
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm6,%xmm12
+	xorl	%ecx,%edi
 	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	pxor	%xmm12,%xmm8
+	xorl	%ebx,%eax
 	roll	$5,%ebp
-	pxor	%xmm8,%xmm4
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	movdqa	%xmm9,48(%rsp)
-	xorl	%ecx,%edi
-.byte	102,69,15,56,220,223
-	movups	48(%r15),%xmm14
-	addl	%ebp,%edx
-	movdqa	%xmm4,%xmm10
-	movdqa	%xmm4,%xmm8
-	rorl	$7,%eax
+	movdqa	%xmm13,48(%rsp)
 	addl	%edi,%edx
-	addl	8(%rsp),%ecx
+	movups	-64(%r15),%xmm0
+.byte	102,15,56,220,209
+	andl	%eax,%esi
+	movdqa	%xmm8,%xmm3
 	xorl	%ebx,%eax
-	pslldq	$12,%xmm10
-	paddd	%xmm4,%xmm4
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	movdqa	%xmm8,%xmm12
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm8,%xmm8
 	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	psrld	$31,%xmm12
+	xorl	%eax,%ebp
 	roll	$5,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	psrld	$31,%xmm8
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
-	movdqa	%xmm10,%xmm9
-	rorl	$7,%ebp
 	addl	%esi,%ecx
-	psrld	$30,%xmm10
-	por	%xmm8,%xmm4
-	addl	12(%rsp),%ebx
+	movdqa	%xmm3,%xmm13
+	andl	%ebp,%edi
 	xorl	%eax,%ebp
+	psrld	$30,%xmm3
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm12,%xmm8
+	xorl	%eax,%edi
 	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	movups	-48(%r15),%xmm1
+.byte	102,15,56,220,208
+	pslld	$2,%xmm13
+	pxor	%xmm3,%xmm8
+	xorl	%ebp,%edx
+	movdqa	0(%r11),%xmm3
 	roll	$5,%ecx
-.byte	102,69,15,56,220,222
-	movups	64(%r15),%xmm15
-	pslld	$2,%xmm9
-	pxor	%xmm10,%xmm4
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	movdqa	0(%r11),%xmm10
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
-	pxor	%xmm9,%xmm4
-	rorl	$7,%edx
 	addl	%edi,%ebx
-	movdqa	%xmm2,%xmm5
-	addl	16(%rsp),%eax
+	andl	%edx,%esi
+	pxor	%xmm13,%xmm8
 	xorl	%ebp,%edx
-	movdqa	%xmm4,%xmm9
-.byte	102,15,58,15,233,8
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pshufd	$238,%xmm5,%xmm9
+	xorl	%ebp,%esi
+	movdqa	%xmm8,%xmm13
+	paddd	%xmm8,%xmm3
 	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	punpcklqdq	%xmm6,%xmm9
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	paddd	%xmm4,%xmm10
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	psrldq	$4,%xmm9
-	xorl	%ebp,%esi
-	addl	%ebx,%eax
-	pxor	%xmm1,%xmm5
-	rorl	$7,%ecx
 	addl	%esi,%eax
-	pxor	%xmm3,%xmm9
-	addl	20(%rsp),%ebp
-.byte	102,69,15,56,220,223
-	movups	80(%r15),%xmm14
+	psrldq	$4,%xmm13
+	andl	%ecx,%edi
 	xorl	%edx,%ecx
+	pxor	%xmm5,%xmm9
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movups	-32(%r15),%xmm0
+.byte	102,15,56,220,209
+	pxor	%xmm7,%xmm13
+	xorl	%edx,%edi
 	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	pxor	%xmm13,%xmm9
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pxor	%xmm9,%xmm5
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	movdqa	%xmm10,0(%rsp)
-	xorl	%edx,%edi
-	addl	%eax,%ebp
-	movdqa	%xmm5,%xmm8
-	movdqa	%xmm5,%xmm9
-	rorl	$7,%ebx
+	movdqa	%xmm3,0(%rsp)
 	addl	%edi,%ebp
-	addl	24(%rsp),%edx
+	andl	%ebx,%esi
+	movdqa	%xmm9,%xmm12
 	xorl	%ecx,%ebx
-	pslldq	$12,%xmm8
-	paddd	%xmm5,%xmm5
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	movdqa	%xmm9,%xmm13
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm12
+	paddd	%xmm9,%xmm9
 	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	psrld	$31,%xmm13
+	xorl	%ebx,%eax
 	roll	$5,%ebp
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	psrld	$31,%xmm9
-	xorl	%ecx,%esi
-.byte	102,69,15,56,220,222
-	movups	96(%r15),%xmm15
-	addl	%ebp,%edx
-	movdqa	%xmm8,%xmm10
-	rorl	$7,%eax
 	addl	%esi,%edx
-	psrld	$30,%xmm8
-	por	%xmm9,%xmm5
-	addl	28(%rsp),%ecx
+	movups	-16(%r15),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm12,%xmm3
+	andl	%eax,%edi
 	xorl	%ebx,%eax
+	psrld	$30,%xmm12
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	por	%xmm13,%xmm9
+	xorl	%ebx,%edi
 	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	pslld	$2,%xmm3
+	pxor	%xmm12,%xmm9
+	xorl	%eax,%ebp
+	movdqa	16(%r11),%xmm12
 	roll	$5,%edx
-	pslld	$2,%xmm10
-	pxor	%xmm8,%xmm5
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	movdqa	16(%r11),%xmm8
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	pxor	%xmm10,%xmm5
-	rorl	$7,%ebp
 	addl	%edi,%ecx
-	movdqa	%xmm3,%xmm6
-	addl	32(%rsp),%ebx
+	andl	%ebp,%esi
+	pxor	%xmm3,%xmm9
 	xorl	%eax,%ebp
-	movdqa	%xmm5,%xmm10
-.byte	102,15,58,15,242,8
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pshufd	$238,%xmm6,%xmm10
+	xorl	%eax,%esi
+	movdqa	%xmm9,%xmm3
+	paddd	%xmm9,%xmm12
 	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	movups	0(%r15),%xmm0
+.byte	102,15,56,220,209
+	punpcklqdq	%xmm7,%xmm10
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-.byte	102,69,15,56,220,223
-	movups	112(%r15),%xmm14
-	paddd	%xmm5,%xmm8
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	psrldq	$4,%xmm10
-	xorl	%eax,%esi
-	addl	%ecx,%ebx
-	pxor	%xmm2,%xmm6
-	rorl	$7,%edx
 	addl	%esi,%ebx
-	pxor	%xmm4,%xmm10
-	addl	36(%rsp),%eax
+	psrldq	$4,%xmm3
+	andl	%edx,%edi
 	xorl	%ebp,%edx
+	pxor	%xmm6,%xmm10
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm8,%xmm3
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	pxor	%xmm3,%xmm10
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	pxor	%xmm10,%xmm6
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	movdqa	%xmm8,16(%rsp)
-	xorl	%ebp,%edi
-	addl	%ebx,%eax
-	movdqa	%xmm6,%xmm9
-	movdqa	%xmm6,%xmm10
-	rorl	$7,%ecx
+	movdqa	%xmm12,16(%rsp)
 	addl	%edi,%eax
-	addl	40(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	128(%r15),%xmm15
+	andl	%ecx,%esi
+	movdqa	%xmm10,%xmm13
 	xorl	%edx,%ecx
-	pslldq	$12,%xmm9
-	paddd	%xmm6,%xmm6
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movups	16(%r15),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm10,%xmm3
+	xorl	%edx,%esi
+	pslldq	$12,%xmm13
+	paddd	%xmm10,%xmm10
 	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	psrld	$31,%xmm3
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	psrld	$31,%xmm10
-	xorl	%edx,%esi
-	addl	%eax,%ebp
-	movdqa	%xmm9,%xmm8
-	rorl	$7,%ebx
 	addl	%esi,%ebp
-	psrld	$30,%xmm9
-	por	%xmm10,%xmm6
-	addl	44(%rsp),%edx
+	movdqa	%xmm13,%xmm12
+	andl	%ebx,%edi
 	xorl	%ecx,%ebx
+	psrld	$30,%xmm13
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	por	%xmm3,%xmm10
+	xorl	%ecx,%edi
 	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	pslld	$2,%xmm12
+	pxor	%xmm13,%xmm10
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm13
 	roll	$5,%ebp
-	pslld	$2,%xmm8
-	pxor	%xmm9,%xmm6
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	movdqa	16(%r11),%xmm9
-	xorl	%ecx,%edi
-.byte	102,69,15,56,220,223
-	movups	144(%r15),%xmm14
-	addl	%ebp,%edx
-	pxor	%xmm8,%xmm6
-	rorl	$7,%eax
 	addl	%edi,%edx
-	movdqa	%xmm4,%xmm7
-	addl	48(%rsp),%ecx
+	movups	32(%r15),%xmm0
+.byte	102,15,56,220,209
+	andl	%eax,%esi
+	pxor	%xmm12,%xmm10
 	xorl	%ebx,%eax
-	movdqa	%xmm6,%xmm8
-.byte	102,15,58,15,251,8
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	pshufd	$238,%xmm7,%xmm11
+	xorl	%ebx,%esi
+	movdqa	%xmm10,%xmm12
+	paddd	%xmm10,%xmm13
 	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	punpcklqdq	%xmm8,%xmm11
+	xorl	%eax,%ebp
 	roll	$5,%edx
-	paddd	%xmm6,%xmm9
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	psrldq	$4,%xmm8
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
-	pxor	%xmm3,%xmm7
-	rorl	$7,%ebp
 	addl	%esi,%ecx
-	pxor	%xmm5,%xmm8
-	addl	52(%rsp),%ebx
+	psrldq	$4,%xmm12
+	andl	%ebp,%edi
 	xorl	%eax,%ebp
+	pxor	%xmm7,%xmm11
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm9,%xmm12
+	xorl	%eax,%edi
 	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	movups	48(%r15),%xmm1
+.byte	102,15,56,220,208
+	pxor	%xmm12,%xmm11
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-.byte	102,69,15,56,220,222
-	movups	160(%r15),%xmm15
-	pxor	%xmm8,%xmm7
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	movdqa	%xmm9,32(%rsp)
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
-	movdqa	%xmm7,%xmm10
-	movdqa	%xmm7,%xmm8
-	rorl	$7,%edx
+	movdqa	%xmm13,32(%rsp)
 	addl	%edi,%ebx
-	addl	56(%rsp),%eax
+	andl	%edx,%esi
+	movdqa	%xmm11,%xmm3
 	xorl	%ebp,%edx
-	pslldq	$12,%xmm10
-	paddd	%xmm7,%xmm7
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm11,%xmm12
+	xorl	%ebp,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm11,%xmm11
 	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	psrld	$31,%xmm12
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	psrld	$31,%xmm8
-	xorl	%ebp,%esi
+	addl	%esi,%eax
+	movdqa	%xmm3,%xmm13
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	psrld	$30,%xmm3
 	addl	%ebx,%eax
-	movdqa	%xmm10,%xmm9
-	rorl	$7,%ecx
-	addl	%esi,%eax
-	psrld	$30,%xmm10
-	por	%xmm8,%xmm7
-	addl	60(%rsp),%ebp
+	rorl	$7,%ebx
 	cmpl	$11,%r8d
 	jb	.Laesenclast1
-	movups	176(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	192(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%r15),%xmm1
+.byte	102,15,56,220,208
 	je	.Laesenclast1
-	movups	208(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	224(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%r15),%xmm1
+.byte	102,15,56,220,208
 .Laesenclast1:
-.byte	102,69,15,56,221,223
-	movups	16(%r15),%xmm14
-	xorl	%edx,%ecx
+.byte	102,15,56,221,209
+	movups	16-112(%r15),%xmm0
+	por	%xmm12,%xmm11
+	xorl	%edx,%edi
 	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	pslld	$2,%xmm13
+	pxor	%xmm3,%xmm11
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm3
 	roll	$5,%eax
-	pslld	$2,%xmm9
-	pxor	%xmm10,%xmm7
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	movdqa	16(%r11),%xmm10
-	xorl	%edx,%edi
-	addl	%eax,%ebp
-	pxor	%xmm9,%xmm7
-	rorl	$7,%ebx
 	addl	%edi,%ebp
-	movdqa	%xmm7,%xmm9
-	addl	0(%rsp),%edx
-	pxor	%xmm4,%xmm0
-.byte	102,68,15,58,15,206,8
+	andl	%ebx,%esi
+	pxor	%xmm13,%xmm11
+	pshufd	$238,%xmm10,%xmm13
 	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm8,%xmm4
+	xorl	%ecx,%esi
 	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	punpcklqdq	%xmm11,%xmm13
+	xorl	%ebx,%eax
 	roll	$5,%ebp
-	pxor	%xmm1,%xmm0
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	movdqa	%xmm10,%xmm8
-	paddd	%xmm7,%xmm10
-	xorl	%ecx,%esi
-	movups	16(%r12),%xmm12
-	xorps	%xmm13,%xmm12
-	movups	%xmm11,0(%r13,%r12,1)
-	xorps	%xmm12,%xmm11
-.byte	102,69,15,56,220,222
-	movups	32(%r15),%xmm15
-	addl	%ebp,%edx
-	pxor	%xmm9,%xmm0
-	rorl	$7,%eax
+	pxor	%xmm5,%xmm4
 	addl	%esi,%edx
-	addl	4(%rsp),%ecx
+	movups	16(%r12),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,0(%r12,%r13,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%r15),%xmm1
+.byte	102,15,56,220,208
+	andl	%eax,%edi
+	movdqa	%xmm3,%xmm12
 	xorl	%ebx,%eax
-	movdqa	%xmm0,%xmm9
-	movdqa	%xmm10,48(%rsp)
+	paddd	%xmm11,%xmm3
+	addl	%ebp,%edx
+	pxor	%xmm13,%xmm4
+	rorl	$7,%ebp
+	xorl	%ebx,%edi
 	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	movdqa	%xmm4,%xmm13
+	xorl	%eax,%ebp
 	roll	$5,%edx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	pslld	$2,%xmm0
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	psrld	$30,%xmm9
-	rorl	$7,%ebp
+	movdqa	%xmm3,48(%rsp)
 	addl	%edi,%ecx
-	addl	8(%rsp),%ebx
+	andl	%ebp,%esi
 	xorl	%eax,%ebp
+	pslld	$2,%xmm4
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	psrld	$30,%xmm13
+	xorl	%eax,%esi
 	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	movups	-64(%r15),%xmm0
+.byte	102,15,56,220,209
+	por	%xmm13,%xmm4
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-.byte	102,69,15,56,220,223
-	movups	48(%r15),%xmm14
-	por	%xmm9,%xmm0
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	movdqa	%xmm0,%xmm10
-	xorl	%eax,%esi
+	pshufd	$238,%xmm11,%xmm3
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	12(%rsp),%eax
-	xorl	%ebp,%edx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	xorl	%ebp,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
+	pxor	%xmm9,%xmm5
 	addl	16(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	64(%r15),%xmm15
-	pxor	%xmm5,%xmm1
-.byte	102,68,15,58,15,215,8
-	xorl	%edx,%esi
+	movups	-48(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm4,%xmm3
 	movl	%eax,%edi
 	roll	$5,%eax
-	pxor	%xmm2,%xmm1
-	xorl	%ecx,%esi
+	pxor	%xmm6,%xmm5
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm12,%xmm13
+	rorl	$7,%ebx
+	paddd	%xmm4,%xmm12
 	addl	%eax,%ebp
-	movdqa	%xmm8,%xmm9
-	paddd	%xmm0,%xmm8
-	rorl	$7,%ebx
-	addl	%esi,%ebp
-	pxor	%xmm10,%xmm1
+	pxor	%xmm3,%xmm5
 	addl	20(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	movdqa	%xmm1,%xmm10
-	movdqa	%xmm8,0(%rsp)
-	xorl	%ebx,%edi
+	movdqa	%xmm5,%xmm3
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm12,0(%rsp)
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
-	pslld	$2,%xmm1
 	addl	24(%rsp),%ecx
-	xorl	%ebx,%esi
-	psrld	$30,%xmm10
+	pslld	$2,%xmm5
+	xorl	%eax,%esi
 	movl	%edx,%edi
+	psrld	$30,%xmm3
 	roll	$5,%edx
-	xorl	%eax,%esi
-.byte	102,69,15,56,220,223
-	movups	80(%r15),%xmm14
+	addl	%esi,%ecx
+	movups	-32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm3,%xmm5
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
-	por	%xmm10,%xmm1
 	addl	28(%rsp),%ebx
-	xorl	%eax,%edi
-	movdqa	%xmm1,%xmm8
+	pshufd	$238,%xmm4,%xmm12
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
+	pxor	%xmm10,%xmm6
 	addl	32(%rsp),%eax
-	pxor	%xmm6,%xmm2
-.byte	102,68,15,58,15,192,8
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
+	punpcklqdq	%xmm5,%xmm12
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	pxor	%xmm3,%xmm2
-	xorl	%edx,%esi
+	pxor	%xmm7,%xmm6
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	32(%r11),%xmm3
+	rorl	$7,%ecx
+	paddd	%xmm5,%xmm13
 	addl	%ebx,%eax
-	movdqa	32(%r11),%xmm10
-	paddd	%xmm1,%xmm9
-	rorl	$7,%ecx
-	addl	%esi,%eax
-	pxor	%xmm8,%xmm2
+	pxor	%xmm12,%xmm6
 	addl	36(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	96(%r15),%xmm15
-	xorl	%edx,%edi
+	movups	-16(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	movdqa	%xmm2,%xmm8
-	movdqa	%xmm9,16(%rsp)
-	xorl	%ecx,%edi
+	movdqa	%xmm6,%xmm12
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	movdqa	%xmm13,16(%rsp)
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
-	pslld	$2,%xmm2
 	addl	40(%rsp),%edx
-	xorl	%ecx,%esi
-	psrld	$30,%xmm8
+	pslld	$2,%xmm6
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
+	psrld	$30,%xmm12
 	roll	$5,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	por	%xmm12,%xmm6
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
-	por	%xmm8,%xmm2
 	addl	44(%rsp),%ecx
-	xorl	%ebx,%edi
-	movdqa	%xmm2,%xmm9
+	pshufd	$238,%xmm5,%xmm13
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
-.byte	102,69,15,56,220,223
-	movups	112(%r15),%xmm14
+	addl	%edi,%ecx
+	movups	0(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
+	pxor	%xmm11,%xmm7
 	addl	48(%rsp),%ebx
-	pxor	%xmm7,%xmm3
-.byte	102,68,15,58,15,201,8
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
+	punpcklqdq	%xmm6,%xmm13
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	pxor	%xmm4,%xmm3
-	xorl	%ebp,%esi
+	pxor	%xmm8,%xmm7
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	movdqa	%xmm3,%xmm12
+	rorl	$7,%edx
+	paddd	%xmm6,%xmm3
 	addl	%ecx,%ebx
-	movdqa	%xmm10,%xmm8
-	paddd	%xmm2,%xmm10
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	pxor	%xmm9,%xmm3
+	pxor	%xmm13,%xmm7
 	addl	52(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	movdqa	%xmm3,%xmm9
-	movdqa	%xmm10,32(%rsp)
-	xorl	%edx,%edi
+	movdqa	%xmm7,%xmm13
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	movdqa	%xmm3,32(%rsp)
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
-	pslld	$2,%xmm3
 	addl	56(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	128(%r15),%xmm15
-	xorl	%edx,%esi
-	psrld	$30,%xmm9
+	movups	16(%r15),%xmm1
+.byte	102,15,56,220,208
+	pslld	$2,%xmm7
+	xorl	%ecx,%esi
 	movl	%eax,%edi
+	psrld	$30,%xmm13
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	por	%xmm13,%xmm7
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%esi,%ebp
-	por	%xmm9,%xmm3
 	addl	60(%rsp),%edx
-	xorl	%ecx,%edi
-	movdqa	%xmm3,%xmm10
+	pshufd	$238,%xmm6,%xmm3
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
+	pxor	%xmm4,%xmm8
 	addl	0(%rsp),%ecx
-	pxor	%xmm0,%xmm4
-.byte	102,68,15,58,15,210,8
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
+	punpcklqdq	%xmm7,%xmm3
 	movl	%edx,%edi
 	roll	$5,%edx
-	pxor	%xmm5,%xmm4
-	xorl	%eax,%esi
-.byte	102,69,15,56,220,223
-	movups	144(%r15),%xmm14
+	pxor	%xmm9,%xmm8
+	addl	%esi,%ecx
+	movups	32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%edi
+	movdqa	%xmm12,%xmm13
+	rorl	$7,%ebp
+	paddd	%xmm7,%xmm12
 	addl	%edx,%ecx
-	movdqa	%xmm8,%xmm9
-	paddd	%xmm3,%xmm8
-	rorl	$7,%ebp
-	addl	%esi,%ecx
-	pxor	%xmm10,%xmm4
+	pxor	%xmm3,%xmm8
 	addl	4(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	movdqa	%xmm4,%xmm10
-	movdqa	%xmm8,48(%rsp)
-	xorl	%ebp,%edi
+	movdqa	%xmm8,%xmm3
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	movdqa	%xmm12,48(%rsp)
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
-	pslld	$2,%xmm4
 	addl	8(%rsp),%eax
-	xorl	%ebp,%esi
-	psrld	$30,%xmm10
+	pslld	$2,%xmm8
+	xorl	%edx,%esi
 	movl	%ebx,%edi
+	psrld	$30,%xmm3
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	por	%xmm3,%xmm8
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
-	por	%xmm10,%xmm4
 	addl	12(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	160(%r15),%xmm15
-	xorl	%edx,%edi
-	movdqa	%xmm4,%xmm8
+	movups	48(%r15),%xmm1
+.byte	102,15,56,220,208
+	pshufd	$238,%xmm7,%xmm12
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
+	pxor	%xmm5,%xmm9
 	addl	16(%rsp),%edx
-	pxor	%xmm1,%xmm5
-.byte	102,68,15,58,15,195,8
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
+	punpcklqdq	%xmm8,%xmm12
 	movl	%ebp,%edi
 	roll	$5,%ebp
-	pxor	%xmm6,%xmm5
-	xorl	%ebx,%esi
+	pxor	%xmm10,%xmm9
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	movdqa	%xmm13,%xmm3
+	rorl	$7,%eax
+	paddd	%xmm8,%xmm13
 	addl	%ebp,%edx
-	movdqa	%xmm9,%xmm10
-	paddd	%xmm4,%xmm9
-	rorl	$7,%eax
-	addl	%esi,%edx
-	pxor	%xmm8,%xmm5
+	pxor	%xmm12,%xmm9
 	addl	20(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	movdqa	%xmm5,%xmm8
-	movdqa	%xmm9,0(%rsp)
-	xorl	%eax,%edi
+	movdqa	%xmm9,%xmm12
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
 	jb	.Laesenclast2
-	movups	176(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	192(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%r15),%xmm1
+.byte	102,15,56,220,208
 	je	.Laesenclast2
-	movups	208(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	224(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%r15),%xmm1
+.byte	102,15,56,220,208
 .Laesenclast2:
-.byte	102,69,15,56,221,223
-	movups	16(%r15),%xmm14
+.byte	102,15,56,221,209
+	movups	16-112(%r15),%xmm0
+	xorl	%eax,%esi
+	movdqa	%xmm13,0(%rsp)
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
-	pslld	$2,%xmm5
 	addl	24(%rsp),%ebx
-	xorl	%eax,%esi
-	psrld	$30,%xmm8
+	pslld	$2,%xmm9
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
+	psrld	$30,%xmm12
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	por	%xmm12,%xmm9
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	por	%xmm8,%xmm5
 	addl	28(%rsp),%eax
-	xorl	%ebp,%edi
-	movdqa	%xmm5,%xmm9
+	pshufd	$238,%xmm8,%xmm13
+	rorl	$7,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%edi
-	addl	%ebx,%eax
-	rorl	$7,%ecx
 	addl	%edi,%eax
-	movl	%ecx,%edi
-	movups	32(%r12),%xmm12
-	xorps	%xmm13,%xmm12
-	movups	%xmm11,16(%r13,%r12,1)
-	xorps	%xmm12,%xmm11
-.byte	102,69,15,56,220,222
-	movups	32(%r15),%xmm15
-	pxor	%xmm2,%xmm6
-.byte	102,68,15,58,15,204,8
+	xorl	%ecx,%esi
 	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm6,%xmm10
 	addl	32(%rsp),%ebp
-	andl	%edx,%edi
-	pxor	%xmm7,%xmm6
+	movups	32(%r12),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,16(%r13,%r12,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%r15),%xmm1
+.byte	102,15,56,220,208
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	movdqa	%xmm10,%xmm8
-	paddd	%xmm5,%xmm10
-	addl	%edi,%ebp
+	punpcklqdq	%xmm9,%xmm13
 	movl	%eax,%edi
-	pxor	%xmm9,%xmm6
+	xorl	%ecx,%esi
+	pxor	%xmm11,%xmm10
 	roll	$5,%eax
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm12
+	xorl	%ebx,%edi
+	paddd	%xmm9,%xmm3
+	xorl	%ecx,%ebx
+	pxor	%xmm13,%xmm10
 	addl	%eax,%ebp
-	movdqa	%xmm6,%xmm9
-	movdqa	%xmm10,16(%rsp)
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	36(%rsp),%edx
-	andl	%ecx,%esi
-	pslld	$2,%xmm6
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	psrld	$30,%xmm9
-	addl	%esi,%edx
+	movdqa	%xmm10,%xmm13
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	movdqa	%xmm3,16(%rsp)
 	roll	$5,%ebp
-.byte	102,69,15,56,220,223
-	movups	48(%r15),%xmm14
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	movups	-64(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%esi
+	pslld	$2,%xmm10
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	por	%xmm9,%xmm6
-	movl	%eax,%edi
-	xorl	%ebx,%eax
-	movdqa	%xmm6,%xmm10
+	psrld	$30,%xmm13
 	addl	40(%rsp),%ecx
-	andl	%ebx,%edi
 	andl	%eax,%esi
+	xorl	%ebx,%eax
+	por	%xmm13,%xmm10
 	rorl	$7,%ebp
-	addl	%edi,%ecx
 	movl	%edx,%edi
+	xorl	%eax,%esi
 	roll	$5,%edx
+	pshufd	$238,%xmm9,%xmm3
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	44(%rsp),%ebx
-	andl	%eax,%esi
 	andl	%ebp,%edi
-.byte	102,69,15,56,220,222
-	movups	64(%r15),%xmm15
+	xorl	%eax,%ebp
 	rorl	$7,%edx
-	addl	%esi,%ebx
+	movups	-48(%r15),%xmm1
+.byte	102,15,56,220,208
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
 	roll	$5,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%edi
-	pxor	%xmm3,%xmm7
-.byte	102,68,15,58,15,213,8
-	xorl	%ebp,%edx
+	pxor	%xmm7,%xmm11
 	addl	48(%rsp),%eax
-	andl	%ebp,%edi
-	pxor	%xmm0,%xmm7
 	andl	%edx,%esi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	movdqa	48(%r11),%xmm9
-	paddd	%xmm6,%xmm8
-	addl	%edi,%eax
+	punpcklqdq	%xmm10,%xmm3
 	movl	%ebx,%edi
-	pxor	%xmm10,%xmm7
+	xorl	%edx,%esi
+	pxor	%xmm4,%xmm11
 	roll	$5,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	movdqa	48(%r11),%xmm13
+	xorl	%ecx,%edi
+	paddd	%xmm10,%xmm12
+	xorl	%edx,%ecx
+	pxor	%xmm3,%xmm11
 	addl	%ebx,%eax
-	movdqa	%xmm7,%xmm10
-	movdqa	%xmm8,32(%rsp)
-	movl	%ecx,%esi
-.byte	102,69,15,56,220,223
-	movups	80(%r15),%xmm14
-	xorl	%edx,%ecx
 	addl	52(%rsp),%ebp
-	andl	%edx,%esi
-	pslld	$2,%xmm7
+	movups	-32(%r15),%xmm0
+.byte	102,15,56,220,209
 	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	psrld	$30,%xmm10
-	addl	%esi,%ebp
+	movdqa	%xmm11,%xmm3
 	movl	%eax,%esi
+	xorl	%ecx,%edi
+	movdqa	%xmm12,32(%rsp)
 	roll	$5,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	pslld	$2,%xmm11
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	por	%xmm10,%xmm7
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
-	movdqa	%xmm7,%xmm8
+	psrld	$30,%xmm3
 	addl	56(%rsp),%edx
-	andl	%ecx,%edi
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	por	%xmm3,%xmm11
 	rorl	$7,%eax
-	addl	%edi,%edx
 	movl	%ebp,%edi
+	xorl	%ebx,%esi
 	roll	$5,%ebp
-.byte	102,69,15,56,220,222
-	movups	96(%r15),%xmm15
+	pshufd	$238,%xmm10,%xmm12
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	movups	-16(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%rsp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%edi
 	roll	$5,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%edi
-	pxor	%xmm4,%xmm0
-.byte	102,68,15,58,15,198,8
-	xorl	%eax,%ebp
+	pxor	%xmm8,%xmm4
 	addl	0(%rsp),%ebx
-	andl	%eax,%edi
-	pxor	%xmm1,%xmm0
 	andl	%ebp,%esi
-.byte	102,69,15,56,220,223
-	movups	112(%r15),%xmm14
+	xorl	%eax,%ebp
 	rorl	$7,%edx
-	movdqa	%xmm9,%xmm10
-	paddd	%xmm7,%xmm9
-	addl	%edi,%ebx
+	movups	0(%r15),%xmm0
+.byte	102,15,56,220,209
+	punpcklqdq	%xmm11,%xmm12
 	movl	%ecx,%edi
-	pxor	%xmm8,%xmm0
+	xorl	%ebp,%esi
+	pxor	%xmm5,%xmm4
 	roll	$5,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	movdqa	%xmm13,%xmm3
+	xorl	%edx,%edi
+	paddd	%xmm11,%xmm13
+	xorl	%ebp,%edx
+	pxor	%xmm12,%xmm4
 	addl	%ecx,%ebx
-	movdqa	%xmm0,%xmm8
-	movdqa	%xmm9,48(%rsp)
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	4(%rsp),%eax
-	andl	%ebp,%esi
-	pslld	$2,%xmm0
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	psrld	$30,%xmm8
-	addl	%esi,%eax
+	movdqa	%xmm4,%xmm12
 	movl	%ebx,%esi
+	xorl	%edx,%edi
+	movdqa	%xmm13,48(%rsp)
 	roll	$5,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%esi
+	pslld	$2,%xmm4
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	por	%xmm8,%xmm0
-	movl	%ecx,%edi
-.byte	102,69,15,56,220,222
-	movups	128(%r15),%xmm15
-	xorl	%edx,%ecx
-	movdqa	%xmm0,%xmm9
+	psrld	$30,%xmm12
 	addl	8(%rsp),%ebp
-	andl	%edx,%edi
+	movups	16(%r15),%xmm1
+.byte	102,15,56,220,208
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	por	%xmm12,%xmm4
 	rorl	$7,%ebx
-	addl	%edi,%ebp
 	movl	%eax,%edi
+	xorl	%ecx,%esi
 	roll	$5,%eax
+	pshufd	$238,%xmm11,%xmm13
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%rsp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
 	roll	$5,%ebp
-.byte	102,69,15,56,220,223
-	movups	144(%r15),%xmm14
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	movups	32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%edi
-	pxor	%xmm5,%xmm1
-.byte	102,68,15,58,15,207,8
-	xorl	%ebx,%eax
+	pxor	%xmm9,%xmm5
 	addl	16(%rsp),%ecx
-	andl	%ebx,%edi
-	pxor	%xmm2,%xmm1
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
-	movdqa	%xmm10,%xmm8
-	paddd	%xmm0,%xmm10
-	addl	%edi,%ecx
+	punpcklqdq	%xmm4,%xmm13
 	movl	%edx,%edi
-	pxor	%xmm9,%xmm1
+	xorl	%eax,%esi
+	pxor	%xmm6,%xmm5
 	roll	$5,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	movdqa	%xmm3,%xmm12
+	xorl	%ebp,%edi
+	paddd	%xmm4,%xmm3
+	xorl	%eax,%ebp
+	pxor	%xmm13,%xmm5
 	addl	%edx,%ecx
-	movdqa	%xmm1,%xmm9
-	movdqa	%xmm10,0(%rsp)
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	20(%rsp),%ebx
-	andl	%eax,%esi
-	pslld	$2,%xmm1
 	andl	%ebp,%edi
-.byte	102,69,15,56,220,222
-	movups	160(%r15),%xmm15
+	xorl	%eax,%ebp
 	rorl	$7,%edx
-	psrld	$30,%xmm9
-	addl	%esi,%ebx
+	movups	48(%r15),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm5,%xmm13
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	movdqa	%xmm3,0(%rsp)
 	roll	$5,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	pslld	$2,%xmm5
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	por	%xmm9,%xmm1
-	movl	%edx,%edi
-	xorl	%ebp,%edx
-	movdqa	%xmm1,%xmm10
+	psrld	$30,%xmm13
 	addl	24(%rsp),%eax
-	andl	%ebp,%edi
 	andl	%edx,%esi
+	xorl	%ebp,%edx
+	por	%xmm13,%xmm5
 	rorl	$7,%ecx
-	addl	%edi,%eax
 	movl	%ebx,%edi
+	xorl	%edx,%esi
 	roll	$5,%ebx
+	pshufd	$238,%xmm4,%xmm3
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
+	addl	28(%rsp),%ebp
 	cmpl	$11,%r8d
 	jb	.Laesenclast3
-	movups	176(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	192(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%r15),%xmm1
+.byte	102,15,56,220,208
 	je	.Laesenclast3
-	movups	208(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	224(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%r15),%xmm1
+.byte	102,15,56,220,208
 .Laesenclast3:
-.byte	102,69,15,56,221,223
-	movups	16(%r15),%xmm14
+.byte	102,15,56,221,209
+	movups	16-112(%r15),%xmm0
+	andl	%ecx,%edi
 	xorl	%edx,%ecx
-	addl	28(%rsp),%ebp
-	andl	%edx,%esi
-	andl	%ecx,%edi
 	rorl	$7,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	xorl	%ecx,%edi
 	roll	$5,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%edi
-	pxor	%xmm6,%xmm2
-.byte	102,68,15,58,15,208,8
-	xorl	%ecx,%ebx
+	pxor	%xmm10,%xmm6
 	addl	32(%rsp),%edx
-	andl	%ecx,%edi
-	pxor	%xmm3,%xmm2
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	movdqa	%xmm8,%xmm9
-	paddd	%xmm1,%xmm8
-	addl	%edi,%edx
+	punpcklqdq	%xmm5,%xmm3
 	movl	%ebp,%edi
-	pxor	%xmm10,%xmm2
+	xorl	%ebx,%esi
+	pxor	%xmm7,%xmm6
 	roll	$5,%ebp
-	movups	48(%r12),%xmm12
-	xorps	%xmm13,%xmm12
-	movups	%xmm11,32(%r13,%r12,1)
-	xorps	%xmm12,%xmm11
-.byte	102,69,15,56,220,222
-	movups	32(%r15),%xmm15
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	movups	48(%r12),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,32(%r13,%r12,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%r15),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm12,%xmm13
+	xorl	%eax,%edi
+	paddd	%xmm5,%xmm12
+	xorl	%ebx,%eax
+	pxor	%xmm3,%xmm6
 	addl	%ebp,%edx
-	movdqa	%xmm2,%xmm10
-	movdqa	%xmm8,16(%rsp)
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	36(%rsp),%ecx
-	andl	%ebx,%esi
-	pslld	$2,%xmm2
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
-	psrld	$30,%xmm10
-	addl	%esi,%ecx
+	movdqa	%xmm6,%xmm3
 	movl	%edx,%esi
+	xorl	%eax,%edi
+	movdqa	%xmm12,16(%rsp)
 	roll	$5,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	pslld	$2,%xmm6
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	por	%xmm10,%xmm2
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
-	movdqa	%xmm2,%xmm8
+	psrld	$30,%xmm3
 	addl	40(%rsp),%ebx
-	andl	%eax,%edi
 	andl	%ebp,%esi
-.byte	102,69,15,56,220,223
-	movups	48(%r15),%xmm14
+	xorl	%eax,%ebp
+	por	%xmm3,%xmm6
 	rorl	$7,%edx
-	addl	%edi,%ebx
+	movups	-64(%r15),%xmm0
+.byte	102,15,56,220,209
 	movl	%ecx,%edi
+	xorl	%ebp,%esi
 	roll	$5,%ecx
+	pshufd	$238,%xmm5,%xmm12
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	44(%rsp),%eax
-	andl	%ebp,%esi
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	roll	$5,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
+	pxor	%xmm11,%xmm7
 	addl	48(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	64(%r15),%xmm15
-	pxor	%xmm7,%xmm3
-.byte	102,68,15,58,15,193,8
-	xorl	%edx,%esi
+	movups	-48(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm6,%xmm12
 	movl	%eax,%edi
 	roll	$5,%eax
-	pxor	%xmm4,%xmm3
-	xorl	%ecx,%esi
+	pxor	%xmm8,%xmm7
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm13,%xmm3
+	rorl	$7,%ebx
+	paddd	%xmm6,%xmm13
 	addl	%eax,%ebp
-	movdqa	%xmm9,%xmm10
-	paddd	%xmm2,%xmm9
-	rorl	$7,%ebx
-	addl	%esi,%ebp
-	pxor	%xmm8,%xmm3
+	pxor	%xmm12,%xmm7
 	addl	52(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	movdqa	%xmm3,%xmm8
-	movdqa	%xmm9,32(%rsp)
-	xorl	%ebx,%edi
+	movdqa	%xmm7,%xmm12
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm13,32(%rsp)
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
-	pslld	$2,%xmm3
 	addl	56(%rsp),%ecx
-	xorl	%ebx,%esi
-	psrld	$30,%xmm8
+	pslld	$2,%xmm7
+	xorl	%eax,%esi
 	movl	%edx,%edi
+	psrld	$30,%xmm12
 	roll	$5,%edx
-	xorl	%eax,%esi
-.byte	102,69,15,56,220,223
-	movups	80(%r15),%xmm14
+	addl	%esi,%ecx
+	movups	-32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm12,%xmm7
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
-	por	%xmm8,%xmm3
 	addl	60(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	0(%rsp),%eax
-	paddd	%xmm3,%xmm10
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
-	movdqa	%xmm10,48(%rsp)
+	paddd	%xmm7,%xmm3
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	%xmm3,48(%rsp)
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	4(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	96(%r15),%xmm15
-	xorl	%edx,%edi
+	movups	-16(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	8(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	roll	$5,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	12(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
-.byte	102,69,15,56,220,223
-	movups	112(%r15),%xmm14
+	addl	%edi,%ecx
+	movups	0(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	cmpq	%r14,%r10
 	je	.Ldone_ssse3
-	movdqa	64(%r11),%xmm6
-	movdqa	0(%r11),%xmm9
-	movdqu	0(%r10),%xmm0
-	movdqu	16(%r10),%xmm1
-	movdqu	32(%r10),%xmm2
-	movdqu	48(%r10),%xmm3
-.byte	102,15,56,0,198
+	movdqa	64(%r11),%xmm3
+	movdqa	0(%r11),%xmm13
+	movdqu	0(%r10),%xmm4
+	movdqu	16(%r10),%xmm5
+	movdqu	32(%r10),%xmm6
+	movdqu	48(%r10),%xmm7
+.byte	102,15,56,0,227
 	addq	$64,%r10
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
-.byte	102,15,56,0,206
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
+.byte	102,15,56,0,235
 	roll	$5,%ecx
-	paddd	%xmm9,%xmm0
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	paddd	%xmm13,%xmm4
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	movdqa	%xmm0,0(%rsp)
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
-	psubd	%xmm9,%xmm0
+	xorl	%edx,%edi
 	movl	%ebx,%esi
+	movdqa	%xmm4,0(%rsp)
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	psubd	%xmm13,%xmm4
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	128(%r15),%xmm15
-	xorl	%edx,%esi
+	movups	16(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
-.byte	102,15,56,0,214
+	xorl	%eax,%esi
 	movl	%edx,%edi
+.byte	102,15,56,0,243
 	roll	$5,%edx
-	paddd	%xmm9,%xmm1
-	xorl	%eax,%esi
-.byte	102,69,15,56,220,223
-	movups	144(%r15),%xmm14
+	addl	%esi,%ecx
+	movups	32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	paddd	%xmm13,%xmm5
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
-	movdqa	%xmm1,16(%rsp)
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
-	psubd	%xmm9,%xmm1
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
+	movdqa	%xmm5,16(%rsp)
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	psubd	%xmm13,%xmm5
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	160(%r15),%xmm15
-	xorl	%edx,%edi
+	movups	48(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
-.byte	102,15,56,0,222
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
+.byte	102,15,56,0,251
 	roll	$5,%ebp
-	paddd	%xmm9,%xmm2
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	paddd	%xmm13,%xmm6
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
-	movdqa	%xmm2,32(%rsp)
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
-	psubd	%xmm9,%xmm2
+	xorl	%eax,%edi
 	movl	%edx,%esi
+	movdqa	%xmm6,32(%rsp)
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
 	jb	.Laesenclast4
-	movups	176(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	192(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%r15),%xmm1
+.byte	102,15,56,220,208
 	je	.Laesenclast4
-	movups	208(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	224(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%r15),%xmm1
+.byte	102,15,56,220,208
 .Laesenclast4:
-.byte	102,69,15,56,221,223
-	movups	16(%r15),%xmm14
+.byte	102,15,56,221,209
+	movups	16-112(%r15),%xmm0
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	psubd	%xmm13,%xmm6
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
-	movups	%xmm11,48(%r13,%r12,1)
+	movups	%xmm2,48(%r13,%r12,1)
 	leaq	64(%r12),%r12
 
 	addl	0(%r9),%eax
@@ -1245,129 +1227,130 @@
 	movl	%esi,4(%r9)
 	movl	%esi,%ebx
 	movl	%ecx,8(%r9)
+	movl	%ecx,%edi
 	movl	%edx,12(%r9)
+	xorl	%edx,%edi
 	movl	%ebp,16(%r9)
+	andl	%edi,%esi
 	jmp	.Loop_ssse3
 
-.align	16
 .Ldone_ssse3:
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	128(%r15),%xmm15
-	xorl	%edx,%esi
+	movups	16(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	roll	$5,%edx
-	xorl	%eax,%esi
-.byte	102,69,15,56,220,223
-	movups	144(%r15),%xmm14
+	addl	%esi,%ecx
+	movups	32(%r15),%xmm0
+.byte	102,15,56,220,209
+	xorl	%eax,%edi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-.byte	102,69,15,56,220,222
-	movups	160(%r15),%xmm15
-	xorl	%edx,%edi
+	movups	48(%r15),%xmm1
+.byte	102,15,56,220,208
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	roll	$5,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
 	jb	.Laesenclast5
-	movups	176(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	192(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%r15),%xmm1
+.byte	102,15,56,220,208
 	je	.Laesenclast5
-	movups	208(%r15),%xmm14
-.byte	102,69,15,56,220,223
-	movups	224(%r15),%xmm15
-.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%r15),%xmm1
+.byte	102,15,56,220,208
 .Laesenclast5:
-.byte	102,69,15,56,221,223
-	movups	16(%r15),%xmm14
+.byte	102,15,56,221,209
+	movups	16-112(%r15),%xmm0
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
-	movups	%xmm11,48(%r13,%r12,1)
+	movups	%xmm2,48(%r13,%r12,1)
 	movq	88(%rsp),%r8
 
 	addl	0(%r9),%eax
@@ -1380,7 +1363,7 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 	movl	%ebp,16(%r9)
-	movups	%xmm11,(%r8)
+	movups	%xmm2,(%r8)
 	leaq	104(%rsp),%rsi
 	movq	0(%rsi),%r15
 	movq	8(%rsi),%r14
@@ -1393,7 +1376,7 @@
 	.byte	0xf3,0xc3
 .size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 .type	aesni_cbc_sha1_enc_avx, at function
-.align	16
+.align	32
 aesni_cbc_sha1_enc_avx:
 	movq	8(%rsp),%r10
 
@@ -1411,13 +1394,12 @@
 	movq	%rdi,%r12
 	movq	%rsi,%r13
 	movq	%rdx,%r14
-	movq	%rcx,%r15
-	vmovdqu	(%r8),%xmm11
+	leaq	112(%rcx),%r15
+	vmovdqu	(%r8),%xmm12
 	movq	%r8,88(%rsp)
 	shlq	$6,%r14
 	subq	%r12,%r13
-	movl	240(%r15),%r8d
-	addq	$112,%r15
+	movl	240-112(%r15),%r8d
 	addq	%r10,%r14
 
 	leaq	K_XX_XX(%rip),%r11
@@ -1427,9 +1409,12 @@
 	movl	12(%r9),%edx
 	movl	%ebx,%esi
 	movl	16(%r9),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
 
 	vmovdqa	64(%r11),%xmm6
-	vmovdqa	0(%r11),%xmm9
+	vmovdqa	0(%r11),%xmm10
 	vmovdqu	0(%r10),%xmm0
 	vmovdqu	16(%r10),%xmm1
 	vmovdqu	32(%r10),%xmm2
@@ -1439,1137 +1424,1094 @@
 	vpshufb	%xmm6,%xmm1,%xmm1
 	vpshufb	%xmm6,%xmm2,%xmm2
 	vpshufb	%xmm6,%xmm3,%xmm3
-	vpaddd	%xmm9,%xmm0,%xmm4
-	vpaddd	%xmm9,%xmm1,%xmm5
-	vpaddd	%xmm9,%xmm2,%xmm6
+	vpaddd	%xmm10,%xmm0,%xmm4
+	vpaddd	%xmm10,%xmm1,%xmm5
+	vpaddd	%xmm10,%xmm2,%xmm6
 	vmovdqa	%xmm4,0(%rsp)
 	vmovdqa	%xmm5,16(%rsp)
 	vmovdqa	%xmm6,32(%rsp)
-	vmovups	-112(%r15),%xmm13
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
 	jmp	.Loop_avx
-.align	16
+.align	32
 .Loop_avx:
-	addl	0(%rsp),%ebp
-	vmovups	0(%r12),%xmm12
-	vxorps	%xmm13,%xmm12,%xmm12
-	vxorps	%xmm12,%xmm11,%xmm11
-	vaesenc	%xmm14,%xmm11,%xmm11
+	shrdl	$2,%ebx,%ebx
+	vmovdqu	0(%r12),%xmm13
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm13,%xmm12,%xmm12
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-80(%r15),%xmm15
-	xorl	%edx,%ecx
+	xorl	%edx,%esi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	vpaddd	%xmm3,%xmm10,%xmm9
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpaddd	%xmm3,%xmm9,%xmm9
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
 	vpsrldq	$4,%xmm3,%xmm8
-	xorl	%edx,%esi
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpxor	%xmm0,%xmm4,%xmm4
-	shrdl	$2,%ebx,%ebx
-	addl	%esi,%ebp
 	vpxor	%xmm2,%xmm8,%xmm8
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
 	addl	4(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%esi
+	vpxor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm8,%xmm4,%xmm4
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
 	vmovdqa	%xmm9,48(%rsp)
-	xorl	%ecx,%edi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-64(%r15),%xmm14
-	addl	%ebp,%edx
+	andl	%eax,%esi
 	vpsrld	$31,%xmm4,%xmm8
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	8(%rsp),%ecx
 	xorl	%ebx,%eax
-	vpslldq	$12,%xmm4,%xmm10
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm9
 	vpaddd	%xmm4,%xmm4,%xmm4
 	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	vpsrld	$30,%xmm10,%xmm9
 	vpor	%xmm8,%xmm4,%xmm4
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
+	vpsrld	$30,%xmm9,%xmm8
 	addl	%esi,%ecx
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm4,%xmm4
-	addl	12(%rsp),%ebx
+	andl	%ebp,%edi
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm4,%xmm4
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
 	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	vaesenc	%xmm14,%xmm12,%xmm12
+	vmovups	-48(%r15),%xmm15
+	vpxor	%xmm9,%xmm4,%xmm4
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vaesenc	%xmm14,%xmm11,%xmm11
-	vmovups	-48(%r15),%xmm15
-	vpxor	%xmm10,%xmm4,%xmm4
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	vmovdqa	0(%r11),%xmm10
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
 	addl	%edi,%ebx
-	addl	16(%rsp),%eax
+	andl	%edx,%esi
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	vpaddd	%xmm4,%xmm10,%xmm9
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpaddd	%xmm4,%xmm10,%xmm10
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	vpsrldq	$4,%xmm4,%xmm9
-	xorl	%ebp,%esi
+	vpsrldq	$4,%xmm4,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpxor	%xmm1,%xmm5,%xmm5
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
-	vpxor	%xmm3,%xmm9,%xmm9
-	addl	20(%rsp),%ebp
-	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-32(%r15),%xmm14
-	xorl	%edx,%ecx
+	xorl	%edx,%edi
 	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	vpxor	%xmm8,%xmm5,%xmm5
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm9,%xmm5,%xmm5
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	vmovdqa	%xmm10,0(%rsp)
-	xorl	%edx,%edi
-	addl	%eax,%ebp
-	vpsrld	$31,%xmm5,%xmm9
-	shrdl	$7,%ebx,%ebx
+	vmovdqa	%xmm9,0(%rsp)
 	addl	%edi,%ebp
-	addl	24(%rsp),%edx
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm8
 	xorl	%ecx,%ebx
-	vpslldq	$12,%xmm5,%xmm8
+	addl	%eax,%ebp
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm9
 	vpaddd	%xmm5,%xmm5,%xmm5
 	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	vpsrld	$30,%xmm8,%xmm10
-	vpor	%xmm9,%xmm5,%xmm5
-	xorl	%ecx,%esi
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vpor	%xmm8,%xmm5,%xmm5
+	vpsrld	$30,%xmm9,%xmm8
+	addl	%esi,%edx
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-16(%r15),%xmm15
+	andl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
-	vpslld	$2,%xmm8,%xmm8
-	vpxor	%xmm10,%xmm5,%xmm5
+	vpslld	$2,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm5,%xmm5
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
 	addl	28(%rsp),%ecx
-	xorl	%ebx,%eax
-	movl	%edx,%esi
+	vpxor	%xmm9,%xmm5,%xmm5
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vpxor	%xmm8,%xmm5,%xmm5
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	vmovdqa	16(%r11),%xmm8
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
+	vmovdqa	16(%r11),%xmm10
 	addl	%edi,%ecx
-	addl	32(%rsp),%ebx
+	andl	%ebp,%esi
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vmovups	0(%r15),%xmm14
+	vpaddd	%xmm5,%xmm10,%xmm9
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vmovups	0(%r15),%xmm14
-	vpaddd	%xmm5,%xmm8,%xmm8
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	vpsrldq	$4,%xmm5,%xmm10
-	xorl	%eax,%esi
-	addl	%ecx,%ebx
+	vpsrldq	$4,%xmm5,%xmm8
+	addl	%esi,%ebx
+	andl	%edx,%edi
 	vpxor	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
-	vpxor	%xmm4,%xmm10,%xmm10
-	addl	36(%rsp),%eax
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm8,%xmm8
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpxor	%xmm10,%xmm6,%xmm6
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	vmovdqa	%xmm8,16(%rsp)
-	xorl	%ebp,%edi
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm8
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpsrld	$31,%xmm6,%xmm10
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	addl	40(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	shrdl	$7,%ebx,%ebx
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	16(%r15),%xmm15
-	xorl	%edx,%ecx
+	xorl	%edx,%esi
 	vpslldq	$12,%xmm6,%xmm9
 	vpaddd	%xmm6,%xmm6,%xmm6
 	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	vpor	%xmm8,%xmm6,%xmm6
 	vpsrld	$30,%xmm9,%xmm8
-	vpor	%xmm10,%xmm6,%xmm6
-	xorl	%edx,%esi
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	vpslld	$2,%xmm9,%xmm9
 	vpxor	%xmm8,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
 	addl	44(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%esi
+	vpxor	%xmm9,%xmm6,%xmm6
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm9,%xmm6,%xmm6
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	vmovdqa	16(%r11),%xmm9
-	xorl	%ecx,%edi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	32(%r15),%xmm14
+	andl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	48(%rsp),%ecx
-	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	vpaddd	%xmm6,%xmm10,%xmm9
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vpaddd	%xmm6,%xmm9,%xmm9
-	andl	%eax,%esi
-	xorl	%ebx,%eax
 	vpsrldq	$4,%xmm6,%xmm8
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpxor	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vpxor	%xmm5,%xmm8,%xmm8
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
 	addl	52(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	48(%r15),%xmm15
 	vpxor	%xmm8,%xmm7,%xmm7
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
 	vmovdqa	%xmm9,32(%rsp)
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
+	addl	%edi,%ebx
+	andl	%edx,%esi
 	vpsrld	$31,%xmm7,%xmm8
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
-	addl	56(%rsp),%eax
 	xorl	%ebp,%edx
-	vpslldq	$12,%xmm7,%xmm10
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpslldq	$12,%xmm7,%xmm9
 	vpaddd	%xmm7,%xmm7,%xmm7
 	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	vpsrld	$30,%xmm10,%xmm9
 	vpor	%xmm8,%xmm7,%xmm7
-	xorl	%ebp,%esi
+	vpsrld	$30,%xmm9,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm7,%xmm7
-	addl	60(%rsp),%ebp
+	vpslld	$2,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
 	cmpl	$11,%r8d
-	jb	.Lvaesenclast1
-	vaesenc	%xmm15,%xmm11,%xmm11
+	jb	.Lvaesenclast6
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	64(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	80(%r15),%xmm15
-	je	.Lvaesenclast1
-	vaesenc	%xmm15,%xmm11,%xmm11
+	je	.Lvaesenclast6
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	96(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	112(%r15),%xmm15
-.Lvaesenclast1:
-	vaesenclast	%xmm15,%xmm11,%xmm11
+.Lvaesenclast6:
+	vaesenclast	%xmm15,%xmm12,%xmm12
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
-	xorl	%edx,%ecx
+	xorl	%edx,%edi
 	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	vpxor	%xmm9,%xmm7,%xmm7
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm10,%xmm7,%xmm7
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	vmovdqa	16(%r11),%xmm10
-	xorl	%edx,%edi
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
-	vpalignr	$8,%xmm6,%xmm7,%xmm9
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
 	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
 	addl	0(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%edi
+	vpxor	%xmm1,%xmm0,%xmm0
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm1,%xmm0,%xmm0
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm7,%xmm10,%xmm10
-	xorl	%ecx,%esi
-	vmovups	16(%r12),%xmm12
-	vxorps	%xmm13,%xmm12,%xmm12
-	vmovups	%xmm11,0(%r13,%r12,1)
-	vxorps	%xmm12,%xmm11,%xmm11
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm10,%xmm9
+	addl	%esi,%edx
+	vmovdqu	16(%r12),%xmm13
+	vpxor	%xmm15,%xmm13,%xmm13
+	vmovups	%xmm12,0(%r12,%r13,1)
+	vpxor	%xmm13,%xmm12,%xmm12
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-80(%r15),%xmm15
+	andl	%eax,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpxor	%xmm9,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
 	addl	4(%rsp),%ecx
-	xorl	%ebx,%eax
-	vpsrld	$30,%xmm0,%xmm9
-	vmovdqa	%xmm10,48(%rsp)
-	movl	%edx,%esi
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
 	vpslld	$2,%xmm0,%xmm0
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
 	addl	%edi,%ecx
-	addl	8(%rsp),%ebx
+	andl	%ebp,%esi
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vmovups	-64(%r15),%xmm14
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vmovups	-64(%r15),%xmm14
-	vpor	%xmm9,%xmm0,%xmm0
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	vmovdqa	%xmm0,%xmm10
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	12(%rsp),%eax
-	xorl	%ebp,%edx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	xorl	%ebp,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vpalignr	$8,%xmm7,%xmm0,%xmm10
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
 	vpxor	%xmm5,%xmm1,%xmm1
 	addl	16(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-48(%r15),%xmm15
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
 	vpxor	%xmm2,%xmm1,%xmm1
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm0,%xmm10,%xmm9
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm0,%xmm8,%xmm8
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
-	vpxor	%xmm10,%xmm1,%xmm1
+	vpxor	%xmm8,%xmm1,%xmm1
 	addl	20(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	vpsrld	$30,%xmm1,%xmm10
-	vmovdqa	%xmm8,0(%rsp)
-	xorl	%ebx,%edi
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	vpslld	$2,%xmm1,%xmm1
 	addl	24(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%esi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-32(%r15),%xmm14
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
-	vpor	%xmm10,%xmm1,%xmm1
+	vpor	%xmm8,%xmm1,%xmm1
 	addl	28(%rsp),%ebx
-	xorl	%eax,%edi
-	vmovdqa	%xmm1,%xmm8
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	vpalignr	$8,%xmm0,%xmm1,%xmm8
 	vpxor	%xmm6,%xmm2,%xmm2
 	addl	32(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
 	vpxor	%xmm3,%xmm2,%xmm2
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	vpaddd	%xmm1,%xmm10,%xmm9
 	vmovdqa	32(%r11),%xmm10
-	vpaddd	%xmm1,%xmm9,%xmm9
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
+	addl	%ebx,%eax
 	vpxor	%xmm8,%xmm2,%xmm2
 	addl	36(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-16(%r15),%xmm15
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
 	vpsrld	$30,%xmm2,%xmm8
 	vmovdqa	%xmm9,16(%rsp)
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	vpslld	$2,%xmm2,%xmm2
 	addl	40(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpor	%xmm8,%xmm2,%xmm2
 	addl	44(%rsp),%ecx
-	xorl	%ebx,%edi
-	vmovdqa	%xmm2,%xmm9
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	0(%r15),%xmm14
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
-	vpalignr	$8,%xmm1,%xmm2,%xmm9
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
 	vpxor	%xmm4,%xmm3,%xmm3
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	vpaddd	%xmm2,%xmm10,%xmm9
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm2,%xmm10,%xmm10
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
-	vpxor	%xmm9,%xmm3,%xmm3
+	vpxor	%xmm8,%xmm3,%xmm3
 	addl	52(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm3,%xmm9
-	vmovdqa	%xmm10,32(%rsp)
-	xorl	%edx,%edi
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	16(%r15),%xmm15
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
-	vpor	%xmm9,%xmm3,%xmm3
+	vpor	%xmm8,%xmm3,%xmm3
 	addl	60(%rsp),%edx
-	xorl	%ecx,%edi
-	vmovdqa	%xmm3,%xmm10
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpalignr	$8,%xmm2,%xmm3,%xmm10
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
 	vpxor	%xmm0,%xmm4,%xmm4
 	addl	0(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
 	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%eax,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%esi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	32(%r15),%xmm14
+	xorl	%eax,%edi
+	vpaddd	%xmm3,%xmm10,%xmm9
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm3,%xmm8,%xmm8
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
-	vpxor	%xmm10,%xmm4,%xmm4
+	vpxor	%xmm8,%xmm4,%xmm4
 	addl	4(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	vpsrld	$30,%xmm4,%xmm10
-	vmovdqa	%xmm8,48(%rsp)
-	xorl	%ebp,%edi
+	vpsrld	$30,%xmm4,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	vpslld	$2,%xmm4,%xmm4
 	addl	8(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
-	vpor	%xmm10,%xmm4,%xmm4
+	vpor	%xmm8,%xmm4,%xmm4
 	addl	12(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	48(%r15),%xmm15
-	xorl	%edx,%edi
-	vmovdqa	%xmm4,%xmm8
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	vpalignr	$8,%xmm3,%xmm4,%xmm8
 	vpxor	%xmm1,%xmm5,%xmm5
 	addl	16(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
 	vpxor	%xmm6,%xmm5,%xmm5
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	vpaddd	%xmm4,%xmm10,%xmm9
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm4,%xmm9,%xmm9
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpxor	%xmm8,%xmm5,%xmm5
 	addl	20(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
 	vpsrld	$30,%xmm5,%xmm8
 	vmovdqa	%xmm9,0(%rsp)
-	xorl	%eax,%edi
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
-	jb	.Lvaesenclast2
-	vaesenc	%xmm15,%xmm11,%xmm11
+	jb	.Lvaesenclast7
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	64(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	80(%r15),%xmm15
-	je	.Lvaesenclast2
-	vaesenc	%xmm15,%xmm11,%xmm11
+	je	.Lvaesenclast7
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	96(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	112(%r15),%xmm15
-.Lvaesenclast2:
-	vaesenclast	%xmm15,%xmm11,%xmm11
+.Lvaesenclast7:
+	vaesenclast	%xmm15,%xmm12,%xmm12
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	vpslld	$2,%xmm5,%xmm5
 	addl	24(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpor	%xmm8,%xmm5,%xmm5
 	addl	28(%rsp),%eax
-	xorl	%ebp,%edi
-	vmovdqa	%xmm5,%xmm9
+	shrdl	$7,%ecx,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vpalignr	$8,%xmm4,%xmm5,%xmm9
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
 	vpxor	%xmm2,%xmm6,%xmm6
-	movl	%ecx,%edi
-	vmovups	32(%r12),%xmm12
-	vxorps	%xmm13,%xmm12,%xmm12
-	vmovups	%xmm11,16(%r13,%r12,1)
-	vxorps	%xmm12,%xmm11,%xmm11
-	vaesenc	%xmm14,%xmm11,%xmm11
+	addl	32(%rsp),%ebp
+	vmovdqu	32(%r12),%xmm13
+	vpxor	%xmm15,%xmm13,%xmm13
+	vmovups	%xmm12,16(%r13,%r12,1)
+	vpxor	%xmm13,%xmm12,%xmm12
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-80(%r15),%xmm15
+	andl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	32(%rsp),%ebp
-	andl	%edx,%edi
+	shrdl	$7,%ebx,%ebx
 	vpxor	%xmm7,%xmm6,%xmm6
-	andl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm5,%xmm10,%xmm10
-	addl	%edi,%ebp
 	movl	%eax,%edi
-	vpxor	%xmm9,%xmm6,%xmm6
+	xorl	%ecx,%esi
+	vpaddd	%xmm5,%xmm10,%xmm9
 	shldl	$5,%eax,%eax
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpsrld	$30,%xmm6,%xmm9
-	vmovdqa	%xmm10,16(%rsp)
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	36(%rsp),%edx
-	andl	%ecx,%esi
-	vpslld	$2,%xmm6,%xmm6
+	vpsrld	$30,%xmm6,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%edi
 	shldl	$5,%ebp,%ebp
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-64(%r15),%xmm14
-	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpor	%xmm9,%xmm6,%xmm6
-	movl	%eax,%edi
-	xorl	%ebx,%eax
-	vmovdqa	%xmm6,%xmm10
 	addl	40(%rsp),%ecx
-	andl	%ebx,%edi
 	andl	%eax,%esi
+	vpor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	movl	%edx,%edi
+	xorl	%eax,%esi
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	44(%rsp),%ebx
-	andl	%eax,%esi
 	andl	%ebp,%edi
-	vaesenc	%xmm14,%xmm11,%xmm11
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-48(%r15),%xmm15
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
 	shldl	$5,%ecx,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	vpalignr	$8,%xmm5,%xmm6,%xmm10
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
 	vpxor	%xmm3,%xmm7,%xmm7
-	movl	%edx,%edi
-	xorl	%ebp,%edx
 	addl	48(%rsp),%eax
-	andl	%ebp,%edi
-	vpxor	%xmm0,%xmm7,%xmm7
 	andl	%edx,%esi
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	vmovdqa	48(%r11),%xmm9
-	vpaddd	%xmm6,%xmm8,%xmm8
-	addl	%edi,%eax
+	vpxor	%xmm0,%xmm7,%xmm7
 	movl	%ebx,%edi
-	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%edx,%esi
+	vpaddd	%xmm6,%xmm10,%xmm9
+	vmovdqa	48(%r11),%xmm10
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpsrld	$30,%xmm7,%xmm10
-	vmovdqa	%xmm8,32(%rsp)
-	movl	%ecx,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	52(%rsp),%ebp
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-32(%r15),%xmm14
+	vpsrld	$30,%xmm7,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	andl	%ecx,%edi
 	xorl	%edx,%ecx
-	addl	52(%rsp),%ebp
-	andl	%edx,%esi
-	vpslld	$2,%xmm7,%xmm7
-	andl	%ecx,%edi
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%edi
 	shldl	$5,%eax,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpor	%xmm10,%xmm7,%xmm7
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
-	vmovdqa	%xmm7,%xmm8
 	addl	56(%rsp),%edx
-	andl	%ecx,%edi
 	andl	%ebx,%esi
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	movl	%ebp,%edi
+	xorl	%ebx,%esi
 	shldl	$5,%ebp,%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	addl	%esi,%edx
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-16(%r15),%xmm15
-	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%rsp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
 	vpalignr	$8,%xmm6,%xmm7,%xmm8
 	vpxor	%xmm4,%xmm0,%xmm0
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
 	addl	0(%rsp),%ebx
-	andl	%eax,%edi
-	vpxor	%xmm1,%xmm0,%xmm0
 	andl	%ebp,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	0(%r15),%xmm14
-	shrdl	$7,%edx,%edx
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm7,%xmm9,%xmm9
-	addl	%edi,%ebx
+	vpxor	%xmm1,%xmm0,%xmm0
 	movl	%ecx,%edi
-	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%esi
+	vpaddd	%xmm7,%xmm10,%xmm9
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
 	vpsrld	$30,%xmm0,%xmm8
 	vmovdqa	%xmm9,48(%rsp)
-	movl	%edx,%esi
+	andl	%edx,%edi
 	xorl	%ebp,%edx
-	addl	4(%rsp),%eax
-	andl	%ebp,%esi
-	vpslld	$2,%xmm0,%xmm0
-	andl	%edx,%edi
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	8(%rsp),%ebp
+	vaesenc	%xmm14,%xmm12,%xmm12
+	vmovups	16(%r15),%xmm15
+	andl	%ecx,%esi
 	vpor	%xmm8,%xmm0,%xmm0
-	movl	%ecx,%edi
-	vaesenc	%xmm14,%xmm11,%xmm11
-	vmovups	16(%r15),%xmm15
 	xorl	%edx,%ecx
-	vmovdqa	%xmm0,%xmm9
-	addl	8(%rsp),%ebp
-	andl	%edx,%edi
-	andl	%ecx,%esi
 	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	movl	%eax,%edi
+	xorl	%ecx,%esi
 	shldl	$5,%eax,%eax
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%rsp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
 	shldl	$5,%ebp,%ebp
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	32(%r15),%xmm14
-	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpalignr	$8,%xmm7,%xmm0,%xmm9
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
 	vpxor	%xmm5,%xmm1,%xmm1
-	movl	%eax,%edi
-	xorl	%ebx,%eax
 	addl	16(%rsp),%ecx
-	andl	%ebx,%edi
-	vpxor	%xmm2,%xmm1,%xmm1
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm0,%xmm10,%xmm10
-	addl	%edi,%ecx
+	vpxor	%xmm2,%xmm1,%xmm1
 	movl	%edx,%edi
-	vpxor	%xmm9,%xmm1,%xmm1
+	xorl	%eax,%esi
+	vpaddd	%xmm0,%xmm10,%xmm9
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	vpxor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpsrld	$30,%xmm1,%xmm9
-	vmovdqa	%xmm10,0(%rsp)
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	20(%rsp),%ebx
-	andl	%eax,%esi
-	vpslld	$2,%xmm1,%xmm1
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
 	andl	%ebp,%edi
-	vaesenc	%xmm14,%xmm11,%xmm11
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	48(%r15),%xmm15
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%ebp,%edi
 	shldl	$5,%ecx,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	vpor	%xmm9,%xmm1,%xmm1
-	movl	%edx,%edi
-	xorl	%ebp,%edx
-	vmovdqa	%xmm1,%xmm10
 	addl	24(%rsp),%eax
-	andl	%ebp,%edi
 	andl	%edx,%esi
+	vpor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	movl	%ebx,%edi
+	xorl	%edx,%esi
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
+	addl	28(%rsp),%ebp
 	cmpl	$11,%r8d
-	jb	.Lvaesenclast3
-	vaesenc	%xmm15,%xmm11,%xmm11
+	jb	.Lvaesenclast8
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	64(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	80(%r15),%xmm15
-	je	.Lvaesenclast3
-	vaesenc	%xmm15,%xmm11,%xmm11
+	je	.Lvaesenclast8
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	96(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	112(%r15),%xmm15
-.Lvaesenclast3:
-	vaesenclast	%xmm15,%xmm11,%xmm11
+.Lvaesenclast8:
+	vaesenclast	%xmm15,%xmm12,%xmm12
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
+	andl	%ecx,%edi
 	xorl	%edx,%ecx
-	addl	28(%rsp),%ebp
-	andl	%edx,%esi
-	andl	%ecx,%edi
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	xorl	%ecx,%edi
 	shldl	$5,%eax,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpalignr	$8,%xmm0,%xmm1,%xmm10
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
 	vpxor	%xmm6,%xmm2,%xmm2
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
 	addl	32(%rsp),%edx
-	andl	%ecx,%edi
-	vpxor	%xmm3,%xmm2,%xmm2
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm1,%xmm8,%xmm8
-	addl	%edi,%edx
+	vpxor	%xmm3,%xmm2,%xmm2
 	movl	%ebp,%edi
-	vpxor	%xmm10,%xmm2,%xmm2
+	xorl	%ebx,%esi
+	vpaddd	%xmm1,%xmm10,%xmm9
 	shldl	$5,%ebp,%ebp
-	vmovups	48(%r12),%xmm12
-	vxorps	%xmm13,%xmm12,%xmm12
-	vmovups	%xmm11,32(%r13,%r12,1)
-	vxorps	%xmm12,%xmm11,%xmm11
-	vaesenc	%xmm14,%xmm11,%xmm11
+	addl	%esi,%edx
+	vmovdqu	48(%r12),%xmm13
+	vpxor	%xmm15,%xmm13,%xmm13
+	vmovups	%xmm12,32(%r13,%r12,1)
+	vpxor	%xmm13,%xmm12,%xmm12
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-80(%r15),%xmm15
-	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	vpxor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpsrld	$30,%xmm2,%xmm10
-	vmovdqa	%xmm8,16(%rsp)
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	36(%rsp),%ecx
-	andl	%ebx,%esi
-	vpslld	$2,%xmm2,%xmm2
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpor	%xmm10,%xmm2,%xmm2
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
-	vmovdqa	%xmm2,%xmm8
 	addl	40(%rsp),%ebx
-	andl	%eax,%edi
 	andl	%ebp,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	vpor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-64(%r15),%xmm14
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	movl	%ecx,%edi
+	xorl	%ebp,%esi
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	44(%rsp),%eax
-	andl	%ebp,%esi
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	vpalignr	$8,%xmm1,%xmm2,%xmm8
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-48(%r15),%xmm15
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
 	vpxor	%xmm4,%xmm3,%xmm3
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm2,%xmm10,%xmm9
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm2,%xmm9,%xmm9
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	vpxor	%xmm8,%xmm3,%xmm3
 	addl	52(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
 	vpsrld	$30,%xmm3,%xmm8
 	vmovdqa	%xmm9,32(%rsp)
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%esi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	-32(%r15),%xmm14
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vpor	%xmm8,%xmm3,%xmm3
 	addl	60(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	0(%rsp),%eax
-	vpaddd	%xmm3,%xmm10,%xmm10
-	xorl	%ebp,%esi
+	vpaddd	%xmm3,%xmm10,%xmm9
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
-	movdqa	%xmm10,48(%rsp)
+	addl	%esi,%eax
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	4(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	-16(%r15),%xmm15
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	8(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	12(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%edi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	0(%r15),%xmm14
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	cmpq	%r14,%r10
 	je	.Ldone_avx
-	vmovdqa	64(%r11),%xmm6
-	vmovdqa	0(%r11),%xmm9
+	vmovdqa	64(%r11),%xmm9
+	vmovdqa	0(%r11),%xmm10
 	vmovdqu	0(%r10),%xmm0
 	vmovdqu	16(%r10),%xmm1
 	vmovdqu	32(%r10),%xmm2
 	vmovdqu	48(%r10),%xmm3
-	vpshufb	%xmm6,%xmm0,%xmm0
+	vpshufb	%xmm9,%xmm0,%xmm0
 	addq	$64,%r10
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
-	vpshufb	%xmm6,%xmm1,%xmm1
+	xorl	%ebp,%esi
+	vpshufb	%xmm9,%xmm1,%xmm1
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	vpaddd	%xmm9,%xmm0,%xmm4
-	xorl	%ebp,%esi
+	vpaddd	%xmm10,%xmm0,%xmm8
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
-	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm8,0(%rsp)
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	16(%r15),%xmm15
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
-	vpshufb	%xmm6,%xmm2,%xmm2
+	xorl	%eax,%esi
+	vpshufb	%xmm9,%xmm2,%xmm2
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	vpaddd	%xmm9,%xmm1,%xmm5
-	xorl	%eax,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm10,%xmm1,%xmm8
+	addl	%esi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	32(%r15),%xmm14
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
-	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm8,16(%rsp)
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	48(%r15),%xmm15
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
-	vpshufb	%xmm6,%xmm3,%xmm3
+	xorl	%ebx,%esi
+	vpshufb	%xmm9,%xmm3,%xmm3
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	vpaddd	%xmm9,%xmm2,%xmm6
-	xorl	%ebx,%esi
+	vpaddd	%xmm10,%xmm2,%xmm8
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
-	vmovdqa	%xmm6,32(%rsp)
+	vmovdqa	%xmm8,32(%rsp)
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
-	jb	.Lvaesenclast4
-	vaesenc	%xmm15,%xmm11,%xmm11
+	jb	.Lvaesenclast9
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	64(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	80(%r15),%xmm15
-	je	.Lvaesenclast4
-	vaesenc	%xmm15,%xmm11,%xmm11
+	je	.Lvaesenclast9
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	96(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	112(%r15),%xmm15
-.Lvaesenclast4:
-	vaesenclast	%xmm15,%xmm11,%xmm11
+.Lvaesenclast9:
+	vaesenclast	%xmm15,%xmm12,%xmm12
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vmovups	%xmm11,48(%r13,%r12,1)
+	vmovups	%xmm12,48(%r13,%r12,1)
 	leaq	64(%r12),%r12
 
 	addl	0(%r9),%eax
@@ -2581,129 +2523,131 @@
 	movl	%esi,4(%r9)
 	movl	%esi,%ebx
 	movl	%ecx,8(%r9)
+	movl	%ecx,%edi
 	movl	%edx,12(%r9)
+	xorl	%edx,%edi
 	movl	%ebp,16(%r9)
+	andl	%edi,%esi
 	jmp	.Loop_avx
 
-.align	16
 .Ldone_avx:
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	16(%r15),%xmm15
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
-	vaesenc	%xmm15,%xmm11,%xmm11
+	addl	%esi,%ecx
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	32(%r15),%xmm14
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	48(%r15),%xmm15
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
 	cmpl	$11,%r8d
-	jb	.Lvaesenclast5
-	vaesenc	%xmm15,%xmm11,%xmm11
+	jb	.Lvaesenclast10
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	64(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	80(%r15),%xmm15
-	je	.Lvaesenclast5
-	vaesenc	%xmm15,%xmm11,%xmm11
+	je	.Lvaesenclast10
+	vaesenc	%xmm15,%xmm12,%xmm12
 	vmovups	96(%r15),%xmm14
-	vaesenc	%xmm14,%xmm11,%xmm11
+	vaesenc	%xmm14,%xmm12,%xmm12
 	vmovups	112(%r15),%xmm15
-.Lvaesenclast5:
-	vaesenclast	%xmm15,%xmm11,%xmm11
+.Lvaesenclast10:
+	vaesenclast	%xmm15,%xmm12,%xmm12
+	vmovups	-112(%r15),%xmm15
 	vmovups	16-112(%r15),%xmm14
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vmovups	%xmm11,48(%r13,%r12,1)
+	vmovups	%xmm12,48(%r13,%r12,1)
 	movq	88(%rsp),%r8
 
 	addl	0(%r9),%eax
@@ -2716,7 +2660,7 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 	movl	%ebp,16(%r9)
-	vmovups	%xmm11,(%r8)
+	vmovups	%xmm12,(%r8)
 	vzeroall
 	leaq	104(%rsp),%rsi
 	movq	0(%rsi),%r15
@@ -2731,11 +2675,314 @@
 .size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
 .align	64
 K_XX_XX:
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 
 .byte	65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64
+.type	aesni_cbc_sha1_enc_shaext, at function
+.align	32
+aesni_cbc_sha1_enc_shaext:
+	movq	8(%rsp),%r10
+	movdqu	(%r9),%xmm8
+	movd	16(%r9),%xmm9
+	movdqa	K_XX_XX+80(%rip),%xmm7
+
+	movl	240(%rcx),%r11d
+	subq	%rdi,%rsi
+	movups	(%rcx),%xmm15
+	movups	(%r8),%xmm2
+	movups	16(%rcx),%xmm0
+	leaq	112(%rcx),%rcx
+
+	pshufd	$27,%xmm8,%xmm8
+	pshufd	$27,%xmm9,%xmm9
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movups	0(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	xorps	%xmm14,%xmm2
+	movups	-80(%rcx),%xmm1
+.byte	102,15,56,220,208
+	movdqu	(%r10),%xmm3
+	movdqa	%xmm9,%xmm12
+.byte	102,15,56,0,223
+	movdqu	16(%r10),%xmm4
+	movdqa	%xmm8,%xmm11
+	movups	-64(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,0,231
+
+	paddd	%xmm3,%xmm9
+	movdqu	32(%r10),%xmm5
+	leaq	64(%r10),%r10
+	pxor	%xmm12,%xmm3
+	movups	-48(%rcx),%xmm1
+.byte	102,15,56,220,208
+	pxor	%xmm12,%xmm3
+	movdqa	%xmm8,%xmm10
+.byte	102,15,56,0,239
+.byte	69,15,58,204,193,0
+.byte	68,15,56,200,212
+	movups	-32(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	15,56,201,220
+	movdqu	-16(%r10),%xmm6
+	movdqa	%xmm8,%xmm9
+.byte	102,15,56,0,247
+	movups	-16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	69,15,58,204,194,0
+.byte	68,15,56,200,205
+	pxor	%xmm5,%xmm3
+.byte	15,56,201,229
+	movups	0(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,0
+.byte	68,15,56,200,214
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,222
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	movups	32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,0
+.byte	68,15,56,200,203
+	movups	48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,227
+	pxor	%xmm3,%xmm5
+.byte	15,56,201,243
+	cmpl	$11,%r11d
+	jb	.Laesenclast11
+	movups	64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%rcx),%xmm1
+.byte	102,15,56,220,208
+	je	.Laesenclast11
+	movups	96(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%rcx),%xmm1
+.byte	102,15,56,220,208
+.Laesenclast11:
+.byte	102,15,56,221,209
+	movups	16-112(%rcx),%xmm0
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,0
+.byte	68,15,56,200,212
+	movups	16(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,0(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,236
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,220
+	movups	-64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,1
+.byte	68,15,56,200,205
+	movups	-48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,245
+	pxor	%xmm5,%xmm3
+.byte	15,56,201,229
+	movups	-32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,1
+.byte	68,15,56,200,214
+	movups	-16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,222
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	movups	0(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,1
+.byte	68,15,56,200,203
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,227
+	pxor	%xmm3,%xmm5
+.byte	15,56,201,243
+	movups	32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,1
+.byte	68,15,56,200,212
+	movups	48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,236
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,220
+	cmpl	$11,%r11d
+	jb	.Laesenclast12
+	movups	64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%rcx),%xmm1
+.byte	102,15,56,220,208
+	je	.Laesenclast12
+	movups	96(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%rcx),%xmm1
+.byte	102,15,56,220,208
+.Laesenclast12:
+.byte	102,15,56,221,209
+	movups	16-112(%rcx),%xmm0
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,1
+.byte	68,15,56,200,205
+	movups	32(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,16(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,245
+	pxor	%xmm5,%xmm3
+.byte	15,56,201,229
+	movups	-64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,2
+.byte	68,15,56,200,214
+	movups	-48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,222
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	movups	-32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,2
+.byte	68,15,56,200,203
+	movups	-16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,227
+	pxor	%xmm3,%xmm5
+.byte	15,56,201,243
+	movups	0(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,2
+.byte	68,15,56,200,212
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,236
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,220
+	movups	32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,2
+.byte	68,15,56,200,205
+	movups	48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,245
+	pxor	%xmm5,%xmm3
+.byte	15,56,201,229
+	cmpl	$11,%r11d
+	jb	.Laesenclast13
+	movups	64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%rcx),%xmm1
+.byte	102,15,56,220,208
+	je	.Laesenclast13
+	movups	96(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%rcx),%xmm1
+.byte	102,15,56,220,208
+.Laesenclast13:
+.byte	102,15,56,221,209
+	movups	16-112(%rcx),%xmm0
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,2
+.byte	68,15,56,200,214
+	movups	48(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm2,32(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm2
+	movups	-80(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,222
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	movups	-64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,3
+.byte	68,15,56,200,203
+	movups	-48(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	15,56,202,227
+	pxor	%xmm3,%xmm5
+.byte	15,56,201,243
+	movups	-32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,3
+.byte	68,15,56,200,212
+.byte	15,56,202,236
+	pxor	%xmm4,%xmm6
+	movups	-16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,3
+.byte	68,15,56,200,205
+.byte	15,56,202,245
+	movups	0(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movdqa	%xmm12,%xmm5
+	movdqa	%xmm8,%xmm10
+.byte	69,15,58,204,193,3
+.byte	68,15,56,200,214
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	movdqa	%xmm8,%xmm9
+.byte	69,15,58,204,194,3
+.byte	68,15,56,200,205
+	movups	32(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	48(%rcx),%xmm1
+.byte	102,15,56,220,208
+	cmpl	$11,%r11d
+	jb	.Laesenclast14
+	movups	64(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	80(%rcx),%xmm1
+.byte	102,15,56,220,208
+	je	.Laesenclast14
+	movups	96(%rcx),%xmm0
+.byte	102,15,56,220,209
+	movups	112(%rcx),%xmm1
+.byte	102,15,56,220,208
+.Laesenclast14:
+.byte	102,15,56,221,209
+	movups	16-112(%rcx),%xmm0
+	decq	%rdx
+
+	paddd	%xmm11,%xmm8
+	movups	%xmm2,48(%rsi,%rdi,1)
+	leaq	64(%rdi),%rdi
+	jnz	.Loop_shaext
+
+	pshufd	$27,%xmm8,%xmm8
+	pshufd	$27,%xmm9,%xmm9
+	movups	%xmm2,(%r8)
+	movdqu	%xmm8,(%r9)
+	movd	%xmm9,16(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext

Modified: trunk/secure/lib/libcrypto/amd64/aesni-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aesni-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/aesni-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,7 +1,8 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/aesni-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from aesni-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-x86_64.pl. */
 .text	
+
 .globl	aesni_encrypt
 .type	aesni_encrypt, at function
 .align	16
@@ -17,9 +18,12 @@
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
-	jnz	.Loop_enc1_1	
+	jnz	.Loop_enc1_1
 .byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	.byte	0xf3,0xc3
 .size	aesni_encrypt,.-aesni_encrypt
 
@@ -38,34 +42,96 @@
 	decl	%eax
 	movups	(%rdx),%xmm1
 	leaq	16(%rdx),%rdx
-	jnz	.Loop_dec1_2	
+	jnz	.Loop_dec1_2
 .byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	.byte	0xf3,0xc3
 .size	aesni_decrypt, .-aesni_decrypt
+.type	_aesni_encrypt2, at function
+.align	16
+_aesni_encrypt2:
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_decrypt2, at function
+.align	16
+_aesni_decrypt2:
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Ldec_loop2:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop2
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt2,.-_aesni_decrypt2
 .type	_aesni_encrypt3, at function
 .align	16
 _aesni_encrypt3:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
 	xorps	%xmm0,%xmm4
-	movups	(%rcx),%xmm0
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
 
 .Lenc_loop3:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%eax
 .byte	102,15,56,220,225
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,224
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop3
 
 .byte	102,15,56,220,209
@@ -80,25 +146,26 @@
 .align	16
 _aesni_decrypt3:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
 	xorps	%xmm0,%xmm4
-	movups	(%rcx),%xmm0
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
 
 .Ldec_loop3:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%eax
 .byte	102,15,56,222,225
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,222,224
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop3
 
 .byte	102,15,56,222,209
@@ -113,28 +180,30 @@
 .align	16
 _aesni_encrypt4:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
 	xorps	%xmm0,%xmm4
 	xorps	%xmm0,%xmm5
-	movups	(%rcx),%xmm0
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
 
 .Lenc_loop4:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%eax
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop4
 
 .byte	102,15,56,220,209
@@ -151,28 +220,30 @@
 .align	16
 _aesni_decrypt4:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
 	xorps	%xmm0,%xmm4
 	xorps	%xmm0,%xmm5
-	movups	(%rcx),%xmm0
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
 
 .Ldec_loop4:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%eax
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop4
 
 .byte	102,15,56,222,209
@@ -189,43 +260,40 @@
 .align	16
 _aesni_encrypt6:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,220,209
-	pxor	%xmm0,%xmm4
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
 .byte	102,15,56,220,217
 	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,233
 	pxor	%xmm0,%xmm7
-	decl	%eax
-.byte	102,15,56,220,241
-	movups	(%rcx),%xmm0
-.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
 	jmp	.Lenc_loop6_enter
 .align	16
 .Lenc_loop6:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%eax
 .byte	102,15,56,220,225
+.Lenc_loop6_enter:
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-.Lenc_loop6_enter:
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop6
 
 .byte	102,15,56,220,209
@@ -246,43 +314,40 @@
 .align	16
 _aesni_decrypt6:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,222,209
-	pxor	%xmm0,%xmm4
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
 .byte	102,15,56,222,217
 	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,222,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,233
 	pxor	%xmm0,%xmm7
-	decl	%eax
-.byte	102,15,56,222,241
-	movups	(%rcx),%xmm0
-.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
 	jmp	.Ldec_loop6_enter
 .align	16
 .Ldec_loop6:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%eax
 .byte	102,15,56,222,225
+.Ldec_loop6_enter:
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
-.Ldec_loop6_enter:
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
 .byte	102,15,56,222,240
 .byte	102,15,56,222,248
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop6
 
 .byte	102,15,56,222,209
@@ -303,34 +368,28 @@
 .align	16
 _aesni_encrypt8:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
-.byte	102,15,56,220,209
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,220,217
 	pxor	%xmm0,%xmm5
-.byte	102,15,56,220,225
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,233
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
 	pxor	%xmm0,%xmm7
-	decl	%eax
-.byte	102,15,56,220,241
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,220,249
+.byte	102,15,56,220,217
 	pxor	%xmm0,%xmm9
-	movups	(%rcx),%xmm0
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	16(%rcx),%xmm1
-	jmp	.Lenc_loop8_enter
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop8_inner
 .align	16
 .Lenc_loop8:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%eax
+.Lenc_loop8_inner:
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
@@ -337,11 +396,11 @@
 .byte	102,15,56,220,249
 .byte	102,68,15,56,220,193
 .byte	102,68,15,56,220,201
-	movups	16(%rcx),%xmm1
 .Lenc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
 .byte	102,15,56,220,240
@@ -348,7 +407,7 @@
 .byte	102,15,56,220,248
 .byte	102,68,15,56,220,192
 .byte	102,68,15,56,220,200
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lenc_loop8
 
 .byte	102,15,56,220,209
@@ -373,34 +432,28 @@
 .align	16
 _aesni_decrypt8:
 	movups	(%rcx),%xmm0
-	shrl	$1,%eax
+	shll	$4,%eax
 	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm0,%xmm3
-.byte	102,15,56,222,209
 	pxor	%xmm0,%xmm4
-.byte	102,15,56,222,217
 	pxor	%xmm0,%xmm5
-.byte	102,15,56,222,225
 	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,233
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,209
 	pxor	%xmm0,%xmm7
-	decl	%eax
-.byte	102,15,56,222,241
 	pxor	%xmm0,%xmm8
-.byte	102,15,56,222,249
+.byte	102,15,56,222,217
 	pxor	%xmm0,%xmm9
-	movups	(%rcx),%xmm0
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	16(%rcx),%xmm1
-	jmp	.Ldec_loop8_enter
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Ldec_loop8_inner
 .align	16
 .Ldec_loop8:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%eax
+.Ldec_loop8_inner:
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
@@ -407,11 +460,11 @@
 .byte	102,15,56,222,249
 .byte	102,68,15,56,222,193
 .byte	102,68,15,56,222,201
-	movups	16(%rcx),%xmm1
 .Ldec_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
 .byte	102,15,56,222,240
@@ -418,7 +471,7 @@
 .byte	102,15,56,222,248
 .byte	102,68,15,56,222,192
 .byte	102,68,15,56,222,200
-	movups	(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Ldec_loop8
 
 .byte	102,15,56,222,209
@@ -453,7 +506,7 @@
 	testl	%r8d,%r8d
 	jz	.Lecb_decrypt
 
-	cmpq	$128,%rdx
+	cmpq	$0x80,%rdx
 	jb	.Lecb_enc_tail
 
 	movdqu	(%rdi),%xmm2
@@ -465,7 +518,7 @@
 	movdqu	96(%rdi),%xmm8
 	movdqu	112(%rdi),%xmm9
 	leaq	128(%rdi),%rdi
-	subq	$128,%rdx
+	subq	$0x80,%rdx
 	jmp	.Lecb_enc_loop8_enter
 .align	16
 .Lecb_enc_loop8:
@@ -493,7 +546,7 @@
 
 	call	_aesni_encrypt8
 
-	subq	$128,%rdx
+	subq	$0x80,%rdx
 	jnc	.Lecb_enc_loop8
 
 	movups	%xmm2,(%rsi)
@@ -507,26 +560,27 @@
 	movups	%xmm8,96(%rsi)
 	movups	%xmm9,112(%rsi)
 	leaq	128(%rsi),%rsi
-	addq	$128,%rdx
+	addq	$0x80,%rdx
 	jz	.Lecb_ret
 
 .Lecb_enc_tail:
 	movups	(%rdi),%xmm2
-	cmpq	$32,%rdx
+	cmpq	$0x20,%rdx
 	jb	.Lecb_enc_one
 	movups	16(%rdi),%xmm3
 	je	.Lecb_enc_two
 	movups	32(%rdi),%xmm4
-	cmpq	$64,%rdx
+	cmpq	$0x40,%rdx
 	jb	.Lecb_enc_three
 	movups	48(%rdi),%xmm5
 	je	.Lecb_enc_four
 	movups	64(%rdi),%xmm6
-	cmpq	$96,%rdx
+	cmpq	$0x60,%rdx
 	jb	.Lecb_enc_five
 	movups	80(%rdi),%xmm7
 	je	.Lecb_enc_six
 	movdqu	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
 	call	_aesni_encrypt8
 	movups	%xmm2,(%rsi)
 	movups	%xmm3,16(%rsi)
@@ -547,14 +601,13 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_3	
+	jnz	.Loop_enc1_3
 .byte	102,15,56,221,209
 	movups	%xmm2,(%rsi)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 	movups	%xmm2,(%rsi)
 	movups	%xmm3,16(%rsi)
 	jmp	.Lecb_ret
@@ -596,7 +649,7 @@
 
 .align	16
 .Lecb_decrypt:
-	cmpq	$128,%rdx
+	cmpq	$0x80,%rdx
 	jb	.Lecb_dec_tail
 
 	movdqu	(%rdi),%xmm2
@@ -608,7 +661,7 @@
 	movdqu	96(%rdi),%xmm8
 	movdqu	112(%rdi),%xmm9
 	leaq	128(%rdi),%rdi
-	subq	$128,%rdx
+	subq	$0x80,%rdx
 	jmp	.Lecb_dec_loop8_enter
 .align	16
 .Lecb_dec_loop8:
@@ -637,49 +690,66 @@
 	call	_aesni_decrypt8
 
 	movups	(%r11),%xmm0
-	subq	$128,%rdx
+	subq	$0x80,%rdx
 	jnc	.Lecb_dec_loop8
 
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movq	%r11,%rcx
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movl	%r10d,%eax
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
 	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
 	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
 	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
 	movups	%xmm9,112(%rsi)
+	pxor	%xmm9,%xmm9
 	leaq	128(%rsi),%rsi
-	addq	$128,%rdx
+	addq	$0x80,%rdx
 	jz	.Lecb_ret
 
 .Lecb_dec_tail:
 	movups	(%rdi),%xmm2
-	cmpq	$32,%rdx
+	cmpq	$0x20,%rdx
 	jb	.Lecb_dec_one
 	movups	16(%rdi),%xmm3
 	je	.Lecb_dec_two
 	movups	32(%rdi),%xmm4
-	cmpq	$64,%rdx
+	cmpq	$0x40,%rdx
 	jb	.Lecb_dec_three
 	movups	48(%rdi),%xmm5
 	je	.Lecb_dec_four
 	movups	64(%rdi),%xmm6
-	cmpq	$96,%rdx
+	cmpq	$0x60,%rdx
 	jb	.Lecb_dec_five
 	movups	80(%rdi),%xmm7
 	je	.Lecb_dec_six
 	movups	96(%rdi),%xmm8
 	movups	(%rcx),%xmm0
+	xorps	%xmm9,%xmm9
 	call	_aesni_decrypt8
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
 	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
 	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
 	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_one:
@@ -692,31 +762,40 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_dec1_4	
+	jnz	.Loop_dec1_4
 .byte	102,15,56,223,209
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_three:
 	call	_aesni_decrypt3
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_four:
 	call	_aesni_decrypt4
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_five:
@@ -723,22 +802,36 @@
 	xorps	%xmm7,%xmm7
 	call	_aesni_decrypt6
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
 	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_six:
 	call	_aesni_decrypt6
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
 	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
 	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
 
 .Lecb_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	.byte	0xf3,0xc3
 .size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
 .globl	aesni_ccm64_encrypt_blocks
@@ -746,56 +839,62 @@
 .align	16
 aesni_ccm64_encrypt_blocks:
 	movl	240(%rcx),%eax
-	movdqu	(%r8),%xmm9
-	movdqa	.Lincrement64(%rip),%xmm6
+	movdqu	(%r8),%xmm6
+	movdqa	.Lincrement64(%rip),%xmm9
 	movdqa	.Lbswap_mask(%rip),%xmm7
 
-	shrl	$1,%eax
+	shll	$4,%eax
+	movl	$16,%r10d
 	leaq	0(%rcx),%r11
 	movdqu	(%r9),%xmm3
-	movdqa	%xmm9,%xmm2
-	movl	%eax,%r10d
-.byte	102,68,15,56,0,207
+	movdqa	%xmm6,%xmm2
+	leaq	32(%rcx,%rax,1),%rcx
+.byte	102,15,56,0,247
+	subq	%rax,%r10
 	jmp	.Lccm64_enc_outer
 .align	16
 .Lccm64_enc_outer:
 	movups	(%r11),%xmm0
-	movl	%r10d,%eax
+	movq	%r10,%rax
 	movups	(%rdi),%xmm8
 
 	xorps	%xmm0,%xmm2
 	movups	16(%r11),%xmm1
 	xorps	%xmm8,%xmm0
-	leaq	32(%r11),%rcx
 	xorps	%xmm0,%xmm3
-	movups	(%rcx),%xmm0
+	movups	32(%r11),%xmm0
 
 .Lccm64_enc2_loop:
 .byte	102,15,56,220,209
-	decl	%eax
 .byte	102,15,56,220,217
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,216
-	movups	0(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lccm64_enc2_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	paddq	%xmm6,%xmm9
+	paddq	%xmm9,%xmm6
+	decq	%rdx
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
 
-	decq	%rdx
 	leaq	16(%rdi),%rdi
 	xorps	%xmm2,%xmm8
-	movdqa	%xmm9,%xmm2
+	movdqa	%xmm6,%xmm2
 	movups	%xmm8,(%rsi)
+.byte	102,15,56,0,215
 	leaq	16(%rsi),%rsi
-.byte	102,15,56,0,215
 	jnz	.Lccm64_enc_outer
 
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,(%r9)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm8,%xmm8
+	pxor	%xmm6,%xmm6
 	.byte	0xf3,0xc3
 .size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
 .globl	aesni_ccm64_decrypt_blocks
@@ -803,15 +902,15 @@
 .align	16
 aesni_ccm64_decrypt_blocks:
 	movl	240(%rcx),%eax
-	movups	(%r8),%xmm9
+	movups	(%r8),%xmm6
 	movdqu	(%r9),%xmm3
-	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lincrement64(%rip),%xmm9
 	movdqa	.Lbswap_mask(%rip),%xmm7
 
-	movaps	%xmm9,%xmm2
+	movaps	%xmm6,%xmm2
 	movl	%eax,%r10d
 	movq	%rcx,%r11
-.byte	102,68,15,56,0,207
+.byte	102,15,56,0,247
 	movups	(%rcx),%xmm0
 	movups	16(%rcx),%xmm1
 	leaq	32(%rcx),%rcx
@@ -821,17 +920,21 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_5	
+	jnz	.Loop_enc1_5
 .byte	102,15,56,221,209
+	shll	$4,%r10d
+	movl	$16,%eax
 	movups	(%rdi),%xmm8
-	paddq	%xmm6,%xmm9
+	paddq	%xmm9,%xmm6
 	leaq	16(%rdi),%rdi
+	subq	%r10,%rax
+	leaq	32(%r11,%r10,1),%rcx
+	movq	%rax,%r10
 	jmp	.Lccm64_dec_outer
 .align	16
 .Lccm64_dec_outer:
 	xorps	%xmm2,%xmm8
-	movdqa	%xmm9,%xmm2
-	movl	%r10d,%eax
+	movdqa	%xmm6,%xmm2
 	movups	%xmm8,(%rsi)
 	leaq	16(%rsi),%rsi
 .byte	102,15,56,0,215
@@ -840,36 +943,36 @@
 	jz	.Lccm64_dec_break
 
 	movups	(%r11),%xmm0
-	shrl	$1,%eax
+	movq	%r10,%rax
 	movups	16(%r11),%xmm1
 	xorps	%xmm0,%xmm8
-	leaq	32(%r11),%rcx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm8,%xmm3
-	movups	(%rcx),%xmm0
-
+	movups	32(%r11),%xmm0
+	jmp	.Lccm64_dec2_loop
+.align	16
 .Lccm64_dec2_loop:
 .byte	102,15,56,220,209
-	decl	%eax
 .byte	102,15,56,220,217
-	movups	16(%rcx),%xmm1
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 .byte	102,15,56,220,208
-	leaq	32(%rcx),%rcx
 .byte	102,15,56,220,216
-	movups	0(%rcx),%xmm0
+	movups	-16(%rcx,%rax,1),%xmm0
 	jnz	.Lccm64_dec2_loop
 	movups	(%rdi),%xmm8
-	paddq	%xmm6,%xmm9
+	paddq	%xmm9,%xmm6
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	leaq	16(%rdi),%rdi
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
+	leaq	16(%rdi),%rdi
 	jmp	.Lccm64_dec_outer
 
 .align	16
 .Lccm64_dec_break:
 
+	movl	240(%r11),%eax
 	movups	(%r11),%xmm0
 	movups	16(%r11),%xmm1
 	xorps	%xmm0,%xmm8
@@ -880,9 +983,15 @@
 	decl	%eax
 	movups	(%r11),%xmm1
 	leaq	16(%r11),%r11
-	jnz	.Loop_enc1_6	
+	jnz	.Loop_enc1_6
 .byte	102,15,56,221,217
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
 	movups	%xmm3,(%r9)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm8,%xmm8
+	pxor	%xmm6,%xmm6
 	.byte	0xf3,0xc3
 .size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
 .globl	aesni_ctr32_encrypt_blocks
@@ -890,247 +999,572 @@
 .align	16
 aesni_ctr32_encrypt_blocks:
 	cmpq	$1,%rdx
-	je	.Lctr32_one_shortcut
+	jne	.Lctr32_bulk
 
-	movdqu	(%r8),%xmm14
-	movdqa	.Lbswap_mask(%rip),%xmm15
-	xorl	%eax,%eax
-.byte	102,69,15,58,22,242,3
-.byte	102,68,15,58,34,240,3
 
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_7:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_7
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	.Lctr32_epilogue
+
+.align	16
+.Lctr32_bulk:
+	leaq	(%rsp),%rax
+	pushq	%rbp
+	subq	$128,%rsp
+	andq	$-16,%rsp
+	leaq	-8(%rax),%rbp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%r11d
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%r11d,%eax
+	xorl	%r11d,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%r11d,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%r11d,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
 	movl	240(%rcx),%eax
+	xorl	%r11d,%r9d
 	bswapl	%r10d
-	pxor	%xmm12,%xmm12
-	pxor	%xmm13,%xmm13
-.byte	102,69,15,58,34,226,0
-	leaq	3(%r10),%r11
-.byte	102,69,15,58,34,235,0
-	incl	%r10d
-.byte	102,69,15,58,34,226,1
-	incq	%r11
-.byte	102,69,15,58,34,235,1
-	incl	%r10d
-.byte	102,69,15,58,34,226,2
-	incq	%r11
-.byte	102,69,15,58,34,235,2
-	movdqa	%xmm12,-40(%rsp)
-.byte	102,69,15,56,0,231
-	movdqa	%xmm13,-24(%rsp)
-.byte	102,69,15,56,0,239
+	movl	%r9d,80+12(%rsp)
+	xorl	%r11d,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
+	xorl	%r11d,%r9d
+	andl	$71303168,%r10d
+	movl	%r9d,112+12(%rsp)
 
-	pshufd	$192,%xmm12,%xmm2
-	pshufd	$128,%xmm12,%xmm3
-	pshufd	$64,%xmm12,%xmm4
-	cmpq	$6,%rdx
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
 	jb	.Lctr32_tail
-	shrl	$1,%eax
-	movq	%rcx,%r11
-	movl	%eax,%r10d
+
 	subq	$6,%rdx
+	cmpl	$4194304,%r10d
+	je	.Lctr32_6x
+
+	leaq	128(%rcx),%rcx
+	subq	$2,%rdx
+	jmp	.Lctr32_loop8
+
+.align	16
+.Lctr32_6x:
+	shll	$4,%eax
+	movl	$48,%r10d
+	bswapl	%r11d
+	leaq	32(%rcx,%rax,1),%rcx
+	subq	%rax,%r10
 	jmp	.Lctr32_loop6
 
 .align	16
 .Lctr32_loop6:
-	pshufd	$192,%xmm13,%xmm5
-	por	%xmm14,%xmm2
-	movups	(%r11),%xmm0
-	pshufd	$128,%xmm13,%xmm6
-	por	%xmm14,%xmm3
-	movups	16(%r11),%xmm1
-	pshufd	$64,%xmm13,%xmm7
-	por	%xmm14,%xmm4
-	por	%xmm14,%xmm5
-	xorps	%xmm0,%xmm2
-	por	%xmm14,%xmm6
-	por	%xmm14,%xmm7
+	addl	$6,%r8d
+	movups	-48(%rcx,%r10,1),%xmm0
+.byte	102,15,56,220,209
+	movl	%r8d,%eax
+	xorl	%r11d,%eax
+.byte	102,15,56,220,217
+.byte	0x0f,0x38,0xf1,0x44,0x24,12
+	leal	1(%r8),%eax
+.byte	102,15,56,220,225
+	xorl	%r11d,%eax
+.byte	0x0f,0x38,0xf1,0x44,0x24,28
+.byte	102,15,56,220,233
+	leal	2(%r8),%eax
+	xorl	%r11d,%eax
+.byte	102,15,56,220,241
+.byte	0x0f,0x38,0xf1,0x44,0x24,44
+	leal	3(%r8),%eax
+.byte	102,15,56,220,249
+	movups	-32(%rcx,%r10,1),%xmm1
+	xorl	%r11d,%eax
 
+.byte	102,15,56,220,208
+.byte	0x0f,0x38,0xf1,0x44,0x24,60
+	leal	4(%r8),%eax
+.byte	102,15,56,220,216
+	xorl	%r11d,%eax
+.byte	0x0f,0x38,0xf1,0x44,0x24,76
+.byte	102,15,56,220,224
+	leal	5(%r8),%eax
+	xorl	%r11d,%eax
+.byte	102,15,56,220,232
+.byte	0x0f,0x38,0xf1,0x44,0x24,92
+	movq	%r10,%rax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%r10,1),%xmm0
 
+	call	.Lenc_loop6
 
+	movdqu	(%rdi),%xmm8
+	movdqu	16(%rdi),%xmm9
+	movdqu	32(%rdi),%xmm10
+	movdqu	48(%rdi),%xmm11
+	movdqu	64(%rdi),%xmm12
+	movdqu	80(%rdi),%xmm13
+	leaq	96(%rdi),%rdi
+	movups	-64(%rcx,%r10,1),%xmm1
+	pxor	%xmm2,%xmm8
+	movaps	0(%rsp),%xmm2
+	pxor	%xmm3,%xmm9
+	movaps	16(%rsp),%xmm3
+	pxor	%xmm4,%xmm10
+	movaps	32(%rsp),%xmm4
+	pxor	%xmm5,%xmm11
+	movaps	48(%rsp),%xmm5
+	pxor	%xmm6,%xmm12
+	movaps	64(%rsp),%xmm6
+	pxor	%xmm7,%xmm13
+	movaps	80(%rsp),%xmm7
+	movdqu	%xmm8,(%rsi)
+	movdqu	%xmm9,16(%rsi)
+	movdqu	%xmm10,32(%rsi)
+	movdqu	%xmm11,48(%rsi)
+	movdqu	%xmm12,64(%rsi)
+	movdqu	%xmm13,80(%rsi)
+	leaq	96(%rsi),%rsi
 
-	pxor	%xmm0,%xmm3
+	subq	$6,%rdx
+	jnc	.Lctr32_loop6
+
+	addq	$6,%rdx
+	jz	.Lctr32_done
+
+	leal	-48(%r10),%eax
+	leaq	-80(%rcx,%r10,1),%rcx
+	negl	%eax
+	shrl	$4,%eax
+	jmp	.Lctr32_tail
+
+.align	32
+.Lctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
 .byte	102,15,56,220,209
-	leaq	32(%r11),%rcx
-	pxor	%xmm0,%xmm4
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
 .byte	102,15,56,220,217
-	movdqa	.Lincrement32(%rip),%xmm13
-	pxor	%xmm0,%xmm5
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
 .byte	102,15,56,220,225
-	movdqa	-40(%rsp),%xmm12
-	pxor	%xmm0,%xmm6
+	xorl	%r11d,%r9d
+	nop
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
-	movups	(%rcx),%xmm0
-	decl	%eax
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-	jmp	.Lctr32_enc_loop6_enter
-.align	16
-.Lctr32_enc_loop6:
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%eax
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-.Lctr32_enc_loop6_enter:
-	movups	16(%rcx),%xmm1
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	(%rcx),%xmm0
-	jnz	.Lctr32_enc_loop6
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%r11d,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%r11d,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
 
+	jb	.Lctr32_enc_done
+
 .byte	102,15,56,220,209
-	paddd	%xmm13,%xmm12
 .byte	102,15,56,220,217
-	paddd	-24(%rsp),%xmm13
 .byte	102,15,56,220,225
-	movdqa	%xmm12,-40(%rsp)
 .byte	102,15,56,220,233
-	movdqa	%xmm13,-24(%rsp)
 .byte	102,15,56,220,241
-.byte	102,69,15,56,0,231
 .byte	102,15,56,220,249
-.byte	102,69,15,56,0,239
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
 
-.byte	102,15,56,221,208
-	movups	(%rdi),%xmm8
-.byte	102,15,56,221,216
-	movups	16(%rdi),%xmm9
-.byte	102,15,56,221,224
-	movups	32(%rdi),%xmm10
-.byte	102,15,56,221,232
-	movups	48(%rdi),%xmm11
-.byte	102,15,56,221,240
-	movups	64(%rdi),%xmm1
-.byte	102,15,56,221,248
-	movups	80(%rdi),%xmm0
-	leaq	96(%rdi),%rdi
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+	je	.Lctr32_enc_done
 
-	xorps	%xmm2,%xmm8
-	pshufd	$192,%xmm12,%xmm2
-	xorps	%xmm3,%xmm9
-	pshufd	$128,%xmm12,%xmm3
-	movups	%xmm8,(%rsi)
-	xorps	%xmm4,%xmm10
-	pshufd	$64,%xmm12,%xmm4
-	movups	%xmm9,16(%rsi)
-	xorps	%xmm5,%xmm11
-	movups	%xmm10,32(%rsi)
-	xorps	%xmm6,%xmm1
-	movups	%xmm11,48(%rsi)
-	xorps	%xmm7,%xmm0
-	movups	%xmm1,64(%rsi)
-	movups	%xmm0,80(%rsi)
-	leaq	96(%rsi),%rsi
-	movl	%r10d,%eax
-	subq	$6,%rdx
-	jnc	.Lctr32_loop6
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
 
-	addq	$6,%rdx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	.Lctr32_enc_done
+
+.align	16
+.Lctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	.Lctr32_loop8
+
+	addq	$8,%rdx
 	jz	.Lctr32_done
-	movq	%r11,%rcx
-	leal	1(%rax,%rax,1),%eax
+	leaq	-128(%rcx),%rcx
 
 .Lctr32_tail:
-	por	%xmm14,%xmm2
-	movups	(%rdi),%xmm8
-	cmpq	$2,%rdx
-	jb	.Lctr32_one
 
-	por	%xmm14,%xmm3
-	movups	16(%rdi),%xmm9
-	je	.Lctr32_two
 
-	pshufd	$192,%xmm13,%xmm5
-	por	%xmm14,%xmm4
-	movups	32(%rdi),%xmm10
+	leaq	16(%rcx),%rcx
 	cmpq	$4,%rdx
-	jb	.Lctr32_three
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
 
-	pshufd	$128,%xmm13,%xmm6
-	por	%xmm14,%xmm5
-	movups	48(%rdi),%xmm11
-	je	.Lctr32_four
 
-	por	%xmm14,%xmm6
-	xorps	%xmm7,%xmm7
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
 
-	call	_aesni_encrypt6
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
 
-	movups	64(%rdi),%xmm1
-	xorps	%xmm2,%xmm8
-	xorps	%xmm3,%xmm9
-	movups	%xmm8,(%rsi)
-	xorps	%xmm4,%xmm10
-	movups	%xmm9,16(%rsi)
-	xorps	%xmm5,%xmm11
-	movups	%xmm10,32(%rsi)
-	xorps	%xmm6,%xmm1
-	movups	%xmm11,48(%rsi)
-	movups	%xmm1,64(%rsi)
+	call	.Lenc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	.Lctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	.Lctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
 	jmp	.Lctr32_done
 
-.align	16
-.Lctr32_one_shortcut:
-	movups	(%r8),%xmm2
-	movups	(%rdi),%xmm8
-	movl	240(%rcx),%eax
-.Lctr32_one:
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-.Loop_enc1_7:
+.align	32
+.Lctr32_loop4:
 .byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
 	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
 	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_7	
+	jnz	.Lctr32_loop4
 .byte	102,15,56,221,209
-	xorps	%xmm2,%xmm8
-	movups	%xmm8,(%rsi)
-	jmp	.Lctr32_done
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
 
-.align	16
-.Lctr32_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
-	xorps	%xmm2,%xmm8
-	xorps	%xmm3,%xmm9
-	movups	%xmm8,(%rsi)
-	movups	%xmm9,16(%rsi)
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
 	jmp	.Lctr32_done
 
-.align	16
-.Lctr32_three:
-	call	_aesni_encrypt3
-	xorps	%xmm2,%xmm8
-	xorps	%xmm3,%xmm9
-	movups	%xmm8,(%rsi)
-	xorps	%xmm4,%xmm10
-	movups	%xmm9,16(%rsi)
-	movups	%xmm10,32(%rsi)
-	jmp	.Lctr32_done
+.align	32
+.Lctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
 
-.align	16
-.Lctr32_four:
-	call	_aesni_encrypt4
-	xorps	%xmm2,%xmm8
-	xorps	%xmm3,%xmm9
-	movups	%xmm8,(%rsi)
-	xorps	%xmm4,%xmm10
-	movups	%xmm9,16(%rsi)
-	xorps	%xmm5,%xmm11
-	movups	%xmm10,32(%rsi)
-	movups	%xmm11,48(%rsi)
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	.Lctr32_done
 
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	.Lctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
 .Lctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%r11d,%r11d
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	leaq	(%rbp),%rsp
+	popq	%rbp
+.Lctr32_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 .globl	aesni_xts_encrypt
@@ -1137,243 +1571,287 @@
 .type	aesni_xts_encrypt, at function
 .align	16
 aesni_xts_encrypt:
-	leaq	-104(%rsp),%rsp
-	movups	(%r9),%xmm15
+	leaq	(%rsp),%rax
+	pushq	%rbp
+	subq	$112,%rsp
+	andq	$-16,%rsp
+	leaq	-8(%rax),%rbp
+	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
 	movups	(%r8),%xmm0
 	movups	16(%r8),%xmm1
 	leaq	32(%r8),%r8
-	xorps	%xmm0,%xmm15
+	xorps	%xmm0,%xmm2
 .Loop_enc1_8:
-.byte	102,68,15,56,220,249
+.byte	102,15,56,220,209
 	decl	%eax
 	movups	(%r8),%xmm1
 	leaq	16(%r8),%r8
-	jnz	.Loop_enc1_8	
-.byte	102,68,15,56,221,249
+	jnz	.Loop_enc1_8
+.byte	102,15,56,221,209
+	movups	(%rcx),%xmm0
 	movq	%rcx,%r11
 	movl	%r10d,%eax
+	shll	$4,%r10d
 	movq	%rdx,%r9
 	andq	$-16,%rdx
 
+	movups	16(%rcx,%r10,1),%xmm1
+
 	movdqa	.Lxts_magic(%rip),%xmm8
-	pxor	%xmm14,%xmm14
-	pcmpgtd	%xmm15,%xmm14
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	movdqa	%xmm2,%xmm15
+	pshufd	$0x5f,%xmm2,%xmm9
+	pxor	%xmm0,%xmm1
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm10
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm11
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm11
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm12
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm12
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm13
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm13
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm15,%xmm14
+	psrad	$31,%xmm9
+	paddq	%xmm15,%xmm15
 	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm0,%xmm14
 	pxor	%xmm9,%xmm15
+	movaps	%xmm1,96(%rsp)
+
 	subq	$96,%rdx
 	jc	.Lxts_enc_short
 
-	shrl	$1,%eax
-	subl	$1,%eax
-	movl	%eax,%r10d
+	movl	$16+96,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	subq	%r10,%rax
+	movups	16(%r11),%xmm1
+	movq	%rax,%r10
+	leaq	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_enc_grandloop
 
-.align	16
+.align	32
 .Lxts_enc_grandloop:
-	pshufd	$19,%xmm14,%xmm9
-	movdqa	%xmm15,%xmm14
-	paddq	%xmm15,%xmm15
 	movdqu	0(%rdi),%xmm2
-	pand	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm8
 	movdqu	16(%rdi),%xmm3
-	pxor	%xmm9,%xmm15
-
+	pxor	%xmm10,%xmm2
 	movdqu	32(%rdi),%xmm4
-	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,220,209
 	movdqu	48(%rdi),%xmm5
-	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm4
+.byte	102,15,56,220,217
 	movdqu	64(%rdi),%xmm6
-	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+.byte	102,15,56,220,225
 	movdqu	80(%rdi),%xmm7
+	pxor	%xmm15,%xmm8
+	movdqa	96(%rsp),%xmm9
+	pxor	%xmm14,%xmm6
+.byte	102,15,56,220,233
+	movups	32(%r11),%xmm0
 	leaq	96(%rdi),%rdi
-	pxor	%xmm13,%xmm5
-	movups	(%r11),%xmm0
-	pxor	%xmm14,%xmm6
-	pxor	%xmm15,%xmm7
+	pxor	%xmm8,%xmm7
 
+	pxor	%xmm9,%xmm10
+.byte	102,15,56,220,241
+	pxor	%xmm9,%xmm11
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,220,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm12
 
-
-	movups	16(%r11),%xmm1
-	pxor	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,220,208
+	pxor	%xmm9,%xmm13
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,220,216
+	pxor	%xmm9,%xmm14
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	pxor	%xmm9,%xmm8
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	64(%r11),%xmm0
+	movdqa	%xmm8,80(%rsp)
+	pshufd	$0x5f,%xmm15,%xmm9
+	jmp	.Lxts_enc_loop6
+.align	32
+.Lxts_enc_loop6:
 .byte	102,15,56,220,209
-	leaq	32(%r11),%rcx
-	pxor	%xmm0,%xmm4
-	movdqa	%xmm11,16(%rsp)
 .byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
-	movdqa	%xmm12,32(%rsp)
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm13,48(%rsp)
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
-	movups	(%rcx),%xmm0
-	decl	%eax
-	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,220,241
-	movdqa	%xmm15,80(%rsp)
 .byte	102,15,56,220,249
-	pxor	%xmm14,%xmm14
-	pcmpgtd	%xmm15,%xmm14
-	jmp	.Lxts_enc_loop6_enter
+	movups	-64(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 
-.align	16
-.Lxts_enc_loop6:
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-80(%rcx,%rax,1),%xmm0
+	jnz	.Lxts_enc_loop6
+
+	movdqa	(%r8),%xmm8
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,220,209
+	paddq	%xmm15,%xmm15
+	psrad	$31,%xmm14
 .byte	102,15,56,220,217
-	decl	%eax
+	pand	%xmm8,%xmm14
+	movups	(%r11),%xmm10
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
+	pxor	%xmm14,%xmm15
+	movaps	%xmm10,%xmm11
 .byte	102,15,56,220,249
-.Lxts_enc_loop6_enter:
-	movups	16(%rcx),%xmm1
+	movups	-64(%rcx),%xmm1
+
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,220,208
+	paddd	%xmm9,%xmm9
+	pxor	%xmm15,%xmm10
 .byte	102,15,56,220,216
-	leaq	32(%rcx),%rcx
+	psrad	$31,%xmm14
+	paddq	%xmm15,%xmm15
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
+	pand	%xmm8,%xmm14
+	movaps	%xmm11,%xmm12
 .byte	102,15,56,220,240
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,220,248
-	movups	(%rcx),%xmm0
-	jnz	.Lxts_enc_loop6
+	movups	-48(%rcx),%xmm0
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	paddq	%xmm15,%xmm15
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,220,209
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm11
+	psrad	$31,%xmm14
 .byte	102,15,56,220,217
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
 .byte	102,15,56,220,225
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,220,233
+	movdqa	%xmm13,48(%rsp)
+	pxor	%xmm14,%xmm15
 .byte	102,15,56,220,241
+	movaps	%xmm12,%xmm13
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,220,249
-	movups	16(%rcx),%xmm1
+	movups	-32(%rcx),%xmm1
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm10
-	paddq	%xmm15,%xmm15
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,220,208
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm12
+	psrad	$31,%xmm14
 .byte	102,15,56,220,216
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
 .byte	102,15,56,220,224
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,220,232
 .byte	102,15,56,220,240
+	pxor	%xmm14,%xmm15
+	movaps	%xmm13,%xmm14
 .byte	102,15,56,220,248
-	movups	32(%rcx),%xmm0
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm11
-	paddq	%xmm15,%xmm15
+	movdqa	%xmm9,%xmm0
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,220,209
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm13
+	psrad	$31,%xmm0
 .byte	102,15,56,220,217
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm0
 .byte	102,15,56,220,225
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,220,233
+	pxor	%xmm0,%xmm15
+	movups	(%r11),%xmm0
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
+	movups	16(%r11),%xmm1
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm12
+	pxor	%xmm15,%xmm14
+.byte	102,15,56,221,84,36,0
+	psrad	$31,%xmm9
 	paddq	%xmm15,%xmm15
-.byte	102,15,56,221,208
+.byte	102,15,56,221,92,36,16
+.byte	102,15,56,221,100,36,32
 	pand	%xmm8,%xmm9
-.byte	102,15,56,221,216
-	pcmpgtd	%xmm15,%xmm14
-.byte	102,15,56,221,224
+	movq	%r10,%rax
+.byte	102,15,56,221,108,36,48
+.byte	102,15,56,221,116,36,64
+.byte	102,15,56,221,124,36,80
 	pxor	%xmm9,%xmm15
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm13
-	paddq	%xmm15,%xmm15
-	xorps	0(%rsp),%xmm2
-	pand	%xmm8,%xmm9
-	xorps	16(%rsp),%xmm3
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-
-	xorps	32(%rsp),%xmm4
-	movups	%xmm2,0(%rsi)
-	xorps	48(%rsp),%xmm5
-	movups	%xmm3,16(%rsi)
-	xorps	64(%rsp),%xmm6
-	movups	%xmm4,32(%rsi)
-	xorps	80(%rsp),%xmm7
-	movups	%xmm5,48(%rsi)
-	movl	%r10d,%eax
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
 	leaq	96(%rsi),%rsi
+	movups	%xmm2,-96(%rsi)
+	movups	%xmm3,-80(%rsi)
+	movups	%xmm4,-64(%rsi)
+	movups	%xmm5,-48(%rsi)
+	movups	%xmm6,-32(%rsi)
+	movups	%xmm7,-16(%rsi)
 	subq	$96,%rdx
 	jnc	.Lxts_enc_grandloop
 
-	leal	3(%rax,%rax,1),%eax
+	movl	$16+96,%eax
+	subl	%r10d,%eax
 	movq	%r11,%rcx
-	movl	%eax,%r10d
+	shrl	$4,%eax
 
 .Lxts_enc_short:
+
+	movl	%eax,%r10d
+	pxor	%xmm0,%xmm10
 	addq	$96,%rdx
 	jz	.Lxts_enc_done
 
-	cmpq	$32,%rdx
+	pxor	%xmm0,%xmm11
+	cmpq	$0x20,%rdx
 	jb	.Lxts_enc_one
+	pxor	%xmm0,%xmm12
 	je	.Lxts_enc_two
 
-	cmpq	$64,%rdx
+	pxor	%xmm0,%xmm13
+	cmpq	$0x40,%rdx
 	jb	.Lxts_enc_three
+	pxor	%xmm0,%xmm14
 	je	.Lxts_enc_four
 
-	pshufd	$19,%xmm14,%xmm9
-	movdqa	%xmm15,%xmm14
-	paddq	%xmm15,%xmm15
 	movdqu	(%rdi),%xmm2
-	pand	%xmm8,%xmm9
 	movdqu	16(%rdi),%xmm3
-	pxor	%xmm9,%xmm15
-
 	movdqu	32(%rdi),%xmm4
 	pxor	%xmm10,%xmm2
 	movdqu	48(%rdi),%xmm5
@@ -1383,6 +1861,7 @@
 	pxor	%xmm12,%xmm4
 	pxor	%xmm13,%xmm5
 	pxor	%xmm14,%xmm6
+	pxor	%xmm7,%xmm7
 
 	call	_aesni_encrypt6
 
@@ -1414,7 +1893,7 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_9	
+	jnz	.Loop_enc1_9
 .byte	102,15,56,221,209
 	xorps	%xmm10,%xmm2
 	movdqa	%xmm11,%xmm10
@@ -1430,7 +1909,7 @@
 	xorps	%xmm10,%xmm2
 	xorps	%xmm11,%xmm3
 
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 
 	xorps	%xmm10,%xmm2
 	movdqa	%xmm12,%xmm10
@@ -1476,15 +1955,15 @@
 
 	call	_aesni_encrypt4
 
-	xorps	%xmm10,%xmm2
-	movdqa	%xmm15,%xmm10
-	xorps	%xmm11,%xmm3
-	xorps	%xmm12,%xmm4
-	movups	%xmm2,(%rsi)
-	xorps	%xmm13,%xmm5
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	movdqu	%xmm4,32(%rsi)
+	movdqu	%xmm5,48(%rsi)
 	leaq	64(%rsi),%rsi
 	jmp	.Lxts_enc_done
 
@@ -1519,13 +1998,37 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_10	
+	jnz	.Loop_enc1_10
 .byte	102,15,56,221,209
 	xorps	%xmm10,%xmm2
 	movups	%xmm2,-16(%rsi)
 
 .Lxts_enc_ret:
-	leaq	104(%rsp),%rsp
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	(%rbp),%rsp
+	popq	%rbp
 .Lxts_enc_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1533,21 +2036,25 @@
 .type	aesni_xts_decrypt, at function
 .align	16
 aesni_xts_decrypt:
-	leaq	-104(%rsp),%rsp
-	movups	(%r9),%xmm15
+	leaq	(%rsp),%rax
+	pushq	%rbp
+	subq	$112,%rsp
+	andq	$-16,%rsp
+	leaq	-8(%rax),%rbp
+	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
 	movups	(%r8),%xmm0
 	movups	16(%r8),%xmm1
 	leaq	32(%r8),%r8
-	xorps	%xmm0,%xmm15
+	xorps	%xmm0,%xmm2
 .Loop_enc1_11:
-.byte	102,68,15,56,220,249
+.byte	102,15,56,220,209
 	decl	%eax
 	movups	(%r8),%xmm1
 	leaq	16(%r8),%r8
-	jnz	.Loop_enc1_11	
-.byte	102,68,15,56,221,249
+	jnz	.Loop_enc1_11
+.byte	102,15,56,221,209
 	xorl	%eax,%eax
 	testq	$15,%rdx
 	setnz	%al
@@ -1554,228 +2061,268 @@
 	shlq	$4,%rax
 	subq	%rax,%rdx
 
+	movups	(%rcx),%xmm0
 	movq	%rcx,%r11
 	movl	%r10d,%eax
+	shll	$4,%r10d
 	movq	%rdx,%r9
 	andq	$-16,%rdx
 
+	movups	16(%rcx,%r10,1),%xmm1
+
 	movdqa	.Lxts_magic(%rip),%xmm8
-	pxor	%xmm14,%xmm14
-	pcmpgtd	%xmm15,%xmm14
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	movdqa	%xmm2,%xmm15
+	pshufd	$0x5f,%xmm2,%xmm9
+	pxor	%xmm0,%xmm1
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm10
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm11
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm11
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm12
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
-	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm12
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 	movdqa	%xmm15,%xmm13
+	psrad	$31,%xmm14
 	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
+	pxor	%xmm0,%xmm13
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm15,%xmm14
+	psrad	$31,%xmm9
+	paddq	%xmm15,%xmm15
 	pand	%xmm8,%xmm9
-	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm0,%xmm14
 	pxor	%xmm9,%xmm15
+	movaps	%xmm1,96(%rsp)
+
 	subq	$96,%rdx
 	jc	.Lxts_dec_short
 
-	shrl	$1,%eax
-	subl	$1,%eax
-	movl	%eax,%r10d
+	movl	$16+96,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	subq	%r10,%rax
+	movups	16(%r11),%xmm1
+	movq	%rax,%r10
+	leaq	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_dec_grandloop
 
-.align	16
+.align	32
 .Lxts_dec_grandloop:
-	pshufd	$19,%xmm14,%xmm9
-	movdqa	%xmm15,%xmm14
-	paddq	%xmm15,%xmm15
 	movdqu	0(%rdi),%xmm2
-	pand	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm8
 	movdqu	16(%rdi),%xmm3
-	pxor	%xmm9,%xmm15
-
+	pxor	%xmm10,%xmm2
 	movdqu	32(%rdi),%xmm4
-	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,222,209
 	movdqu	48(%rdi),%xmm5
-	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm4
+.byte	102,15,56,222,217
 	movdqu	64(%rdi),%xmm6
-	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+.byte	102,15,56,222,225
 	movdqu	80(%rdi),%xmm7
+	pxor	%xmm15,%xmm8
+	movdqa	96(%rsp),%xmm9
+	pxor	%xmm14,%xmm6
+.byte	102,15,56,222,233
+	movups	32(%r11),%xmm0
 	leaq	96(%rdi),%rdi
-	pxor	%xmm13,%xmm5
-	movups	(%r11),%xmm0
-	pxor	%xmm14,%xmm6
-	pxor	%xmm15,%xmm7
+	pxor	%xmm8,%xmm7
 
+	pxor	%xmm9,%xmm10
+.byte	102,15,56,222,241
+	pxor	%xmm9,%xmm11
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,222,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm12
 
-
-	movups	16(%r11),%xmm1
-	pxor	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,222,208
+	pxor	%xmm9,%xmm13
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,222,216
+	pxor	%xmm9,%xmm14
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	pxor	%xmm9,%xmm8
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	64(%r11),%xmm0
+	movdqa	%xmm8,80(%rsp)
+	pshufd	$0x5f,%xmm15,%xmm9
+	jmp	.Lxts_dec_loop6
+.align	32
+.Lxts_dec_loop6:
 .byte	102,15,56,222,209
-	leaq	32(%r11),%rcx
-	pxor	%xmm0,%xmm4
-	movdqa	%xmm11,16(%rsp)
 .byte	102,15,56,222,217
-	pxor	%xmm0,%xmm5
-	movdqa	%xmm12,32(%rsp)
 .byte	102,15,56,222,225
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm13,48(%rsp)
 .byte	102,15,56,222,233
-	pxor	%xmm0,%xmm7
-	movups	(%rcx),%xmm0
-	decl	%eax
-	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,222,241
-	movdqa	%xmm15,80(%rsp)
 .byte	102,15,56,222,249
-	pxor	%xmm14,%xmm14
-	pcmpgtd	%xmm15,%xmm14
-	jmp	.Lxts_dec_loop6_enter
+	movups	-64(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
 
-.align	16
-.Lxts_dec_loop6:
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-80(%rcx,%rax,1),%xmm0
+	jnz	.Lxts_dec_loop6
+
+	movdqa	(%r8),%xmm8
+	movdqa	%xmm9,%xmm14
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,222,209
+	paddq	%xmm15,%xmm15
+	psrad	$31,%xmm14
 .byte	102,15,56,222,217
-	decl	%eax
+	pand	%xmm8,%xmm14
+	movups	(%r11),%xmm10
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
+	pxor	%xmm14,%xmm15
+	movaps	%xmm10,%xmm11
 .byte	102,15,56,222,249
-.Lxts_dec_loop6_enter:
-	movups	16(%rcx),%xmm1
+	movups	-64(%rcx),%xmm1
+
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,222,208
+	paddd	%xmm9,%xmm9
+	pxor	%xmm15,%xmm10
 .byte	102,15,56,222,216
-	leaq	32(%rcx),%rcx
+	psrad	$31,%xmm14
+	paddq	%xmm15,%xmm15
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
+	pand	%xmm8,%xmm14
+	movaps	%xmm11,%xmm12
 .byte	102,15,56,222,240
+	pxor	%xmm14,%xmm15
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,222,248
-	movups	(%rcx),%xmm0
-	jnz	.Lxts_dec_loop6
+	movups	-48(%rcx),%xmm0
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	paddq	%xmm15,%xmm15
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,222,209
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm11
+	psrad	$31,%xmm14
 .byte	102,15,56,222,217
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
 .byte	102,15,56,222,225
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,222,233
+	movdqa	%xmm13,48(%rsp)
+	pxor	%xmm14,%xmm15
 .byte	102,15,56,222,241
+	movaps	%xmm12,%xmm13
+	movdqa	%xmm9,%xmm14
 .byte	102,15,56,222,249
-	movups	16(%rcx),%xmm1
+	movups	-32(%rcx),%xmm1
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm10
-	paddq	%xmm15,%xmm15
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,222,208
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm12
+	psrad	$31,%xmm14
 .byte	102,15,56,222,216
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm14
 .byte	102,15,56,222,224
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,222,232
 .byte	102,15,56,222,240
+	pxor	%xmm14,%xmm15
+	movaps	%xmm13,%xmm14
 .byte	102,15,56,222,248
-	movups	32(%rcx),%xmm0
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm11
-	paddq	%xmm15,%xmm15
+	movdqa	%xmm9,%xmm0
+	paddd	%xmm9,%xmm9
 .byte	102,15,56,222,209
-	pand	%xmm8,%xmm9
+	pxor	%xmm15,%xmm13
+	psrad	$31,%xmm0
 .byte	102,15,56,222,217
-	pcmpgtd	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm0
 .byte	102,15,56,222,225
-	pxor	%xmm9,%xmm15
 .byte	102,15,56,222,233
+	pxor	%xmm0,%xmm15
+	movups	(%r11),%xmm0
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
+	movups	16(%r11),%xmm1
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm12
+	pxor	%xmm15,%xmm14
+.byte	102,15,56,223,84,36,0
+	psrad	$31,%xmm9
 	paddq	%xmm15,%xmm15
-.byte	102,15,56,223,208
+.byte	102,15,56,223,92,36,16
+.byte	102,15,56,223,100,36,32
 	pand	%xmm8,%xmm9
-.byte	102,15,56,223,216
-	pcmpgtd	%xmm15,%xmm14
-.byte	102,15,56,223,224
+	movq	%r10,%rax
+.byte	102,15,56,223,108,36,48
+.byte	102,15,56,223,116,36,64
+.byte	102,15,56,223,124,36,80
 	pxor	%xmm9,%xmm15
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
 
-	pshufd	$19,%xmm14,%xmm9
-	pxor	%xmm14,%xmm14
-	movdqa	%xmm15,%xmm13
-	paddq	%xmm15,%xmm15
-	xorps	0(%rsp),%xmm2
-	pand	%xmm8,%xmm9
-	xorps	16(%rsp),%xmm3
-	pcmpgtd	%xmm15,%xmm14
-	pxor	%xmm9,%xmm15
-
-	xorps	32(%rsp),%xmm4
-	movups	%xmm2,0(%rsi)
-	xorps	48(%rsp),%xmm5
-	movups	%xmm3,16(%rsi)
-	xorps	64(%rsp),%xmm6
-	movups	%xmm4,32(%rsi)
-	xorps	80(%rsp),%xmm7
-	movups	%xmm5,48(%rsi)
-	movl	%r10d,%eax
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
 	leaq	96(%rsi),%rsi
+	movups	%xmm2,-96(%rsi)
+	movups	%xmm3,-80(%rsi)
+	movups	%xmm4,-64(%rsi)
+	movups	%xmm5,-48(%rsi)
+	movups	%xmm6,-32(%rsi)
+	movups	%xmm7,-16(%rsi)
 	subq	$96,%rdx
 	jnc	.Lxts_dec_grandloop
 
-	leal	3(%rax,%rax,1),%eax
+	movl	$16+96,%eax
+	subl	%r10d,%eax
 	movq	%r11,%rcx
-	movl	%eax,%r10d
+	shrl	$4,%eax
 
 .Lxts_dec_short:
+
+	movl	%eax,%r10d
+	pxor	%xmm0,%xmm10
+	pxor	%xmm0,%xmm11
 	addq	$96,%rdx
 	jz	.Lxts_dec_done
 
-	cmpq	$32,%rdx
+	pxor	%xmm0,%xmm12
+	cmpq	$0x20,%rdx
 	jb	.Lxts_dec_one
+	pxor	%xmm0,%xmm13
 	je	.Lxts_dec_two
 
-	cmpq	$64,%rdx
+	pxor	%xmm0,%xmm14
+	cmpq	$0x40,%rdx
 	jb	.Lxts_dec_three
 	je	.Lxts_dec_four
 
-	pshufd	$19,%xmm14,%xmm9
-	movdqa	%xmm15,%xmm14
-	paddq	%xmm15,%xmm15
 	movdqu	(%rdi),%xmm2
-	pand	%xmm8,%xmm9
 	movdqu	16(%rdi),%xmm3
-	pxor	%xmm9,%xmm15
-
 	movdqu	32(%rdi),%xmm4
 	pxor	%xmm10,%xmm2
 	movdqu	48(%rdi),%xmm5
@@ -1801,7 +2348,7 @@
 	pcmpgtd	%xmm15,%xmm14
 	movdqu	%xmm6,64(%rsi)
 	leaq	80(%rsi),%rsi
-	pshufd	$19,%xmm14,%xmm11
+	pshufd	$0x13,%xmm14,%xmm11
 	andq	$15,%r9
 	jz	.Lxts_dec_ret
 
@@ -1825,7 +2372,7 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_dec1_12	
+	jnz	.Loop_dec1_12
 .byte	102,15,56,223,209
 	xorps	%xmm10,%xmm2
 	movdqa	%xmm11,%xmm10
@@ -1842,7 +2389,7 @@
 	xorps	%xmm10,%xmm2
 	xorps	%xmm11,%xmm3
 
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 
 	xorps	%xmm10,%xmm2
 	movdqa	%xmm12,%xmm10
@@ -1868,7 +2415,7 @@
 	xorps	%xmm10,%xmm2
 	movdqa	%xmm13,%xmm10
 	xorps	%xmm11,%xmm3
-	movdqa	%xmm15,%xmm11
+	movdqa	%xmm14,%xmm11
 	xorps	%xmm12,%xmm4
 	movups	%xmm2,(%rsi)
 	movups	%xmm3,16(%rsi)
@@ -1878,14 +2425,8 @@
 
 .align	16
 .Lxts_dec_four:
-	pshufd	$19,%xmm14,%xmm9
-	movdqa	%xmm15,%xmm14
-	paddq	%xmm15,%xmm15
 	movups	(%rdi),%xmm2
-	pand	%xmm8,%xmm9
 	movups	16(%rdi),%xmm3
-	pxor	%xmm9,%xmm15
-
 	movups	32(%rdi),%xmm4
 	xorps	%xmm10,%xmm2
 	movups	48(%rdi),%xmm5
@@ -1896,16 +2437,16 @@
 
 	call	_aesni_decrypt4
 
-	xorps	%xmm10,%xmm2
+	pxor	%xmm10,%xmm2
 	movdqa	%xmm14,%xmm10
-	xorps	%xmm11,%xmm3
+	pxor	%xmm11,%xmm3
 	movdqa	%xmm15,%xmm11
-	xorps	%xmm12,%xmm4
-	movups	%xmm2,(%rsi)
-	xorps	%xmm13,%xmm5
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	movdqu	%xmm4,32(%rsi)
+	movdqu	%xmm5,48(%rsi)
 	leaq	64(%rsi),%rsi
 	jmp	.Lxts_dec_done
 
@@ -1929,7 +2470,7 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_dec1_13	
+	jnz	.Loop_dec1_13
 .byte	102,15,56,223,209
 	xorps	%xmm11,%xmm2
 	movups	%xmm2,(%rsi)
@@ -1959,13 +2500,37 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_dec1_14	
+	jnz	.Loop_dec1_14
 .byte	102,15,56,223,209
 	xorps	%xmm10,%xmm2
 	movups	%xmm2,(%rsi)
 
 .Lxts_dec_ret:
-	leaq	104(%rsp),%rsp
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	(%rbp),%rsp
+	popq	%rbp
 .Lxts_dec_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2002,7 +2567,7 @@
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_enc1_15	
+	jnz	.Loop_enc1_15
 .byte	102,15,56,221,209
 	movl	%r10d,%eax
 	movq	%r11,%rcx
@@ -2012,285 +2577,545 @@
 	jnc	.Lcbc_enc_loop
 	addq	$16,%rdx
 	jnz	.Lcbc_enc_tail
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%r8)
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
 	jmp	.Lcbc_ret
 
 .Lcbc_enc_tail:
 	movq	%rdx,%rcx
 	xchgq	%rdi,%rsi
-.long	0x9066A4F3	
+.long	0x9066A4F3
 	movl	$16,%ecx
 	subq	%rdx,%rcx
 	xorl	%eax,%eax
-.long	0x9066AAF3	
+.long	0x9066AAF3
 	leaq	-16(%rdi),%rdi
 	movl	%r10d,%eax
 	movq	%rdi,%rsi
 	movq	%r11,%rcx
 	xorq	%rdx,%rdx
-	jmp	.Lcbc_enc_loop	
+	jmp	.Lcbc_enc_loop
 
 .align	16
 .Lcbc_decrypt:
-	movups	(%r8),%xmm9
+	cmpq	$16,%rdx
+	jne	.Lcbc_decrypt_bulk
+
+
+
+	movdqu	(%rdi),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	%xmm2,%xmm4
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_16:
+.byte	102,15,56,222,209
+	decl	%r10d
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_16
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqu	%xmm4,(%r8)
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lcbc_ret
+.align	16
+.Lcbc_decrypt_bulk:
+	leaq	(%rsp),%rax
+	pushq	%rbp
+	subq	$16,%rsp
+	andq	$-16,%rsp
+	leaq	-8(%rax),%rbp
+	movups	(%r8),%xmm10
 	movl	%r10d,%eax
-	cmpq	$112,%rdx
+	cmpq	$0x50,%rdx
 	jbe	.Lcbc_dec_tail
-	shrl	$1,%r10d
-	subq	$112,%rdx
-	movl	%r10d,%eax
-	movaps	%xmm9,-24(%rsp)
+
+	movups	(%rcx),%xmm0
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+	movl	OPENSSL_ia32cap_P+4(%rip),%r9d
+	cmpq	$0x70,%rdx
+	jbe	.Lcbc_dec_six_or_seven
+
+	andl	$71303168,%r9d
+	subq	$0x50,%rdx
+	cmpl	$4194304,%r9d
+	je	.Lcbc_dec_loop6_enter
+	subq	$0x20,%rdx
+	leaq	112(%rcx),%rcx
 	jmp	.Lcbc_dec_loop8_enter
 .align	16
 .Lcbc_dec_loop8:
-	movaps	%xmm0,-24(%rsp)
 	movups	%xmm9,(%rsi)
 	leaq	16(%rsi),%rsi
 .Lcbc_dec_loop8_enter:
-	movups	(%rcx),%xmm0
-	movups	(%rdi),%xmm2
-	movups	16(%rdi),%xmm3
-	movups	16(%rcx),%xmm1
+	movdqu	96(%rdi),%xmm8
+	pxor	%xmm0,%xmm2
+	movdqu	112(%rdi),%xmm9
+	pxor	%xmm0,%xmm3
+	movups	16-112(%rcx),%xmm1
+	pxor	%xmm0,%xmm4
+	xorq	%r11,%r11
+	cmpq	$0x70,%rdx
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
 
-	leaq	32(%rcx),%rcx
-	movdqu	32(%rdi),%xmm4
-	xorps	%xmm0,%xmm2
-	movdqu	48(%rdi),%xmm5
-	xorps	%xmm0,%xmm3
-	movdqu	64(%rdi),%xmm6
 .byte	102,15,56,222,209
-	pxor	%xmm0,%xmm4
-	movdqu	80(%rdi),%xmm7
+	pxor	%xmm0,%xmm9
+	movups	32-112(%rcx),%xmm0
 .byte	102,15,56,222,217
-	pxor	%xmm0,%xmm5
-	movdqu	96(%rdi),%xmm8
 .byte	102,15,56,222,225
-	pxor	%xmm0,%xmm6
-	movdqu	112(%rdi),%xmm9
 .byte	102,15,56,222,233
-	pxor	%xmm0,%xmm7
-	decl	%eax
 .byte	102,15,56,222,241
-	pxor	%xmm0,%xmm8
 .byte	102,15,56,222,249
-	pxor	%xmm0,%xmm9
-	movups	(%rcx),%xmm0
 .byte	102,68,15,56,222,193
+	setnc	%r11b
+	shlq	$7,%r11
 .byte	102,68,15,56,222,201
-	movups	16(%rcx),%xmm1
+	addq	%rdi,%r11
+	movups	48-112(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	64-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	80-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	96-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	112-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	128-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	144-112(%rcx),%xmm1
+	cmpl	$11,%eax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	160-112(%rcx),%xmm0
+	jb	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	176-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	192-112(%rcx),%xmm0
+	je	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	208-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	224-112(%rcx),%xmm0
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_done:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm10
+	pxor	%xmm0,%xmm11
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm12
+	pxor	%xmm0,%xmm13
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movdqu	80(%rdi),%xmm1
 
-	call	.Ldec_loop8_enter
+.byte	102,65,15,56,223,210
+	movdqu	96(%rdi),%xmm10
+	pxor	%xmm0,%xmm1
+.byte	102,65,15,56,223,219
+	pxor	%xmm0,%xmm10
+	movdqu	112(%rdi),%xmm0
+.byte	102,65,15,56,223,228
+	leaq	128(%rdi),%rdi
+	movdqu	0(%r11),%xmm11
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+	movdqu	16(%r11),%xmm12
+	movdqu	32(%r11),%xmm13
+.byte	102,65,15,56,223,255
+.byte	102,68,15,56,223,193
+	movdqu	48(%r11),%xmm14
+	movdqu	64(%r11),%xmm15
+.byte	102,69,15,56,223,202
+	movdqa	%xmm0,%xmm10
+	movdqu	80(%r11),%xmm1
+	movups	-112(%rcx),%xmm0
 
-	movups	(%rdi),%xmm1
-	movups	16(%rdi),%xmm0
-	xorps	-24(%rsp),%xmm2
-	xorps	%xmm1,%xmm3
-	movups	32(%rdi),%xmm1
-	xorps	%xmm0,%xmm4
-	movups	48(%rdi),%xmm0
-	xorps	%xmm1,%xmm5
-	movups	64(%rdi),%xmm1
-	xorps	%xmm0,%xmm6
-	movups	80(%rdi),%xmm0
-	xorps	%xmm1,%xmm7
-	movups	96(%rdi),%xmm1
-	xorps	%xmm0,%xmm8
-	movups	112(%rdi),%xmm0
-	xorps	%xmm1,%xmm9
 	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
 	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
 	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
 	movups	%xmm5,48(%rsi)
-	movl	%r10d,%eax
+	movdqa	%xmm14,%xmm5
 	movups	%xmm6,64(%rsi)
-	movq	%r11,%rcx
+	movdqa	%xmm15,%xmm6
 	movups	%xmm7,80(%rsi)
-	leaq	128(%rdi),%rdi
+	movdqa	%xmm1,%xmm7
 	movups	%xmm8,96(%rsi)
 	leaq	112(%rsi),%rsi
-	subq	$128,%rdx
+
+	subq	$0x80,%rdx
 	ja	.Lcbc_dec_loop8
 
 	movaps	%xmm9,%xmm2
-	movaps	%xmm0,%xmm9
-	addq	$112,%rdx
-	jle	.Lcbc_dec_tail_collected
-	movups	%xmm2,(%rsi)
-	leal	1(%r10,%r10,1),%eax
+	leaq	-112(%rcx),%rcx
+	addq	$0x70,%rdx
+	jle	.Lcbc_dec_clear_tail_collected
+	movups	%xmm9,(%rsi)
 	leaq	16(%rsi),%rsi
+	cmpq	$0x50,%rdx
+	jbe	.Lcbc_dec_tail
+
+	movaps	%xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+	cmpq	$0x60,%rdx
+	ja	.Lcbc_dec_seven
+
+	movaps	%xmm7,%xmm8
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	leaq	80(%rsi),%rsi
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm7,%xmm7
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_seven:
+	movups	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	80(%rdi),%xmm9
+	pxor	%xmm10,%xmm2
+	movups	96(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm9,%xmm8
+	movdqu	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	leaq	96(%rsi),%rsi
+	movdqa	%xmm8,%xmm2
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_loop6:
+	movups	%xmm7,(%rsi)
+	leaq	16(%rsi),%rsi
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+	leaq	96(%rdi),%rdi
+	movdqa	%xmm7,%xmm8
+
+	call	_aesni_decrypt6
+
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm14,%xmm6
+	movq	%r11,%rcx
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm15,%xmm7
+	movl	%r10d,%eax
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	subq	$0x60,%rdx
+	ja	.Lcbc_dec_loop6
+
+	movdqa	%xmm7,%xmm2
+	addq	$0x50,%rdx
+	jle	.Lcbc_dec_clear_tail_collected
+	movups	%xmm7,(%rsi)
+	leaq	16(%rsi),%rsi
+
 .Lcbc_dec_tail:
 	movups	(%rdi),%xmm2
-	movaps	%xmm2,%xmm8
-	cmpq	$16,%rdx
+	subq	$0x10,%rdx
 	jbe	.Lcbc_dec_one
 
 	movups	16(%rdi),%xmm3
-	movaps	%xmm3,%xmm7
-	cmpq	$32,%rdx
+	movaps	%xmm2,%xmm11
+	subq	$0x10,%rdx
 	jbe	.Lcbc_dec_two
 
 	movups	32(%rdi),%xmm4
-	movaps	%xmm4,%xmm6
-	cmpq	$48,%rdx
+	movaps	%xmm3,%xmm12
+	subq	$0x10,%rdx
 	jbe	.Lcbc_dec_three
 
 	movups	48(%rdi),%xmm5
-	cmpq	$64,%rdx
+	movaps	%xmm4,%xmm13
+	subq	$0x10,%rdx
 	jbe	.Lcbc_dec_four
 
 	movups	64(%rdi),%xmm6
-	cmpq	$80,%rdx
-	jbe	.Lcbc_dec_five
+	movaps	%xmm5,%xmm14
+	movaps	%xmm6,%xmm15
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm15,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	leaq	64(%rsi),%rsi
+	movdqa	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	subq	$0x10,%rdx
+	jmp	.Lcbc_dec_tail_collected
 
-	movups	80(%rdi),%xmm7
-	cmpq	$96,%rdx
-	jbe	.Lcbc_dec_six
-
-	movups	96(%rdi),%xmm8
-	movaps	%xmm9,-24(%rsp)
-	call	_aesni_decrypt8
-	movups	(%rdi),%xmm1
-	movups	16(%rdi),%xmm0
-	xorps	-24(%rsp),%xmm2
-	xorps	%xmm1,%xmm3
-	movups	32(%rdi),%xmm1
-	xorps	%xmm0,%xmm4
-	movups	48(%rdi),%xmm0
-	xorps	%xmm1,%xmm5
-	movups	64(%rdi),%xmm1
-	xorps	%xmm0,%xmm6
-	movups	80(%rdi),%xmm0
-	xorps	%xmm1,%xmm7
-	movups	96(%rdi),%xmm9
-	xorps	%xmm0,%xmm8
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
-	leaq	96(%rsi),%rsi
-	movaps	%xmm8,%xmm2
-	subq	$112,%rdx
-	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_one:
+	movaps	%xmm2,%xmm11
 	movups	(%rcx),%xmm0
 	movups	16(%rcx),%xmm1
 	leaq	32(%rcx),%rcx
 	xorps	%xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
 .byte	102,15,56,222,209
 	decl	%eax
 	movups	(%rcx),%xmm1
 	leaq	16(%rcx),%rcx
-	jnz	.Loop_dec1_16	
+	jnz	.Loop_dec1_17
 .byte	102,15,56,223,209
-	xorps	%xmm9,%xmm2
-	movaps	%xmm8,%xmm9
-	subq	$16,%rdx
+	xorps	%xmm10,%xmm2
+	movaps	%xmm11,%xmm10
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
-	xorps	%xmm9,%xmm2
-	xorps	%xmm8,%xmm3
-	movups	%xmm2,(%rsi)
-	movaps	%xmm7,%xmm9
-	movaps	%xmm3,%xmm2
+	movaps	%xmm3,%xmm12
+	call	_aesni_decrypt2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm12,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
 	leaq	16(%rsi),%rsi
-	subq	$32,%rdx
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_three:
+	movaps	%xmm4,%xmm13
 	call	_aesni_decrypt3
-	xorps	%xmm9,%xmm2
-	xorps	%xmm8,%xmm3
-	movups	%xmm2,(%rsi)
-	xorps	%xmm7,%xmm4
-	movups	%xmm3,16(%rsi)
-	movaps	%xmm6,%xmm9
-	movaps	%xmm4,%xmm2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm13,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
 	leaq	32(%rsi),%rsi
-	subq	$48,%rdx
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_four:
+	movaps	%xmm5,%xmm14
 	call	_aesni_decrypt4
-	xorps	%xmm9,%xmm2
-	movups	48(%rdi),%xmm9
-	xorps	%xmm8,%xmm3
-	movups	%xmm2,(%rsi)
-	xorps	%xmm7,%xmm4
-	movups	%xmm3,16(%rsi)
-	xorps	%xmm6,%xmm5
-	movups	%xmm4,32(%rsi)
-	movaps	%xmm5,%xmm2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm14,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
 	leaq	48(%rsi),%rsi
-	subq	$64,%rdx
 	jmp	.Lcbc_dec_tail_collected
+
 .align	16
-.Lcbc_dec_five:
-	xorps	%xmm7,%xmm7
-	call	_aesni_decrypt6
-	movups	16(%rdi),%xmm1
-	movups	32(%rdi),%xmm0
-	xorps	%xmm9,%xmm2
-	xorps	%xmm8,%xmm3
-	xorps	%xmm1,%xmm4
-	movups	48(%rdi),%xmm1
-	xorps	%xmm0,%xmm5
-	movups	64(%rdi),%xmm9
-	xorps	%xmm1,%xmm6
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	leaq	64(%rsi),%rsi
-	movaps	%xmm6,%xmm2
-	subq	$80,%rdx
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_six:
-	call	_aesni_decrypt6
-	movups	16(%rdi),%xmm1
-	movups	32(%rdi),%xmm0
-	xorps	%xmm9,%xmm2
-	xorps	%xmm8,%xmm3
-	xorps	%xmm1,%xmm4
-	movups	48(%rdi),%xmm1
-	xorps	%xmm0,%xmm5
-	movups	64(%rdi),%xmm0
-	xorps	%xmm1,%xmm6
-	movups	80(%rdi),%xmm9
-	xorps	%xmm0,%xmm7
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	leaq	80(%rsi),%rsi
-	movaps	%xmm7,%xmm2
-	subq	$96,%rdx
-	jmp	.Lcbc_dec_tail_collected
-.align	16
+.Lcbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
 .Lcbc_dec_tail_collected:
+	movups	%xmm10,(%r8)
 	andq	$15,%rdx
-	movups	%xmm9,(%r8)
 	jnz	.Lcbc_dec_tail_partial
 	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
 	jmp	.Lcbc_dec_ret
 .align	16
 .Lcbc_dec_tail_partial:
-	movaps	%xmm2,-24(%rsp)
+	movaps	%xmm2,(%rsp)
+	pxor	%xmm2,%xmm2
 	movq	$16,%rcx
 	movq	%rsi,%rdi
 	subq	%rdx,%rcx
-	leaq	-24(%rsp),%rsi
-.long	0x9066A4F3	
+	leaq	(%rsp),%rsi
+.long	0x9066A4F3
+	movdqa	%xmm2,(%rsp)
 
 .Lcbc_dec_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	leaq	(%rbp),%rsp
+	popq	%rbp
 .Lcbc_ret:
 	.byte	0xf3,0xc3
 .size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
@@ -2298,7 +3123,7 @@
 .type	aesni_set_decrypt_key, at function
 .align	16
 aesni_set_decrypt_key:
-.byte	0x48,0x83,0xEC,0x08	
+.byte	0x48,0x83,0xEC,0x08
 	call	__aesni_set_encrypt_key
 	shll	$4,%esi
 	testl	%eax,%eax
@@ -2326,7 +3151,9 @@
 
 	movups	(%rdx),%xmm0
 .byte	102,15,56,219,192
+	pxor	%xmm1,%xmm1
 	movups	%xmm0,(%rdi)
+	pxor	%xmm0,%xmm0
 .Ldec_key_ret:
 	addq	$8,%rsp
 	.byte	0xf3,0xc3
@@ -2337,7 +3164,7 @@
 .align	16
 aesni_set_encrypt_key:
 __aesni_set_encrypt_key:
-.byte	0x48,0x83,0xEC,0x08	
+.byte	0x48,0x83,0xEC,0x08
 	movq	$-1,%rax
 	testq	%rdi,%rdi
 	jz	.Lenc_key_ret
@@ -2344,8 +3171,10 @@
 	testq	%rdx,%rdx
 	jz	.Lenc_key_ret
 
+	movl	$268437504,%r10d
 	movups	(%rdi),%xmm0
 	xorps	%xmm4,%xmm4
+	andl	OPENSSL_ia32cap_P+4(%rip),%r10d
 	leaq	16(%rdx),%rax
 	cmpl	$256,%esi
 	je	.L14rounds
@@ -2356,6 +3185,9 @@
 
 .L10rounds:
 	movl	$9,%esi
+	cmpl	$268435456,%r10d
+	je	.L10rounds_alt
+
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,200,1
 	call	.Lkey_expansion_128_cold
@@ -2383,9 +3215,79 @@
 	jmp	.Lenc_key_ret
 
 .align	16
+.L10rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	.Loop_key128
+
+	movdqa	.Lkey_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
 .L12rounds:
 	movq	16(%rdi),%xmm2
 	movl	$11,%esi
+	cmpl	$268435456,%r10d
+	je	.L12rounds_alt
+
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,202,1
 	call	.Lkey_expansion_192a_cold
@@ -2409,10 +3311,54 @@
 	jmp	.Lenc_key_ret
 
 .align	16
+.L12rounds_alt:
+	movdqa	.Lkey_rotate192(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key192
+
+.align	16
+.Loop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	.Loop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
 .L14rounds:
 	movups	16(%rdi),%xmm2
 	movl	$13,%esi
 	leaq	16(%rax),%rax
+	cmpl	$268435456,%r10d
+	je	.L14rounds_alt
+
 	movups	%xmm0,(%rdx)
 	movups	%xmm2,16(%rdx)
 .byte	102,15,58,223,202,1
@@ -2447,9 +3393,69 @@
 	jmp	.Lenc_key_ret
 
 .align	16
+.L14rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	.Ldone_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
 .Lbad_keybits:
 	movq	$-2,%rax
 .Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
 	addq	$8,%rsp
 	.byte	0xf3,0xc3
 .LSEH_end_set_encrypt_key:
@@ -2533,6 +3539,16 @@
 .long	1,0,0,0
 .Lxts_magic:
 .long	0x87,0,1,0
+.Lincrement1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long	1,1,1,1
+.Lkey_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
 
 .byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64

Modified: trunk/secure/lib/libcrypto/amd64/bsaes-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/bsaes-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/bsaes-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/bsaes-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from bsaes-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/bsaes-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from bsaes-x86_64.pl. */
 .text	
 
 
@@ -16,18 +16,18 @@
 	movdqa	80(%r11),%xmm7
 	pxor	%xmm8,%xmm15
 	pxor	%xmm8,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm8,%xmm2
 .byte	102,68,15,56,0,255
-	pxor	%xmm8,%xmm1
 .byte	102,15,56,0,199
-	pxor	%xmm8,%xmm2
+	pxor	%xmm8,%xmm3
+	pxor	%xmm8,%xmm4
 .byte	102,15,56,0,207
-	pxor	%xmm8,%xmm3
 .byte	102,15,56,0,215
-	pxor	%xmm8,%xmm4
+	pxor	%xmm8,%xmm5
+	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,223
-	pxor	%xmm8,%xmm5
 .byte	102,15,56,0,231
-	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,239
 .byte	102,15,56,0,247
 _bsaes_encrypt8_bitslice:
@@ -124,21 +124,21 @@
 .Lenc_loop:
 	pxor	0(%rax),%xmm15
 	pxor	16(%rax),%xmm0
+	pxor	32(%rax),%xmm1
+	pxor	48(%rax),%xmm2
 .byte	102,68,15,56,0,255
-	pxor	32(%rax),%xmm1
 .byte	102,15,56,0,199
-	pxor	48(%rax),%xmm2
+	pxor	64(%rax),%xmm3
+	pxor	80(%rax),%xmm4
 .byte	102,15,56,0,207
-	pxor	64(%rax),%xmm3
 .byte	102,15,56,0,215
-	pxor	80(%rax),%xmm4
+	pxor	96(%rax),%xmm5
+	pxor	112(%rax),%xmm6
 .byte	102,15,56,0,223
-	pxor	96(%rax),%xmm5
 .byte	102,15,56,0,231
-	pxor	112(%rax),%xmm6
 .byte	102,15,56,0,239
+.byte	102,15,56,0,247
 	leaq	128(%rax),%rax
-.byte	102,15,56,0,247
 .Lenc_sbox:
 	pxor	%xmm5,%xmm4
 	pxor	%xmm0,%xmm1
@@ -327,20 +327,20 @@
 	pxor	%xmm2,%xmm5
 	decl	%r10d
 	jl	.Lenc_done
-	pshufd	$147,%xmm15,%xmm7
-	pshufd	$147,%xmm0,%xmm8
+	pshufd	$0x93,%xmm15,%xmm7
+	pshufd	$0x93,%xmm0,%xmm8
 	pxor	%xmm7,%xmm15
-	pshufd	$147,%xmm3,%xmm9
+	pshufd	$0x93,%xmm3,%xmm9
 	pxor	%xmm8,%xmm0
-	pshufd	$147,%xmm5,%xmm10
+	pshufd	$0x93,%xmm5,%xmm10
 	pxor	%xmm9,%xmm3
-	pshufd	$147,%xmm2,%xmm11
+	pshufd	$0x93,%xmm2,%xmm11
 	pxor	%xmm10,%xmm5
-	pshufd	$147,%xmm6,%xmm12
+	pshufd	$0x93,%xmm6,%xmm12
 	pxor	%xmm11,%xmm2
-	pshufd	$147,%xmm1,%xmm13
+	pshufd	$0x93,%xmm1,%xmm13
 	pxor	%xmm12,%xmm6
-	pshufd	$147,%xmm4,%xmm14
+	pshufd	$0x93,%xmm4,%xmm14
 	pxor	%xmm13,%xmm1
 	pxor	%xmm14,%xmm4
 
@@ -347,25 +347,25 @@
 	pxor	%xmm15,%xmm8
 	pxor	%xmm4,%xmm7
 	pxor	%xmm4,%xmm8
-	pshufd	$78,%xmm15,%xmm15
+	pshufd	$0x4E,%xmm15,%xmm15
 	pxor	%xmm0,%xmm9
-	pshufd	$78,%xmm0,%xmm0
+	pshufd	$0x4E,%xmm0,%xmm0
 	pxor	%xmm2,%xmm12
 	pxor	%xmm7,%xmm15
 	pxor	%xmm6,%xmm13
 	pxor	%xmm8,%xmm0
 	pxor	%xmm5,%xmm11
-	pshufd	$78,%xmm2,%xmm7
+	pshufd	$0x4E,%xmm2,%xmm7
 	pxor	%xmm1,%xmm14
-	pshufd	$78,%xmm6,%xmm8
+	pshufd	$0x4E,%xmm6,%xmm8
 	pxor	%xmm3,%xmm10
-	pshufd	$78,%xmm5,%xmm2
+	pshufd	$0x4E,%xmm5,%xmm2
 	pxor	%xmm4,%xmm10
-	pshufd	$78,%xmm4,%xmm6
+	pshufd	$0x4E,%xmm4,%xmm6
 	pxor	%xmm4,%xmm11
-	pshufd	$78,%xmm1,%xmm5
+	pshufd	$0x4E,%xmm1,%xmm5
 	pxor	%xmm11,%xmm7
-	pshufd	$78,%xmm3,%xmm1
+	pshufd	$0x4E,%xmm3,%xmm1
 	pxor	%xmm12,%xmm8
 	pxor	%xmm10,%xmm2
 	pxor	%xmm14,%xmm6
@@ -488,18 +488,18 @@
 	movdqa	-48(%r11),%xmm7
 	pxor	%xmm8,%xmm15
 	pxor	%xmm8,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm8,%xmm2
 .byte	102,68,15,56,0,255
-	pxor	%xmm8,%xmm1
 .byte	102,15,56,0,199
-	pxor	%xmm8,%xmm2
+	pxor	%xmm8,%xmm3
+	pxor	%xmm8,%xmm4
 .byte	102,15,56,0,207
-	pxor	%xmm8,%xmm3
 .byte	102,15,56,0,215
-	pxor	%xmm8,%xmm4
+	pxor	%xmm8,%xmm5
+	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,223
-	pxor	%xmm8,%xmm5
 .byte	102,15,56,0,231
-	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,239
 .byte	102,15,56,0,247
 	movdqa	0(%r11),%xmm7
@@ -595,21 +595,21 @@
 .Ldec_loop:
 	pxor	0(%rax),%xmm15
 	pxor	16(%rax),%xmm0
+	pxor	32(%rax),%xmm1
+	pxor	48(%rax),%xmm2
 .byte	102,68,15,56,0,255
-	pxor	32(%rax),%xmm1
 .byte	102,15,56,0,199
-	pxor	48(%rax),%xmm2
+	pxor	64(%rax),%xmm3
+	pxor	80(%rax),%xmm4
 .byte	102,15,56,0,207
-	pxor	64(%rax),%xmm3
 .byte	102,15,56,0,215
-	pxor	80(%rax),%xmm4
+	pxor	96(%rax),%xmm5
+	pxor	112(%rax),%xmm6
 .byte	102,15,56,0,223
-	pxor	96(%rax),%xmm5
 .byte	102,15,56,0,231
-	pxor	112(%rax),%xmm6
 .byte	102,15,56,0,239
+.byte	102,15,56,0,247
 	leaq	128(%rax),%rax
-.byte	102,15,56,0,247
 .Ldec_sbox:
 	pxor	%xmm3,%xmm2
 
@@ -799,24 +799,24 @@
 	decl	%r10d
 	jl	.Ldec_done
 
-	pshufd	$78,%xmm15,%xmm7
-	pshufd	$78,%xmm2,%xmm13
+	pshufd	$0x4E,%xmm15,%xmm7
+	pshufd	$0x4E,%xmm2,%xmm13
 	pxor	%xmm15,%xmm7
-	pshufd	$78,%xmm4,%xmm14
+	pshufd	$0x4E,%xmm4,%xmm14
 	pxor	%xmm2,%xmm13
-	pshufd	$78,%xmm0,%xmm8
+	pshufd	$0x4E,%xmm0,%xmm8
 	pxor	%xmm4,%xmm14
-	pshufd	$78,%xmm5,%xmm9
+	pshufd	$0x4E,%xmm5,%xmm9
 	pxor	%xmm0,%xmm8
-	pshufd	$78,%xmm3,%xmm10
+	pshufd	$0x4E,%xmm3,%xmm10
 	pxor	%xmm5,%xmm9
 	pxor	%xmm13,%xmm15
 	pxor	%xmm13,%xmm0
-	pshufd	$78,%xmm1,%xmm11
+	pshufd	$0x4E,%xmm1,%xmm11
 	pxor	%xmm3,%xmm10
 	pxor	%xmm7,%xmm5
 	pxor	%xmm8,%xmm3
-	pshufd	$78,%xmm6,%xmm12
+	pshufd	$0x4E,%xmm6,%xmm12
 	pxor	%xmm1,%xmm11
 	pxor	%xmm14,%xmm0
 	pxor	%xmm9,%xmm1
@@ -830,20 +830,20 @@
 	pxor	%xmm14,%xmm1
 	pxor	%xmm14,%xmm6
 	pxor	%xmm12,%xmm4
-	pshufd	$147,%xmm15,%xmm7
-	pshufd	$147,%xmm0,%xmm8
+	pshufd	$0x93,%xmm15,%xmm7
+	pshufd	$0x93,%xmm0,%xmm8
 	pxor	%xmm7,%xmm15
-	pshufd	$147,%xmm5,%xmm9
+	pshufd	$0x93,%xmm5,%xmm9
 	pxor	%xmm8,%xmm0
-	pshufd	$147,%xmm3,%xmm10
+	pshufd	$0x93,%xmm3,%xmm10
 	pxor	%xmm9,%xmm5
-	pshufd	$147,%xmm1,%xmm11
+	pshufd	$0x93,%xmm1,%xmm11
 	pxor	%xmm10,%xmm3
-	pshufd	$147,%xmm6,%xmm12
+	pshufd	$0x93,%xmm6,%xmm12
 	pxor	%xmm11,%xmm1
-	pshufd	$147,%xmm2,%xmm13
+	pshufd	$0x93,%xmm2,%xmm13
 	pxor	%xmm12,%xmm6
-	pshufd	$147,%xmm4,%xmm14
+	pshufd	$0x93,%xmm4,%xmm14
 	pxor	%xmm13,%xmm2
 	pxor	%xmm14,%xmm4
 
@@ -850,25 +850,25 @@
 	pxor	%xmm15,%xmm8
 	pxor	%xmm4,%xmm7
 	pxor	%xmm4,%xmm8
-	pshufd	$78,%xmm15,%xmm15
+	pshufd	$0x4E,%xmm15,%xmm15
 	pxor	%xmm0,%xmm9
-	pshufd	$78,%xmm0,%xmm0
+	pshufd	$0x4E,%xmm0,%xmm0
 	pxor	%xmm1,%xmm12
 	pxor	%xmm7,%xmm15
 	pxor	%xmm6,%xmm13
 	pxor	%xmm8,%xmm0
 	pxor	%xmm3,%xmm11
-	pshufd	$78,%xmm1,%xmm7
+	pshufd	$0x4E,%xmm1,%xmm7
 	pxor	%xmm2,%xmm14
-	pshufd	$78,%xmm6,%xmm8
+	pshufd	$0x4E,%xmm6,%xmm8
 	pxor	%xmm5,%xmm10
-	pshufd	$78,%xmm3,%xmm1
+	pshufd	$0x4E,%xmm3,%xmm1
 	pxor	%xmm4,%xmm10
-	pshufd	$78,%xmm4,%xmm6
+	pshufd	$0x4E,%xmm4,%xmm6
 	pxor	%xmm4,%xmm11
-	pshufd	$78,%xmm2,%xmm3
+	pshufd	$0x4E,%xmm2,%xmm3
 	pxor	%xmm11,%xmm7
-	pshufd	$78,%xmm5,%xmm2
+	pshufd	$0x4E,%xmm5,%xmm2
 	pxor	%xmm12,%xmm8
 	pxor	%xmm1,%xmm10
 	pxor	%xmm14,%xmm6
@@ -1287,7 +1287,7 @@
 	leaq	(%r12),%rdi
 	leaq	32(%rbp),%rsi
 	leaq	(%r15),%rdx
-	call	asm_AES_decrypt		
+	call	asm_AES_decrypt
 	pxor	32(%rbp),%xmm14
 	movdqu	%xmm14,(%r13)
 	movdqa	%xmm15,%xmm14
@@ -1385,21 +1385,21 @@
 	movdqa	-16(%r11),%xmm7
 	pxor	%xmm8,%xmm15
 	pxor	%xmm8,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm8,%xmm2
 .byte	102,68,15,56,0,255
-	pxor	%xmm8,%xmm1
 .byte	102,15,56,0,199
-	pxor	%xmm8,%xmm2
+	pxor	%xmm8,%xmm3
+	pxor	%xmm8,%xmm4
 .byte	102,15,56,0,207
-	pxor	%xmm8,%xmm3
 .byte	102,15,56,0,215
-	pxor	%xmm8,%xmm4
+	pxor	%xmm8,%xmm5
+	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,223
-	pxor	%xmm8,%xmm5
 .byte	102,15,56,0,231
-	pxor	%xmm8,%xmm6
 .byte	102,15,56,0,239
+.byte	102,15,56,0,247
 	leaq	.LBS0(%rip),%r11
-.byte	102,15,56,0,247
 	movl	%ebx,%r10d
 
 	call	_bsaes_encrypt8_bitslice
@@ -1537,7 +1537,7 @@
 	leaq	(%r9),%rdi
 	leaq	32(%rbp),%rsi
 	leaq	(%r8),%rdx
-	call	asm_AES_encrypt		
+	call	asm_AES_encrypt
 
 	movl	240(%r15),%eax
 	movq	%r14,%rbx
@@ -1555,7 +1555,7 @@
 	movdqa	%xmm7,(%rax)
 
 	andq	$-16,%r14
-	subq	$128,%rsp
+	subq	$0x80,%rsp
 	movdqa	32(%rbp),%xmm6
 
 	pxor	%xmm14,%xmm14
@@ -1562,13 +1562,13 @@
 	movdqa	.Lxts_magic(%rip),%xmm12
 	pcmpgtd	%xmm6,%xmm14
 
-	subq	$128,%r14
+	subq	$0x80,%r14
 	jc	.Lxts_enc_short
 	jmp	.Lxts_enc_loop
 
 .align	16
 .Lxts_enc_loop:
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm15
 	movdqa	%xmm6,0(%rsp)
@@ -1576,7 +1576,7 @@
 	pand	%xmm12,%xmm13
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm6,16(%rsp)
@@ -1585,7 +1585,7 @@
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
 	movdqu	0(%r12),%xmm7
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm1
 	movdqa	%xmm6,32(%rsp)
@@ -1595,7 +1595,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	16(%r12),%xmm8
 	pxor	%xmm7,%xmm15
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm2
 	movdqa	%xmm6,48(%rsp)
@@ -1605,7 +1605,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	32(%r12),%xmm9
 	pxor	%xmm8,%xmm0
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm3
 	movdqa	%xmm6,64(%rsp)
@@ -1615,7 +1615,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	48(%r12),%xmm10
 	pxor	%xmm9,%xmm1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm6,80(%rsp)
@@ -1625,7 +1625,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	64(%r12),%xmm11
 	pxor	%xmm10,%xmm2
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm5
 	movdqa	%xmm6,96(%rsp)
@@ -1669,7 +1669,7 @@
 	pxor	%xmm14,%xmm14
 	movdqa	.Lxts_magic(%rip),%xmm12
 	pcmpgtd	%xmm6,%xmm14
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	paddq	%xmm6,%xmm6
 	pand	%xmm12,%xmm13
@@ -1676,13 +1676,13 @@
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
 
-	subq	$128,%r14
+	subq	$0x80,%r14
 	jnc	.Lxts_enc_loop
 
 .Lxts_enc_short:
-	addq	$128,%r14
+	addq	$0x80,%r14
 	jz	.Lxts_enc_done
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm15
 	movdqa	%xmm6,0(%rsp)
@@ -1690,7 +1690,7 @@
 	pand	%xmm12,%xmm13
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm6,16(%rsp)
@@ -1701,7 +1701,7 @@
 	movdqu	0(%r12),%xmm7
 	cmpq	$16,%r14
 	je	.Lxts_enc_1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm1
 	movdqa	%xmm6,32(%rsp)
@@ -1713,7 +1713,7 @@
 	cmpq	$32,%r14
 	je	.Lxts_enc_2
 	pxor	%xmm7,%xmm15
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm2
 	movdqa	%xmm6,48(%rsp)
@@ -1725,7 +1725,7 @@
 	cmpq	$48,%r14
 	je	.Lxts_enc_3
 	pxor	%xmm8,%xmm0
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm3
 	movdqa	%xmm6,64(%rsp)
@@ -1737,7 +1737,7 @@
 	cmpq	$64,%r14
 	je	.Lxts_enc_4
 	pxor	%xmm9,%xmm1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm6,80(%rsp)
@@ -1749,7 +1749,7 @@
 	cmpq	$80,%r14
 	je	.Lxts_enc_5
 	pxor	%xmm10,%xmm2
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm5
 	movdqa	%xmm6,96(%rsp)
@@ -1907,7 +1907,7 @@
 	leaq	32(%rbp),%rdi
 	leaq	32(%rbp),%rsi
 	leaq	(%r15),%rdx
-	call	asm_AES_encrypt		
+	call	asm_AES_encrypt
 	pxor	32(%rbp),%xmm15
 
 
@@ -1940,7 +1940,7 @@
 	leaq	32(%rbp),%rsi
 	movdqa	%xmm15,32(%rbp)
 	leaq	(%r15),%rdx
-	call	asm_AES_encrypt		
+	call	asm_AES_encrypt
 	pxor	32(%rbp),%xmm6
 	movdqu	%xmm6,-16(%r13)
 
@@ -1989,7 +1989,7 @@
 	leaq	(%r9),%rdi
 	leaq	32(%rbp),%rsi
 	leaq	(%r8),%rdx
-	call	asm_AES_encrypt		
+	call	asm_AES_encrypt
 
 	movl	240(%r15),%eax
 	movq	%r14,%rbx
@@ -2014,7 +2014,7 @@
 	shlq	$4,%rax
 	subq	%rax,%r14
 
-	subq	$128,%rsp
+	subq	$0x80,%rsp
 	movdqa	32(%rbp),%xmm6
 
 	pxor	%xmm14,%xmm14
@@ -2021,13 +2021,13 @@
 	movdqa	.Lxts_magic(%rip),%xmm12
 	pcmpgtd	%xmm6,%xmm14
 
-	subq	$128,%r14
+	subq	$0x80,%r14
 	jc	.Lxts_dec_short
 	jmp	.Lxts_dec_loop
 
 .align	16
 .Lxts_dec_loop:
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm15
 	movdqa	%xmm6,0(%rsp)
@@ -2035,7 +2035,7 @@
 	pand	%xmm12,%xmm13
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm6,16(%rsp)
@@ -2044,7 +2044,7 @@
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
 	movdqu	0(%r12),%xmm7
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm1
 	movdqa	%xmm6,32(%rsp)
@@ -2054,7 +2054,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	16(%r12),%xmm8
 	pxor	%xmm7,%xmm15
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm2
 	movdqa	%xmm6,48(%rsp)
@@ -2064,7 +2064,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	32(%r12),%xmm9
 	pxor	%xmm8,%xmm0
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm3
 	movdqa	%xmm6,64(%rsp)
@@ -2074,7 +2074,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	48(%r12),%xmm10
 	pxor	%xmm9,%xmm1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm6,80(%rsp)
@@ -2084,7 +2084,7 @@
 	pxor	%xmm13,%xmm6
 	movdqu	64(%r12),%xmm11
 	pxor	%xmm10,%xmm2
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm5
 	movdqa	%xmm6,96(%rsp)
@@ -2128,7 +2128,7 @@
 	pxor	%xmm14,%xmm14
 	movdqa	.Lxts_magic(%rip),%xmm12
 	pcmpgtd	%xmm6,%xmm14
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	paddq	%xmm6,%xmm6
 	pand	%xmm12,%xmm13
@@ -2135,13 +2135,13 @@
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
 
-	subq	$128,%r14
+	subq	$0x80,%r14
 	jnc	.Lxts_dec_loop
 
 .Lxts_dec_short:
-	addq	$128,%r14
+	addq	$0x80,%r14
 	jz	.Lxts_dec_done
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm15
 	movdqa	%xmm6,0(%rsp)
@@ -2149,7 +2149,7 @@
 	pand	%xmm12,%xmm13
 	pcmpgtd	%xmm6,%xmm14
 	pxor	%xmm13,%xmm6
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm6,16(%rsp)
@@ -2160,7 +2160,7 @@
 	movdqu	0(%r12),%xmm7
 	cmpq	$16,%r14
 	je	.Lxts_dec_1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm1
 	movdqa	%xmm6,32(%rsp)
@@ -2172,7 +2172,7 @@
 	cmpq	$32,%r14
 	je	.Lxts_dec_2
 	pxor	%xmm7,%xmm15
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm2
 	movdqa	%xmm6,48(%rsp)
@@ -2184,7 +2184,7 @@
 	cmpq	$48,%r14
 	je	.Lxts_dec_3
 	pxor	%xmm8,%xmm0
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm3
 	movdqa	%xmm6,64(%rsp)
@@ -2196,7 +2196,7 @@
 	cmpq	$64,%r14
 	je	.Lxts_dec_4
 	pxor	%xmm9,%xmm1
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm4
 	movdqa	%xmm6,80(%rsp)
@@ -2208,7 +2208,7 @@
 	cmpq	$80,%r14
 	je	.Lxts_dec_5
 	pxor	%xmm10,%xmm2
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	pxor	%xmm14,%xmm14
 	movdqa	%xmm6,%xmm5
 	movdqa	%xmm6,96(%rsp)
@@ -2366,7 +2366,7 @@
 	leaq	32(%rbp),%rdi
 	leaq	32(%rbp),%rsi
 	leaq	(%r15),%rdx
-	call	asm_AES_decrypt		
+	call	asm_AES_decrypt
 	pxor	32(%rbp),%xmm15
 
 
@@ -2385,7 +2385,7 @@
 	pxor	%xmm14,%xmm14
 	movdqa	.Lxts_magic(%rip),%xmm12
 	pcmpgtd	%xmm6,%xmm14
-	pshufd	$19,%xmm14,%xmm13
+	pshufd	$0x13,%xmm14,%xmm13
 	movdqa	%xmm6,%xmm5
 	paddq	%xmm6,%xmm6
 	pand	%xmm12,%xmm13
@@ -2397,7 +2397,7 @@
 	leaq	32(%rbp),%rsi
 	movdqa	%xmm15,32(%rbp)
 	leaq	(%r15),%rdx
-	call	asm_AES_decrypt		
+	call	asm_AES_decrypt
 	pxor	32(%rbp),%xmm6
 	movq	%r13,%rdx
 	movdqu	%xmm6,(%r13)
@@ -2418,7 +2418,7 @@
 	leaq	32(%rbp),%rsi
 	movdqa	%xmm15,32(%rbp)
 	leaq	(%r15),%rdx
-	call	asm_AES_decrypt		
+	call	asm_AES_decrypt
 	pxor	32(%rbp),%xmm5
 	movdqu	%xmm5,(%r13)
 

Modified: trunk/secure/lib/libcrypto/amd64/cmll-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/cmll-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/cmll-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/cmll-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from cmll-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/cmll-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from cmll-x86_64.pl. */
 .text	
 
 
@@ -269,7 +269,7 @@
 	movl	%ecx,%r10d
 	movl	%edx,%r11d
 
-.byte	0xf3,0xc3		
+.byte	0xf3,0xc3
 .size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
 
 
@@ -539,7 +539,7 @@
 	movl	%eax,%r10d
 	movl	%ebx,%r11d
 
-.byte	0xf3,0xc3		
+.byte	0xf3,0xc3
 .size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
 .globl	Camellia_Ekeygen
 .type	Camellia_Ekeygen, at function
@@ -552,7 +552,7 @@
 	pushq	%r15
 .Lkey_prologue:
 
-	movq	%rdi,%r15
+	movl	%edi,%r15d
 	movq	%rdx,%r13
 
 	movl	0(%rsi),%r8d
@@ -1627,7 +1627,7 @@
 	leaq	-64-63(%rcx),%r10
 	subq	%rsp,%r10
 	negq	%r10
-	andq	$960,%r10
+	andq	$0x3C0,%r10
 	subq	%r10,%rsp
 
 
@@ -1726,7 +1726,7 @@
 	cld
 	movq	%r12,%rsi
 	leaq	8+24(%rsp),%rdi
-.long	0x9066A4F3		
+.long	0x9066A4F3
 	popfq
 .Lcbc_enc_popf:
 
@@ -1733,7 +1733,7 @@
 	leaq	24(%rsp),%r12
 	leaq	16+24(%rsp),%rax
 	movq	%rax,8(%rsp)
-	jmp	.Lcbc_eloop		
+	jmp	.Lcbc_eloop
 
 .align	16
 .LCBC_DECRYPT:
@@ -1816,7 +1816,7 @@
 	cld
 	leaq	8+24(%rsp),%rsi
 	leaq	(%r13),%rdi
-.long	0x9066A4F3		
+.long	0x9066A4F3
 	popfq
 .Lcbc_dec_popf:
 

Modified: trunk/secure/lib/libcrypto/amd64/ghash-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/ghash-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/ghash-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,8 +1,9 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/ghash-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from ghash-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/ghash-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
 .text	
 
+
 .globl	gcm_gmult_4bit
 .type	gcm_gmult_4bit, at function
 .align	16
@@ -22,7 +23,7 @@
 	movq	$14,%rcx
 	movq	8(%rsi,%rax,1),%r8
 	movq	(%rsi,%rax,1),%r9
-	andb	$240,%bl
+	andb	$0xf0,%bl
 	movq	%r8,%rdx
 	jmp	.Loop1
 
@@ -29,7 +30,7 @@
 .align	16
 .Loop1:
 	shrq	$4,%r8
-	andq	$15,%rdx
+	andq	$0xf,%rdx
 	movq	%r9,%r10
 	movb	(%rdi,%rcx,1),%al
 	shrq	$4,%r9
@@ -45,13 +46,13 @@
 	js	.Lbreak1
 
 	shrq	$4,%r8
-	andq	$15,%rdx
+	andq	$0xf,%rdx
 	movq	%r9,%r10
 	shrq	$4,%r9
 	xorq	8(%rsi,%rax,1),%r8
 	shlq	$60,%r10
 	xorq	(%rsi,%rax,1),%r9
-	andb	$240,%bl
+	andb	$0xf0,%bl
 	xorq	(%r11,%rdx,8),%r9
 	movq	%r8,%rdx
 	xorq	%r10,%r8
@@ -60,19 +61,19 @@
 .align	16
 .Lbreak1:
 	shrq	$4,%r8
-	andq	$15,%rdx
+	andq	$0xf,%rdx
 	movq	%r9,%r10
 	shrq	$4,%r9
 	xorq	8(%rsi,%rax,1),%r8
 	shlq	$60,%r10
 	xorq	(%rsi,%rax,1),%r9
-	andb	$240,%bl
+	andb	$0xf0,%bl
 	xorq	(%r11,%rdx,8),%r9
 	movq	%r8,%rdx
 	xorq	%r10,%r8
 
 	shrq	$4,%r8
-	andq	$15,%rdx
+	andq	$0xf,%rdx
 	movq	%r9,%r10
 	shrq	$4,%r9
 	xorq	8(%rsi,%rbx,1),%r8
@@ -661,6 +662,7 @@
 .type	gcm_init_clmul, at function
 .align	16
 gcm_init_clmul:
+.L_init_clmul:
 	movdqu	(%rsi),%xmm2
 	pshufd	$78,%xmm2,%xmm2
 
@@ -679,15 +681,15 @@
 	pxor	%xmm5,%xmm2
 
 
+	pshufd	$78,%xmm2,%xmm6
 	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+.byte	102,15,58,68,222,0
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -697,29 +699,119 @@
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pxor	%xmm4,%xmm1
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 
 
 	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
 	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
-	movdqu	%xmm2,(%rdi)
-	movdqu	%xmm0,16(%rdi)
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
 	.byte	0xf3,0xc3
 .size	gcm_init_clmul,.-gcm_init_clmul
 .globl	gcm_gmult_clmul
@@ -726,15 +818,15 @@
 .type	gcm_gmult_clmul, at function
 .align	16
 gcm_gmult_clmul:
+.L_gmult_clmul:
 	movdqu	(%rdi),%xmm0
 	movdqa	.Lbswap_mask(%rip),%xmm5
 	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
 .byte	102,15,56,0,197
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
 .byte	102,15,58,68,220,0
@@ -747,27 +839,28 @@
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pxor	%xmm4,%xmm1
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 
 
 	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%rdi)
 	.byte	0xf3,0xc3
@@ -774,174 +867,351 @@
 .size	gcm_gmult_clmul,.-gcm_gmult_clmul
 .globl	gcm_ghash_clmul
 .type	gcm_ghash_clmul, at function
-.align	16
+.align	32
 gcm_ghash_clmul:
-	movdqa	.Lbswap_mask(%rip),%xmm5
+.L_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm10
 
 	movdqu	(%rdi),%xmm0
 	movdqu	(%rsi),%xmm2
-.byte	102,15,56,0,197
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
 
-	subq	$16,%rcx
+	subq	$0x10,%rcx
 	jz	.Lodd_tail
 
-	movdqu	16(%rsi),%xmm8
+	movdqu	16(%rsi),%xmm6
+	movl	OPENSSL_ia32cap_P+4(%rip),%eax
+	cmpq	$0x30,%rcx
+	jb	.Lskip4x
 
+	andl	$71303168,%eax
+	cmpl	$4194304,%eax
+	je	.Lskip4x
 
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
 
 
 
-	movdqu	(%rdx),%xmm3
-	movdqu	16(%rdx),%xmm6
-.byte	102,15,56,0,221
-.byte	102,15,56,0,245
-	pxor	%xmm3,%xmm0
-	movdqa	%xmm6,%xmm7
-	pshufd	$78,%xmm6,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm6,%xmm3
-	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,242,0
-.byte	102,15,58,68,250,17
-.byte	102,15,58,68,220,0
-	pxor	%xmm6,%xmm3
-	pxor	%xmm7,%xmm3
 
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm7
-	pxor	%xmm4,%xmm6
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm8,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm8,%xmm4
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
 
-	leaq	32(%rdx),%rdx
-	subq	$32,%rcx
-	jbe	.Leven_tail
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	.Ltail4x
 
-.Lmod_loop:
-.byte	102,65,15,58,68,192,0
-.byte	102,65,15,58,68,200,17
-.byte	102,15,58,68,220,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
 
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqu	(%rdx),%xmm3
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	.L7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
 
-	movdqu	16(%rdx),%xmm6
-.byte	102,15,56,0,221
-.byte	102,15,56,0,245
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
 
-	movdqa	%xmm6,%xmm7
-	pshufd	$78,%xmm6,%xmm9
-	pshufd	$78,%xmm2,%xmm10
-	pxor	%xmm6,%xmm9
-	pxor	%xmm2,%xmm10
-	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
 
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
-.byte	102,15,58,68,242,0
 	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pxor	%xmm4,%xmm1
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 
-.byte	102,15,58,68,250,17
+
 	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	.Ldone
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+.Lskip4x:
 
-.byte	102,69,15,58,68,202,0
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	.Leven_tail
+	nop
+	jmp	.Lmod_loop
+
+.align	32
+.Lmod_loop:
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm8,%xmm4
-	pxor	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
 	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
 
-	pxor	%xmm6,%xmm9
-	pxor	%xmm7,%xmm9
-	movdqa	%xmm9,%xmm10
-	psrldq	$8,%xmm9
-	pslldq	$8,%xmm10
-	pxor	%xmm9,%xmm7
-	pxor	%xmm10,%xmm6
+	movdqa	%xmm3,%xmm5
 
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
 	leaq	32(%rdx),%rdx
-	subq	$32,%rcx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
 	ja	.Lmod_loop
 
 .Leven_tail:
-.byte	102,65,15,58,68,192,0
-.byte	102,65,15,58,68,200,17
-.byte	102,15,58,68,220,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
 	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
+	pxor	%xmm8,%xmm1
 	pxor	%xmm4,%xmm0
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
 
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pxor	%xmm4,%xmm1
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 
 
 	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 	testq	%rcx,%rcx
 	jnz	.Ldone
 
 .Lodd_tail:
-	movdqu	(%rdx),%xmm3
-.byte	102,15,56,0,221
-	pxor	%xmm3,%xmm0
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
+.byte	102,15,58,68,223,0
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
@@ -951,38 +1221,531 @@
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pxor	%xmm4,%xmm1
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 
 
 	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .Ldone:
-.byte	102,15,56,0,197
+.byte	102,65,15,56,0,194
 	movdqu	%xmm0,(%rdi)
 	.byte	0xf3,0xc3
-.LSEH_end_gcm_ghash_clmul:
 .size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl	gcm_init_avx
+.type	gcm_init_avx, at function
+.align	32
+gcm_init_avx:
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.size	gcm_init_avx,.-gcm_init_avx
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx, at function
+.align	32
+gcm_gmult_avx:
+	jmp	.L_gmult_clmul
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx, at function
+.align	32
+gcm_ghash_avx:
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.size	gcm_ghash_avx,.-gcm_ghash_avx
 .align	64
 .Lbswap_mask:
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .L0x1c2_polynomial:
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.L7_mask_poly:
+.long	7,0,450,0
 .align	64
 .type	.Lrem_4bit, at object
 .Lrem_4bit:

Modified: trunk/secure/lib/libcrypto/amd64/md5-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/md5-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/md5-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/md5-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from md5-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/md5-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from md5-x86_64.pl. */
 .text	
 .align	16
 
@@ -32,7 +32,7 @@
 
 
 	cmpq	%rdi,%rsi
-	je	.Lend				
+	je	.Lend
 
 
 .Lloop:
@@ -496,7 +496,7 @@
 	movl	%ecx,%r11d
 	addl	%ecx,%ebx
 	movl	0(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	xorl	%edx,%r11d
 	leal	-198630844(%rax,%r10,1),%eax
 	orl	%ebx,%r11d
@@ -503,7 +503,7 @@
 	xorl	%ecx,%r11d
 	addl	%r11d,%eax
 	movl	28(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$6,%eax
 	xorl	%ecx,%r11d
 	addl	%ebx,%eax
@@ -512,7 +512,7 @@
 	xorl	%ebx,%r11d
 	addl	%r11d,%edx
 	movl	56(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$10,%edx
 	xorl	%ebx,%r11d
 	addl	%eax,%edx
@@ -521,7 +521,7 @@
 	xorl	%eax,%r11d
 	addl	%r11d,%ecx
 	movl	20(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$15,%ecx
 	xorl	%eax,%r11d
 	addl	%edx,%ecx
@@ -530,7 +530,7 @@
 	xorl	%edx,%r11d
 	addl	%r11d,%ebx
 	movl	48(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$21,%ebx
 	xorl	%edx,%r11d
 	addl	%ecx,%ebx
@@ -539,7 +539,7 @@
 	xorl	%ecx,%r11d
 	addl	%r11d,%eax
 	movl	12(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$6,%eax
 	xorl	%ecx,%r11d
 	addl	%ebx,%eax
@@ -548,7 +548,7 @@
 	xorl	%ebx,%r11d
 	addl	%r11d,%edx
 	movl	40(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$10,%edx
 	xorl	%ebx,%r11d
 	addl	%eax,%edx
@@ -557,7 +557,7 @@
 	xorl	%eax,%r11d
 	addl	%r11d,%ecx
 	movl	4(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$15,%ecx
 	xorl	%eax,%r11d
 	addl	%edx,%ecx
@@ -566,7 +566,7 @@
 	xorl	%edx,%r11d
 	addl	%r11d,%ebx
 	movl	32(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$21,%ebx
 	xorl	%edx,%r11d
 	addl	%ecx,%ebx
@@ -575,7 +575,7 @@
 	xorl	%ecx,%r11d
 	addl	%r11d,%eax
 	movl	60(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$6,%eax
 	xorl	%ecx,%r11d
 	addl	%ebx,%eax
@@ -584,7 +584,7 @@
 	xorl	%ebx,%r11d
 	addl	%r11d,%edx
 	movl	24(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$10,%edx
 	xorl	%ebx,%r11d
 	addl	%eax,%edx
@@ -593,7 +593,7 @@
 	xorl	%eax,%r11d
 	addl	%r11d,%ecx
 	movl	52(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$15,%ecx
 	xorl	%eax,%r11d
 	addl	%edx,%ecx
@@ -602,7 +602,7 @@
 	xorl	%edx,%r11d
 	addl	%r11d,%ebx
 	movl	16(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$21,%ebx
 	xorl	%edx,%r11d
 	addl	%ecx,%ebx
@@ -611,7 +611,7 @@
 	xorl	%ecx,%r11d
 	addl	%r11d,%eax
 	movl	44(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$6,%eax
 	xorl	%ecx,%r11d
 	addl	%ebx,%eax
@@ -620,7 +620,7 @@
 	xorl	%ebx,%r11d
 	addl	%r11d,%edx
 	movl	8(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$10,%edx
 	xorl	%ebx,%r11d
 	addl	%eax,%edx
@@ -629,7 +629,7 @@
 	xorl	%eax,%r11d
 	addl	%r11d,%ecx
 	movl	36(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$15,%ecx
 	xorl	%eax,%r11d
 	addl	%edx,%ecx
@@ -638,7 +638,7 @@
 	xorl	%edx,%r11d
 	addl	%r11d,%ebx
 	movl	0(%rsi),%r10d
-	movl	$4294967295,%r11d
+	movl	$0xffffffff,%r11d
 	roll	$21,%ebx
 	xorl	%edx,%r11d
 	addl	%ecx,%ebx
@@ -651,7 +651,7 @@
 
 	addq	$64,%rsi
 	cmpq	%rdi,%rsi
-	jb	.Lloop				
+	jb	.Lloop
 
 
 .Lend:

Modified: trunk/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from rc4-md5-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rc4-md5-x86_64.pl. */
 .text	
 .align	16
 

Modified: trunk/secure/lib/libcrypto/amd64/rc4-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/rc4-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/rc4-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/rc4-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from rc4-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rc4-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rc4-x86_64.pl. */
 .text	
 
 
@@ -50,7 +50,7 @@
 	movl	(%rdi,%rax,4),%edx
 	movl	(%rdi,%r10,4),%eax
 	xorb	(%r12),%dl
-	movb	%dl,(%r13,%r12,1)
+	movb	%dl,(%r12,%r13,1)
 	leaq	1(%r12),%r12
 	decq	%rbx
 	jnz	.Loop8_warmup
@@ -129,7 +129,7 @@
 	subq	$8,%r11
 
 	xorq	(%r12),%r8
-	movq	%r8,(%r13,%r12,1)
+	movq	%r8,(%r12,%r13,1)
 	leaq	8(%r12),%r12
 
 	testq	$-8,%r11
@@ -155,7 +155,7 @@
 	movl	(%rdi,%rax,4),%edx
 	movl	(%rdi,%r10,4),%eax
 	xorb	(%r12),%dl
-	movb	%dl,(%r13,%r12,1)
+	movb	%dl,(%r12,%r13,1)
 	leaq	1(%r12),%r12
 	decq	%rbx
 	jnz	.Loop16_warmup
@@ -192,7 +192,7 @@
 	pxor	%xmm1,%xmm2
 	addb	%bl,%cl
 	pinsrw	$0,(%rdi,%rax,4),%xmm0
-	movdqu	%xmm2,(%r13,%r12,1)
+	movdqu	%xmm2,(%r12,%r13,1)
 	leaq	16(%r12),%r12
 .Loop16_enter:
 	movl	(%rdi,%rcx,4),%edx
@@ -328,7 +328,7 @@
 	psllq	$8,%xmm1
 	pxor	%xmm0,%xmm2
 	pxor	%xmm1,%xmm2
-	movdqu	%xmm2,(%r13,%r12,1)
+	movdqu	%xmm2,(%r12,%r13,1)
 	leaq	16(%r12),%r12
 
 	cmpq	$0,%r11
@@ -346,7 +346,7 @@
 	movl	(%rdi,%rax,4),%edx
 	movl	(%rdi,%r10,4),%eax
 	xorb	(%r12),%dl
-	movb	%dl,(%r13,%r12,1)
+	movb	%dl,(%r12,%r13,1)
 	leaq	1(%r12),%r12
 	decq	%r11
 	jnz	.Lloop1
@@ -371,7 +371,7 @@
 	movb	%al,(%rdi,%rcx,1)
 	cmpq	%rsi,%rcx
 	movb	%dl,(%rdi,%r10,1)
-	jne	.Lcmov0			
+	jne	.Lcmov0
 	movq	%rax,%rbx
 .Lcmov0:
 	addb	%al,%dl
@@ -385,7 +385,7 @@
 	movb	%bl,(%rdi,%rcx,1)
 	cmpq	%r10,%rcx
 	movb	%dl,(%rdi,%rsi,1)
-	jne	.Lcmov1			
+	jne	.Lcmov1
 	movq	%rbx,%rax
 .Lcmov1:
 	addb	%bl,%dl
@@ -399,7 +399,7 @@
 	movb	%al,(%rdi,%rcx,1)
 	cmpq	%rsi,%rcx
 	movb	%dl,(%rdi,%r10,1)
-	jne	.Lcmov2			
+	jne	.Lcmov2
 	movq	%rax,%rbx
 .Lcmov2:
 	addb	%al,%dl
@@ -413,7 +413,7 @@
 	movb	%bl,(%rdi,%rcx,1)
 	cmpq	%r10,%rcx
 	movb	%dl,(%rdi,%rsi,1)
-	jne	.Lcmov3			
+	jne	.Lcmov3
 	movq	%rbx,%rax
 .Lcmov3:
 	addb	%bl,%dl
@@ -427,7 +427,7 @@
 	movb	%al,(%rdi,%rcx,1)
 	cmpq	%rsi,%rcx
 	movb	%dl,(%rdi,%r10,1)
-	jne	.Lcmov4			
+	jne	.Lcmov4
 	movq	%rax,%rbx
 .Lcmov4:
 	addb	%al,%dl
@@ -441,7 +441,7 @@
 	movb	%bl,(%rdi,%rcx,1)
 	cmpq	%r10,%rcx
 	movb	%dl,(%rdi,%rsi,1)
-	jne	.Lcmov5			
+	jne	.Lcmov5
 	movq	%rbx,%rax
 .Lcmov5:
 	addb	%bl,%dl
@@ -455,7 +455,7 @@
 	movb	%al,(%rdi,%rcx,1)
 	cmpq	%rsi,%rcx
 	movb	%dl,(%rdi,%r10,1)
-	jne	.Lcmov6			
+	jne	.Lcmov6
 	movq	%rax,%rbx
 .Lcmov6:
 	addb	%al,%dl
@@ -469,7 +469,7 @@
 	movb	%bl,(%rdi,%rcx,1)
 	cmpq	%r10,%rcx
 	movb	%dl,(%rdi,%rsi,1)
-	jne	.Lcmov7			
+	jne	.Lcmov7
 	movq	%rbx,%rax
 .Lcmov7:
 	addb	%bl,%dl

Modified: trunk/secure/lib/libcrypto/amd64/sha1-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/sha1-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/sha1-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/sha1-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from sha1-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/sha1-x86_64.S 306195 2016-09-22 14:57:48Z jkim $ */
+/* Do not modify. This file is auto-generated from sha1-x86_64.pl. */
 .text	
 
 
@@ -10,8 +10,14 @@
 sha1_block_data_order:
 	movl	OPENSSL_ia32cap_P+0(%rip),%r9d
 	movl	OPENSSL_ia32cap_P+4(%rip),%r8d
+	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
 	testl	$512,%r8d
 	jz	.Lialu
+	testl	$536870912,%r10d
+	jnz	_shaext_shortcut
+	andl	$296,%r10d
+	cmpl	$296,%r10d
+	je	_avx2_shortcut
 	andl	$268435456,%r8d
 	andl	$1073741824,%r9d
 	orl	%r9d,%r8d
@@ -21,17 +27,18 @@
 
 .align	16
 .Lialu:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
-	movq	%rsp,%r11
+	pushq	%r14
 	movq	%rdi,%r8
 	subq	$72,%rsp
 	movq	%rsi,%r9
 	andq	$-64,%rsp
 	movq	%rdx,%r10
-	movq	%r11,64(%rsp)
+	movq	%rax,64(%rsp)
 .Lprologue:
 
 	movl	0(%r8),%esi
@@ -45,1230 +52,1168 @@
 .Lloop:
 	movl	0(%r9),%edx
 	bswapl	%edx
+	movl	4(%r9),%ebp
+	movl	%r12d,%eax
 	movl	%edx,0(%rsp)
-	movl	%r11d,%eax
-	movl	4(%r9),%ebp
 	movl	%esi,%ecx
-	xorl	%r12d,%eax
 	bswapl	%ebp
+	xorl	%r11d,%eax
 	roll	$5,%ecx
+	andl	%edi,%eax
 	leal	1518500249(%rdx,%r13,1),%r13d
-	andl	%edi,%eax
-	movl	%ebp,4(%rsp)
 	addl	%ecx,%r13d
 	xorl	%r12d,%eax
 	roll	$30,%edi
 	addl	%eax,%r13d
-	movl	%edi,%eax
-	movl	8(%r9),%edx
+	movl	8(%r9),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,4(%rsp)
 	movl	%r13d,%ecx
-	xorl	%r11d,%eax
-	bswapl	%edx
+	bswapl	%r14d
+	xorl	%edi,%eax
 	roll	$5,%ecx
+	andl	%esi,%eax
 	leal	1518500249(%rbp,%r12,1),%r12d
-	andl	%esi,%eax
-	movl	%edx,8(%rsp)
 	addl	%ecx,%r12d
 	xorl	%r11d,%eax
 	roll	$30,%esi
 	addl	%eax,%r12d
-	movl	%esi,%eax
-	movl	12(%r9),%ebp
+	movl	12(%r9),%edx
+	movl	%edi,%eax
+	movl	%r14d,8(%rsp)
 	movl	%r12d,%ecx
-	xorl	%edi,%eax
-	bswapl	%ebp
+	bswapl	%edx
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rdx,%r11,1),%r11d
 	andl	%r13d,%eax
-	movl	%ebp,12(%rsp)
+	leal	1518500249(%r14,%r11,1),%r11d
 	addl	%ecx,%r11d
 	xorl	%edi,%eax
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	movl	%r13d,%eax
-	movl	16(%r9),%edx
+	movl	16(%r9),%ebp
+	movl	%esi,%eax
+	movl	%edx,12(%rsp)
 	movl	%r11d,%ecx
-	xorl	%esi,%eax
-	bswapl	%edx
+	bswapl	%ebp
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rbp,%rdi,1),%edi
 	andl	%r12d,%eax
-	movl	%edx,16(%rsp)
+	leal	1518500249(%rdx,%rdi,1),%edi
 	addl	%ecx,%edi
 	xorl	%esi,%eax
 	roll	$30,%r12d
 	addl	%eax,%edi
-	movl	%r12d,%eax
-	movl	20(%r9),%ebp
+	movl	20(%r9),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,16(%rsp)
 	movl	%edi,%ecx
-	xorl	%r13d,%eax
-	bswapl	%ebp
+	bswapl	%r14d
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rdx,%rsi,1),%esi
 	andl	%r11d,%eax
-	movl	%ebp,20(%rsp)
+	leal	1518500249(%rbp,%rsi,1),%esi
 	addl	%ecx,%esi
 	xorl	%r13d,%eax
 	roll	$30,%r11d
 	addl	%eax,%esi
-	movl	%r11d,%eax
 	movl	24(%r9),%edx
+	movl	%r12d,%eax
+	movl	%r14d,20(%rsp)
 	movl	%esi,%ecx
-	xorl	%r12d,%eax
 	bswapl	%edx
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rbp,%r13,1),%r13d
 	andl	%edi,%eax
-	movl	%edx,24(%rsp)
+	leal	1518500249(%r14,%r13,1),%r13d
 	addl	%ecx,%r13d
 	xorl	%r12d,%eax
 	roll	$30,%edi
 	addl	%eax,%r13d
-	movl	%edi,%eax
 	movl	28(%r9),%ebp
+	movl	%r11d,%eax
+	movl	%edx,24(%rsp)
 	movl	%r13d,%ecx
-	xorl	%r11d,%eax
 	bswapl	%ebp
+	xorl	%edi,%eax
 	roll	$5,%ecx
+	andl	%esi,%eax
 	leal	1518500249(%rdx,%r12,1),%r12d
-	andl	%esi,%eax
-	movl	%ebp,28(%rsp)
 	addl	%ecx,%r12d
 	xorl	%r11d,%eax
 	roll	$30,%esi
 	addl	%eax,%r12d
-	movl	%esi,%eax
-	movl	32(%r9),%edx
+	movl	32(%r9),%r14d
+	movl	%edi,%eax
+	movl	%ebp,28(%rsp)
 	movl	%r12d,%ecx
-	xorl	%edi,%eax
-	bswapl	%edx
+	bswapl	%r14d
+	xorl	%esi,%eax
 	roll	$5,%ecx
+	andl	%r13d,%eax
 	leal	1518500249(%rbp,%r11,1),%r11d
-	andl	%r13d,%eax
-	movl	%edx,32(%rsp)
 	addl	%ecx,%r11d
 	xorl	%edi,%eax
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	movl	%r13d,%eax
-	movl	36(%r9),%ebp
+	movl	36(%r9),%edx
+	movl	%esi,%eax
+	movl	%r14d,32(%rsp)
 	movl	%r11d,%ecx
-	xorl	%esi,%eax
-	bswapl	%ebp
+	bswapl	%edx
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rdx,%rdi,1),%edi
 	andl	%r12d,%eax
-	movl	%ebp,36(%rsp)
+	leal	1518500249(%r14,%rdi,1),%edi
 	addl	%ecx,%edi
 	xorl	%esi,%eax
 	roll	$30,%r12d
 	addl	%eax,%edi
-	movl	%r12d,%eax
-	movl	40(%r9),%edx
+	movl	40(%r9),%ebp
+	movl	%r13d,%eax
+	movl	%edx,36(%rsp)
 	movl	%edi,%ecx
-	xorl	%r13d,%eax
-	bswapl	%edx
+	bswapl	%ebp
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rbp,%rsi,1),%esi
 	andl	%r11d,%eax
-	movl	%edx,40(%rsp)
+	leal	1518500249(%rdx,%rsi,1),%esi
 	addl	%ecx,%esi
 	xorl	%r13d,%eax
 	roll	$30,%r11d
 	addl	%eax,%esi
-	movl	%r11d,%eax
-	movl	44(%r9),%ebp
+	movl	44(%r9),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,40(%rsp)
 	movl	%esi,%ecx
-	xorl	%r12d,%eax
-	bswapl	%ebp
+	bswapl	%r14d
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rdx,%r13,1),%r13d
 	andl	%edi,%eax
-	movl	%ebp,44(%rsp)
+	leal	1518500249(%rbp,%r13,1),%r13d
 	addl	%ecx,%r13d
 	xorl	%r12d,%eax
 	roll	$30,%edi
 	addl	%eax,%r13d
-	movl	%edi,%eax
 	movl	48(%r9),%edx
+	movl	%r11d,%eax
+	movl	%r14d,44(%rsp)
 	movl	%r13d,%ecx
-	xorl	%r11d,%eax
 	bswapl	%edx
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rbp,%r12,1),%r12d
 	andl	%esi,%eax
-	movl	%edx,48(%rsp)
+	leal	1518500249(%r14,%r12,1),%r12d
 	addl	%ecx,%r12d
 	xorl	%r11d,%eax
 	roll	$30,%esi
 	addl	%eax,%r12d
-	movl	%esi,%eax
 	movl	52(%r9),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
 	movl	%r12d,%ecx
-	xorl	%edi,%eax
 	bswapl	%ebp
+	xorl	%esi,%eax
 	roll	$5,%ecx
+	andl	%r13d,%eax
 	leal	1518500249(%rdx,%r11,1),%r11d
-	andl	%r13d,%eax
-	movl	%ebp,52(%rsp)
 	addl	%ecx,%r11d
 	xorl	%edi,%eax
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	movl	%r13d,%eax
-	movl	56(%r9),%edx
+	movl	56(%r9),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
 	movl	%r11d,%ecx
-	xorl	%esi,%eax
-	bswapl	%edx
+	bswapl	%r14d
+	xorl	%r13d,%eax
 	roll	$5,%ecx
+	andl	%r12d,%eax
 	leal	1518500249(%rbp,%rdi,1),%edi
-	andl	%r12d,%eax
-	movl	%edx,56(%rsp)
 	addl	%ecx,%edi
 	xorl	%esi,%eax
 	roll	$30,%r12d
 	addl	%eax,%edi
-	movl	%r12d,%eax
-	movl	60(%r9),%ebp
+	movl	60(%r9),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
 	movl	%edi,%ecx
-	xorl	%r13d,%eax
-	bswapl	%ebp
+	bswapl	%edx
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	1518500249(%rdx,%rsi,1),%esi
 	andl	%r11d,%eax
-	movl	%ebp,60(%rsp)
+	leal	1518500249(%r14,%rsi,1),%esi
 	addl	%ecx,%esi
 	xorl	%r13d,%eax
 	roll	$30,%r11d
 	addl	%eax,%esi
-	movl	0(%rsp),%edx
-	movl	%r11d,%eax
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
 	movl	%esi,%ecx
-	xorl	8(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	8(%rsp),%ebp
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	xorl	32(%rsp),%edx
+	xorl	32(%rsp),%ebp
 	andl	%edi,%eax
-	leal	1518500249(%rbp,%r13,1),%r13d
-	xorl	52(%rsp),%edx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	roll	$30,%edi
 	xorl	%r12d,%eax
-	roll	$1,%edx
 	addl	%ecx,%r13d
-	roll	$30,%edi
-	movl	%edx,0(%rsp)
+	roll	$1,%ebp
 	addl	%eax,%r13d
-	movl	4(%rsp),%ebp
-	movl	%edi,%eax
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
 	movl	%r13d,%ecx
-	xorl	12(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	12(%rsp),%r14d
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	xorl	36(%rsp),%ebp
+	xorl	36(%rsp),%r14d
 	andl	%esi,%eax
-	leal	1518500249(%rdx,%r12,1),%r12d
-	xorl	56(%rsp),%ebp
+	leal	1518500249(%rbp,%r12,1),%r12d
+	roll	$30,%esi
 	xorl	%r11d,%eax
-	roll	$1,%ebp
 	addl	%ecx,%r12d
-	roll	$30,%esi
-	movl	%ebp,4(%rsp)
+	roll	$1,%r14d
 	addl	%eax,%r12d
-	movl	8(%rsp),%edx
-	movl	%esi,%eax
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
 	movl	%r12d,%ecx
 	xorl	16(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	%esi,%eax
 	roll	$5,%ecx
 	xorl	40(%rsp),%edx
 	andl	%r13d,%eax
-	leal	1518500249(%rbp,%r11,1),%r11d
-	xorl	60(%rsp),%edx
+	leal	1518500249(%r14,%r11,1),%r11d
+	roll	$30,%r13d
 	xorl	%edi,%eax
+	addl	%ecx,%r11d
 	roll	$1,%edx
-	addl	%ecx,%r11d
-	roll	$30,%r13d
+	addl	%eax,%r11d
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
 	movl	%edx,8(%rsp)
-	addl	%eax,%r11d
-	movl	12(%rsp),%ebp
-	movl	%r13d,%eax
 	movl	%r11d,%ecx
 	xorl	20(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	%r13d,%eax
 	roll	$5,%ecx
 	xorl	44(%rsp),%ebp
 	andl	%r12d,%eax
 	leal	1518500249(%rdx,%rdi,1),%edi
-	xorl	0(%rsp),%ebp
+	roll	$30,%r12d
 	xorl	%esi,%eax
+	addl	%ecx,%edi
 	roll	$1,%ebp
-	addl	%ecx,%edi
-	roll	$30,%r12d
+	addl	%eax,%edi
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
 	movl	%ebp,12(%rsp)
-	addl	%eax,%edi
-	movl	16(%rsp),%edx
-	movl	%r12d,%eax
 	movl	%edi,%ecx
-	xorl	24(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	24(%rsp),%r14d
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	xorl	48(%rsp),%edx
+	xorl	48(%rsp),%r14d
 	andl	%r11d,%eax
 	leal	1518500249(%rbp,%rsi,1),%esi
-	xorl	4(%rsp),%edx
+	roll	$30,%r11d
 	xorl	%r13d,%eax
-	roll	$1,%edx
 	addl	%ecx,%esi
-	roll	$30,%r11d
-	movl	%edx,16(%rsp)
+	roll	$1,%r14d
 	addl	%eax,%esi
-	movl	20(%rsp),%ebp
-	movl	%r11d,%eax
+	xorl	20(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,16(%rsp)
 	movl	%esi,%ecx
-	xorl	28(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	28(%rsp),%edx
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%r13,1),%r13d
-	xorl	52(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	52(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	8(%rsp),%ebp
 	roll	$30,%edi
 	addl	%eax,%r13d
-	roll	$1,%ebp
-	movl	%ebp,20(%rsp)
-	movl	24(%rsp),%edx
-	movl	%edi,%eax
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,20(%rsp)
 	movl	%r13d,%ecx
-	xorl	32(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	32(%rsp),%ebp
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%r12,1),%r12d
-	xorl	56(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	56(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	12(%rsp),%edx
 	roll	$30,%esi
 	addl	%eax,%r12d
-	roll	$1,%edx
-	movl	%edx,24(%rsp)
-	movl	28(%rsp),%ebp
-	movl	%esi,%eax
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,24(%rsp)
 	movl	%r12d,%ecx
-	xorl	36(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	36(%rsp),%r14d
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%r11,1),%r11d
-	xorl	60(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	60(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	16(%rsp),%ebp
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%ebp
-	movl	%ebp,28(%rsp)
-	movl	32(%rsp),%edx
-	movl	%r13d,%eax
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,28(%rsp)
 	movl	%r11d,%ecx
 	xorl	40(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%rdi,1),%edi
 	xorl	0(%rsp),%edx
-	xorl	%esi,%eax
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	20(%rsp),%edx
 	roll	$30,%r12d
 	addl	%eax,%edi
 	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r11d,%eax
 	movl	%edx,32(%rsp)
-	movl	36(%rsp),%ebp
-	movl	%r12d,%eax
 	movl	%edi,%ecx
 	xorl	44(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	%r13d,%eax
 	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
 	leal	1859775393(%rdx,%rsi,1),%esi
-	xorl	4(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	24(%rsp),%ebp
 	roll	$30,%r11d
 	addl	%eax,%esi
 	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%edi,%eax
 	movl	%ebp,36(%rsp)
-	movl	40(%rsp),%edx
-	movl	%r11d,%eax
 	movl	%esi,%ecx
-	xorl	48(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	48(%rsp),%r14d
+	xorl	%r12d,%eax
 	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
 	leal	1859775393(%rbp,%r13,1),%r13d
-	xorl	8(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	28(%rsp),%edx
 	roll	$30,%edi
 	addl	%eax,%r13d
-	roll	$1,%edx
-	movl	%edx,40(%rsp)
-	movl	44(%rsp),%ebp
-	movl	%edi,%eax
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,40(%rsp)
 	movl	%r13d,%ecx
-	xorl	52(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	52(%rsp),%edx
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%r12,1),%r12d
-	xorl	12(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	12(%rsp),%edx
+	leal	1859775393(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	32(%rsp),%ebp
 	roll	$30,%esi
 	addl	%eax,%r12d
-	roll	$1,%ebp
-	movl	%ebp,44(%rsp)
-	movl	48(%rsp),%edx
-	movl	%esi,%eax
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,44(%rsp)
 	movl	%r12d,%ecx
-	xorl	56(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	56(%rsp),%ebp
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%r11,1),%r11d
-	xorl	16(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	16(%rsp),%ebp
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	36(%rsp),%edx
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%edx
-	movl	%edx,48(%rsp)
-	movl	52(%rsp),%ebp
-	movl	%r13d,%eax
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,48(%rsp)
 	movl	%r11d,%ecx
-	xorl	60(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	60(%rsp),%r14d
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%rdi,1),%edi
-	xorl	20(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	20(%rsp),%r14d
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	40(%rsp),%ebp
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%ebp
-	movl	%ebp,52(%rsp)
-	movl	56(%rsp),%edx
-	movl	%r12d,%eax
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,52(%rsp)
 	movl	%edi,%ecx
 	xorl	0(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%rsi,1),%esi
 	xorl	24(%rsp),%edx
-	xorl	%r13d,%eax
+	leal	1859775393(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	44(%rsp),%edx
 	roll	$30,%r11d
 	addl	%eax,%esi
 	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%edi,%eax
 	movl	%edx,56(%rsp)
-	movl	60(%rsp),%ebp
-	movl	%r11d,%eax
 	movl	%esi,%ecx
 	xorl	4(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	%r12d,%eax
 	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
 	leal	1859775393(%rdx,%r13,1),%r13d
-	xorl	28(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	48(%rsp),%ebp
 	roll	$30,%edi
 	addl	%eax,%r13d
 	roll	$1,%ebp
+	xorl	0(%rsp),%r14d
+	movl	%esi,%eax
 	movl	%ebp,60(%rsp)
-	movl	0(%rsp),%edx
-	movl	%edi,%eax
 	movl	%r13d,%ecx
-	xorl	8(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	8(%rsp),%r14d
+	xorl	%r11d,%eax
 	roll	$5,%ecx
+	xorl	32(%rsp),%r14d
 	leal	1859775393(%rbp,%r12,1),%r12d
-	xorl	32(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	52(%rsp),%edx
 	roll	$30,%esi
 	addl	%eax,%r12d
-	roll	$1,%edx
-	movl	%edx,0(%rsp)
-	movl	4(%rsp),%ebp
-	movl	%esi,%eax
+	roll	$1,%r14d
+	xorl	4(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,0(%rsp)
 	movl	%r12d,%ecx
-	xorl	12(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	12(%rsp),%edx
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%r11,1),%r11d
-	xorl	36(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	36(%rsp),%edx
+	leal	1859775393(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	56(%rsp),%ebp
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%ebp
-	movl	%ebp,4(%rsp)
-	movl	8(%rsp),%edx
-	movl	%r13d,%eax
+	roll	$1,%edx
+	xorl	8(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,4(%rsp)
 	movl	%r11d,%ecx
-	xorl	16(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	16(%rsp),%ebp
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%rdi,1),%edi
-	xorl	40(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	40(%rsp),%ebp
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	60(%rsp),%edx
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%edx
-	movl	%edx,8(%rsp)
-	movl	12(%rsp),%ebp
-	movl	%r12d,%eax
+	roll	$1,%ebp
+	xorl	12(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,8(%rsp)
 	movl	%edi,%ecx
-	xorl	20(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	20(%rsp),%r14d
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%rsi,1),%esi
-	xorl	44(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	44(%rsp),%r14d
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	0(%rsp),%ebp
 	roll	$30,%r11d
 	addl	%eax,%esi
-	roll	$1,%ebp
-	movl	%ebp,12(%rsp)
-	movl	16(%rsp),%edx
-	movl	%r11d,%eax
+	roll	$1,%r14d
+	xorl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,12(%rsp)
 	movl	%esi,%ecx
 	xorl	24(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%r13,1),%r13d
 	xorl	48(%rsp),%edx
-	xorl	%r12d,%eax
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	4(%rsp),%edx
 	roll	$30,%edi
 	addl	%eax,%r13d
 	roll	$1,%edx
+	xorl	20(%rsp),%ebp
+	movl	%esi,%eax
 	movl	%edx,16(%rsp)
-	movl	20(%rsp),%ebp
-	movl	%edi,%eax
 	movl	%r13d,%ecx
 	xorl	28(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	%r11d,%eax
 	roll	$5,%ecx
+	xorl	52(%rsp),%ebp
 	leal	1859775393(%rdx,%r12,1),%r12d
-	xorl	52(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	8(%rsp),%ebp
 	roll	$30,%esi
 	addl	%eax,%r12d
 	roll	$1,%ebp
+	xorl	24(%rsp),%r14d
+	movl	%r13d,%eax
 	movl	%ebp,20(%rsp)
-	movl	24(%rsp),%edx
-	movl	%esi,%eax
 	movl	%r12d,%ecx
-	xorl	32(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	32(%rsp),%r14d
+	xorl	%edi,%eax
 	roll	$5,%ecx
+	xorl	56(%rsp),%r14d
 	leal	1859775393(%rbp,%r11,1),%r11d
-	xorl	56(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	12(%rsp),%edx
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%edx
-	movl	%edx,24(%rsp)
-	movl	28(%rsp),%ebp
-	movl	%r13d,%eax
+	roll	$1,%r14d
+	xorl	28(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,24(%rsp)
 	movl	%r11d,%ecx
-	xorl	36(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	36(%rsp),%edx
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rdx,%rdi,1),%edi
-	xorl	60(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	60(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	16(%rsp),%ebp
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%ebp
-	movl	%ebp,28(%rsp)
-	movl	32(%rsp),%edx
-	movl	%r12d,%eax
+	roll	$1,%edx
+	xorl	32(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,28(%rsp)
 	movl	%edi,%ecx
-	xorl	40(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	40(%rsp),%ebp
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	1859775393(%rbp,%rsi,1),%esi
-	xorl	0(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	0(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	20(%rsp),%edx
 	roll	$30,%r11d
 	addl	%eax,%esi
-	roll	$1,%edx
-	movl	%edx,32(%rsp)
-	movl	36(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%r11d,%ebx
-	xorl	44(%rsp),%ebp
-	andl	%r12d,%eax
+	roll	$1,%ebp
+	xorl	36(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,32(%rsp)
+	movl	%r12d,%ebx
+	xorl	44(%rsp),%r14d
+	andl	%r11d,%eax
 	movl	%esi,%ecx
-	xorl	4(%rsp),%ebp
-	xorl	%r12d,%ebx
-	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	4(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
 	roll	$5,%ecx
-	xorl	24(%rsp),%ebp
 	addl	%eax,%r13d
+	roll	$1,%r14d
 	andl	%edi,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r13d
+	roll	$30,%edi
 	addl	%ebx,%r13d
-	roll	$30,%edi
-	movl	%ebp,36(%rsp)
-	addl	%ecx,%r13d
-	movl	40(%rsp),%edx
-	movl	%edi,%eax
-	movl	%edi,%ebx
+	xorl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,36(%rsp)
+	movl	%r11d,%ebx
 	xorl	48(%rsp),%edx
-	andl	%r11d,%eax
+	andl	%edi,%eax
 	movl	%r13d,%ecx
 	xorl	8(%rsp),%edx
-	xorl	%r11d,%ebx
-	leal	-1894007588(%rbp,%r12,1),%r12d
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
 	roll	$5,%ecx
-	xorl	28(%rsp),%edx
 	addl	%eax,%r12d
+	roll	$1,%edx
 	andl	%esi,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r12d
+	roll	$30,%esi
 	addl	%ebx,%r12d
-	roll	$30,%esi
+	xorl	44(%rsp),%ebp
+	movl	%edi,%eax
 	movl	%edx,40(%rsp)
-	addl	%ecx,%r12d
-	movl	44(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%esi,%ebx
+	movl	%edi,%ebx
 	xorl	52(%rsp),%ebp
-	andl	%edi,%eax
+	andl	%esi,%eax
 	movl	%r12d,%ecx
 	xorl	12(%rsp),%ebp
-	xorl	%edi,%ebx
 	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
 	roll	$5,%ecx
-	xorl	32(%rsp),%ebp
 	addl	%eax,%r11d
+	roll	$1,%ebp
 	andl	%r13d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r11d
+	roll	$30,%r13d
 	addl	%ebx,%r11d
-	roll	$30,%r13d
+	xorl	48(%rsp),%r14d
+	movl	%esi,%eax
 	movl	%ebp,44(%rsp)
-	addl	%ecx,%r11d
-	movl	48(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r13d,%ebx
-	xorl	56(%rsp),%edx
-	andl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	56(%rsp),%r14d
+	andl	%r13d,%eax
 	movl	%r11d,%ecx
-	xorl	16(%rsp),%edx
-	xorl	%esi,%ebx
+	xorl	16(%rsp),%r14d
 	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
 	roll	$5,%ecx
-	xorl	36(%rsp),%edx
 	addl	%eax,%edi
+	roll	$1,%r14d
 	andl	%r12d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%edi
+	roll	$30,%r12d
 	addl	%ebx,%edi
-	roll	$30,%r12d
-	movl	%edx,48(%rsp)
-	addl	%ecx,%edi
-	movl	52(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%r12d,%ebx
-	xorl	60(%rsp),%ebp
-	andl	%r13d,%eax
+	xorl	52(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,48(%rsp)
+	movl	%r13d,%ebx
+	xorl	60(%rsp),%edx
+	andl	%r12d,%eax
 	movl	%edi,%ecx
-	xorl	20(%rsp),%ebp
-	xorl	%r13d,%ebx
-	leal	-1894007588(%rdx,%rsi,1),%esi
+	xorl	20(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
 	roll	$5,%ecx
-	xorl	40(%rsp),%ebp
 	addl	%eax,%esi
+	roll	$1,%edx
 	andl	%r11d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%esi
+	roll	$30,%r11d
 	addl	%ebx,%esi
-	roll	$30,%r11d
-	movl	%ebp,52(%rsp)
-	addl	%ecx,%esi
-	movl	56(%rsp),%edx
-	movl	%r11d,%eax
-	movl	%r11d,%ebx
-	xorl	0(%rsp),%edx
-	andl	%r12d,%eax
+	xorl	56(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,52(%rsp)
+	movl	%r12d,%ebx
+	xorl	0(%rsp),%ebp
+	andl	%r11d,%eax
 	movl	%esi,%ecx
-	xorl	24(%rsp),%edx
-	xorl	%r12d,%ebx
-	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	24(%rsp),%ebp
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	%r11d,%ebx
 	roll	$5,%ecx
-	xorl	44(%rsp),%edx
 	addl	%eax,%r13d
+	roll	$1,%ebp
 	andl	%edi,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r13d
+	roll	$30,%edi
 	addl	%ebx,%r13d
-	roll	$30,%edi
-	movl	%edx,56(%rsp)
-	addl	%ecx,%r13d
-	movl	60(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edi,%ebx
-	xorl	4(%rsp),%ebp
-	andl	%r11d,%eax
+	xorl	60(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,56(%rsp)
+	movl	%r11d,%ebx
+	xorl	4(%rsp),%r14d
+	andl	%edi,%eax
 	movl	%r13d,%ecx
-	xorl	28(%rsp),%ebp
-	xorl	%r11d,%ebx
-	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	28(%rsp),%r14d
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	xorl	%edi,%ebx
 	roll	$5,%ecx
-	xorl	48(%rsp),%ebp
 	addl	%eax,%r12d
+	roll	$1,%r14d
 	andl	%esi,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r12d
+	roll	$30,%esi
 	addl	%ebx,%r12d
-	roll	$30,%esi
-	movl	%ebp,60(%rsp)
-	addl	%ecx,%r12d
-	movl	0(%rsp),%edx
-	movl	%esi,%eax
-	movl	%esi,%ebx
+	xorl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,60(%rsp)
+	movl	%edi,%ebx
 	xorl	8(%rsp),%edx
-	andl	%edi,%eax
+	andl	%esi,%eax
 	movl	%r12d,%ecx
 	xorl	32(%rsp),%edx
-	xorl	%edi,%ebx
-	leal	-1894007588(%rbp,%r11,1),%r11d
+	leal	-1894007588(%r14,%r11,1),%r11d
+	xorl	%esi,%ebx
 	roll	$5,%ecx
-	xorl	52(%rsp),%edx
 	addl	%eax,%r11d
+	roll	$1,%edx
 	andl	%r13d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
 	addl	%ebx,%r11d
-	roll	$30,%r13d
+	xorl	4(%rsp),%ebp
+	movl	%esi,%eax
 	movl	%edx,0(%rsp)
-	addl	%ecx,%r11d
-	movl	4(%rsp),%ebp
-	movl	%r13d,%eax
-	movl	%r13d,%ebx
+	movl	%esi,%ebx
 	xorl	12(%rsp),%ebp
-	andl	%esi,%eax
+	andl	%r13d,%eax
 	movl	%r11d,%ecx
 	xorl	36(%rsp),%ebp
-	xorl	%esi,%ebx
 	leal	-1894007588(%rdx,%rdi,1),%edi
+	xorl	%r13d,%ebx
 	roll	$5,%ecx
-	xorl	56(%rsp),%ebp
 	addl	%eax,%edi
+	roll	$1,%ebp
 	andl	%r12d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%edi
+	roll	$30,%r12d
 	addl	%ebx,%edi
-	roll	$30,%r12d
+	xorl	8(%rsp),%r14d
+	movl	%r13d,%eax
 	movl	%ebp,4(%rsp)
-	addl	%ecx,%edi
-	movl	8(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r12d,%ebx
-	xorl	16(%rsp),%edx
-	andl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	16(%rsp),%r14d
+	andl	%r12d,%eax
 	movl	%edi,%ecx
-	xorl	40(%rsp),%edx
-	xorl	%r13d,%ebx
+	xorl	40(%rsp),%r14d
 	leal	-1894007588(%rbp,%rsi,1),%esi
+	xorl	%r12d,%ebx
 	roll	$5,%ecx
-	xorl	60(%rsp),%edx
 	addl	%eax,%esi
+	roll	$1,%r14d
 	andl	%r11d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%esi
+	roll	$30,%r11d
 	addl	%ebx,%esi
-	roll	$30,%r11d
-	movl	%edx,8(%rsp)
-	addl	%ecx,%esi
-	movl	12(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%r11d,%ebx
-	xorl	20(%rsp),%ebp
-	andl	%r12d,%eax
+	xorl	12(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ebx
+	xorl	20(%rsp),%edx
+	andl	%r11d,%eax
 	movl	%esi,%ecx
-	xorl	44(%rsp),%ebp
-	xorl	%r12d,%ebx
-	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	44(%rsp),%edx
+	leal	-1894007588(%r14,%r13,1),%r13d
+	xorl	%r11d,%ebx
 	roll	$5,%ecx
-	xorl	0(%rsp),%ebp
 	addl	%eax,%r13d
+	roll	$1,%edx
 	andl	%edi,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r13d
+	roll	$30,%edi
 	addl	%ebx,%r13d
-	roll	$30,%edi
-	movl	%ebp,12(%rsp)
-	addl	%ecx,%r13d
-	movl	16(%rsp),%edx
-	movl	%edi,%eax
-	movl	%edi,%ebx
-	xorl	24(%rsp),%edx
-	andl	%r11d,%eax
+	xorl	16(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ebx
+	xorl	24(%rsp),%ebp
+	andl	%edi,%eax
 	movl	%r13d,%ecx
-	xorl	48(%rsp),%edx
-	xorl	%r11d,%ebx
-	leal	-1894007588(%rbp,%r12,1),%r12d
+	xorl	48(%rsp),%ebp
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	%edi,%ebx
 	roll	$5,%ecx
-	xorl	4(%rsp),%edx
 	addl	%eax,%r12d
+	roll	$1,%ebp
 	andl	%esi,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r12d
+	roll	$30,%esi
 	addl	%ebx,%r12d
-	roll	$30,%esi
-	movl	%edx,16(%rsp)
-	addl	%ecx,%r12d
-	movl	20(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%esi,%ebx
-	xorl	28(%rsp),%ebp
-	andl	%edi,%eax
+	xorl	20(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ebx
+	xorl	28(%rsp),%r14d
+	andl	%esi,%eax
 	movl	%r12d,%ecx
-	xorl	52(%rsp),%ebp
-	xorl	%edi,%ebx
-	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	52(%rsp),%r14d
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	xorl	%esi,%ebx
 	roll	$5,%ecx
-	xorl	8(%rsp),%ebp
 	addl	%eax,%r11d
+	roll	$1,%r14d
 	andl	%r13d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r11d
+	roll	$30,%r13d
 	addl	%ebx,%r11d
-	roll	$30,%r13d
-	movl	%ebp,20(%rsp)
-	addl	%ecx,%r11d
-	movl	24(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r13d,%ebx
+	xorl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ebx
 	xorl	32(%rsp),%edx
-	andl	%esi,%eax
+	andl	%r13d,%eax
 	movl	%r11d,%ecx
 	xorl	56(%rsp),%edx
-	xorl	%esi,%ebx
-	leal	-1894007588(%rbp,%rdi,1),%edi
+	leal	-1894007588(%r14,%rdi,1),%edi
+	xorl	%r13d,%ebx
 	roll	$5,%ecx
-	xorl	12(%rsp),%edx
 	addl	%eax,%edi
+	roll	$1,%edx
 	andl	%r12d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%edi
+	roll	$30,%r12d
 	addl	%ebx,%edi
-	roll	$30,%r12d
+	xorl	28(%rsp),%ebp
+	movl	%r13d,%eax
 	movl	%edx,24(%rsp)
-	addl	%ecx,%edi
-	movl	28(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%r12d,%ebx
+	movl	%r13d,%ebx
 	xorl	36(%rsp),%ebp
-	andl	%r13d,%eax
+	andl	%r12d,%eax
 	movl	%edi,%ecx
 	xorl	60(%rsp),%ebp
-	xorl	%r13d,%ebx
 	leal	-1894007588(%rdx,%rsi,1),%esi
+	xorl	%r12d,%ebx
 	roll	$5,%ecx
-	xorl	16(%rsp),%ebp
 	addl	%eax,%esi
+	roll	$1,%ebp
 	andl	%r11d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%esi
+	roll	$30,%r11d
 	addl	%ebx,%esi
-	roll	$30,%r11d
+	xorl	32(%rsp),%r14d
+	movl	%r12d,%eax
 	movl	%ebp,28(%rsp)
-	addl	%ecx,%esi
-	movl	32(%rsp),%edx
-	movl	%r11d,%eax
-	movl	%r11d,%ebx
-	xorl	40(%rsp),%edx
-	andl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	40(%rsp),%r14d
+	andl	%r11d,%eax
 	movl	%esi,%ecx
-	xorl	0(%rsp),%edx
-	xorl	%r12d,%ebx
+	xorl	0(%rsp),%r14d
 	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
 	roll	$5,%ecx
-	xorl	20(%rsp),%edx
 	addl	%eax,%r13d
+	roll	$1,%r14d
 	andl	%edi,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r13d
+	roll	$30,%edi
 	addl	%ebx,%r13d
-	roll	$30,%edi
-	movl	%edx,32(%rsp)
-	addl	%ecx,%r13d
-	movl	36(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edi,%ebx
-	xorl	44(%rsp),%ebp
-	andl	%r11d,%eax
+	xorl	36(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%edx
+	andl	%edi,%eax
 	movl	%r13d,%ecx
-	xorl	4(%rsp),%ebp
-	xorl	%r11d,%ebx
-	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	4(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
 	roll	$5,%ecx
-	xorl	24(%rsp),%ebp
 	addl	%eax,%r12d
+	roll	$1,%edx
 	andl	%esi,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%r12d
+	roll	$30,%esi
 	addl	%ebx,%r12d
-	roll	$30,%esi
-	movl	%ebp,36(%rsp)
-	addl	%ecx,%r12d
-	movl	40(%rsp),%edx
-	movl	%esi,%eax
-	movl	%esi,%ebx
-	xorl	48(%rsp),%edx
-	andl	%edi,%eax
+	xorl	40(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ebx
+	xorl	48(%rsp),%ebp
+	andl	%esi,%eax
 	movl	%r12d,%ecx
-	xorl	8(%rsp),%edx
-	xorl	%edi,%ebx
-	leal	-1894007588(%rbp,%r11,1),%r11d
+	xorl	8(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
 	roll	$5,%ecx
-	xorl	28(%rsp),%edx
 	addl	%eax,%r11d
+	roll	$1,%ebp
 	andl	%r13d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
 	addl	%ebx,%r11d
-	roll	$30,%r13d
-	movl	%edx,40(%rsp)
-	addl	%ecx,%r11d
-	movl	44(%rsp),%ebp
-	movl	%r13d,%eax
-	movl	%r13d,%ebx
-	xorl	52(%rsp),%ebp
-	andl	%esi,%eax
+	xorl	44(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ebx
+	xorl	52(%rsp),%r14d
+	andl	%r13d,%eax
 	movl	%r11d,%ecx
-	xorl	12(%rsp),%ebp
-	xorl	%esi,%ebx
-	leal	-1894007588(%rdx,%rdi,1),%edi
+	xorl	12(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
 	roll	$5,%ecx
-	xorl	32(%rsp),%ebp
 	addl	%eax,%edi
+	roll	$1,%r14d
 	andl	%r12d,%ebx
-	roll	$1,%ebp
+	addl	%ecx,%edi
+	roll	$30,%r12d
 	addl	%ebx,%edi
-	roll	$30,%r12d
-	movl	%ebp,44(%rsp)
-	addl	%ecx,%edi
-	movl	48(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r12d,%ebx
+	xorl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ebx
 	xorl	56(%rsp),%edx
-	andl	%r13d,%eax
+	andl	%r12d,%eax
 	movl	%edi,%ecx
 	xorl	16(%rsp),%edx
-	xorl	%r13d,%ebx
-	leal	-1894007588(%rbp,%rsi,1),%esi
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
 	roll	$5,%ecx
-	xorl	36(%rsp),%edx
 	addl	%eax,%esi
+	roll	$1,%edx
 	andl	%r11d,%ebx
-	roll	$1,%edx
+	addl	%ecx,%esi
+	roll	$30,%r11d
 	addl	%ebx,%esi
-	roll	$30,%r11d
+	xorl	52(%rsp),%ebp
+	movl	%edi,%eax
 	movl	%edx,48(%rsp)
-	addl	%ecx,%esi
-	movl	52(%rsp),%ebp
-	movl	%r11d,%eax
 	movl	%esi,%ecx
 	xorl	60(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	%r12d,%eax
 	roll	$5,%ecx
+	xorl	20(%rsp),%ebp
 	leal	-899497514(%rdx,%r13,1),%r13d
-	xorl	20(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	40(%rsp),%ebp
 	roll	$30,%edi
 	addl	%eax,%r13d
 	roll	$1,%ebp
+	xorl	56(%rsp),%r14d
+	movl	%esi,%eax
 	movl	%ebp,52(%rsp)
-	movl	56(%rsp),%edx
-	movl	%edi,%eax
 	movl	%r13d,%ecx
-	xorl	0(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	0(%rsp),%r14d
+	xorl	%r11d,%eax
 	roll	$5,%ecx
+	xorl	24(%rsp),%r14d
 	leal	-899497514(%rbp,%r12,1),%r12d
-	xorl	24(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	44(%rsp),%edx
 	roll	$30,%esi
 	addl	%eax,%r12d
-	roll	$1,%edx
-	movl	%edx,56(%rsp)
-	movl	60(%rsp),%ebp
-	movl	%esi,%eax
+	roll	$1,%r14d
+	xorl	60(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
 	movl	%r12d,%ecx
-	xorl	4(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	4(%rsp),%edx
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%r11,1),%r11d
-	xorl	28(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	28(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	48(%rsp),%ebp
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%ebp
-	movl	%ebp,60(%rsp)
-	movl	0(%rsp),%edx
-	movl	%r13d,%eax
+	roll	$1,%edx
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
 	movl	%r11d,%ecx
-	xorl	8(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	8(%rsp),%ebp
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%rdi,1),%edi
-	xorl	32(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	32(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	52(%rsp),%edx
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%edx
-	movl	%edx,0(%rsp)
-	movl	4(%rsp),%ebp
-	movl	%r12d,%eax
+	roll	$1,%ebp
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
 	movl	%edi,%ecx
-	xorl	12(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	12(%rsp),%r14d
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%rsi,1),%esi
-	xorl	36(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	36(%rsp),%r14d
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	56(%rsp),%ebp
 	roll	$30,%r11d
 	addl	%eax,%esi
-	roll	$1,%ebp
-	movl	%ebp,4(%rsp)
-	movl	8(%rsp),%edx
-	movl	%r11d,%eax
+	roll	$1,%r14d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
 	movl	%esi,%ecx
 	xorl	16(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%r13,1),%r13d
 	xorl	40(%rsp),%edx
-	xorl	%r12d,%eax
+	leal	-899497514(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	60(%rsp),%edx
 	roll	$30,%edi
 	addl	%eax,%r13d
 	roll	$1,%edx
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
 	movl	%edx,8(%rsp)
-	movl	12(%rsp),%ebp
-	movl	%edi,%eax
 	movl	%r13d,%ecx
 	xorl	20(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	%r11d,%eax
 	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
 	leal	-899497514(%rdx,%r12,1),%r12d
-	xorl	44(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	0(%rsp),%ebp
 	roll	$30,%esi
 	addl	%eax,%r12d
 	roll	$1,%ebp
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
 	movl	%ebp,12(%rsp)
-	movl	16(%rsp),%edx
-	movl	%esi,%eax
 	movl	%r12d,%ecx
-	xorl	24(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	24(%rsp),%r14d
+	xorl	%edi,%eax
 	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
 	leal	-899497514(%rbp,%r11,1),%r11d
-	xorl	48(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	4(%rsp),%edx
 	roll	$30,%r13d
 	addl	%eax,%r11d
-	roll	$1,%edx
-	movl	%edx,16(%rsp)
-	movl	20(%rsp),%ebp
-	movl	%r13d,%eax
+	roll	$1,%r14d
+	xorl	20(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,16(%rsp)
 	movl	%r11d,%ecx
-	xorl	28(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	28(%rsp),%edx
+	xorl	%esi,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%rdi,1),%edi
-	xorl	52(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	52(%rsp),%edx
+	leal	-899497514(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	8(%rsp),%ebp
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%ebp
-	movl	%ebp,20(%rsp)
-	movl	24(%rsp),%edx
-	movl	%r12d,%eax
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,20(%rsp)
 	movl	%edi,%ecx
-	xorl	32(%rsp),%edx
-	xorl	%r11d,%eax
+	xorl	32(%rsp),%ebp
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%rsi,1),%esi
-	xorl	56(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	56(%rsp),%ebp
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	12(%rsp),%edx
 	roll	$30,%r11d
 	addl	%eax,%esi
-	roll	$1,%edx
-	movl	%edx,24(%rsp)
-	movl	28(%rsp),%ebp
-	movl	%r11d,%eax
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,24(%rsp)
 	movl	%esi,%ecx
-	xorl	36(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	36(%rsp),%r14d
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%r13,1),%r13d
-	xorl	60(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	60(%rsp),%r14d
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	16(%rsp),%ebp
 	roll	$30,%edi
 	addl	%eax,%r13d
-	roll	$1,%ebp
-	movl	%ebp,28(%rsp)
-	movl	32(%rsp),%edx
-	movl	%edi,%eax
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,28(%rsp)
 	movl	%r13d,%ecx
 	xorl	40(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%r12,1),%r12d
 	xorl	0(%rsp),%edx
-	xorl	%r11d,%eax
+	leal	-899497514(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	20(%rsp),%edx
 	roll	$30,%esi
 	addl	%eax,%r12d
 	roll	$1,%edx
-	movl	%edx,32(%rsp)
-	movl	36(%rsp),%ebp
-	movl	%esi,%eax
+	xorl	36(%rsp),%ebp
+	movl	%r13d,%eax
+
 	movl	%r12d,%ecx
 	xorl	44(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	%edi,%eax
 	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
 	leal	-899497514(%rdx,%r11,1),%r11d
-	xorl	4(%rsp),%ebp
-	xorl	%edi,%eax
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	24(%rsp),%ebp
 	roll	$30,%r13d
 	addl	%eax,%r11d
 	roll	$1,%ebp
-	movl	%ebp,36(%rsp)
-	movl	40(%rsp),%edx
-	movl	%r13d,%eax
+	xorl	40(%rsp),%r14d
+	movl	%r12d,%eax
+
 	movl	%r11d,%ecx
-	xorl	48(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	48(%rsp),%r14d
+	xorl	%esi,%eax
 	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
 	leal	-899497514(%rbp,%rdi,1),%edi
-	xorl	8(%rsp),%edx
-	xorl	%esi,%eax
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	28(%rsp),%edx
 	roll	$30,%r12d
 	addl	%eax,%edi
-	roll	$1,%edx
-	movl	%edx,40(%rsp)
-	movl	44(%rsp),%ebp
-	movl	%r12d,%eax
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%r11d,%eax
+
 	movl	%edi,%ecx
-	xorl	52(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	52(%rsp),%edx
+	xorl	%r13d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%rsi,1),%esi
-	xorl	12(%rsp),%ebp
-	xorl	%r13d,%eax
+	xorl	12(%rsp),%edx
+	leal	-899497514(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
-	xorl	32(%rsp),%ebp
 	roll	$30,%r11d
 	addl	%eax,%esi
-	roll	$1,%ebp
-	movl	%ebp,44(%rsp)
-	movl	48(%rsp),%edx
-	movl	%r11d,%eax
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%edi,%eax
+
 	movl	%esi,%ecx
-	xorl	56(%rsp),%edx
-	xorl	%edi,%eax
+	xorl	56(%rsp),%ebp
+	xorl	%r12d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%r13,1),%r13d
-	xorl	16(%rsp),%edx
-	xorl	%r12d,%eax
+	xorl	16(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
 	addl	%ecx,%r13d
-	xorl	36(%rsp),%edx
 	roll	$30,%edi
 	addl	%eax,%r13d
-	roll	$1,%edx
-	movl	%edx,48(%rsp)
-	movl	52(%rsp),%ebp
-	movl	%edi,%eax
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%esi,%eax
+
 	movl	%r13d,%ecx
-	xorl	60(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	60(%rsp),%r14d
+	xorl	%r11d,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rdx,%r12,1),%r12d
-	xorl	20(%rsp),%ebp
-	xorl	%r11d,%eax
+	xorl	20(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
 	addl	%ecx,%r12d
-	xorl	40(%rsp),%ebp
 	roll	$30,%esi
 	addl	%eax,%r12d
-	roll	$1,%ebp
-	movl	56(%rsp),%edx
-	movl	%esi,%eax
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r13d,%eax
+
 	movl	%r12d,%ecx
 	xorl	0(%rsp),%edx
-	xorl	%r13d,%eax
+	xorl	%edi,%eax
 	roll	$5,%ecx
-	leal	-899497514(%rbp,%r11,1),%r11d
 	xorl	24(%rsp),%edx
-	xorl	%edi,%eax
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
 	addl	%ecx,%r11d
-	xorl	44(%rsp),%edx
 	roll	$30,%r13d
 	addl	%eax,%r11d
 	roll	$1,%edx
-	movl	60(%rsp),%ebp
-	movl	%r13d,%eax
+	xorl	60(%rsp),%ebp
+	movl	%r12d,%eax
+
 	movl	%r11d,%ecx
 	xorl	4(%rsp),%ebp
-	xorl	%r12d,%eax
+	xorl	%esi,%eax
 	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
 	leal	-899497514(%rdx,%rdi,1),%edi
-	xorl	28(%rsp),%ebp
-	xorl	%esi,%eax
+	xorl	%r13d,%eax
 	addl	%ecx,%edi
-	xorl	48(%rsp),%ebp
 	roll	$30,%r12d
 	addl	%eax,%edi
 	roll	$1,%ebp
-	movl	%r12d,%eax
+	movl	%r11d,%eax
 	movl	%edi,%ecx
-	xorl	%r11d,%eax
+	xorl	%r13d,%eax
 	leal	-899497514(%rbp,%rsi,1),%esi
 	roll	$5,%ecx
-	xorl	%r13d,%eax
+	xorl	%r12d,%eax
 	addl	%ecx,%esi
 	roll	$30,%r11d
 	addl	%eax,%esi
@@ -1288,22 +1233,195 @@
 	jnz	.Lloop
 
 	movq	64(%rsp),%rsi
-	movq	(%rsi),%r13
-	movq	8(%rsi),%r12
-	movq	16(%rsi),%rbp
-	movq	24(%rsi),%rbx
-	leaq	32(%rsi),%rsp
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_data_order_shaext, at function
+.align	32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+	movdqu	(%rdi),%xmm0
+	movd	16(%rdi),%xmm1
+	movdqa	K_XX_XX+160(%rip),%xmm3
+
+	movdqu	(%rsi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%rsi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+	movdqa	%xmm1,%xmm9
+.byte	102,15,56,0,251
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	decq	%rdx
+	leaq	64(%rsi),%r8
+	paddd	%xmm4,%xmm1
+	cmovneq	%r8,%rsi
+	movdqa	%xmm0,%xmm8
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%rsi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%rsi),%xmm5
+.byte	102,15,56,0,227
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,235
+
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,243
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	65,15,56,200,201
+.byte	102,15,56,0,251
+
+	paddd	%xmm8,%xmm0
+	movdqa	%xmm1,%xmm9
+
+	jnz	.Loop_shaext
+
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%rdi)
+	movd	%xmm1,16(%rdi)
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
 .type	sha1_block_data_order_ssse3, at function
 .align	16
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
+	pushq	%r13
+	pushq	%r14
 	leaq	-64(%rsp),%rsp
+	movq	%rax,%r14
+	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
 	movq	%rdx,%r10
@@ -1310,7 +1428,7 @@
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r11
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -1318,19 +1436,22 @@
 	movl	12(%r8),%edx
 	movl	%ebx,%esi
 	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
 
 	movdqa	64(%r11),%xmm6
-	movdqa	0(%r11),%xmm9
+	movdqa	-64(%r11),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
 	movdqu	48(%r9),%xmm3
 .byte	102,15,56,0,198
-	addq	$64,%r9
 .byte	102,15,56,0,206
 .byte	102,15,56,0,214
+	addq	$64,%r9
+	paddd	%xmm9,%xmm0
 .byte	102,15,56,0,222
-	paddd	%xmm9,%xmm0
 	paddd	%xmm9,%xmm1
 	paddd	%xmm9,%xmm2
 	movdqa	%xmm0,0(%rsp)
@@ -1342,904 +1463,882 @@
 	jmp	.Loop_ssse3
 .align	16
 .Loop_ssse3:
-	movdqa	%xmm1,%xmm4
-	addl	0(%rsp),%ebp
-	xorl	%edx,%ecx
+	rorl	$2,%ebx
+	pshufd	$238,%xmm0,%xmm4
+	xorl	%edx,%esi
 	movdqa	%xmm3,%xmm8
-.byte	102,15,58,15,224,8
+	paddd	%xmm3,%xmm9
 	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	paddd	%xmm3,%xmm9
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	addl	%esi,%ebp
 	psrldq	$4,%xmm8
-	xorl	%edx,%esi
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
 	addl	%eax,%ebp
-	pxor	%xmm0,%xmm4
-	rorl	$2,%ebx
-	addl	%esi,%ebp
+	rorl	$7,%eax
 	pxor	%xmm2,%xmm8
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
 	addl	4(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%esi
+	pxor	%xmm8,%xmm4
+	xorl	%ebx,%eax
 	roll	$5,%ebp
-	pxor	%xmm8,%xmm4
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
 	movdqa	%xmm9,48(%rsp)
-	xorl	%ecx,%edi
+	addl	%edi,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm10
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movdqa	%xmm4,%xmm10
+	rorl	$7,%ebp
 	movdqa	%xmm4,%xmm8
-	rorl	$7,%eax
-	addl	%edi,%edx
-	addl	8(%rsp),%ecx
-	xorl	%ebx,%eax
+	xorl	%ebx,%esi
 	pslldq	$12,%xmm10
 	paddd	%xmm4,%xmm4
 	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	psrld	$31,%xmm8
+	xorl	%eax,%ebp
 	roll	$5,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	psrld	$31,%xmm8
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
 	movdqa	%xmm10,%xmm9
-	rorl	$7,%ebp
-	addl	%esi,%ecx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	psrld	$30,%xmm10
+	addl	%edx,%ecx
+	rorl	$7,%edx
 	por	%xmm8,%xmm4
+	xorl	%eax,%edi
+	movl	%ecx,%esi
 	addl	12(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
-	roll	$5,%ecx
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm4
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	movdqa	0(%r11),%xmm10
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
+	xorl	%ebp,%edx
+	movdqa	-64(%r11),%xmm10
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
 	pxor	%xmm9,%xmm4
-	rorl	$7,%edx
-	addl	%edi,%ebx
-	movdqa	%xmm2,%xmm5
-	addl	16(%rsp),%eax
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%ebp,%esi
 	movdqa	%xmm4,%xmm9
-.byte	102,15,58,15,233,8
+	paddd	%xmm4,%xmm10
 	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	paddd	%xmm4,%xmm10
-	andl	%edx,%esi
-	xorl	%ebp,%edx
+	addl	%esi,%eax
 	psrldq	$4,%xmm9
-	xorl	%ebp,%esi
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
 	addl	%ebx,%eax
-	pxor	%xmm1,%xmm5
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	rorl	$7,%ebx
 	pxor	%xmm3,%xmm9
+	xorl	%edx,%edi
+	movl	%eax,%esi
 	addl	20(%rsp),%ebp
-	xorl	%edx,%ecx
-	movl	%eax,%esi
+	pxor	%xmm9,%xmm5
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pxor	%xmm9,%xmm5
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
 	movdqa	%xmm10,0(%rsp)
-	xorl	%edx,%edi
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm8
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movdqa	%xmm5,%xmm8
+	rorl	$7,%eax
 	movdqa	%xmm5,%xmm9
-	rorl	$7,%ebx
-	addl	%edi,%ebp
-	addl	24(%rsp),%edx
-	xorl	%ecx,%ebx
+	xorl	%ecx,%esi
 	pslldq	$12,%xmm8
 	paddd	%xmm5,%xmm5
 	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	psrld	$31,%xmm9
+	xorl	%ebx,%eax
 	roll	$5,%ebp
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	psrld	$31,%xmm9
-	xorl	%ecx,%esi
-	addl	%ebp,%edx
+	addl	%esi,%edx
 	movdqa	%xmm8,%xmm10
-	rorl	$7,%eax
-	addl	%esi,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
 	psrld	$30,%xmm8
+	addl	%ebp,%edx
+	rorl	$7,%ebp
 	por	%xmm9,%xmm5
+	xorl	%ebx,%edi
+	movl	%edx,%esi
 	addl	28(%rsp),%ecx
-	xorl	%ebx,%eax
-	movl	%edx,%esi
-	roll	$5,%edx
 	pslld	$2,%xmm10
 	pxor	%xmm8,%xmm5
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	movdqa	16(%r11),%xmm8
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
+	xorl	%eax,%ebp
+	movdqa	-32(%r11),%xmm8
+	roll	$5,%edx
+	addl	%edi,%ecx
+	andl	%ebp,%esi
 	pxor	%xmm10,%xmm5
-	rorl	$7,%ebp
-	addl	%edi,%ecx
-	movdqa	%xmm3,%xmm6
-	addl	32(%rsp),%ebx
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%esi
 	movdqa	%xmm5,%xmm10
-.byte	102,15,58,15,242,8
+	paddd	%xmm5,%xmm8
 	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-	paddd	%xmm5,%xmm8
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
+	addl	%esi,%ebx
 	psrldq	$4,%xmm10
-	xorl	%eax,%esi
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	pxor	%xmm2,%xmm6
 	addl	%ecx,%ebx
-	pxor	%xmm2,%xmm6
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	rorl	$7,%ecx
 	pxor	%xmm4,%xmm10
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
 	addl	36(%rsp),%eax
-	xorl	%ebp,%edx
-	movl	%ebx,%esi
+	pxor	%xmm10,%xmm6
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	pxor	%xmm10,%xmm6
-	andl	%edx,%edi
-	xorl	%ebp,%edx
 	movdqa	%xmm8,16(%rsp)
-	xorl	%ebp,%edi
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm9
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movdqa	%xmm6,%xmm9
+	rorl	$7,%ebx
 	movdqa	%xmm6,%xmm10
-	rorl	$7,%ecx
-	addl	%edi,%eax
-	addl	40(%rsp),%ebp
-	xorl	%edx,%ecx
+	xorl	%edx,%esi
 	pslldq	$12,%xmm9
 	paddd	%xmm6,%xmm6
 	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	psrld	$31,%xmm10
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	psrld	$31,%xmm10
-	xorl	%edx,%esi
-	addl	%eax,%ebp
+	addl	%esi,%ebp
 	movdqa	%xmm9,%xmm8
-	rorl	$7,%ebx
-	addl	%esi,%ebp
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	psrld	$30,%xmm9
+	addl	%eax,%ebp
+	rorl	$7,%eax
 	por	%xmm10,%xmm6
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
 	addl	44(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%esi
-	roll	$5,%ebp
 	pslld	$2,%xmm8
 	pxor	%xmm9,%xmm6
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	movdqa	16(%r11),%xmm9
-	xorl	%ecx,%edi
-	addl	%ebp,%edx
+	xorl	%ebx,%eax
+	movdqa	-32(%r11),%xmm9
+	roll	$5,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
 	pxor	%xmm8,%xmm6
-	rorl	$7,%eax
-	addl	%edi,%edx
-	movdqa	%xmm4,%xmm7
-	addl	48(%rsp),%ecx
 	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%esi
 	movdqa	%xmm6,%xmm8
-.byte	102,15,58,15,251,8
+	paddd	%xmm6,%xmm9
 	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%eax,%ebp
 	roll	$5,%edx
-	paddd	%xmm6,%xmm9
-	andl	%eax,%esi
-	xorl	%ebx,%eax
+	addl	%esi,%ecx
 	psrldq	$4,%xmm8
-	xorl	%ebx,%esi
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	pxor	%xmm3,%xmm7
 	addl	%edx,%ecx
-	pxor	%xmm3,%xmm7
-	rorl	$7,%ebp
-	addl	%esi,%ecx
+	rorl	$7,%edx
 	pxor	%xmm5,%xmm8
+	xorl	%eax,%edi
+	movl	%ecx,%esi
 	addl	52(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
+	pxor	%xmm8,%xmm7
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-	pxor	%xmm8,%xmm7
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
 	movdqa	%xmm9,32(%rsp)
-	xorl	%eax,%edi
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm10
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movdqa	%xmm7,%xmm10
+	rorl	$7,%ecx
 	movdqa	%xmm7,%xmm8
-	rorl	$7,%edx
-	addl	%edi,%ebx
-	addl	56(%rsp),%eax
-	xorl	%ebp,%edx
+	xorl	%ebp,%esi
 	pslldq	$12,%xmm10
 	paddd	%xmm7,%xmm7
 	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	psrld	$31,%xmm8
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	psrld	$31,%xmm8
-	xorl	%ebp,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
 	movdqa	%xmm10,%xmm9
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	psrld	$30,%xmm10
+	addl	%ebx,%eax
+	rorl	$7,%ebx
 	por	%xmm8,%xmm7
+	xorl	%edx,%edi
+	movl	%eax,%esi
 	addl	60(%rsp),%ebp
-	xorl	%edx,%ecx
-	movl	%eax,%esi
-	roll	$5,%eax
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm7
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	movdqa	16(%r11),%xmm10
-	xorl	%edx,%edi
+	xorl	%ecx,%ebx
+	movdqa	-32(%r11),%xmm10
+	roll	$5,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	pxor	%xmm9,%xmm7
+	pshufd	$238,%xmm6,%xmm9
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	pxor	%xmm9,%xmm7
-	rorl	$7,%ebx
-	addl	%edi,%ebp
-	movdqa	%xmm7,%xmm9
-	addl	0(%rsp),%edx
+	rorl	$7,%eax
 	pxor	%xmm4,%xmm0
-.byte	102,68,15,58,15,206,8
-	xorl	%ecx,%ebx
+	xorl	%ecx,%esi
 	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	punpcklqdq	%xmm7,%xmm9
+	xorl	%ebx,%eax
 	roll	$5,%ebp
 	pxor	%xmm1,%xmm0
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
+	addl	%esi,%edx
+	andl	%eax,%edi
 	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%eax
 	paddd	%xmm7,%xmm10
-	xorl	%ecx,%esi
 	addl	%ebp,%edx
 	pxor	%xmm9,%xmm0
-	rorl	$7,%eax
-	addl	%esi,%edx
+	rorl	$7,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
 	addl	4(%rsp),%ecx
-	xorl	%ebx,%eax
 	movdqa	%xmm0,%xmm9
+	xorl	%eax,%ebp
+	roll	$5,%edx
 	movdqa	%xmm10,48(%rsp)
-	movl	%edx,%esi
-	roll	$5,%edx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
 	pslld	$2,%xmm0
-	xorl	%ebx,%edi
 	addl	%edx,%ecx
+	rorl	$7,%edx
 	psrld	$30,%xmm9
-	rorl	$7,%ebp
-	addl	%edi,%ecx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
 	addl	8(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%edi
+	por	%xmm9,%xmm0
+	xorl	%ebp,%edx
 	roll	$5,%ecx
-	por	%xmm9,%xmm0
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	movdqa	%xmm0,%xmm10
-	xorl	%eax,%esi
+	pshufd	$238,%xmm7,%xmm10
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	12(%rsp),%eax
-	xorl	%ebp,%edx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	xorl	%ebp,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
+	pxor	%xmm5,%xmm1
 	addl	16(%rsp),%ebp
-	pxor	%xmm5,%xmm1
-.byte	102,68,15,58,15,215,8
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm0,%xmm10
 	movl	%eax,%edi
 	roll	$5,%eax
 	pxor	%xmm2,%xmm1
-	xorl	%ecx,%esi
-	addl	%eax,%ebp
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
 	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebx
 	paddd	%xmm0,%xmm8
-	rorl	$7,%ebx
-	addl	%esi,%ebp
+	addl	%eax,%ebp
 	pxor	%xmm10,%xmm1
 	addl	20(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
 	movdqa	%xmm1,%xmm10
+	addl	%edi,%edx
+	xorl	%ebx,%esi
 	movdqa	%xmm8,0(%rsp)
-	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
+	addl	24(%rsp),%ecx
 	pslld	$2,%xmm1
-	addl	24(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
+	movl	%edx,%edi
 	psrld	$30,%xmm10
-	movl	%edx,%edi
 	roll	$5,%edx
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
 	rorl	$7,%ebp
-	addl	%esi,%ecx
 	por	%xmm10,%xmm1
+	addl	%edx,%ecx
 	addl	28(%rsp),%ebx
-	xorl	%eax,%edi
-	movdqa	%xmm1,%xmm8
+	pshufd	$238,%xmm0,%xmm8
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
+	pxor	%xmm6,%xmm2
 	addl	32(%rsp),%eax
-	pxor	%xmm6,%xmm2
-.byte	102,68,15,58,15,192,8
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
+	punpcklqdq	%xmm1,%xmm8
 	movl	%ebx,%edi
 	roll	$5,%ebx
 	pxor	%xmm3,%xmm2
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	0(%r11),%xmm10
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm9
 	addl	%ebx,%eax
-	movdqa	32(%r11),%xmm10
-	paddd	%xmm1,%xmm9
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	pxor	%xmm8,%xmm2
 	addl	36(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
 	movdqa	%xmm2,%xmm8
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
 	movdqa	%xmm9,16(%rsp)
-	xorl	%ecx,%edi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
+	addl	40(%rsp),%edx
 	pslld	$2,%xmm2
-	addl	40(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
 	psrld	$30,%xmm8
-	movl	%ebp,%edi
 	roll	$5,%ebp
-	xorl	%ebx,%esi
-	addl	%ebp,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%edi
 	rorl	$7,%eax
-	addl	%esi,%edx
 	por	%xmm8,%xmm2
+	addl	%ebp,%edx
 	addl	44(%rsp),%ecx
-	xorl	%ebx,%edi
-	movdqa	%xmm2,%xmm9
+	pshufd	$238,%xmm1,%xmm9
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
+	pxor	%xmm7,%xmm3
 	addl	48(%rsp),%ebx
-	pxor	%xmm7,%xmm3
-.byte	102,68,15,58,15,201,8
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
+	punpcklqdq	%xmm2,%xmm9
 	movl	%ecx,%edi
 	roll	$5,%ecx
 	pxor	%xmm4,%xmm3
-	xorl	%ebp,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
 	movdqa	%xmm10,%xmm8
+	rorl	$7,%edx
 	paddd	%xmm2,%xmm10
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	addl	%ecx,%ebx
 	pxor	%xmm9,%xmm3
 	addl	52(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
 	movdqa	%xmm3,%xmm9
+	addl	%edi,%eax
+	xorl	%edx,%esi
 	movdqa	%xmm10,32(%rsp)
-	xorl	%edx,%edi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
+	addl	56(%rsp),%ebp
 	pslld	$2,%xmm3
-	addl	56(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
+	movl	%eax,%edi
 	psrld	$30,%xmm9
-	movl	%eax,%edi
 	roll	$5,%eax
-	xorl	%ecx,%esi
-	addl	%eax,%ebp
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
 	rorl	$7,%ebx
-	addl	%esi,%ebp
 	por	%xmm9,%xmm3
+	addl	%eax,%ebp
 	addl	60(%rsp),%edx
-	xorl	%ecx,%edi
-	movdqa	%xmm3,%xmm10
+	pshufd	$238,%xmm2,%xmm10
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
+	pxor	%xmm0,%xmm4
 	addl	0(%rsp),%ecx
-	pxor	%xmm0,%xmm4
-.byte	102,68,15,58,15,210,8
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
+	punpcklqdq	%xmm3,%xmm10
 	movl	%edx,%edi
 	roll	$5,%edx
 	pxor	%xmm5,%xmm4
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
 	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebp
 	paddd	%xmm3,%xmm8
-	rorl	$7,%ebp
-	addl	%esi,%ecx
+	addl	%edx,%ecx
 	pxor	%xmm10,%xmm4
 	addl	4(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
 	movdqa	%xmm4,%xmm10
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
 	movdqa	%xmm8,48(%rsp)
-	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
+	addl	8(%rsp),%eax
 	pslld	$2,%xmm4
-	addl	8(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
+	movl	%ebx,%edi
 	psrld	$30,%xmm10
-	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%edi
 	rorl	$7,%ecx
-	addl	%esi,%eax
 	por	%xmm10,%xmm4
+	addl	%ebx,%eax
 	addl	12(%rsp),%ebp
-	xorl	%edx,%edi
-	movdqa	%xmm4,%xmm8
+	pshufd	$238,%xmm3,%xmm8
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
+	pxor	%xmm1,%xmm5
 	addl	16(%rsp),%edx
-	pxor	%xmm1,%xmm5
-.byte	102,68,15,58,15,195,8
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
+	punpcklqdq	%xmm4,%xmm8
 	movl	%ebp,%edi
 	roll	$5,%ebp
 	pxor	%xmm6,%xmm5
-	xorl	%ebx,%esi
-	addl	%ebp,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%edi
 	movdqa	%xmm9,%xmm10
+	rorl	$7,%eax
 	paddd	%xmm4,%xmm9
-	rorl	$7,%eax
-	addl	%esi,%edx
+	addl	%ebp,%edx
 	pxor	%xmm8,%xmm5
 	addl	20(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
 	movdqa	%xmm5,%xmm8
+	addl	%edi,%ecx
+	xorl	%eax,%esi
 	movdqa	%xmm9,0(%rsp)
-	xorl	%eax,%edi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
+	addl	24(%rsp),%ebx
 	pslld	$2,%xmm5
-	addl	24(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
 	psrld	$30,%xmm8
-	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
 	rorl	$7,%edx
-	addl	%esi,%ebx
 	por	%xmm8,%xmm5
+	addl	%ecx,%ebx
 	addl	28(%rsp),%eax
-	xorl	%ebp,%edi
-	movdqa	%xmm5,%xmm9
+	pshufd	$238,%xmm4,%xmm9
+	rorl	$7,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
-	movl	%ecx,%edi
 	pxor	%xmm2,%xmm6
-.byte	102,68,15,58,15,204,8
-	xorl	%edx,%ecx
 	addl	32(%rsp),%ebp
-	andl	%edx,%edi
-	pxor	%xmm7,%xmm6
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
+	punpcklqdq	%xmm5,%xmm9
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	pxor	%xmm7,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
 	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%edi
 	paddd	%xmm5,%xmm10
-	addl	%edi,%ebp
-	movl	%eax,%edi
+	xorl	%ecx,%ebx
 	pxor	%xmm9,%xmm6
-	roll	$5,%eax
-	addl	%esi,%ebp
-	xorl	%edx,%ecx
 	addl	%eax,%ebp
-	movdqa	%xmm6,%xmm9
-	movdqa	%xmm10,16(%rsp)
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	36(%rsp),%edx
-	andl	%ecx,%esi
-	pslld	$2,%xmm6
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	psrld	$30,%xmm9
-	addl	%esi,%edx
+	movdqa	%xmm6,%xmm9
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	movdqa	%xmm10,16(%rsp)
 	roll	$5,%ebp
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	pslld	$2,%xmm6
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	por	%xmm9,%xmm6
-	movl	%eax,%edi
-	xorl	%ebx,%eax
-	movdqa	%xmm6,%xmm10
+	psrld	$30,%xmm9
 	addl	40(%rsp),%ecx
-	andl	%ebx,%edi
 	andl	%eax,%esi
+	xorl	%ebx,%eax
+	por	%xmm9,%xmm6
 	rorl	$7,%ebp
-	addl	%edi,%ecx
 	movl	%edx,%edi
+	xorl	%eax,%esi
 	roll	$5,%edx
+	pshufd	$238,%xmm5,%xmm10
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	44(%rsp),%ebx
-	andl	%eax,%esi
 	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	rorl	$7,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
 	roll	$5,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%edi
 	pxor	%xmm3,%xmm7
-.byte	102,68,15,58,15,213,8
-	xorl	%ebp,%edx
 	addl	48(%rsp),%eax
-	andl	%ebp,%edi
-	pxor	%xmm0,%xmm7
 	andl	%edx,%esi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	movdqa	48(%r11),%xmm9
-	paddd	%xmm6,%xmm8
-	addl	%edi,%eax
+	punpcklqdq	%xmm6,%xmm10
 	movl	%ebx,%edi
-	pxor	%xmm10,%xmm7
+	xorl	%edx,%esi
+	pxor	%xmm0,%xmm7
 	roll	$5,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	movdqa	32(%r11),%xmm9
+	xorl	%ecx,%edi
+	paddd	%xmm6,%xmm8
+	xorl	%edx,%ecx
+	pxor	%xmm10,%xmm7
 	addl	%ebx,%eax
-	movdqa	%xmm7,%xmm10
-	movdqa	%xmm8,32(%rsp)
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	52(%rsp),%ebp
-	andl	%edx,%esi
-	pslld	$2,%xmm7
 	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	psrld	$30,%xmm10
-	addl	%esi,%ebp
+	movdqa	%xmm7,%xmm10
 	movl	%eax,%esi
+	xorl	%ecx,%edi
+	movdqa	%xmm8,32(%rsp)
 	roll	$5,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	pslld	$2,%xmm7
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	por	%xmm10,%xmm7
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
-	movdqa	%xmm7,%xmm8
+	psrld	$30,%xmm10
 	addl	56(%rsp),%edx
-	andl	%ecx,%edi
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	por	%xmm10,%xmm7
 	rorl	$7,%eax
-	addl	%edi,%edx
 	movl	%ebp,%edi
+	xorl	%ebx,%esi
 	roll	$5,%ebp
+	pshufd	$238,%xmm6,%xmm8
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%rsp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%edi
 	roll	$5,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%edi
 	pxor	%xmm4,%xmm0
-.byte	102,68,15,58,15,198,8
-	xorl	%eax,%ebp
 	addl	0(%rsp),%ebx
-	andl	%eax,%edi
-	pxor	%xmm1,%xmm0
 	andl	%ebp,%esi
+	xorl	%eax,%ebp
 	rorl	$7,%edx
+	punpcklqdq	%xmm7,%xmm8
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	pxor	%xmm1,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
 	movdqa	%xmm9,%xmm10
+	xorl	%edx,%edi
 	paddd	%xmm7,%xmm9
-	addl	%edi,%ebx
-	movl	%ecx,%edi
+	xorl	%ebp,%edx
 	pxor	%xmm8,%xmm0
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%eax,%ebp
 	addl	%ecx,%ebx
-	movdqa	%xmm0,%xmm8
-	movdqa	%xmm9,48(%rsp)
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	4(%rsp),%eax
-	andl	%ebp,%esi
-	pslld	$2,%xmm0
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	psrld	$30,%xmm8
-	addl	%esi,%eax
+	movdqa	%xmm0,%xmm8
 	movl	%ebx,%esi
+	xorl	%edx,%edi
+	movdqa	%xmm9,48(%rsp)
 	roll	$5,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%esi
+	pslld	$2,%xmm0
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	por	%xmm8,%xmm0
-	movl	%ecx,%edi
-	xorl	%edx,%ecx
-	movdqa	%xmm0,%xmm9
+	psrld	$30,%xmm8
 	addl	8(%rsp),%ebp
-	andl	%edx,%edi
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	por	%xmm8,%xmm0
 	rorl	$7,%ebx
-	addl	%edi,%ebp
 	movl	%eax,%edi
+	xorl	%ecx,%esi
 	roll	$5,%eax
+	pshufd	$238,%xmm7,%xmm9
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%rsp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
 	roll	$5,%ebp
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%edi
 	pxor	%xmm5,%xmm1
-.byte	102,68,15,58,15,207,8
-	xorl	%ebx,%eax
 	addl	16(%rsp),%ecx
-	andl	%ebx,%edi
-	pxor	%xmm2,%xmm1
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
+	punpcklqdq	%xmm0,%xmm9
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	pxor	%xmm2,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
 	movdqa	%xmm10,%xmm8
+	xorl	%ebp,%edi
 	paddd	%xmm0,%xmm10
-	addl	%edi,%ecx
-	movl	%edx,%edi
+	xorl	%eax,%ebp
 	pxor	%xmm9,%xmm1
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%ebx,%eax
 	addl	%edx,%ecx
-	movdqa	%xmm1,%xmm9
-	movdqa	%xmm10,0(%rsp)
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	20(%rsp),%ebx
-	andl	%eax,%esi
-	pslld	$2,%xmm1
 	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	rorl	$7,%edx
-	psrld	$30,%xmm9
-	addl	%esi,%ebx
+	movdqa	%xmm1,%xmm9
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	movdqa	%xmm10,0(%rsp)
 	roll	$5,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	pslld	$2,%xmm1
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	por	%xmm9,%xmm1
-	movl	%edx,%edi
-	xorl	%ebp,%edx
-	movdqa	%xmm1,%xmm10
+	psrld	$30,%xmm9
 	addl	24(%rsp),%eax
-	andl	%ebp,%edi
 	andl	%edx,%esi
+	xorl	%ebp,%edx
+	por	%xmm9,%xmm1
 	rorl	$7,%ecx
-	addl	%edi,%eax
 	movl	%ebx,%edi
+	xorl	%edx,%esi
 	roll	$5,%ebx
+	pshufd	$238,%xmm0,%xmm10
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	28(%rsp),%ebp
-	andl	%edx,%esi
 	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	xorl	%ecx,%edi
 	roll	$5,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%edi
 	pxor	%xmm6,%xmm2
-.byte	102,68,15,58,15,208,8
-	xorl	%ecx,%ebx
 	addl	32(%rsp),%edx
-	andl	%ecx,%edi
-	pxor	%xmm3,%xmm2
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
+	punpcklqdq	%xmm1,%xmm10
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	pxor	%xmm3,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
 	movdqa	%xmm8,%xmm9
+	xorl	%eax,%edi
 	paddd	%xmm1,%xmm8
-	addl	%edi,%edx
-	movl	%ebp,%edi
+	xorl	%ebx,%eax
 	pxor	%xmm10,%xmm2
-	roll	$5,%ebp
-	addl	%esi,%edx
-	xorl	%ecx,%ebx
 	addl	%ebp,%edx
-	movdqa	%xmm2,%xmm10
-	movdqa	%xmm8,16(%rsp)
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	36(%rsp),%ecx
-	andl	%ebx,%esi
-	pslld	$2,%xmm2
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	rorl	$7,%ebp
-	psrld	$30,%xmm10
-	addl	%esi,%ecx
+	movdqa	%xmm2,%xmm10
 	movl	%edx,%esi
+	xorl	%eax,%edi
+	movdqa	%xmm8,16(%rsp)
 	roll	$5,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	pslld	$2,%xmm2
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	por	%xmm10,%xmm2
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
-	movdqa	%xmm2,%xmm8
+	psrld	$30,%xmm10
 	addl	40(%rsp),%ebx
-	andl	%eax,%edi
 	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	por	%xmm10,%xmm2
 	rorl	$7,%edx
-	addl	%edi,%ebx
 	movl	%ecx,%edi
+	xorl	%ebp,%esi
 	roll	$5,%ecx
+	pshufd	$238,%xmm1,%xmm8
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	44(%rsp),%eax
-	andl	%ebp,%esi
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	rorl	$7,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	roll	$5,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
+	pxor	%xmm7,%xmm3
 	addl	48(%rsp),%ebp
-	pxor	%xmm7,%xmm3
-.byte	102,68,15,58,15,193,8
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm2,%xmm8
 	movl	%eax,%edi
 	roll	$5,%eax
 	pxor	%xmm4,%xmm3
-	xorl	%ecx,%esi
-	addl	%eax,%ebp
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
 	movdqa	%xmm9,%xmm10
+	rorl	$7,%ebx
 	paddd	%xmm2,%xmm9
-	rorl	$7,%ebx
-	addl	%esi,%ebp
+	addl	%eax,%ebp
 	pxor	%xmm8,%xmm3
 	addl	52(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
 	movdqa	%xmm3,%xmm8
+	addl	%edi,%edx
+	xorl	%ebx,%esi
 	movdqa	%xmm9,32(%rsp)
-	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
+	addl	56(%rsp),%ecx
 	pslld	$2,%xmm3
-	addl	56(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
+	movl	%edx,%edi
 	psrld	$30,%xmm8
-	movl	%edx,%edi
 	roll	$5,%edx
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
 	rorl	$7,%ebp
-	addl	%esi,%ecx
 	por	%xmm8,%xmm3
+	addl	%edx,%ecx
 	addl	60(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	0(%rsp),%eax
-	paddd	%xmm3,%xmm10
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	paddd	%xmm3,%xmm10
+	addl	%esi,%eax
+	xorl	%edx,%edi
 	movdqa	%xmm10,48(%rsp)
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	4(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	8(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	roll	$5,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	12(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	cmpq	%r10,%r9
 	je	.Ldone_ssse3
 	movdqa	64(%r11),%xmm6
-	movdqa	0(%r11),%xmm9
+	movdqa	-64(%r11),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
@@ -2247,113 +2346,112 @@
 .byte	102,15,56,0,198
 	addq	$64,%r9
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
 .byte	102,15,56,0,206
-	movl	%ecx,%edi
 	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	paddd	%xmm9,%xmm0
-	xorl	%ebp,%esi
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	movdqa	%xmm0,0(%rsp)
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
-	psubd	%xmm9,%xmm0
+	xorl	%edx,%edi
 	movl	%ebx,%esi
+	movdqa	%xmm0,0(%rsp)
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	psubd	%xmm9,%xmm0
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
+	movl	%edx,%edi
 .byte	102,15,56,0,214
-	movl	%edx,%edi
 	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
 	paddd	%xmm9,%xmm1
-	xorl	%eax,%esi
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
-	movdqa	%xmm1,16(%rsp)
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
-	psubd	%xmm9,%xmm1
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
+	movdqa	%xmm1,16(%rsp)
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	psubd	%xmm9,%xmm1
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
 .byte	102,15,56,0,222
-	movl	%ebp,%edi
 	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
 	paddd	%xmm9,%xmm2
-	xorl	%ebx,%esi
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
-	movdqa	%xmm2,32(%rsp)
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
-	psubd	%xmm9,%xmm2
+	xorl	%eax,%edi
 	movl	%edx,%esi
+	movdqa	%xmm2,32(%rsp)
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	psubd	%xmm9,%xmm2
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	0(%r8),%eax
 	addl	4(%r8),%esi
 	addl	8(%r8),%ecx
@@ -2363,108 +2461,110 @@
 	movl	%esi,4(%r8)
 	movl	%esi,%ebx
 	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
 	movl	%edx,12(%r8)
+	xorl	%edx,%edi
 	movl	%ebp,16(%r8)
+	andl	%edi,%esi
 	jmp	.Loop_ssse3
 
 .align	16
 .Ldone_ssse3:
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	roll	$5,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%esi,%ecx
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%ebp
-	rorl	$7,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	roll	$5,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
 	addl	%ebp,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
 	addl	%edx,%ecx
-	rorl	$7,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	roll	$5,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%edi,%eax
 	addl	0(%r8),%eax
 	addl	4(%r8),%esi
 	addl	8(%r8),%ecx
@@ -2475,11 +2575,13 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	64(%rsp),%rsi
-	movq	0(%rsi),%r12
-	movq	8(%rsi),%rbp
-	movq	16(%rsi),%rbx
-	leaq	24(%rsi),%rsp
+	leaq	(%r14),%rsi
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_ssse3:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -2487,18 +2589,23 @@
 .align	16
 sha1_block_data_order_avx:
 _avx_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
+	pushq	%r13
+	pushq	%r14
 	leaq	-64(%rsp),%rsp
+	vzeroupper
+	movq	%rax,%r14
+	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
 	movq	%rdx,%r10
-	vzeroupper
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r11
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -2506,9 +2613,12 @@
 	movl	12(%r8),%edx
 	movl	%ebx,%esi
 	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
 
 	vmovdqa	64(%r11),%xmm6
-	vmovdqa	0(%r11),%xmm9
+	vmovdqa	-64(%r11),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -2518,9 +2628,9 @@
 	vpshufb	%xmm6,%xmm1,%xmm1
 	vpshufb	%xmm6,%xmm2,%xmm2
 	vpshufb	%xmm6,%xmm3,%xmm3
-	vpaddd	%xmm9,%xmm0,%xmm4
-	vpaddd	%xmm9,%xmm1,%xmm5
-	vpaddd	%xmm9,%xmm2,%xmm6
+	vpaddd	%xmm11,%xmm0,%xmm4
+	vpaddd	%xmm11,%xmm1,%xmm5
+	vpaddd	%xmm11,%xmm2,%xmm6
 	vmovdqa	%xmm4,0(%rsp)
 	vmovdqa	%xmm5,16(%rsp)
 	vmovdqa	%xmm6,32(%rsp)
@@ -2527,871 +2637,825 @@
 	jmp	.Loop_avx
 .align	16
 .Loop_avx:
-	addl	0(%rsp),%ebp
-	xorl	%edx,%ecx
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpaddd	%xmm3,%xmm9,%xmm9
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
 	vpsrldq	$4,%xmm3,%xmm8
-	xorl	%edx,%esi
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpxor	%xmm0,%xmm4,%xmm4
-	shrdl	$2,%ebx,%ebx
-	addl	%esi,%ebp
 	vpxor	%xmm2,%xmm8,%xmm8
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
 	addl	4(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%esi
+	vpxor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm8,%xmm4,%xmm4
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
 	vmovdqa	%xmm9,48(%rsp)
-	xorl	%ecx,%edi
-	addl	%ebp,%edx
+	addl	%edi,%edx
+	andl	%eax,%esi
 	vpsrld	$31,%xmm4,%xmm8
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	8(%rsp),%ecx
 	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
 	vpslldq	$12,%xmm4,%xmm10
 	vpaddd	%xmm4,%xmm4,%xmm4
 	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
 	vpsrld	$30,%xmm10,%xmm9
 	vpor	%xmm8,%xmm4,%xmm4
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vpslld	$2,%xmm10,%xmm10
 	vpxor	%xmm9,%xmm4,%xmm4
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
 	addl	12(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
+	vpxor	%xmm10,%xmm4,%xmm4
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vpxor	%xmm10,%xmm4,%xmm4
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	vmovdqa	0(%r11),%xmm10
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
 	addl	%edi,%ebx
-	addl	16(%rsp),%eax
+	andl	%edx,%esi
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	vpaddd	%xmm4,%xmm11,%xmm9
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpaddd	%xmm4,%xmm10,%xmm10
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	vpsrldq	$4,%xmm4,%xmm9
-	xorl	%ebp,%esi
-	addl	%ebx,%eax
+	vpsrldq	$4,%xmm4,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
 	vpxor	%xmm1,%xmm5,%xmm5
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
-	vpxor	%xmm3,%xmm9,%xmm9
-	addl	20(%rsp),%ebp
 	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
 	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	vpxor	%xmm8,%xmm5,%xmm5
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm9,%xmm5,%xmm5
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	vmovdqa	%xmm10,0(%rsp)
-	xorl	%edx,%edi
-	addl	%eax,%ebp
-	vpsrld	$31,%xmm5,%xmm9
-	shrdl	$7,%ebx,%ebx
+	vmovdqa	%xmm9,0(%rsp)
 	addl	%edi,%ebp
-	addl	24(%rsp),%edx
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm8
 	xorl	%ecx,%ebx
-	vpslldq	$12,%xmm5,%xmm8
+	addl	%eax,%ebp
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm10
 	vpaddd	%xmm5,%xmm5,%xmm5
 	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	vpsrld	$30,%xmm8,%xmm10
-	vpor	%xmm9,%xmm5,%xmm5
-	xorl	%ecx,%esi
-	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm5,%xmm5
 	addl	%esi,%edx
-	vpslld	$2,%xmm8,%xmm8
-	vpxor	%xmm10,%xmm5,%xmm5
-	addl	28(%rsp),%ecx
+	andl	%eax,%edi
 	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm5,%xmm5
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
 	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	vpxor	%xmm10,%xmm5,%xmm5
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vpxor	%xmm8,%xmm5,%xmm5
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	vmovdqa	16(%r11),%xmm8
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
+	vmovdqa	-32(%r11),%xmm11
 	addl	%edi,%ecx
-	addl	32(%rsp),%ebx
+	andl	%ebp,%esi
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	vpaddd	%xmm5,%xmm11,%xmm9
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vpaddd	%xmm5,%xmm8,%xmm8
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	vpsrldq	$4,%xmm5,%xmm10
-	xorl	%eax,%esi
-	addl	%ecx,%ebx
+	vpsrldq	$4,%xmm5,%xmm8
+	addl	%esi,%ebx
+	andl	%edx,%edi
 	vpxor	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
-	vpxor	%xmm4,%xmm10,%xmm10
-	addl	36(%rsp),%eax
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm8,%xmm8
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpxor	%xmm10,%xmm6,%xmm6
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	vmovdqa	%xmm8,16(%rsp)
-	xorl	%ebp,%edi
-	addl	%ebx,%eax
-	vpsrld	$31,%xmm6,%xmm10
-	shrdl	$7,%ecx,%ecx
+	vmovdqa	%xmm9,16(%rsp)
 	addl	%edi,%eax
-	addl	40(%rsp),%ebp
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm8
 	xorl	%edx,%ecx
-	vpslldq	$12,%xmm6,%xmm9
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm10
 	vpaddd	%xmm6,%xmm6,%xmm6
 	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	vpsrld	$30,%xmm9,%xmm8
-	vpor	%xmm10,%xmm6,%xmm6
-	xorl	%edx,%esi
-	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm6,%xmm6
 	addl	%esi,%ebp
-	vpslld	$2,%xmm9,%xmm9
-	vpxor	%xmm8,%xmm6,%xmm6
-	addl	44(%rsp),%edx
+	andl	%ebx,%edi
 	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
 	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	vpxor	%xmm10,%xmm6,%xmm6
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm9,%xmm6,%xmm6
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	vmovdqa	16(%r11),%xmm9
-	xorl	%ecx,%edi
-	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	addl	48(%rsp),%ecx
+	andl	%eax,%esi
 	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	vpaddd	%xmm6,%xmm11,%xmm9
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vpaddd	%xmm6,%xmm9,%xmm9
-	andl	%eax,%esi
-	xorl	%ebx,%eax
 	vpsrldq	$4,%xmm6,%xmm8
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpxor	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vpxor	%xmm5,%xmm8,%xmm8
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
 	addl	52(%rsp),%ebx
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vpxor	%xmm8,%xmm7,%xmm7
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
 	vmovdqa	%xmm9,32(%rsp)
-	xorl	%eax,%edi
-	addl	%ecx,%ebx
+	addl	%edi,%ebx
+	andl	%edx,%esi
 	vpsrld	$31,%xmm7,%xmm8
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
-	addl	56(%rsp),%eax
 	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
 	vpslldq	$12,%xmm7,%xmm10
 	vpaddd	%xmm7,%xmm7,%xmm7
 	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%esi
-	xorl	%ebp,%edx
 	vpsrld	$30,%xmm10,%xmm9
 	vpor	%xmm8,%xmm7,%xmm7
-	xorl	%ebp,%esi
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpslld	$2,%xmm10,%xmm10
 	vpxor	%xmm9,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
 	addl	60(%rsp),%ebp
-	xorl	%edx,%ecx
-	movl	%eax,%esi
+	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm10,%xmm7,%xmm7
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	vmovdqa	16(%r11),%xmm10
-	xorl	%edx,%edi
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
-	vpalignr	$8,%xmm6,%xmm7,%xmm9
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
 	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
 	addl	0(%rsp),%edx
-	xorl	%ecx,%ebx
-	movl	%ebp,%edi
+	vpxor	%xmm1,%xmm0,%xmm0
+	xorl	%ebx,%eax
 	shldl	$5,%ebp,%ebp
-	vpxor	%xmm1,%xmm0,%xmm0
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm7,%xmm10,%xmm10
-	xorl	%ecx,%esi
-	addl	%ebp,%edx
-	vpxor	%xmm9,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
+	vpaddd	%xmm7,%xmm11,%xmm9
 	addl	%esi,%edx
-	addl	4(%rsp),%ecx
+	andl	%eax,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
 	xorl	%ebx,%eax
-	vpsrld	$30,%xmm0,%xmm9
-	vmovdqa	%xmm10,48(%rsp)
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
 	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
 	vpslld	$2,%xmm0,%xmm0
-	xorl	%ebx,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
 	addl	%edi,%ecx
-	addl	8(%rsp),%ebx
+	andl	%ebp,%esi
 	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%edx
 	shldl	$5,%ecx,%ecx
-	vpor	%xmm9,%xmm0,%xmm0
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	vmovdqa	%xmm0,%xmm10
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	12(%rsp),%eax
-	xorl	%ebp,%edx
+	xorl	%ebp,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	xorl	%ebp,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vpalignr	$8,%xmm7,%xmm0,%xmm10
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
 	vpxor	%xmm5,%xmm1,%xmm1
 	addl	16(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
 	vpxor	%xmm2,%xmm1,%xmm1
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm0,%xmm8,%xmm8
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
-	vpxor	%xmm10,%xmm1,%xmm1
+	vpxor	%xmm8,%xmm1,%xmm1
 	addl	20(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	vpsrld	$30,%xmm1,%xmm10
-	vmovdqa	%xmm8,0(%rsp)
-	xorl	%ebx,%edi
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	vpslld	$2,%xmm1,%xmm1
 	addl	24(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
-	vpor	%xmm10,%xmm1,%xmm1
+	vpor	%xmm8,%xmm1,%xmm1
 	addl	28(%rsp),%ebx
-	xorl	%eax,%edi
-	vmovdqa	%xmm1,%xmm8
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	vpalignr	$8,%xmm0,%xmm1,%xmm8
 	vpxor	%xmm6,%xmm2,%xmm2
 	addl	32(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
 	vpxor	%xmm3,%xmm2,%xmm2
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	vmovdqa	0(%r11),%xmm11
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	vmovdqa	32(%r11),%xmm10
-	vpaddd	%xmm1,%xmm9,%xmm9
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpxor	%xmm8,%xmm2,%xmm2
 	addl	36(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
 	vpsrld	$30,%xmm2,%xmm8
 	vmovdqa	%xmm9,16(%rsp)
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	vpslld	$2,%xmm2,%xmm2
 	addl	40(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpor	%xmm8,%xmm2,%xmm2
 	addl	44(%rsp),%ecx
-	xorl	%ebx,%edi
-	vmovdqa	%xmm2,%xmm9
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
-	vpalignr	$8,%xmm1,%xmm2,%xmm9
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
 	vpxor	%xmm4,%xmm3,%xmm3
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm2,%xmm10,%xmm10
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
-	vpxor	%xmm9,%xmm3,%xmm3
+	vpxor	%xmm8,%xmm3,%xmm3
 	addl	52(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm3,%xmm9
-	vmovdqa	%xmm10,32(%rsp)
-	xorl	%edx,%edi
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
-	vpor	%xmm9,%xmm3,%xmm3
+	vpor	%xmm8,%xmm3,%xmm3
 	addl	60(%rsp),%edx
-	xorl	%ecx,%edi
-	vmovdqa	%xmm3,%xmm10
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpalignr	$8,%xmm2,%xmm3,%xmm10
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
 	vpxor	%xmm0,%xmm4,%xmm4
 	addl	0(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
 	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	vpaddd	%xmm3,%xmm11,%xmm9
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm3,%xmm8,%xmm8
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
-	vpxor	%xmm10,%xmm4,%xmm4
+	vpxor	%xmm8,%xmm4,%xmm4
 	addl	4(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	vpsrld	$30,%xmm4,%xmm10
-	vmovdqa	%xmm8,48(%rsp)
-	xorl	%ebp,%edi
+	vpsrld	$30,%xmm4,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	vpslld	$2,%xmm4,%xmm4
 	addl	8(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
-	vpor	%xmm10,%xmm4,%xmm4
+	vpor	%xmm8,%xmm4,%xmm4
 	addl	12(%rsp),%ebp
-	xorl	%edx,%edi
-	vmovdqa	%xmm4,%xmm8
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	vpalignr	$8,%xmm3,%xmm4,%xmm8
 	vpxor	%xmm1,%xmm5,%xmm5
 	addl	16(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
 	vpxor	%xmm6,%xmm5,%xmm5
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	vpaddd	%xmm4,%xmm11,%xmm9
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm4,%xmm9,%xmm9
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpxor	%xmm8,%xmm5,%xmm5
 	addl	20(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
 	vpsrld	$30,%xmm5,%xmm8
 	vmovdqa	%xmm9,0(%rsp)
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	vpslld	$2,%xmm5,%xmm5
 	addl	24(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpor	%xmm8,%xmm5,%xmm5
 	addl	28(%rsp),%eax
-	xorl	%ebp,%edi
-	vmovdqa	%xmm5,%xmm9
+	shrdl	$7,%ecx,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
-	vpalignr	$8,%xmm4,%xmm5,%xmm9
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
 	vpxor	%xmm2,%xmm6,%xmm6
-	movl	%ecx,%edi
-	xorl	%edx,%ecx
 	addl	32(%rsp),%ebp
-	andl	%edx,%edi
-	vpxor	%xmm7,%xmm6,%xmm6
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
 	shrdl	$7,%ebx,%ebx
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm5,%xmm10,%xmm10
-	addl	%edi,%ebp
+	vpxor	%xmm7,%xmm6,%xmm6
 	movl	%eax,%edi
-	vpxor	%xmm9,%xmm6,%xmm6
+	xorl	%ecx,%esi
+	vpaddd	%xmm5,%xmm11,%xmm9
 	shldl	$5,%eax,%eax
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpsrld	$30,%xmm6,%xmm9
-	vmovdqa	%xmm10,16(%rsp)
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	36(%rsp),%edx
-	andl	%ecx,%esi
-	vpslld	$2,%xmm6,%xmm6
+	vpsrld	$30,%xmm6,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%edi
 	shldl	$5,%ebp,%ebp
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpor	%xmm9,%xmm6,%xmm6
-	movl	%eax,%edi
-	xorl	%ebx,%eax
-	vmovdqa	%xmm6,%xmm10
 	addl	40(%rsp),%ecx
-	andl	%ebx,%edi
 	andl	%eax,%esi
+	vpor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	movl	%edx,%edi
+	xorl	%eax,%esi
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	44(%rsp),%ebx
-	andl	%eax,%esi
 	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%ebp,%edi
 	shldl	$5,%ecx,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	vpalignr	$8,%xmm5,%xmm6,%xmm10
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
 	vpxor	%xmm3,%xmm7,%xmm7
-	movl	%edx,%edi
-	xorl	%ebp,%edx
 	addl	48(%rsp),%eax
-	andl	%ebp,%edi
-	vpxor	%xmm0,%xmm7,%xmm7
 	andl	%edx,%esi
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	vmovdqa	48(%r11),%xmm9
-	vpaddd	%xmm6,%xmm8,%xmm8
-	addl	%edi,%eax
+	vpxor	%xmm0,%xmm7,%xmm7
 	movl	%ebx,%edi
-	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%edx,%esi
+	vpaddd	%xmm6,%xmm11,%xmm9
+	vmovdqa	32(%r11),%xmm11
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpsrld	$30,%xmm7,%xmm10
-	vmovdqa	%xmm8,32(%rsp)
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	52(%rsp),%ebp
-	andl	%edx,%esi
-	vpslld	$2,%xmm7,%xmm7
+	vpsrld	$30,%xmm7,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
 	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%edi
 	shldl	$5,%eax,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpor	%xmm10,%xmm7,%xmm7
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
-	vmovdqa	%xmm7,%xmm8
 	addl	56(%rsp),%edx
-	andl	%ecx,%edi
 	andl	%ebx,%esi
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	movl	%ebp,%edi
+	xorl	%ebx,%esi
 	shldl	$5,%ebp,%ebp
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%rsp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
 	vpalignr	$8,%xmm6,%xmm7,%xmm8
 	vpxor	%xmm4,%xmm0,%xmm0
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
 	addl	0(%rsp),%ebx
-	andl	%eax,%edi
-	vpxor	%xmm1,%xmm0,%xmm0
 	andl	%ebp,%esi
+	xorl	%eax,%ebp
 	shrdl	$7,%edx,%edx
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm7,%xmm9,%xmm9
-	addl	%edi,%ebx
+	vpxor	%xmm1,%xmm0,%xmm0
 	movl	%ecx,%edi
-	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%esi
+	vpaddd	%xmm7,%xmm11,%xmm9
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
 	vpsrld	$30,%xmm0,%xmm8
 	vmovdqa	%xmm9,48(%rsp)
-	movl	%edx,%esi
+	andl	%edx,%edi
 	xorl	%ebp,%edx
-	addl	4(%rsp),%eax
-	andl	%ebp,%esi
-	vpslld	$2,%xmm0,%xmm0
-	andl	%edx,%edi
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
 	vpor	%xmm8,%xmm0,%xmm0
-	movl	%ecx,%edi
 	xorl	%edx,%ecx
-	vmovdqa	%xmm0,%xmm9
-	addl	8(%rsp),%ebp
-	andl	%edx,%edi
-	andl	%ecx,%esi
 	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	movl	%eax,%edi
+	xorl	%ecx,%esi
 	shldl	$5,%eax,%eax
 	addl	%esi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%rsp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%edi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%ebp,%esi
+	xorl	%ebx,%edi
 	shldl	$5,%ebp,%ebp
 	addl	%edi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpalignr	$8,%xmm7,%xmm0,%xmm9
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
 	vpxor	%xmm5,%xmm1,%xmm1
-	movl	%eax,%edi
-	xorl	%ebx,%eax
 	addl	16(%rsp),%ecx
-	andl	%ebx,%edi
-	vpxor	%xmm2,%xmm1,%xmm1
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	vmovdqa	%xmm10,%xmm8
-	vpaddd	%xmm0,%xmm10,%xmm10
-	addl	%edi,%ecx
+	vpxor	%xmm2,%xmm1,%xmm1
 	movl	%edx,%edi
-	vpxor	%xmm9,%xmm1,%xmm1
+	xorl	%eax,%esi
+	vpaddd	%xmm0,%xmm11,%xmm9
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	vpxor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpsrld	$30,%xmm1,%xmm9
-	vmovdqa	%xmm10,0(%rsp)
-	movl	%ebp,%esi
-	xorl	%eax,%ebp
 	addl	20(%rsp),%ebx
-	andl	%eax,%esi
-	vpslld	$2,%xmm1,%xmm1
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
 	andl	%ebp,%edi
+	xorl	%eax,%ebp
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%ebp,%edi
 	shldl	$5,%ecx,%ecx
 	addl	%edi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	vpor	%xmm9,%xmm1,%xmm1
-	movl	%edx,%edi
-	xorl	%ebp,%edx
-	vmovdqa	%xmm1,%xmm10
 	addl	24(%rsp),%eax
-	andl	%ebp,%edi
 	andl	%edx,%esi
+	vpor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	movl	%ebx,%edi
+	xorl	%edx,%esi
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%ebp,%edx
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	28(%rsp),%ebp
-	andl	%edx,%esi
 	andl	%ecx,%edi
+	xorl	%edx,%ecx
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	movl	%eax,%esi
+	xorl	%ecx,%edi
 	shldl	$5,%eax,%eax
 	addl	%edi,%ebp
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%ebp
-	vpalignr	$8,%xmm0,%xmm1,%xmm10
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
 	vpxor	%xmm6,%xmm2,%xmm2
-	movl	%ebx,%edi
-	xorl	%ecx,%ebx
 	addl	32(%rsp),%edx
-	andl	%ecx,%edi
-	vpxor	%xmm3,%xmm2,%xmm2
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	vmovdqa	%xmm8,%xmm9
-	vpaddd	%xmm1,%xmm8,%xmm8
-	addl	%edi,%edx
+	vpxor	%xmm3,%xmm2,%xmm2
 	movl	%ebp,%edi
-	vpxor	%xmm10,%xmm2,%xmm2
+	xorl	%ebx,%esi
+	vpaddd	%xmm1,%xmm11,%xmm9
 	shldl	$5,%ebp,%ebp
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	vpxor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
 	addl	%ebp,%edx
-	vpsrld	$30,%xmm2,%xmm10
-	vmovdqa	%xmm8,16(%rsp)
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	36(%rsp),%ecx
-	andl	%ebx,%esi
-	vpslld	$2,%xmm2,%xmm2
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
 	andl	%eax,%edi
+	xorl	%ebx,%eax
 	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
 	addl	%edi,%ecx
-	xorl	%ebx,%eax
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
 	addl	%edx,%ecx
-	vpor	%xmm10,%xmm2,%xmm2
-	movl	%ebp,%edi
-	xorl	%eax,%ebp
-	vmovdqa	%xmm2,%xmm8
 	addl	40(%rsp),%ebx
-	andl	%eax,%edi
 	andl	%ebp,%esi
+	vpor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%ebp
 	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	movl	%ecx,%edi
+	xorl	%ebp,%esi
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%ebp
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%ebp,%edx
 	addl	44(%rsp),%eax
-	andl	%ebp,%esi
 	andl	%edx,%edi
+	xorl	%ebp,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%edi
 	shldl	$5,%ebx,%ebx
 	addl	%edi,%eax
-	xorl	%ebp,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	vpalignr	$8,%xmm1,%xmm2,%xmm8
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
 	vpxor	%xmm4,%xmm3,%xmm3
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	vmovdqa	%xmm9,%xmm10
-	vpaddd	%xmm2,%xmm9,%xmm9
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	vpxor	%xmm8,%xmm3,%xmm3
 	addl	52(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
 	vpsrld	$30,%xmm3,%xmm8
 	vmovdqa	%xmm9,32(%rsp)
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vpor	%xmm8,%xmm3,%xmm3
 	addl	60(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	0(%rsp),%eax
-	vpaddd	%xmm3,%xmm10,%xmm10
-	xorl	%ebp,%esi
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
-	movdqa	%xmm10,48(%rsp)
+	addl	%esi,%eax
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	4(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	8(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	12(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	cmpq	%r10,%r9
 	je	.Ldone_avx
 	vmovdqa	64(%r11),%xmm6
-	vmovdqa	0(%r11),%xmm9
+	vmovdqa	-64(%r11),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -3399,110 +3463,109 @@
 	vpshufb	%xmm6,%xmm0,%xmm0
 	addq	$64,%r9
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	vpshufb	%xmm6,%xmm1,%xmm1
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	vpaddd	%xmm9,%xmm0,%xmm4
-	xorl	%ebp,%esi
+	vpaddd	%xmm11,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vmovdqa	%xmm4,0(%rsp)
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	vpshufb	%xmm6,%xmm2,%xmm2
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	vpaddd	%xmm9,%xmm1,%xmm5
-	xorl	%eax,%esi
+	vpaddd	%xmm11,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	vmovdqa	%xmm5,16(%rsp)
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	vpshufb	%xmm6,%xmm3,%xmm3
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	vpaddd	%xmm9,%xmm2,%xmm6
-	xorl	%ebx,%esi
+	vpaddd	%xmm11,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vmovdqa	%xmm6,32(%rsp)
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	addl	0(%r8),%eax
 	addl	4(%r8),%esi
 	addl	8(%r8),%ecx
@@ -3512,108 +3575,110 @@
 	movl	%esi,4(%r8)
 	movl	%esi,%ebx
 	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
 	movl	%edx,12(%r8)
+	xorl	%edx,%edi
 	movl	%ebp,16(%r8)
+	andl	%edi,%esi
 	jmp	.Loop_avx
 
 .align	16
 .Ldone_avx:
 	addl	16(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	20(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	addl	24(%rsp),%ebp
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%edi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%ebp
 	addl	28(%rsp),%edx
-	xorl	%ecx,%edi
+	xorl	%ebx,%edi
 	movl	%ebp,%esi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%edi
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
 	addl	32(%rsp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%edi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%esi,%ecx
 	addl	36(%rsp),%ebx
-	xorl	%eax,%edi
+	xorl	%ebp,%edi
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%edi
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%edi,%ebx
 	addl	40(%rsp),%eax
-	xorl	%ebp,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%edi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%rsp),%ebp
-	xorl	%edx,%edi
+	xorl	%ecx,%edi
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%edi
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%edi,%ebp
 	addl	48(%rsp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%ebp,%edi
 	shldl	$5,%ebp,%ebp
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
 	addl	%ebp,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	52(%rsp),%ecx
-	xorl	%ebx,%edi
+	xorl	%eax,%edi
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%edi
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
 	addl	%edx,%ecx
-	shrdl	$7,%ebp,%ebp
-	addl	%edi,%ecx
 	addl	56(%rsp),%ebx
-	xorl	%eax,%esi
+	xorl	%ebp,%esi
 	movl	%ecx,%edi
 	shldl	$5,%ecx,%ecx
-	xorl	%ebp,%esi
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%rsp),%eax
-	xorl	%ebp,%edi
+	xorl	%edx,%edi
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%edi
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%edi,%eax
 	vzeroupper
 
 	addl	0(%r8),%eax
@@ -3626,20 +3691,1709 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	64(%rsp),%rsi
-	movq	0(%rsi),%r12
-	movq	8(%rsi),%rbp
-	movq	16(%rsi),%rbx
-	leaq	24(%rsi),%rsp
+	leaq	(%r14),%rsi
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_avx:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type	sha1_block_data_order_avx2, at function
+.align	16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	vzeroupper
+	movq	%rax,%r14
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	leaq	-640(%rsp),%rsp
+	shlq	$6,%r10
+	leaq	64(%r9),%r13
+	andq	$-128,%rsp
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r11
+
+	movl	0(%r8),%eax
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+	movl	4(%r8),%ebp
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	16(%r8),%esi
+	vmovdqu	64(%r11),%ymm6
+
+	vmovdqu	(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	leaq	64(%r9),%r9
+	vinserti128	$1,(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vmovdqu	-64(%r11),%ymm11
+	vpshufb	%ymm6,%ymm3,%ymm3
+
+	vpaddd	%ymm11,%ymm0,%ymm4
+	vpaddd	%ymm11,%ymm1,%ymm5
+	vmovdqu	%ymm4,0(%rsp)
+	vpaddd	%ymm11,%ymm2,%ymm6
+	vmovdqu	%ymm5,32(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	vmovdqu	%ymm6,64(%rsp)
+	vmovdqu	%ymm7,96(%rsp)
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	vpsrldq	$4,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$31,%ymm4,%ymm8
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	vpxor	%ymm10,%ymm4,%ymm4
+	vpaddd	%ymm11,%ymm4,%ymm9
+	vmovdqu	%ymm9,128(%rsp)
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	vpsrldq	$4,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r11),%ymm11
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm5,%ymm9
+	vmovdqu	%ymm9,160(%rsp)
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	vpsrldq	$4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$31,%ymm6,%ymm8
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm6,%ymm9
+	vmovdqu	%ymm9,192(%rsp)
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	vpsrldq	$4,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm7,%ymm8
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm7,%ymm7
+	vpaddd	%ymm11,%ymm7,%ymm9
+	vmovdqu	%ymm9,224(%rsp)
+	leaq	128(%rsp),%r13
+	jmp	.Loop_avx2
+.align	32
+.Loop_avx2:
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	jmp	.Lalign32_1
+.align	32
+.Lalign32_1:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpxor	%ymm1,%ymm0,%ymm0
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vpor	%ymm8,%ymm0,%ymm0
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	vmovdqu	%ymm9,256(%rsp)
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpxor	%ymm2,%ymm1,%ymm1
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vpor	%ymm8,%ymm1,%ymm1
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vmovdqu	%ymm9,288(%rsp)
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	vpxor	%ymm3,%ymm2,%ymm2
+	vmovdqu	0(%r11),%ymm11
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vpor	%ymm8,%ymm2,%ymm2
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vmovdqu	%ymm9,320(%rsp)
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	vpxor	%ymm4,%ymm3,%ymm3
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	vpor	%ymm8,%ymm3,%ymm3
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vmovdqu	%ymm9,352(%rsp)
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpalignr	$8,%ymm2,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	vpsrld	$30,%ymm4,%ymm8
+	vpslld	$2,%ymm4,%ymm4
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	vmovdqu	%ymm9,384(%rsp)
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpalignr	$8,%ymm3,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm6,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpxor	%ymm8,%ymm5,%ymm5
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	vpsrld	$30,%ymm5,%ymm8
+	vpslld	$2,%ymm5,%ymm5
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vpor	%ymm8,%ymm5,%ymm5
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	vmovdqu	%ymm9,416(%rsp)
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	vpxor	%ymm8,%ymm6,%ymm6
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	vpsrld	$30,%ymm6,%ymm8
+	vpslld	$2,%ymm6,%ymm6
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpor	%ymm8,%ymm6,%ymm6
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	vmovdqu	%ymm9,448(%rsp)
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm5,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm0,%ymm7,%ymm7
+	vmovdqu	32(%r11),%ymm11
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpxor	%ymm8,%ymm7,%ymm7
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	vpsrld	$30,%ymm7,%ymm8
+	vpslld	$2,%ymm7,%ymm7
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpor	%ymm8,%ymm7,%ymm7
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	vmovdqu	%ymm9,480(%rsp)
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	jmp	.Lalign32_2
+.align	32
+.Lalign32_2:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	vpxor	%ymm1,%ymm0,%ymm0
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	vpor	%ymm8,%ymm0,%ymm0
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	vmovdqu	%ymm9,512(%rsp)
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm2,%ymm1,%ymm1
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	vpor	%ymm8,%ymm1,%ymm1
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	vmovdqu	%ymm9,544(%rsp)
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	vpxor	%ymm3,%ymm2,%ymm2
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	vpor	%ymm8,%ymm2,%ymm2
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,576(%rsp)
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	vpxor	%ymm4,%ymm3,%ymm3
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	vpor	%ymm8,%ymm3,%ymm3
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vmovdqu	%ymm9,608(%rsp)
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%r9),%r13
+	leaq	128(%r9),%rdi
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	je	.Ldone_avx2
+	vmovdqu	64(%r11),%ymm6
+	cmpq	%r10,%rdi
+	ja	.Last_avx2
+
+	vmovdqu	-64(%rdi),%xmm0
+	vmovdqu	-48(%rdi),%xmm1
+	vmovdqu	-32(%rdi),%xmm2
+	vmovdqu	-16(%rdi),%xmm3
+	vinserti128	$1,0(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	jmp	.Last_avx2
+
+.align	32
+.Last_avx2:
+	leaq	128+16(%rsp),%r13
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	subq	$-128,%r9
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vmovdqu	-64(%r11),%ymm11
+	vpshufb	%ymm6,%ymm0,%ymm0
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm11,%ymm0,%ymm8
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vmovdqu	%ymm8,0(%rsp)
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	vmovdqu	%ymm9,32(%rsp)
+	vpshufb	%ymm6,%ymm3,%ymm3
+	vpaddd	%ymm11,%ymm2,%ymm6
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	jmp	.Lalign32_3
+.align	32
+.Lalign32_3:
+	vmovdqu	%ymm6,64(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vmovdqu	%ymm7,96(%rsp)
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	vpsrldq	$4,%ymm3,%ymm8
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	vpsrld	$31,%ymm4,%ymm8
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm10,%ymm4,%ymm4
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,128(%rsp)
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrldq	$4,%ymm4,%ymm8
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r11),%ymm11
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	vpxor	%ymm10,%ymm5,%ymm5
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vmovdqu	%ymm9,160(%rsp)
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpsrldq	$4,%ymm5,%ymm8
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm8,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpsrld	$31,%ymm6,%ymm8
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	vpxor	%ymm10,%ymm6,%ymm6
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vmovdqu	%ymm9,192(%rsp)
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpsrldq	$4,%ymm6,%ymm8
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm8,%ymm7,%ymm7
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	vpsrld	$31,%ymm7,%ymm8
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	vpxor	%ymm10,%ymm7,%ymm7
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vmovdqu	%ymm9,224(%rsp)
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%rsp),%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	jbe	.Loop_avx2
+
+.Ldone_avx2:
+	vzeroupper
+	leaq	(%r14),%rsi
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
 .align	64
 K_XX_XX:
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64

Modified: trunk/secure/lib/libcrypto/amd64/sha256-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/sha256-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/sha256-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,12 +1,29 @@
 /* $MidnightBSD$ */
-	# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/sha256-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-	# Do not modify. This file is auto-generated from sha512-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/sha256-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha512-x86_64.pl. */
 .text	
 
+
 .globl	sha256_block_data_order
 .type	sha256_block_data_order, at function
 .align	16
 sha256_block_data_order:
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	0(%r11),%r9d
+	movl	4(%r11),%r10d
+	movl	8(%r11),%r11d
+	testl	$536870912,%r11d
+	jnz	_shaext_shortcut
+	andl	$296,%r11d
+	cmpl	$296,%r11d
+	je	.Lavx2_shortcut
+	andl	$1073741824,%r9d
+	andl	$268435968,%r10d
+	orl	%r9d,%r10d
+	cmpl	$1342177792,%r10d
+	je	.Lavx_shortcut
+	testl	$512,%r10d
+	jnz	.Lssse3_shortcut
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -24,8 +41,6 @@
 	movq	%r11,64+24(%rsp)
 .Lprologue:
 
-	leaq	K256(%rip),%rbp
-
 	movl	0(%rdi),%eax
 	movl	4(%rdi),%ebx
 	movl	8(%rdi),%ecx
@@ -38,7 +53,9 @@
 
 .align	16
 .Lloop:
-	xorq	%rdi,%rdi
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
 	movl	0(%rsi),%r12d
 	movl	%r8d,%r13d
 	movl	%eax,%r14d
@@ -45,84 +62,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%r9d,%r15d
-	movl	%r12d,0(%rsp)
 
+	xorl	%r8d,%r13d
 	rorl	$9,%r14d
-	xorl	%r8d,%r13d
 	xorl	%r10d,%r15d
 
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
 	rorl	$5,%r13d
 	addl	%r11d,%r12d
-	xorl	%eax,%r14d
+	xorl	%r10d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r8d,%r15d
-	movl	%ebx,%r11d
-
 	rorl	$11,%r14d
 	xorl	%r8d,%r13d
-	xorl	%r10d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%ecx,%r11d
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
 	xorl	%eax,%r14d
-	addl	%r15d,%r12d
-	movl	%ebx,%r15d
 
+	xorl	%ebx,%r15d
 	rorl	$6,%r13d
-	andl	%eax,%r11d
-	andl	%ecx,%r15d
+	movl	%ebx,%r11d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r11d
 
+	xorl	%edi,%r11d
 	addl	%r12d,%edx
 	addl	%r12d,%r11d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r11d
-
 	movl	4(%rsi),%r12d
 	movl	%edx,%r13d
 	movl	%r11d,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%r8d,%r15d
-	movl	%r12d,4(%rsp)
+	movl	%r8d,%edi
 
+	xorl	%edx,%r13d
 	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	xorl	%r9d,%edi
 
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
 	rorl	$5,%r13d
 	addl	%r10d,%r12d
-	xorl	%r11d,%r14d
+	xorl	%r9d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%edx,%r15d
-	movl	%eax,%r10d
-
 	rorl	$11,%r14d
 	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%ebx,%r10d
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r11d,%r14d
-	addl	%r15d,%r12d
-	movl	%eax,%r15d
 
+	xorl	%eax,%edi
 	rorl	$6,%r13d
-	andl	%r11d,%r10d
-	andl	%ebx,%r15d
+	movl	%eax,%r10d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r10d
 
+	xorl	%r15d,%r10d
 	addl	%r12d,%ecx
 	addl	%r12d,%r10d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r10d
-
 	movl	8(%rsi),%r12d
 	movl	%ecx,%r13d
 	movl	%r10d,%r14d
@@ -129,84 +144,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%edx,%r15d
-	movl	%r12d,8(%rsp)
 
+	xorl	%ecx,%r13d
 	rorl	$9,%r14d
-	xorl	%ecx,%r13d
 	xorl	%r8d,%r15d
 
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
 	rorl	$5,%r13d
 	addl	%r9d,%r12d
-	xorl	%r10d,%r14d
+	xorl	%r8d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ecx,%r15d
-	movl	%r11d,%r9d
-
 	rorl	$11,%r14d
 	xorl	%ecx,%r13d
-	xorl	%r8d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%eax,%r9d
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r10d,%r14d
-	addl	%r15d,%r12d
-	movl	%r11d,%r15d
 
+	xorl	%r11d,%r15d
 	rorl	$6,%r13d
-	andl	%r10d,%r9d
-	andl	%eax,%r15d
+	movl	%r11d,%r9d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r9d
 
+	xorl	%edi,%r9d
 	addl	%r12d,%ebx
 	addl	%r12d,%r9d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r9d
-
 	movl	12(%rsi),%r12d
 	movl	%ebx,%r13d
 	movl	%r9d,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%ecx,%r15d
-	movl	%r12d,12(%rsp)
+	movl	%ecx,%edi
 
+	xorl	%ebx,%r13d
 	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	xorl	%edx,%edi
 
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
 	rorl	$5,%r13d
 	addl	%r8d,%r12d
-	xorl	%r9d,%r14d
+	xorl	%edx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ebx,%r15d
-	movl	%r10d,%r8d
-
 	rorl	$11,%r14d
 	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r11d,%r8d
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r9d,%r14d
-	addl	%r15d,%r12d
-	movl	%r10d,%r15d
 
+	xorl	%r10d,%edi
 	rorl	$6,%r13d
-	andl	%r9d,%r8d
-	andl	%r11d,%r15d
+	movl	%r10d,%r8d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r8d
 
+	xorl	%r15d,%r8d
 	addl	%r12d,%eax
 	addl	%r12d,%r8d
-	leaq	1(%rdi),%rdi
+
+	leaq	20(%rbp),%rbp
 	addl	%r14d,%r8d
-
 	movl	16(%rsi),%r12d
 	movl	%eax,%r13d
 	movl	%r8d,%r14d
@@ -213,84 +226,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%ebx,%r15d
-	movl	%r12d,16(%rsp)
 
+	xorl	%eax,%r13d
 	rorl	$9,%r14d
-	xorl	%eax,%r13d
 	xorl	%ecx,%r15d
 
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
 	rorl	$5,%r13d
 	addl	%edx,%r12d
-	xorl	%r8d,%r14d
+	xorl	%ecx,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%eax,%r15d
-	movl	%r9d,%edx
-
 	rorl	$11,%r14d
 	xorl	%eax,%r13d
-	xorl	%ecx,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r10d,%edx
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r8d,%r14d
-	addl	%r15d,%r12d
-	movl	%r9d,%r15d
 
+	xorl	%r9d,%r15d
 	rorl	$6,%r13d
-	andl	%r8d,%edx
-	andl	%r10d,%r15d
+	movl	%r9d,%edx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%edx
 
+	xorl	%edi,%edx
 	addl	%r12d,%r11d
 	addl	%r12d,%edx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%edx
-
 	movl	20(%rsi),%r12d
 	movl	%r11d,%r13d
 	movl	%edx,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%eax,%r15d
-	movl	%r12d,20(%rsp)
+	movl	%eax,%edi
 
+	xorl	%r11d,%r13d
 	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	xorl	%ebx,%edi
 
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
 	rorl	$5,%r13d
 	addl	%ecx,%r12d
-	xorl	%edx,%r14d
+	xorl	%ebx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r11d,%r15d
-	movl	%r8d,%ecx
-
 	rorl	$11,%r14d
 	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r9d,%ecx
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
 	xorl	%edx,%r14d
-	addl	%r15d,%r12d
-	movl	%r8d,%r15d
 
+	xorl	%r8d,%edi
 	rorl	$6,%r13d
-	andl	%edx,%ecx
-	andl	%r9d,%r15d
+	movl	%r8d,%ecx
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ecx
 
+	xorl	%r15d,%ecx
 	addl	%r12d,%r10d
 	addl	%r12d,%ecx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%ecx
-
 	movl	24(%rsi),%r12d
 	movl	%r10d,%r13d
 	movl	%ecx,%r14d
@@ -297,84 +308,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%r11d,%r15d
-	movl	%r12d,24(%rsp)
 
+	xorl	%r10d,%r13d
 	rorl	$9,%r14d
-	xorl	%r10d,%r13d
 	xorl	%eax,%r15d
 
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
 	rorl	$5,%r13d
 	addl	%ebx,%r12d
-	xorl	%ecx,%r14d
+	xorl	%eax,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r10d,%r15d
-	movl	%edx,%ebx
-
 	rorl	$11,%r14d
 	xorl	%r10d,%r13d
-	xorl	%eax,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r8d,%ebx
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
 	xorl	%ecx,%r14d
-	addl	%r15d,%r12d
-	movl	%edx,%r15d
 
+	xorl	%edx,%r15d
 	rorl	$6,%r13d
-	andl	%ecx,%ebx
-	andl	%r8d,%r15d
+	movl	%edx,%ebx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ebx
 
+	xorl	%edi,%ebx
 	addl	%r12d,%r9d
 	addl	%r12d,%ebx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%ebx
-
 	movl	28(%rsi),%r12d
 	movl	%r9d,%r13d
 	movl	%ebx,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%r10d,%r15d
-	movl	%r12d,28(%rsp)
+	movl	%r10d,%edi
 
+	xorl	%r9d,%r13d
 	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	xorl	%r11d,%edi
 
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
 	rorl	$5,%r13d
 	addl	%eax,%r12d
-	xorl	%ebx,%r14d
+	xorl	%r11d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r9d,%r15d
-	movl	%ecx,%eax
-
 	rorl	$11,%r14d
 	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%edx,%eax
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
 	xorl	%ebx,%r14d
-	addl	%r15d,%r12d
-	movl	%ecx,%r15d
 
+	xorl	%ecx,%edi
 	rorl	$6,%r13d
-	andl	%ebx,%eax
-	andl	%edx,%r15d
+	movl	%ecx,%eax
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%eax
 
+	xorl	%r15d,%eax
 	addl	%r12d,%r8d
 	addl	%r12d,%eax
-	leaq	1(%rdi),%rdi
+
+	leaq	20(%rbp),%rbp
 	addl	%r14d,%eax
-
 	movl	32(%rsi),%r12d
 	movl	%r8d,%r13d
 	movl	%eax,%r14d
@@ -381,84 +390,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%r9d,%r15d
-	movl	%r12d,32(%rsp)
 
+	xorl	%r8d,%r13d
 	rorl	$9,%r14d
-	xorl	%r8d,%r13d
 	xorl	%r10d,%r15d
 
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
 	rorl	$5,%r13d
 	addl	%r11d,%r12d
-	xorl	%eax,%r14d
+	xorl	%r10d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r8d,%r15d
-	movl	%ebx,%r11d
-
 	rorl	$11,%r14d
 	xorl	%r8d,%r13d
-	xorl	%r10d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%ecx,%r11d
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
 	xorl	%eax,%r14d
-	addl	%r15d,%r12d
-	movl	%ebx,%r15d
 
+	xorl	%ebx,%r15d
 	rorl	$6,%r13d
-	andl	%eax,%r11d
-	andl	%ecx,%r15d
+	movl	%ebx,%r11d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r11d
 
+	xorl	%edi,%r11d
 	addl	%r12d,%edx
 	addl	%r12d,%r11d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r11d
-
 	movl	36(%rsi),%r12d
 	movl	%edx,%r13d
 	movl	%r11d,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%r8d,%r15d
-	movl	%r12d,36(%rsp)
+	movl	%r8d,%edi
 
+	xorl	%edx,%r13d
 	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	xorl	%r9d,%edi
 
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
 	rorl	$5,%r13d
 	addl	%r10d,%r12d
-	xorl	%r11d,%r14d
+	xorl	%r9d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%edx,%r15d
-	movl	%eax,%r10d
-
 	rorl	$11,%r14d
 	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%ebx,%r10d
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r11d,%r14d
-	addl	%r15d,%r12d
-	movl	%eax,%r15d
 
+	xorl	%eax,%edi
 	rorl	$6,%r13d
-	andl	%r11d,%r10d
-	andl	%ebx,%r15d
+	movl	%eax,%r10d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r10d
 
+	xorl	%r15d,%r10d
 	addl	%r12d,%ecx
 	addl	%r12d,%r10d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r10d
-
 	movl	40(%rsi),%r12d
 	movl	%ecx,%r13d
 	movl	%r10d,%r14d
@@ -465,84 +472,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%edx,%r15d
-	movl	%r12d,40(%rsp)
 
+	xorl	%ecx,%r13d
 	rorl	$9,%r14d
-	xorl	%ecx,%r13d
 	xorl	%r8d,%r15d
 
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
 	rorl	$5,%r13d
 	addl	%r9d,%r12d
-	xorl	%r10d,%r14d
+	xorl	%r8d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ecx,%r15d
-	movl	%r11d,%r9d
-
 	rorl	$11,%r14d
 	xorl	%ecx,%r13d
-	xorl	%r8d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%eax,%r9d
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r10d,%r14d
-	addl	%r15d,%r12d
-	movl	%r11d,%r15d
 
+	xorl	%r11d,%r15d
 	rorl	$6,%r13d
-	andl	%r10d,%r9d
-	andl	%eax,%r15d
+	movl	%r11d,%r9d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r9d
 
+	xorl	%edi,%r9d
 	addl	%r12d,%ebx
 	addl	%r12d,%r9d
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%r9d
-
 	movl	44(%rsi),%r12d
 	movl	%ebx,%r13d
 	movl	%r9d,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%ecx,%r15d
-	movl	%r12d,44(%rsp)
+	movl	%ecx,%edi
 
+	xorl	%ebx,%r13d
 	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	xorl	%edx,%edi
 
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
 	rorl	$5,%r13d
 	addl	%r8d,%r12d
-	xorl	%r9d,%r14d
+	xorl	%edx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ebx,%r15d
-	movl	%r10d,%r8d
-
 	rorl	$11,%r14d
 	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r11d,%r8d
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r9d,%r14d
-	addl	%r15d,%r12d
-	movl	%r10d,%r15d
 
+	xorl	%r10d,%edi
 	rorl	$6,%r13d
-	andl	%r9d,%r8d
-	andl	%r11d,%r15d
+	movl	%r10d,%r8d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r8d
 
+	xorl	%r15d,%r8d
 	addl	%r12d,%eax
 	addl	%r12d,%r8d
-	leaq	1(%rdi),%rdi
+
+	leaq	20(%rbp),%rbp
 	addl	%r14d,%r8d
-
 	movl	48(%rsi),%r12d
 	movl	%eax,%r13d
 	movl	%r8d,%r14d
@@ -549,84 +554,82 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%ebx,%r15d
-	movl	%r12d,48(%rsp)
 
+	xorl	%eax,%r13d
 	rorl	$9,%r14d
-	xorl	%eax,%r13d
 	xorl	%ecx,%r15d
 
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
 	rorl	$5,%r13d
 	addl	%edx,%r12d
-	xorl	%r8d,%r14d
+	xorl	%ecx,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%eax,%r15d
-	movl	%r9d,%edx
-
 	rorl	$11,%r14d
 	xorl	%eax,%r13d
-	xorl	%ecx,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r10d,%edx
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r8d,%r14d
-	addl	%r15d,%r12d
-	movl	%r9d,%r15d
 
+	xorl	%r9d,%r15d
 	rorl	$6,%r13d
-	andl	%r8d,%edx
-	andl	%r10d,%r15d
+	movl	%r9d,%edx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%edx
 
+	xorl	%edi,%edx
 	addl	%r12d,%r11d
 	addl	%r12d,%edx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%edx
-
 	movl	52(%rsi),%r12d
 	movl	%r11d,%r13d
 	movl	%edx,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%eax,%r15d
-	movl	%r12d,52(%rsp)
+	movl	%eax,%edi
 
+	xorl	%r11d,%r13d
 	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	xorl	%ebx,%edi
 
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
 	rorl	$5,%r13d
 	addl	%ecx,%r12d
-	xorl	%edx,%r14d
+	xorl	%ebx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r11d,%r15d
-	movl	%r8d,%ecx
-
 	rorl	$11,%r14d
 	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r9d,%ecx
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
 	xorl	%edx,%r14d
-	addl	%r15d,%r12d
-	movl	%r8d,%r15d
 
+	xorl	%r8d,%edi
 	rorl	$6,%r13d
-	andl	%edx,%ecx
-	andl	%r9d,%r15d
+	movl	%r8d,%ecx
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ecx
 
+	xorl	%r15d,%ecx
 	addl	%r12d,%r10d
 	addl	%r12d,%ecx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%ecx
-
 	movl	56(%rsi),%r12d
 	movl	%r10d,%r13d
 	movl	%ecx,%r14d
@@ -633,1099 +636,1049 @@
 	bswapl	%r12d
 	rorl	$14,%r13d
 	movl	%r11d,%r15d
-	movl	%r12d,56(%rsp)
 
+	xorl	%r10d,%r13d
 	rorl	$9,%r14d
-	xorl	%r10d,%r13d
 	xorl	%eax,%r15d
 
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
 	rorl	$5,%r13d
 	addl	%ebx,%r12d
-	xorl	%ecx,%r14d
+	xorl	%eax,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r10d,%r15d
-	movl	%edx,%ebx
-
 	rorl	$11,%r14d
 	xorl	%r10d,%r13d
-	xorl	%eax,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r8d,%ebx
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
 	xorl	%ecx,%r14d
-	addl	%r15d,%r12d
-	movl	%edx,%r15d
 
+	xorl	%edx,%r15d
 	rorl	$6,%r13d
-	andl	%ecx,%ebx
-	andl	%r8d,%r15d
+	movl	%edx,%ebx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ebx
 
+	xorl	%edi,%ebx
 	addl	%r12d,%r9d
 	addl	%r12d,%ebx
-	leaq	1(%rdi),%rdi
+
+	leaq	4(%rbp),%rbp
 	addl	%r14d,%ebx
-
 	movl	60(%rsi),%r12d
 	movl	%r9d,%r13d
 	movl	%ebx,%r14d
 	bswapl	%r12d
 	rorl	$14,%r13d
-	movl	%r10d,%r15d
-	movl	%r12d,60(%rsp)
+	movl	%r10d,%edi
 
+	xorl	%r9d,%r13d
 	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	xorl	%r11d,%edi
 
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
 	rorl	$5,%r13d
 	addl	%eax,%r12d
-	xorl	%ebx,%r14d
+	xorl	%r11d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r9d,%r15d
-	movl	%ecx,%eax
-
 	rorl	$11,%r14d
 	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%edx,%eax
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
 	xorl	%ebx,%r14d
-	addl	%r15d,%r12d
-	movl	%ecx,%r15d
 
+	xorl	%ecx,%edi
 	rorl	$6,%r13d
-	andl	%ebx,%eax
-	andl	%edx,%r15d
+	movl	%ecx,%eax
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%eax
 
+	xorl	%r15d,%eax
 	addl	%r12d,%r8d
 	addl	%r12d,%eax
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%eax
 
+	leaq	20(%rbp),%rbp
 	jmp	.Lrounds_16_xx
 .align	16
 .Lrounds_16_xx:
 	movl	4(%rsp),%r13d
-	movl	56(%rsp),%r14d
+	movl	56(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	36(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
 
 	addl	0(%rsp),%r12d
 	movl	%r8d,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%eax,%r14d
 	rorl	$14,%r13d
 	movl	%r9d,%r15d
-	movl	%r12d,0(%rsp)
 
+	xorl	%r8d,%r13d
 	rorl	$9,%r14d
-	xorl	%r8d,%r13d
 	xorl	%r10d,%r15d
 
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
 	rorl	$5,%r13d
 	addl	%r11d,%r12d
-	xorl	%eax,%r14d
+	xorl	%r10d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r8d,%r15d
-	movl	%ebx,%r11d
-
 	rorl	$11,%r14d
 	xorl	%r8d,%r13d
-	xorl	%r10d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%ecx,%r11d
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
 	xorl	%eax,%r14d
-	addl	%r15d,%r12d
-	movl	%ebx,%r15d
 
+	xorl	%ebx,%r15d
 	rorl	$6,%r13d
-	andl	%eax,%r11d
-	andl	%ecx,%r15d
+	movl	%ebx,%r11d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r11d
 
+	xorl	%edi,%r11d
 	addl	%r12d,%edx
 	addl	%r12d,%r11d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r11d
 
+	leaq	4(%rbp),%rbp
 	movl	8(%rsp),%r13d
-	movl	60(%rsp),%r14d
+	movl	60(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	40(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
 
 	addl	4(%rsp),%r12d
 	movl	%edx,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%r11d,%r14d
 	rorl	$14,%r13d
-	movl	%r8d,%r15d
-	movl	%r12d,4(%rsp)
+	movl	%r8d,%edi
 
+	xorl	%edx,%r13d
 	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	xorl	%r9d,%edi
 
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
 	rorl	$5,%r13d
 	addl	%r10d,%r12d
-	xorl	%r11d,%r14d
+	xorl	%r9d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%edx,%r15d
-	movl	%eax,%r10d
-
 	rorl	$11,%r14d
 	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%ebx,%r10d
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r11d,%r14d
-	addl	%r15d,%r12d
-	movl	%eax,%r15d
 
+	xorl	%eax,%edi
 	rorl	$6,%r13d
-	andl	%r11d,%r10d
-	andl	%ebx,%r15d
+	movl	%eax,%r10d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r10d
 
+	xorl	%r15d,%r10d
 	addl	%r12d,%ecx
 	addl	%r12d,%r10d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r10d
 
+	leaq	4(%rbp),%rbp
 	movl	12(%rsp),%r13d
-	movl	0(%rsp),%r14d
+	movl	0(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	44(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
 
 	addl	8(%rsp),%r12d
 	movl	%ecx,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%r10d,%r14d
 	rorl	$14,%r13d
 	movl	%edx,%r15d
-	movl	%r12d,8(%rsp)
 
+	xorl	%ecx,%r13d
 	rorl	$9,%r14d
-	xorl	%ecx,%r13d
 	xorl	%r8d,%r15d
 
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
 	rorl	$5,%r13d
 	addl	%r9d,%r12d
-	xorl	%r10d,%r14d
+	xorl	%r8d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ecx,%r15d
-	movl	%r11d,%r9d
-
 	rorl	$11,%r14d
 	xorl	%ecx,%r13d
-	xorl	%r8d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%eax,%r9d
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r10d,%r14d
-	addl	%r15d,%r12d
-	movl	%r11d,%r15d
 
+	xorl	%r11d,%r15d
 	rorl	$6,%r13d
-	andl	%r10d,%r9d
-	andl	%eax,%r15d
+	movl	%r11d,%r9d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r9d
 
+	xorl	%edi,%r9d
 	addl	%r12d,%ebx
 	addl	%r12d,%r9d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r9d
 
+	leaq	4(%rbp),%rbp
 	movl	16(%rsp),%r13d
-	movl	4(%rsp),%r14d
+	movl	4(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	48(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
 
 	addl	12(%rsp),%r12d
 	movl	%ebx,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%r9d,%r14d
 	rorl	$14,%r13d
-	movl	%ecx,%r15d
-	movl	%r12d,12(%rsp)
+	movl	%ecx,%edi
 
+	xorl	%ebx,%r13d
 	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	xorl	%edx,%edi
 
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
 	rorl	$5,%r13d
 	addl	%r8d,%r12d
-	xorl	%r9d,%r14d
+	xorl	%edx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ebx,%r15d
-	movl	%r10d,%r8d
-
 	rorl	$11,%r14d
 	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r11d,%r8d
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r9d,%r14d
-	addl	%r15d,%r12d
-	movl	%r10d,%r15d
 
+	xorl	%r10d,%edi
 	rorl	$6,%r13d
-	andl	%r9d,%r8d
-	andl	%r11d,%r15d
+	movl	%r10d,%r8d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r8d
 
+	xorl	%r15d,%r8d
 	addl	%r12d,%eax
 	addl	%r12d,%r8d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r8d
 
+	leaq	20(%rbp),%rbp
 	movl	20(%rsp),%r13d
-	movl	8(%rsp),%r14d
+	movl	8(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	52(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
 
 	addl	16(%rsp),%r12d
 	movl	%eax,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%r8d,%r14d
 	rorl	$14,%r13d
 	movl	%ebx,%r15d
-	movl	%r12d,16(%rsp)
 
+	xorl	%eax,%r13d
 	rorl	$9,%r14d
-	xorl	%eax,%r13d
 	xorl	%ecx,%r15d
 
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
 	rorl	$5,%r13d
 	addl	%edx,%r12d
-	xorl	%r8d,%r14d
+	xorl	%ecx,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%eax,%r15d
-	movl	%r9d,%edx
-
 	rorl	$11,%r14d
 	xorl	%eax,%r13d
-	xorl	%ecx,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r10d,%edx
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r8d,%r14d
-	addl	%r15d,%r12d
-	movl	%r9d,%r15d
 
+	xorl	%r9d,%r15d
 	rorl	$6,%r13d
-	andl	%r8d,%edx
-	andl	%r10d,%r15d
+	movl	%r9d,%edx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%edx
 
+	xorl	%edi,%edx
 	addl	%r12d,%r11d
 	addl	%r12d,%edx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%edx
 
+	leaq	4(%rbp),%rbp
 	movl	24(%rsp),%r13d
-	movl	12(%rsp),%r14d
+	movl	12(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	56(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
 
 	addl	20(%rsp),%r12d
 	movl	%r11d,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%edx,%r14d
 	rorl	$14,%r13d
-	movl	%eax,%r15d
-	movl	%r12d,20(%rsp)
+	movl	%eax,%edi
 
+	xorl	%r11d,%r13d
 	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	xorl	%ebx,%edi
 
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
 	rorl	$5,%r13d
 	addl	%ecx,%r12d
-	xorl	%edx,%r14d
+	xorl	%ebx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r11d,%r15d
-	movl	%r8d,%ecx
-
 	rorl	$11,%r14d
 	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r9d,%ecx
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
 	xorl	%edx,%r14d
-	addl	%r15d,%r12d
-	movl	%r8d,%r15d
 
+	xorl	%r8d,%edi
 	rorl	$6,%r13d
-	andl	%edx,%ecx
-	andl	%r9d,%r15d
+	movl	%r8d,%ecx
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ecx
 
+	xorl	%r15d,%ecx
 	addl	%r12d,%r10d
 	addl	%r12d,%ecx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%ecx
 
+	leaq	4(%rbp),%rbp
 	movl	28(%rsp),%r13d
-	movl	16(%rsp),%r14d
+	movl	16(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	60(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
 
 	addl	24(%rsp),%r12d
 	movl	%r10d,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%ecx,%r14d
 	rorl	$14,%r13d
 	movl	%r11d,%r15d
-	movl	%r12d,24(%rsp)
 
+	xorl	%r10d,%r13d
 	rorl	$9,%r14d
-	xorl	%r10d,%r13d
 	xorl	%eax,%r15d
 
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
 	rorl	$5,%r13d
 	addl	%ebx,%r12d
-	xorl	%ecx,%r14d
+	xorl	%eax,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r10d,%r15d
-	movl	%edx,%ebx
-
 	rorl	$11,%r14d
 	xorl	%r10d,%r13d
-	xorl	%eax,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r8d,%ebx
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
 	xorl	%ecx,%r14d
-	addl	%r15d,%r12d
-	movl	%edx,%r15d
 
+	xorl	%edx,%r15d
 	rorl	$6,%r13d
-	andl	%ecx,%ebx
-	andl	%r8d,%r15d
+	movl	%edx,%ebx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ebx
 
+	xorl	%edi,%ebx
 	addl	%r12d,%r9d
 	addl	%r12d,%ebx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%ebx
 
+	leaq	4(%rbp),%rbp
 	movl	32(%rsp),%r13d
-	movl	20(%rsp),%r14d
+	movl	20(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	0(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
 
 	addl	28(%rsp),%r12d
 	movl	%r9d,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%ebx,%r14d
 	rorl	$14,%r13d
-	movl	%r10d,%r15d
-	movl	%r12d,28(%rsp)
+	movl	%r10d,%edi
 
+	xorl	%r9d,%r13d
 	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	xorl	%r11d,%edi
 
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
 	rorl	$5,%r13d
 	addl	%eax,%r12d
-	xorl	%ebx,%r14d
+	xorl	%r11d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r9d,%r15d
-	movl	%ecx,%eax
-
 	rorl	$11,%r14d
 	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%edx,%eax
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
 	xorl	%ebx,%r14d
-	addl	%r15d,%r12d
-	movl	%ecx,%r15d
 
+	xorl	%ecx,%edi
 	rorl	$6,%r13d
-	andl	%ebx,%eax
-	andl	%edx,%r15d
+	movl	%ecx,%eax
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%eax
 
+	xorl	%r15d,%eax
 	addl	%r12d,%r8d
 	addl	%r12d,%eax
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%eax
 
+	leaq	20(%rbp),%rbp
 	movl	36(%rsp),%r13d
-	movl	24(%rsp),%r14d
+	movl	24(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	4(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
 
 	addl	32(%rsp),%r12d
 	movl	%r8d,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%eax,%r14d
 	rorl	$14,%r13d
 	movl	%r9d,%r15d
-	movl	%r12d,32(%rsp)
 
+	xorl	%r8d,%r13d
 	rorl	$9,%r14d
-	xorl	%r8d,%r13d
 	xorl	%r10d,%r15d
 
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
 	rorl	$5,%r13d
 	addl	%r11d,%r12d
-	xorl	%eax,%r14d
+	xorl	%r10d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r8d,%r15d
-	movl	%ebx,%r11d
-
 	rorl	$11,%r14d
 	xorl	%r8d,%r13d
-	xorl	%r10d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%ecx,%r11d
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
 	xorl	%eax,%r14d
-	addl	%r15d,%r12d
-	movl	%ebx,%r15d
 
+	xorl	%ebx,%r15d
 	rorl	$6,%r13d
-	andl	%eax,%r11d
-	andl	%ecx,%r15d
+	movl	%ebx,%r11d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r11d
 
+	xorl	%edi,%r11d
 	addl	%r12d,%edx
 	addl	%r12d,%r11d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r11d
 
+	leaq	4(%rbp),%rbp
 	movl	40(%rsp),%r13d
-	movl	28(%rsp),%r14d
+	movl	28(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	8(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
 
 	addl	36(%rsp),%r12d
 	movl	%edx,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%r11d,%r14d
 	rorl	$14,%r13d
-	movl	%r8d,%r15d
-	movl	%r12d,36(%rsp)
+	movl	%r8d,%edi
 
+	xorl	%edx,%r13d
 	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	xorl	%r9d,%edi
 
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
 	rorl	$5,%r13d
 	addl	%r10d,%r12d
-	xorl	%r11d,%r14d
+	xorl	%r9d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%edx,%r15d
-	movl	%eax,%r10d
-
 	rorl	$11,%r14d
 	xorl	%edx,%r13d
-	xorl	%r9d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%ebx,%r10d
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r11d,%r14d
-	addl	%r15d,%r12d
-	movl	%eax,%r15d
 
+	xorl	%eax,%edi
 	rorl	$6,%r13d
-	andl	%r11d,%r10d
-	andl	%ebx,%r15d
+	movl	%eax,%r10d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r10d
 
+	xorl	%r15d,%r10d
 	addl	%r12d,%ecx
 	addl	%r12d,%r10d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r10d
 
+	leaq	4(%rbp),%rbp
 	movl	44(%rsp),%r13d
-	movl	32(%rsp),%r14d
+	movl	32(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	12(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
 
 	addl	40(%rsp),%r12d
 	movl	%ecx,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%r10d,%r14d
 	rorl	$14,%r13d
 	movl	%edx,%r15d
-	movl	%r12d,40(%rsp)
 
+	xorl	%ecx,%r13d
 	rorl	$9,%r14d
-	xorl	%ecx,%r13d
 	xorl	%r8d,%r15d
 
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
 	rorl	$5,%r13d
 	addl	%r9d,%r12d
-	xorl	%r10d,%r14d
+	xorl	%r8d,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ecx,%r15d
-	movl	%r11d,%r9d
-
 	rorl	$11,%r14d
 	xorl	%ecx,%r13d
-	xorl	%r8d,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%eax,%r9d
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r10d,%r14d
-	addl	%r15d,%r12d
-	movl	%r11d,%r15d
 
+	xorl	%r11d,%r15d
 	rorl	$6,%r13d
-	andl	%r10d,%r9d
-	andl	%eax,%r15d
+	movl	%r11d,%r9d
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r9d
 
+	xorl	%edi,%r9d
 	addl	%r12d,%ebx
 	addl	%r12d,%r9d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r9d
 
+	leaq	4(%rbp),%rbp
 	movl	48(%rsp),%r13d
-	movl	36(%rsp),%r14d
+	movl	36(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	16(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
 
 	addl	44(%rsp),%r12d
 	movl	%ebx,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%r9d,%r14d
 	rorl	$14,%r13d
-	movl	%ecx,%r15d
-	movl	%r12d,44(%rsp)
+	movl	%ecx,%edi
 
+	xorl	%ebx,%r13d
 	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	xorl	%edx,%edi
 
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
 	rorl	$5,%r13d
 	addl	%r8d,%r12d
-	xorl	%r9d,%r14d
+	xorl	%edx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%ebx,%r15d
-	movl	%r10d,%r8d
-
 	rorl	$11,%r14d
 	xorl	%ebx,%r13d
-	xorl	%edx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r11d,%r8d
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
 	xorl	%r9d,%r14d
-	addl	%r15d,%r12d
-	movl	%r10d,%r15d
 
+	xorl	%r10d,%edi
 	rorl	$6,%r13d
-	andl	%r9d,%r8d
-	andl	%r11d,%r15d
+	movl	%r10d,%r8d
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%r8d
 
+	xorl	%r15d,%r8d
 	addl	%r12d,%eax
 	addl	%r12d,%r8d
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%r8d
 
+	leaq	20(%rbp),%rbp
 	movl	52(%rsp),%r13d
-	movl	40(%rsp),%r14d
+	movl	40(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	20(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
 
 	addl	48(%rsp),%r12d
 	movl	%eax,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%r8d,%r14d
 	rorl	$14,%r13d
 	movl	%ebx,%r15d
-	movl	%r12d,48(%rsp)
 
+	xorl	%eax,%r13d
 	rorl	$9,%r14d
-	xorl	%eax,%r13d
 	xorl	%ecx,%r15d
 
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
 	rorl	$5,%r13d
 	addl	%edx,%r12d
-	xorl	%r8d,%r14d
+	xorl	%ecx,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%eax,%r15d
-	movl	%r9d,%edx
-
 	rorl	$11,%r14d
 	xorl	%eax,%r13d
-	xorl	%ecx,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r10d,%edx
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
 	xorl	%r8d,%r14d
-	addl	%r15d,%r12d
-	movl	%r9d,%r15d
 
+	xorl	%r9d,%r15d
 	rorl	$6,%r13d
-	andl	%r8d,%edx
-	andl	%r10d,%r15d
+	movl	%r9d,%edx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%edx
 
+	xorl	%edi,%edx
 	addl	%r12d,%r11d
 	addl	%r12d,%edx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%edx
 
+	leaq	4(%rbp),%rbp
 	movl	56(%rsp),%r13d
-	movl	44(%rsp),%r14d
+	movl	44(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	24(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
 
 	addl	52(%rsp),%r12d
 	movl	%r11d,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%edx,%r14d
 	rorl	$14,%r13d
-	movl	%eax,%r15d
-	movl	%r12d,52(%rsp)
+	movl	%eax,%edi
 
+	xorl	%r11d,%r13d
 	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	xorl	%ebx,%edi
 
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
 	rorl	$5,%r13d
 	addl	%ecx,%r12d
-	xorl	%edx,%r14d
+	xorl	%ebx,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r11d,%r15d
-	movl	%r8d,%ecx
-
 	rorl	$11,%r14d
 	xorl	%r11d,%r13d
-	xorl	%ebx,%r15d
+	addl	%edi,%r12d
 
-	xorl	%r9d,%ecx
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
 	xorl	%edx,%r14d
-	addl	%r15d,%r12d
-	movl	%r8d,%r15d
 
+	xorl	%r8d,%edi
 	rorl	$6,%r13d
-	andl	%edx,%ecx
-	andl	%r9d,%r15d
+	movl	%r8d,%ecx
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ecx
 
+	xorl	%r15d,%ecx
 	addl	%r12d,%r10d
 	addl	%r12d,%ecx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%ecx
 
+	leaq	4(%rbp),%rbp
 	movl	60(%rsp),%r13d
-	movl	48(%rsp),%r14d
+	movl	48(%rsp),%r15d
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	28(%rsp),%r12d
-
-	rorl	$2,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
 	xorl	%r14d,%r15d
 	shrl	$10,%r14d
 
 	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
 
 	addl	56(%rsp),%r12d
 	movl	%r10d,%r13d
-	addl	%r14d,%r12d
+	addl	%r15d,%r12d
 	movl	%ecx,%r14d
 	rorl	$14,%r13d
 	movl	%r11d,%r15d
-	movl	%r12d,56(%rsp)
 
+	xorl	%r10d,%r13d
 	rorl	$9,%r14d
-	xorl	%r10d,%r13d
 	xorl	%eax,%r15d
 
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
 	rorl	$5,%r13d
 	addl	%ebx,%r12d
-	xorl	%ecx,%r14d
+	xorl	%eax,%r15d
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r10d,%r15d
-	movl	%edx,%ebx
-
 	rorl	$11,%r14d
 	xorl	%r10d,%r13d
-	xorl	%eax,%r15d
+	addl	%r15d,%r12d
 
-	xorl	%r8d,%ebx
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
 	xorl	%ecx,%r14d
-	addl	%r15d,%r12d
-	movl	%edx,%r15d
 
+	xorl	%edx,%r15d
 	rorl	$6,%r13d
-	andl	%ecx,%ebx
-	andl	%r8d,%r15d
+	movl	%edx,%ebx
 
+	andl	%r15d,%edi
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%ebx
 
+	xorl	%edi,%ebx
 	addl	%r12d,%r9d
 	addl	%r12d,%ebx
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%ebx
 
+	leaq	4(%rbp),%rbp
 	movl	0(%rsp),%r13d
-	movl	52(%rsp),%r14d
+	movl	52(%rsp),%edi
+
 	movl	%r13d,%r12d
-	movl	%r14d,%r15d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
 
-	rorl	$11,%r12d
-	xorl	%r13d,%r12d
-	shrl	$3,%r13d
-
-	rorl	$7,%r12d
 	xorl	%r12d,%r13d
-	movl	32(%rsp),%r12d
-
-	rorl	$2,%r15d
-	xorl	%r14d,%r15d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
 	shrl	$10,%r14d
 
-	rorl	$17,%r15d
-	addl	%r13d,%r12d
-	xorl	%r15d,%r14d
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
 
 	addl	60(%rsp),%r12d
 	movl	%r9d,%r13d
-	addl	%r14d,%r12d
+	addl	%edi,%r12d
 	movl	%ebx,%r14d
 	rorl	$14,%r13d
-	movl	%r10d,%r15d
-	movl	%r12d,60(%rsp)
+	movl	%r10d,%edi
 
+	xorl	%r9d,%r13d
 	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	xorl	%r11d,%edi
 
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
 	rorl	$5,%r13d
 	addl	%eax,%r12d
-	xorl	%ebx,%r14d
+	xorl	%r11d,%edi
 
-	addl	(%rbp,%rdi,4),%r12d
-	andl	%r9d,%r15d
-	movl	%ecx,%eax
-
 	rorl	$11,%r14d
 	xorl	%r9d,%r13d
-	xorl	%r11d,%r15d
+	addl	%edi,%r12d
 
-	xorl	%edx,%eax
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
 	xorl	%ebx,%r14d
-	addl	%r15d,%r12d
-	movl	%ecx,%r15d
 
+	xorl	%ecx,%edi
 	rorl	$6,%r13d
-	andl	%ebx,%eax
-	andl	%edx,%r15d
+	movl	%ecx,%eax
 
+	andl	%edi,%r15d
 	rorl	$2,%r14d
 	addl	%r13d,%r12d
-	addl	%r15d,%eax
 
+	xorl	%r15d,%eax
 	addl	%r12d,%r8d
 	addl	%r12d,%eax
-	leaq	1(%rdi),%rdi
-	addl	%r14d,%eax
 
-	cmpq	$64,%rdi
-	jb	.Lrounds_16_xx
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	.Lrounds_16_xx
 
 	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
 	leaq	64(%rsi),%rsi
 
 	addl	0(%rdi),%eax
@@ -1764,18 +1717,3645 @@
 .type	K256, at object
 K256:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 .long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 .long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 .long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 .long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 .long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 .long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 .long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 .long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 .long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 .long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 .long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 .long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 .long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 .long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type	sha256_block_data_order_shaext, at function
+.align	64
+sha256_block_data_order_shaext:
+_shaext_shortcut:
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+.type	sha256_block_data_order_ssse3, at function
+.align	64
+sha256_block_data_order_ssse3:
+.Lssse3_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	movq	64+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type	sha256_block_data_order_avx, at function
+.align	64
+sha256_block_data_order_avx:
+.Lavx_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lavx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_avx
+
+	movq	64+24(%rsp),%rsi
+	vzeroupper
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
+.type	sha256_block_data_order_avx2, at function
+.align	64
+sha256_block_data_order_avx2:
+.Lavx2_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	subq	$544,%rsp
+	shlq	$4,%rdx
+	andq	$-1024,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	addq	$448,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue_avx2:
+
+	vzeroupper
+	subq	$-64,%rsi
+	movl	0(%rdi),%eax
+	movq	%rsi,%r12
+	movl	4(%rdi),%ebx
+	cmpq	%rdx,%rsi
+	movl	8(%rdi),%ecx
+	cmoveq	%rsp,%r12
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%ymm8
+	vmovdqa	K256+512+64(%rip),%ymm9
+	jmp	.Loop_avx2
+.align	16
+.Loop_avx2:
+	vmovdqa	K256+512(%rip),%ymm7
+	vmovdqu	-64+0(%rsi),%xmm0
+	vmovdqu	-64+16(%rsi),%xmm1
+	vmovdqu	-64+32(%rsi),%xmm2
+	vmovdqu	-64+48(%rsi),%xmm3
+
+	vinserti128	$1,(%r12),%ymm0,%ymm0
+	vinserti128	$1,16(%r12),%ymm1,%ymm1
+	vpshufb	%ymm7,%ymm0,%ymm0
+	vinserti128	$1,32(%r12),%ymm2,%ymm2
+	vpshufb	%ymm7,%ymm1,%ymm1
+	vinserti128	$1,48(%r12),%ymm3,%ymm3
+
+	leaq	K256(%rip),%rbp
+	vpshufb	%ymm7,%ymm2,%ymm2
+	vpaddd	0(%rbp),%ymm0,%ymm4
+	vpshufb	%ymm7,%ymm3,%ymm3
+	vpaddd	32(%rbp),%ymm1,%ymm5
+	vpaddd	64(%rbp),%ymm2,%ymm6
+	vpaddd	96(%rbp),%ymm3,%ymm7
+	vmovdqa	%ymm4,0(%rsp)
+	xorl	%r14d,%r14d
+	vmovdqa	%ymm5,32(%rsp)
+	leaq	-64(%rsp),%rsp
+	movl	%ebx,%edi
+	vmovdqa	%ymm6,0(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%ymm7,32(%rsp)
+	movl	%r9d,%r12d
+	subq	$-32*4,%rbp
+	jmp	.Lavx2_00_47
+
+.align	16
+.Lavx2_00_47:
+	leaq	-64(%rsp),%rsp
+	vpalignr	$4,%ymm0,%ymm1,%ymm4
+	addl	0+128(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	vpalignr	$4,%ymm2,%ymm3,%ymm7
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	vpaddd	%ymm7,%ymm0,%ymm0
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%ebx,%edi
+	vpshufd	$250,%ymm3,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r11,%rdi,1),%r11d
+	movl	%r8d,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	4+128(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%edx,%edi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%r9d,%edx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%edx,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%edi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%edi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	vpaddd	%ymm4,%ymm0,%ymm0
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	8+128(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	vpshufb	%ymm8,%ymm6,%ymm6
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	vpaddd	%ymm6,%ymm0,%ymm0
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	vpshufd	$80,%ymm0,%ymm7
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	vpsrld	$10,%ymm7,%ymm6
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r11d,%edi
+	vpsrlq	$17,%ymm7,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r9,%rdi,1),%r9d
+	movl	%ecx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	12+128(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ebx,%edi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%edx,%ebx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%ebx,%r14d
+	vpshufb	%ymm9,%ymm6,%ymm6
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%edi
+	vpaddd	%ymm6,%ymm0,%ymm0
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%edi
+	vpaddd	0(%rbp),%ymm0,%ymm6
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	vmovdqa	%ymm6,0(%rsp)
+	vpalignr	$4,%ymm1,%ymm2,%ymm4
+	addl	32+128(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	vpalignr	$4,%ymm3,%ymm0,%ymm7
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	vpaddd	%ymm7,%ymm1,%ymm1
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r9d,%edi
+	vpshufd	$250,%ymm0,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rdi,1),%edx
+	movl	%eax,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	36+128(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%r11d,%edi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%ebx,%r11d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r11d,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%edi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%edi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	vpaddd	%ymm4,%ymm1,%ymm1
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	40+128(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	vpshufb	%ymm8,%ymm6,%ymm6
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	vpaddd	%ymm6,%ymm1,%ymm1
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	vpshufd	$80,%ymm1,%ymm7
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	vpsrld	$10,%ymm7,%ymm6
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%edx,%edi
+	vpsrlq	$17,%ymm7,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rdi,1),%ebx
+	movl	%r10d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	44+128(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r9d,%edi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r11d,%r9d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r9d,%r14d
+	vpshufb	%ymm9,%ymm6,%ymm6
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%edi
+	vpaddd	%ymm6,%ymm1,%ymm1
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%edi
+	vpaddd	32(%rbp),%ymm1,%ymm6
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vmovdqa	%ymm6,32(%rsp)
+	leaq	-64(%rsp),%rsp
+	vpalignr	$4,%ymm2,%ymm3,%ymm4
+	addl	0+128(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	vpalignr	$4,%ymm0,%ymm1,%ymm7
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	vpaddd	%ymm7,%ymm2,%ymm2
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%ebx,%edi
+	vpshufd	$250,%ymm1,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r11,%rdi,1),%r11d
+	movl	%r8d,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	4+128(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%edx,%edi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%r9d,%edx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%edx,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%edi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%edi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	vpaddd	%ymm4,%ymm2,%ymm2
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	8+128(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	vpshufb	%ymm8,%ymm6,%ymm6
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	vpaddd	%ymm6,%ymm2,%ymm2
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	vpshufd	$80,%ymm2,%ymm7
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	vpsrld	$10,%ymm7,%ymm6
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r11d,%edi
+	vpsrlq	$17,%ymm7,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r9,%rdi,1),%r9d
+	movl	%ecx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	12+128(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ebx,%edi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%edx,%ebx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%ebx,%r14d
+	vpshufb	%ymm9,%ymm6,%ymm6
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%edi
+	vpaddd	%ymm6,%ymm2,%ymm2
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%edi
+	vpaddd	64(%rbp),%ymm2,%ymm6
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	vmovdqa	%ymm6,0(%rsp)
+	vpalignr	$4,%ymm3,%ymm0,%ymm4
+	addl	32+128(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	vpalignr	$4,%ymm1,%ymm2,%ymm7
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	vpaddd	%ymm7,%ymm3,%ymm3
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r9d,%edi
+	vpshufd	$250,%ymm2,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rdi,1),%edx
+	movl	%eax,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	36+128(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%r11d,%edi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%ebx,%r11d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r11d,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%edi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%edi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	vpaddd	%ymm4,%ymm3,%ymm3
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	40+128(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	vpshufb	%ymm8,%ymm6,%ymm6
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	vpaddd	%ymm6,%ymm3,%ymm3
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	vpshufd	$80,%ymm3,%ymm7
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	vpsrld	$10,%ymm7,%ymm6
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%edx,%edi
+	vpsrlq	$17,%ymm7,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rdi,1),%ebx
+	movl	%r10d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	44+128(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r9d,%edi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r11d,%r9d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r9d,%r14d
+	vpshufb	%ymm9,%ymm6,%ymm6
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%edi
+	vpaddd	%ymm6,%ymm3,%ymm3
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%edi
+	vpaddd	96(%rbp),%ymm3,%ymm6
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vmovdqa	%ymm6,32(%rsp)
+	leaq	128(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jne	.Lavx2_00_47
+	addl	0+64(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%ebx,%edi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rdi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4+64(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%edi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%edi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%edi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8+64(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r11d,%edi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rdi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12+64(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%edi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%edi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%edi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32+64(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r9d,%edi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rdi,1),%edx
+	movl	%eax,%r12d
+	addl	36+64(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%edi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%edi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%edi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40+64(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%edx,%edi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rdi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44+64(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%edi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%edi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%edi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	addl	0(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%ebx,%edi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rdi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%edi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%edi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%edi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r11d,%edi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rdi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%edi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%edi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%edi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r9d,%edi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rdi,1),%edx
+	movl	%eax,%r12d
+	addl	36(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%edi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%edi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%edi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%edx,%edi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rdi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%edi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%edi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%edi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	movq	512(%rsp),%rdi
+	addl	%r14d,%eax
+
+	leaq	448(%rsp),%rbp
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+
+	cmpq	80(%rbp),%rsi
+	je	.Ldone_avx2
+
+	xorl	%r14d,%r14d
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	movl	%r9d,%r12d
+	jmp	.Lower_avx2
+.align	16
+.Lower_avx2:
+	addl	0+16(%rbp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%ebx,%edi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rdi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4+16(%rbp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%edi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%edi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%edi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8+16(%rbp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r11d,%edi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rdi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12+16(%rbp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%edi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%edi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%edi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32+16(%rbp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%r9d,%edi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rdi,1),%edx
+	movl	%eax,%r12d
+	addl	36+16(%rbp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%edi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%edi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%edi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40+16(%rbp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%edi
+	xorl	%r12d,%r14d
+	xorl	%edx,%edi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rdi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44+16(%rbp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%edi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%edi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%edi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%edi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%edi,%r15d
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	leaq	-64(%rbp),%rbp
+	cmpq	%rsp,%rbp
+	jae	.Lower_avx2
+
+	movq	512(%rsp),%rdi
+	addl	%r14d,%eax
+
+	leaq	448(%rsp),%rsp
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	leaq	128(%rsi),%rsi
+	addl	24(%rdi),%r10d
+	movq	%rsi,%r12
+	addl	28(%rdi),%r11d
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	cmoveq	%rsp,%r12
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+
+	jbe	.Loop_avx2
+	leaq	(%rsp),%rbp
+
+.Ldone_avx2:
+	leaq	(%rbp),%rsp
+	movq	64+24(%rsp),%rsi
+	vzeroupper
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order_avx2,.-sha256_block_data_order_avx2

Modified: trunk/secure/lib/libcrypto/amd64/sha512-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/sha512-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/sha512-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,12 +1,27 @@
 /* $MidnightBSD$ */
-	# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/sha512-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-	# Do not modify. This file is auto-generated from sha512-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/sha512-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha512-x86_64.pl. */
 .text	
 
+
 .globl	sha512_block_data_order
 .type	sha512_block_data_order, at function
 .align	16
 sha512_block_data_order:
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	0(%r11),%r9d
+	movl	4(%r11),%r10d
+	movl	8(%r11),%r11d
+	testl	$2048,%r10d
+	jnz	.Lxop_shortcut
+	andl	$296,%r11d
+	cmpl	$296,%r11d
+	je	.Lavx2_shortcut
+	andl	$1073741824,%r9d
+	andl	$268435968,%r10d
+	orl	%r9d,%r10d
+	cmpl	$1342177792,%r10d
+	je	.Lavx_shortcut
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -24,8 +39,6 @@
 	movq	%r11,128+24(%rsp)
 .Lprologue:
 
-	leaq	K512(%rip),%rbp
-
 	movq	0(%rdi),%rax
 	movq	8(%rdi),%rbx
 	movq	16(%rdi),%rcx
@@ -38,7 +51,9 @@
 
 .align	16
 .Lloop:
-	xorq	%rdi,%rdi
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
 	movq	0(%rsi),%r12
 	movq	%r8,%r13
 	movq	%rax,%r14
@@ -45,84 +60,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%r9,%r15
-	movq	%r12,0(%rsp)
 
+	xorq	%r8,%r13
 	rorq	$5,%r14
-	xorq	%r8,%r13
 	xorq	%r10,%r15
 
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
 	rorq	$4,%r13
 	addq	%r11,%r12
-	xorq	%rax,%r14
+	xorq	%r10,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r8,%r15
-	movq	%rbx,%r11
-
 	rorq	$6,%r14
 	xorq	%r8,%r13
-	xorq	%r10,%r15
+	addq	%r15,%r12
 
-	xorq	%rcx,%r11
+	movq	%rax,%r15
+	addq	(%rbp),%r12
 	xorq	%rax,%r14
-	addq	%r15,%r12
-	movq	%rbx,%r15
 
+	xorq	%rbx,%r15
 	rorq	$14,%r13
-	andq	%rax,%r11
-	andq	%rcx,%r15
+	movq	%rbx,%r11
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r11
 
+	xorq	%rdi,%r11
 	addq	%r12,%rdx
 	addq	%r12,%r11
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%r11
-
 	movq	8(%rsi),%r12
 	movq	%rdx,%r13
 	movq	%r11,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%r8,%r15
-	movq	%r12,8(%rsp)
+	movq	%r8,%rdi
 
+	xorq	%rdx,%r13
 	rorq	$5,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	xorq	%r9,%rdi
 
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
 	rorq	$4,%r13
 	addq	%r10,%r12
-	xorq	%r11,%r14
+	xorq	%r9,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rdx,%r15
-	movq	%rax,%r10
-
 	rorq	$6,%r14
 	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	addq	%rdi,%r12
 
-	xorq	%rbx,%r10
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
 	xorq	%r11,%r14
-	addq	%r15,%r12
-	movq	%rax,%r15
 
+	xorq	%rax,%rdi
 	rorq	$14,%r13
-	andq	%r11,%r10
-	andq	%rbx,%r15
+	movq	%rax,%r10
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r10
 
+	xorq	%r15,%r10
 	addq	%r12,%rcx
 	addq	%r12,%r10
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%r10
-
 	movq	16(%rsi),%r12
 	movq	%rcx,%r13
 	movq	%r10,%r14
@@ -129,84 +142,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%rdx,%r15
-	movq	%r12,16(%rsp)
 
+	xorq	%rcx,%r13
 	rorq	$5,%r14
-	xorq	%rcx,%r13
 	xorq	%r8,%r15
 
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
 	rorq	$4,%r13
 	addq	%r9,%r12
-	xorq	%r10,%r14
+	xorq	%r8,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rcx,%r15
-	movq	%r11,%r9
-
 	rorq	$6,%r14
 	xorq	%rcx,%r13
-	xorq	%r8,%r15
+	addq	%r15,%r12
 
-	xorq	%rax,%r9
+	movq	%r10,%r15
+	addq	(%rbp),%r12
 	xorq	%r10,%r14
-	addq	%r15,%r12
-	movq	%r11,%r15
 
+	xorq	%r11,%r15
 	rorq	$14,%r13
-	andq	%r10,%r9
-	andq	%rax,%r15
+	movq	%r11,%r9
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r9
 
+	xorq	%rdi,%r9
 	addq	%r12,%rbx
 	addq	%r12,%r9
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%r9
-
 	movq	24(%rsi),%r12
 	movq	%rbx,%r13
 	movq	%r9,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%rcx,%r15
-	movq	%r12,24(%rsp)
+	movq	%rcx,%rdi
 
+	xorq	%rbx,%r13
 	rorq	$5,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	xorq	%rdx,%rdi
 
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
 	rorq	$4,%r13
 	addq	%r8,%r12
-	xorq	%r9,%r14
+	xorq	%rdx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rbx,%r15
-	movq	%r10,%r8
-
 	rorq	$6,%r14
 	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r11,%r8
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
 	xorq	%r9,%r14
-	addq	%r15,%r12
-	movq	%r10,%r15
 
+	xorq	%r10,%rdi
 	rorq	$14,%r13
-	andq	%r9,%r8
-	andq	%r11,%r15
+	movq	%r10,%r8
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r8
 
+	xorq	%r15,%r8
 	addq	%r12,%rax
 	addq	%r12,%r8
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%r8
-
 	movq	32(%rsi),%r12
 	movq	%rax,%r13
 	movq	%r8,%r14
@@ -213,84 +224,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%rbx,%r15
-	movq	%r12,32(%rsp)
 
+	xorq	%rax,%r13
 	rorq	$5,%r14
-	xorq	%rax,%r13
 	xorq	%rcx,%r15
 
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
 	rorq	$4,%r13
 	addq	%rdx,%r12
-	xorq	%r8,%r14
+	xorq	%rcx,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rax,%r15
-	movq	%r9,%rdx
-
 	rorq	$6,%r14
 	xorq	%rax,%r13
-	xorq	%rcx,%r15
+	addq	%r15,%r12
 
-	xorq	%r10,%rdx
+	movq	%r8,%r15
+	addq	(%rbp),%r12
 	xorq	%r8,%r14
-	addq	%r15,%r12
-	movq	%r9,%r15
 
+	xorq	%r9,%r15
 	rorq	$14,%r13
-	andq	%r8,%rdx
-	andq	%r10,%r15
+	movq	%r9,%rdx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rdx
 
+	xorq	%rdi,%rdx
 	addq	%r12,%r11
 	addq	%r12,%rdx
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%rdx
-
 	movq	40(%rsi),%r12
 	movq	%r11,%r13
 	movq	%rdx,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%rax,%r15
-	movq	%r12,40(%rsp)
+	movq	%rax,%rdi
 
+	xorq	%r11,%r13
 	rorq	$5,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	xorq	%rbx,%rdi
 
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
 	rorq	$4,%r13
 	addq	%rcx,%r12
-	xorq	%rdx,%r14
+	xorq	%rbx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r11,%r15
-	movq	%r8,%rcx
-
 	rorq	$6,%r14
 	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r9,%rcx
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rdx,%r14
-	addq	%r15,%r12
-	movq	%r8,%r15
 
+	xorq	%r8,%rdi
 	rorq	$14,%r13
-	andq	%rdx,%rcx
-	andq	%r9,%r15
+	movq	%r8,%rcx
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rcx
 
+	xorq	%r15,%rcx
 	addq	%r12,%r10
 	addq	%r12,%rcx
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%rcx
-
 	movq	48(%rsi),%r12
 	movq	%r10,%r13
 	movq	%rcx,%r14
@@ -297,84 +306,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%r11,%r15
-	movq	%r12,48(%rsp)
 
+	xorq	%r10,%r13
 	rorq	$5,%r14
-	xorq	%r10,%r13
 	xorq	%rax,%r15
 
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
 	rorq	$4,%r13
 	addq	%rbx,%r12
-	xorq	%rcx,%r14
+	xorq	%rax,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r10,%r15
-	movq	%rdx,%rbx
-
 	rorq	$6,%r14
 	xorq	%r10,%r13
-	xorq	%rax,%r15
+	addq	%r15,%r12
 
-	xorq	%r8,%rbx
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
 	xorq	%rcx,%r14
-	addq	%r15,%r12
-	movq	%rdx,%r15
 
+	xorq	%rdx,%r15
 	rorq	$14,%r13
-	andq	%rcx,%rbx
-	andq	%r8,%r15
+	movq	%rdx,%rbx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rbx
 
+	xorq	%rdi,%rbx
 	addq	%r12,%r9
 	addq	%r12,%rbx
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%rbx
-
 	movq	56(%rsi),%r12
 	movq	%r9,%r13
 	movq	%rbx,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%r10,%r15
-	movq	%r12,56(%rsp)
+	movq	%r10,%rdi
 
+	xorq	%r9,%r13
 	rorq	$5,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r15
+	xorq	%r11,%rdi
 
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
 	rorq	$4,%r13
 	addq	%rax,%r12
-	xorq	%rbx,%r14
+	xorq	%r11,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r9,%r15
-	movq	%rcx,%rax
-
 	rorq	$6,%r14
 	xorq	%r9,%r13
-	xorq	%r11,%r15
+	addq	%rdi,%r12
 
-	xorq	%rdx,%rax
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rbx,%r14
-	addq	%r15,%r12
-	movq	%rcx,%r15
 
+	xorq	%rcx,%rdi
 	rorq	$14,%r13
-	andq	%rbx,%rax
-	andq	%rdx,%r15
+	movq	%rcx,%rax
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rax
 
+	xorq	%r15,%rax
 	addq	%r12,%r8
 	addq	%r12,%rax
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%rax
-
 	movq	64(%rsi),%r12
 	movq	%r8,%r13
 	movq	%rax,%r14
@@ -381,84 +388,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%r9,%r15
-	movq	%r12,64(%rsp)
 
+	xorq	%r8,%r13
 	rorq	$5,%r14
-	xorq	%r8,%r13
 	xorq	%r10,%r15
 
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
 	rorq	$4,%r13
 	addq	%r11,%r12
-	xorq	%rax,%r14
+	xorq	%r10,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r8,%r15
-	movq	%rbx,%r11
-
 	rorq	$6,%r14
 	xorq	%r8,%r13
-	xorq	%r10,%r15
+	addq	%r15,%r12
 
-	xorq	%rcx,%r11
+	movq	%rax,%r15
+	addq	(%rbp),%r12
 	xorq	%rax,%r14
-	addq	%r15,%r12
-	movq	%rbx,%r15
 
+	xorq	%rbx,%r15
 	rorq	$14,%r13
-	andq	%rax,%r11
-	andq	%rcx,%r15
+	movq	%rbx,%r11
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r11
 
+	xorq	%rdi,%r11
 	addq	%r12,%rdx
 	addq	%r12,%r11
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%r11
-
 	movq	72(%rsi),%r12
 	movq	%rdx,%r13
 	movq	%r11,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%r8,%r15
-	movq	%r12,72(%rsp)
+	movq	%r8,%rdi
 
+	xorq	%rdx,%r13
 	rorq	$5,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	xorq	%r9,%rdi
 
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
 	rorq	$4,%r13
 	addq	%r10,%r12
-	xorq	%r11,%r14
+	xorq	%r9,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rdx,%r15
-	movq	%rax,%r10
-
 	rorq	$6,%r14
 	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	addq	%rdi,%r12
 
-	xorq	%rbx,%r10
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
 	xorq	%r11,%r14
-	addq	%r15,%r12
-	movq	%rax,%r15
 
+	xorq	%rax,%rdi
 	rorq	$14,%r13
-	andq	%r11,%r10
-	andq	%rbx,%r15
+	movq	%rax,%r10
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r10
 
+	xorq	%r15,%r10
 	addq	%r12,%rcx
 	addq	%r12,%r10
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%r10
-
 	movq	80(%rsi),%r12
 	movq	%rcx,%r13
 	movq	%r10,%r14
@@ -465,84 +470,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%rdx,%r15
-	movq	%r12,80(%rsp)
 
+	xorq	%rcx,%r13
 	rorq	$5,%r14
-	xorq	%rcx,%r13
 	xorq	%r8,%r15
 
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
 	rorq	$4,%r13
 	addq	%r9,%r12
-	xorq	%r10,%r14
+	xorq	%r8,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rcx,%r15
-	movq	%r11,%r9
-
 	rorq	$6,%r14
 	xorq	%rcx,%r13
-	xorq	%r8,%r15
+	addq	%r15,%r12
 
-	xorq	%rax,%r9
+	movq	%r10,%r15
+	addq	(%rbp),%r12
 	xorq	%r10,%r14
-	addq	%r15,%r12
-	movq	%r11,%r15
 
+	xorq	%r11,%r15
 	rorq	$14,%r13
-	andq	%r10,%r9
-	andq	%rax,%r15
+	movq	%r11,%r9
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r9
 
+	xorq	%rdi,%r9
 	addq	%r12,%rbx
 	addq	%r12,%r9
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%r9
-
 	movq	88(%rsi),%r12
 	movq	%rbx,%r13
 	movq	%r9,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%rcx,%r15
-	movq	%r12,88(%rsp)
+	movq	%rcx,%rdi
 
+	xorq	%rbx,%r13
 	rorq	$5,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	xorq	%rdx,%rdi
 
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
 	rorq	$4,%r13
 	addq	%r8,%r12
-	xorq	%r9,%r14
+	xorq	%rdx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rbx,%r15
-	movq	%r10,%r8
-
 	rorq	$6,%r14
 	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r11,%r8
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
 	xorq	%r9,%r14
-	addq	%r15,%r12
-	movq	%r10,%r15
 
+	xorq	%r10,%rdi
 	rorq	$14,%r13
-	andq	%r9,%r8
-	andq	%r11,%r15
+	movq	%r10,%r8
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r8
 
+	xorq	%r15,%r8
 	addq	%r12,%rax
 	addq	%r12,%r8
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%r8
-
 	movq	96(%rsi),%r12
 	movq	%rax,%r13
 	movq	%r8,%r14
@@ -549,84 +552,82 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%rbx,%r15
-	movq	%r12,96(%rsp)
 
+	xorq	%rax,%r13
 	rorq	$5,%r14
-	xorq	%rax,%r13
 	xorq	%rcx,%r15
 
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
 	rorq	$4,%r13
 	addq	%rdx,%r12
-	xorq	%r8,%r14
+	xorq	%rcx,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rax,%r15
-	movq	%r9,%rdx
-
 	rorq	$6,%r14
 	xorq	%rax,%r13
-	xorq	%rcx,%r15
+	addq	%r15,%r12
 
-	xorq	%r10,%rdx
+	movq	%r8,%r15
+	addq	(%rbp),%r12
 	xorq	%r8,%r14
-	addq	%r15,%r12
-	movq	%r9,%r15
 
+	xorq	%r9,%r15
 	rorq	$14,%r13
-	andq	%r8,%rdx
-	andq	%r10,%r15
+	movq	%r9,%rdx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rdx
 
+	xorq	%rdi,%rdx
 	addq	%r12,%r11
 	addq	%r12,%rdx
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%rdx
-
 	movq	104(%rsi),%r12
 	movq	%r11,%r13
 	movq	%rdx,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%rax,%r15
-	movq	%r12,104(%rsp)
+	movq	%rax,%rdi
 
+	xorq	%r11,%r13
 	rorq	$5,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	xorq	%rbx,%rdi
 
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
 	rorq	$4,%r13
 	addq	%rcx,%r12
-	xorq	%rdx,%r14
+	xorq	%rbx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r11,%r15
-	movq	%r8,%rcx
-
 	rorq	$6,%r14
 	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r9,%rcx
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rdx,%r14
-	addq	%r15,%r12
-	movq	%r8,%r15
 
+	xorq	%r8,%rdi
 	rorq	$14,%r13
-	andq	%rdx,%rcx
-	andq	%r9,%r15
+	movq	%r8,%rcx
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rcx
 
+	xorq	%r15,%rcx
 	addq	%r12,%r10
 	addq	%r12,%rcx
-	leaq	1(%rdi),%rdi
+
+	leaq	24(%rbp),%rbp
 	addq	%r14,%rcx
-
 	movq	112(%rsi),%r12
 	movq	%r10,%r13
 	movq	%rcx,%r14
@@ -633,1099 +634,1049 @@
 	bswapq	%r12
 	rorq	$23,%r13
 	movq	%r11,%r15
-	movq	%r12,112(%rsp)
 
+	xorq	%r10,%r13
 	rorq	$5,%r14
-	xorq	%r10,%r13
 	xorq	%rax,%r15
 
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
 	rorq	$4,%r13
 	addq	%rbx,%r12
-	xorq	%rcx,%r14
+	xorq	%rax,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r10,%r15
-	movq	%rdx,%rbx
-
 	rorq	$6,%r14
 	xorq	%r10,%r13
-	xorq	%rax,%r15
+	addq	%r15,%r12
 
-	xorq	%r8,%rbx
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
 	xorq	%rcx,%r14
-	addq	%r15,%r12
-	movq	%rdx,%r15
 
+	xorq	%rdx,%r15
 	rorq	$14,%r13
-	andq	%rcx,%rbx
-	andq	%r8,%r15
+	movq	%rdx,%rbx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rbx
 
+	xorq	%rdi,%rbx
 	addq	%r12,%r9
 	addq	%r12,%rbx
-	leaq	1(%rdi),%rdi
+
+	leaq	8(%rbp),%rbp
 	addq	%r14,%rbx
-
 	movq	120(%rsi),%r12
 	movq	%r9,%r13
 	movq	%rbx,%r14
 	bswapq	%r12
 	rorq	$23,%r13
-	movq	%r10,%r15
-	movq	%r12,120(%rsp)
+	movq	%r10,%rdi
 
+	xorq	%r9,%r13
 	rorq	$5,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r15
+	xorq	%r11,%rdi
 
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
 	rorq	$4,%r13
 	addq	%rax,%r12
-	xorq	%rbx,%r14
+	xorq	%r11,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r9,%r15
-	movq	%rcx,%rax
-
 	rorq	$6,%r14
 	xorq	%r9,%r13
-	xorq	%r11,%r15
+	addq	%rdi,%r12
 
-	xorq	%rdx,%rax
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rbx,%r14
-	addq	%r15,%r12
-	movq	%rcx,%r15
 
+	xorq	%rcx,%rdi
 	rorq	$14,%r13
-	andq	%rbx,%rax
-	andq	%rdx,%r15
+	movq	%rcx,%rax
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rax
 
+	xorq	%r15,%rax
 	addq	%r12,%r8
 	addq	%r12,%rax
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rax
 
+	leaq	24(%rbp),%rbp
 	jmp	.Lrounds_16_xx
 .align	16
 .Lrounds_16_xx:
 	movq	8(%rsp),%r13
-	movq	112(%rsp),%r14
+	movq	112(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	72(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
 
 	addq	0(%rsp),%r12
 	movq	%r8,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%rax,%r14
 	rorq	$23,%r13
 	movq	%r9,%r15
-	movq	%r12,0(%rsp)
 
+	xorq	%r8,%r13
 	rorq	$5,%r14
-	xorq	%r8,%r13
 	xorq	%r10,%r15
 
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
 	rorq	$4,%r13
 	addq	%r11,%r12
-	xorq	%rax,%r14
+	xorq	%r10,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r8,%r15
-	movq	%rbx,%r11
-
 	rorq	$6,%r14
 	xorq	%r8,%r13
-	xorq	%r10,%r15
+	addq	%r15,%r12
 
-	xorq	%rcx,%r11
+	movq	%rax,%r15
+	addq	(%rbp),%r12
 	xorq	%rax,%r14
-	addq	%r15,%r12
-	movq	%rbx,%r15
 
+	xorq	%rbx,%r15
 	rorq	$14,%r13
-	andq	%rax,%r11
-	andq	%rcx,%r15
+	movq	%rbx,%r11
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r11
 
+	xorq	%rdi,%r11
 	addq	%r12,%rdx
 	addq	%r12,%r11
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r11
 
+	leaq	8(%rbp),%rbp
 	movq	16(%rsp),%r13
-	movq	120(%rsp),%r14
+	movq	120(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	80(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
 
 	addq	8(%rsp),%r12
 	movq	%rdx,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%r11,%r14
 	rorq	$23,%r13
-	movq	%r8,%r15
-	movq	%r12,8(%rsp)
+	movq	%r8,%rdi
 
+	xorq	%rdx,%r13
 	rorq	$5,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	xorq	%r9,%rdi
 
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
 	rorq	$4,%r13
 	addq	%r10,%r12
-	xorq	%r11,%r14
+	xorq	%r9,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rdx,%r15
-	movq	%rax,%r10
-
 	rorq	$6,%r14
 	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	addq	%rdi,%r12
 
-	xorq	%rbx,%r10
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
 	xorq	%r11,%r14
-	addq	%r15,%r12
-	movq	%rax,%r15
 
+	xorq	%rax,%rdi
 	rorq	$14,%r13
-	andq	%r11,%r10
-	andq	%rbx,%r15
+	movq	%rax,%r10
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r10
 
+	xorq	%r15,%r10
 	addq	%r12,%rcx
 	addq	%r12,%r10
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r10
 
+	leaq	24(%rbp),%rbp
 	movq	24(%rsp),%r13
-	movq	0(%rsp),%r14
+	movq	0(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	88(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
 
 	addq	16(%rsp),%r12
 	movq	%rcx,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%r10,%r14
 	rorq	$23,%r13
 	movq	%rdx,%r15
-	movq	%r12,16(%rsp)
 
+	xorq	%rcx,%r13
 	rorq	$5,%r14
-	xorq	%rcx,%r13
 	xorq	%r8,%r15
 
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
 	rorq	$4,%r13
 	addq	%r9,%r12
-	xorq	%r10,%r14
+	xorq	%r8,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rcx,%r15
-	movq	%r11,%r9
-
 	rorq	$6,%r14
 	xorq	%rcx,%r13
-	xorq	%r8,%r15
+	addq	%r15,%r12
 
-	xorq	%rax,%r9
+	movq	%r10,%r15
+	addq	(%rbp),%r12
 	xorq	%r10,%r14
-	addq	%r15,%r12
-	movq	%r11,%r15
 
+	xorq	%r11,%r15
 	rorq	$14,%r13
-	andq	%r10,%r9
-	andq	%rax,%r15
+	movq	%r11,%r9
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r9
 
+	xorq	%rdi,%r9
 	addq	%r12,%rbx
 	addq	%r12,%r9
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r9
 
+	leaq	8(%rbp),%rbp
 	movq	32(%rsp),%r13
-	movq	8(%rsp),%r14
+	movq	8(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	96(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
 
 	addq	24(%rsp),%r12
 	movq	%rbx,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%r9,%r14
 	rorq	$23,%r13
-	movq	%rcx,%r15
-	movq	%r12,24(%rsp)
+	movq	%rcx,%rdi
 
+	xorq	%rbx,%r13
 	rorq	$5,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	xorq	%rdx,%rdi
 
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
 	rorq	$4,%r13
 	addq	%r8,%r12
-	xorq	%r9,%r14
+	xorq	%rdx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rbx,%r15
-	movq	%r10,%r8
-
 	rorq	$6,%r14
 	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r11,%r8
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
 	xorq	%r9,%r14
-	addq	%r15,%r12
-	movq	%r10,%r15
 
+	xorq	%r10,%rdi
 	rorq	$14,%r13
-	andq	%r9,%r8
-	andq	%r11,%r15
+	movq	%r10,%r8
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r8
 
+	xorq	%r15,%r8
 	addq	%r12,%rax
 	addq	%r12,%r8
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r8
 
+	leaq	24(%rbp),%rbp
 	movq	40(%rsp),%r13
-	movq	16(%rsp),%r14
+	movq	16(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	104(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
 
 	addq	32(%rsp),%r12
 	movq	%rax,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%r8,%r14
 	rorq	$23,%r13
 	movq	%rbx,%r15
-	movq	%r12,32(%rsp)
 
+	xorq	%rax,%r13
 	rorq	$5,%r14
-	xorq	%rax,%r13
 	xorq	%rcx,%r15
 
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
 	rorq	$4,%r13
 	addq	%rdx,%r12
-	xorq	%r8,%r14
+	xorq	%rcx,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rax,%r15
-	movq	%r9,%rdx
-
 	rorq	$6,%r14
 	xorq	%rax,%r13
-	xorq	%rcx,%r15
+	addq	%r15,%r12
 
-	xorq	%r10,%rdx
+	movq	%r8,%r15
+	addq	(%rbp),%r12
 	xorq	%r8,%r14
-	addq	%r15,%r12
-	movq	%r9,%r15
 
+	xorq	%r9,%r15
 	rorq	$14,%r13
-	andq	%r8,%rdx
-	andq	%r10,%r15
+	movq	%r9,%rdx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rdx
 
+	xorq	%rdi,%rdx
 	addq	%r12,%r11
 	addq	%r12,%rdx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rdx
 
+	leaq	8(%rbp),%rbp
 	movq	48(%rsp),%r13
-	movq	24(%rsp),%r14
+	movq	24(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	112(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
 
 	addq	40(%rsp),%r12
 	movq	%r11,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%rdx,%r14
 	rorq	$23,%r13
-	movq	%rax,%r15
-	movq	%r12,40(%rsp)
+	movq	%rax,%rdi
 
+	xorq	%r11,%r13
 	rorq	$5,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	xorq	%rbx,%rdi
 
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
 	rorq	$4,%r13
 	addq	%rcx,%r12
-	xorq	%rdx,%r14
+	xorq	%rbx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r11,%r15
-	movq	%r8,%rcx
-
 	rorq	$6,%r14
 	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r9,%rcx
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rdx,%r14
-	addq	%r15,%r12
-	movq	%r8,%r15
 
+	xorq	%r8,%rdi
 	rorq	$14,%r13
-	andq	%rdx,%rcx
-	andq	%r9,%r15
+	movq	%r8,%rcx
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rcx
 
+	xorq	%r15,%rcx
 	addq	%r12,%r10
 	addq	%r12,%rcx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rcx
 
+	leaq	24(%rbp),%rbp
 	movq	56(%rsp),%r13
-	movq	32(%rsp),%r14
+	movq	32(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	120(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
 
 	addq	48(%rsp),%r12
 	movq	%r10,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%rcx,%r14
 	rorq	$23,%r13
 	movq	%r11,%r15
-	movq	%r12,48(%rsp)
 
+	xorq	%r10,%r13
 	rorq	$5,%r14
-	xorq	%r10,%r13
 	xorq	%rax,%r15
 
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
 	rorq	$4,%r13
 	addq	%rbx,%r12
-	xorq	%rcx,%r14
+	xorq	%rax,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r10,%r15
-	movq	%rdx,%rbx
-
 	rorq	$6,%r14
 	xorq	%r10,%r13
-	xorq	%rax,%r15
+	addq	%r15,%r12
 
-	xorq	%r8,%rbx
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
 	xorq	%rcx,%r14
-	addq	%r15,%r12
-	movq	%rdx,%r15
 
+	xorq	%rdx,%r15
 	rorq	$14,%r13
-	andq	%rcx,%rbx
-	andq	%r8,%r15
+	movq	%rdx,%rbx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rbx
 
+	xorq	%rdi,%rbx
 	addq	%r12,%r9
 	addq	%r12,%rbx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rbx
 
+	leaq	8(%rbp),%rbp
 	movq	64(%rsp),%r13
-	movq	40(%rsp),%r14
+	movq	40(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	0(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
 
 	addq	56(%rsp),%r12
 	movq	%r9,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%rbx,%r14
 	rorq	$23,%r13
-	movq	%r10,%r15
-	movq	%r12,56(%rsp)
+	movq	%r10,%rdi
 
+	xorq	%r9,%r13
 	rorq	$5,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r15
+	xorq	%r11,%rdi
 
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
 	rorq	$4,%r13
 	addq	%rax,%r12
-	xorq	%rbx,%r14
+	xorq	%r11,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r9,%r15
-	movq	%rcx,%rax
-
 	rorq	$6,%r14
 	xorq	%r9,%r13
-	xorq	%r11,%r15
+	addq	%rdi,%r12
 
-	xorq	%rdx,%rax
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rbx,%r14
-	addq	%r15,%r12
-	movq	%rcx,%r15
 
+	xorq	%rcx,%rdi
 	rorq	$14,%r13
-	andq	%rbx,%rax
-	andq	%rdx,%r15
+	movq	%rcx,%rax
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rax
 
+	xorq	%r15,%rax
 	addq	%r12,%r8
 	addq	%r12,%rax
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rax
 
+	leaq	24(%rbp),%rbp
 	movq	72(%rsp),%r13
-	movq	48(%rsp),%r14
+	movq	48(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	8(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
 
 	addq	64(%rsp),%r12
 	movq	%r8,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%rax,%r14
 	rorq	$23,%r13
 	movq	%r9,%r15
-	movq	%r12,64(%rsp)
 
+	xorq	%r8,%r13
 	rorq	$5,%r14
-	xorq	%r8,%r13
 	xorq	%r10,%r15
 
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
 	rorq	$4,%r13
 	addq	%r11,%r12
-	xorq	%rax,%r14
+	xorq	%r10,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r8,%r15
-	movq	%rbx,%r11
-
 	rorq	$6,%r14
 	xorq	%r8,%r13
-	xorq	%r10,%r15
+	addq	%r15,%r12
 
-	xorq	%rcx,%r11
+	movq	%rax,%r15
+	addq	(%rbp),%r12
 	xorq	%rax,%r14
-	addq	%r15,%r12
-	movq	%rbx,%r15
 
+	xorq	%rbx,%r15
 	rorq	$14,%r13
-	andq	%rax,%r11
-	andq	%rcx,%r15
+	movq	%rbx,%r11
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r11
 
+	xorq	%rdi,%r11
 	addq	%r12,%rdx
 	addq	%r12,%r11
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r11
 
+	leaq	8(%rbp),%rbp
 	movq	80(%rsp),%r13
-	movq	56(%rsp),%r14
+	movq	56(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	16(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
 
 	addq	72(%rsp),%r12
 	movq	%rdx,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%r11,%r14
 	rorq	$23,%r13
-	movq	%r8,%r15
-	movq	%r12,72(%rsp)
+	movq	%r8,%rdi
 
+	xorq	%rdx,%r13
 	rorq	$5,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	xorq	%r9,%rdi
 
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
 	rorq	$4,%r13
 	addq	%r10,%r12
-	xorq	%r11,%r14
+	xorq	%r9,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rdx,%r15
-	movq	%rax,%r10
-
 	rorq	$6,%r14
 	xorq	%rdx,%r13
-	xorq	%r9,%r15
+	addq	%rdi,%r12
 
-	xorq	%rbx,%r10
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
 	xorq	%r11,%r14
-	addq	%r15,%r12
-	movq	%rax,%r15
 
+	xorq	%rax,%rdi
 	rorq	$14,%r13
-	andq	%r11,%r10
-	andq	%rbx,%r15
+	movq	%rax,%r10
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r10
 
+	xorq	%r15,%r10
 	addq	%r12,%rcx
 	addq	%r12,%r10
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r10
 
+	leaq	24(%rbp),%rbp
 	movq	88(%rsp),%r13
-	movq	64(%rsp),%r14
+	movq	64(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	24(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
 
 	addq	80(%rsp),%r12
 	movq	%rcx,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%r10,%r14
 	rorq	$23,%r13
 	movq	%rdx,%r15
-	movq	%r12,80(%rsp)
 
+	xorq	%rcx,%r13
 	rorq	$5,%r14
-	xorq	%rcx,%r13
 	xorq	%r8,%r15
 
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
 	rorq	$4,%r13
 	addq	%r9,%r12
-	xorq	%r10,%r14
+	xorq	%r8,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rcx,%r15
-	movq	%r11,%r9
-
 	rorq	$6,%r14
 	xorq	%rcx,%r13
-	xorq	%r8,%r15
+	addq	%r15,%r12
 
-	xorq	%rax,%r9
+	movq	%r10,%r15
+	addq	(%rbp),%r12
 	xorq	%r10,%r14
-	addq	%r15,%r12
-	movq	%r11,%r15
 
+	xorq	%r11,%r15
 	rorq	$14,%r13
-	andq	%r10,%r9
-	andq	%rax,%r15
+	movq	%r11,%r9
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r9
 
+	xorq	%rdi,%r9
 	addq	%r12,%rbx
 	addq	%r12,%r9
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r9
 
+	leaq	8(%rbp),%rbp
 	movq	96(%rsp),%r13
-	movq	72(%rsp),%r14
+	movq	72(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	32(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
 
 	addq	88(%rsp),%r12
 	movq	%rbx,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%r9,%r14
 	rorq	$23,%r13
-	movq	%rcx,%r15
-	movq	%r12,88(%rsp)
+	movq	%rcx,%rdi
 
+	xorq	%rbx,%r13
 	rorq	$5,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	xorq	%rdx,%rdi
 
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
 	rorq	$4,%r13
 	addq	%r8,%r12
-	xorq	%r9,%r14
+	xorq	%rdx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rbx,%r15
-	movq	%r10,%r8
-
 	rorq	$6,%r14
 	xorq	%rbx,%r13
-	xorq	%rdx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r11,%r8
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
 	xorq	%r9,%r14
-	addq	%r15,%r12
-	movq	%r10,%r15
 
+	xorq	%r10,%rdi
 	rorq	$14,%r13
-	andq	%r9,%r8
-	andq	%r11,%r15
+	movq	%r10,%r8
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%r8
 
+	xorq	%r15,%r8
 	addq	%r12,%rax
 	addq	%r12,%r8
-	leaq	1(%rdi),%rdi
-	addq	%r14,%r8
 
+	leaq	24(%rbp),%rbp
 	movq	104(%rsp),%r13
-	movq	80(%rsp),%r14
+	movq	80(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	40(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
 
 	addq	96(%rsp),%r12
 	movq	%rax,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%r8,%r14
 	rorq	$23,%r13
 	movq	%rbx,%r15
-	movq	%r12,96(%rsp)
 
+	xorq	%rax,%r13
 	rorq	$5,%r14
-	xorq	%rax,%r13
 	xorq	%rcx,%r15
 
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
 	rorq	$4,%r13
 	addq	%rdx,%r12
-	xorq	%r8,%r14
+	xorq	%rcx,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%rax,%r15
-	movq	%r9,%rdx
-
 	rorq	$6,%r14
 	xorq	%rax,%r13
-	xorq	%rcx,%r15
+	addq	%r15,%r12
 
-	xorq	%r10,%rdx
+	movq	%r8,%r15
+	addq	(%rbp),%r12
 	xorq	%r8,%r14
-	addq	%r15,%r12
-	movq	%r9,%r15
 
+	xorq	%r9,%r15
 	rorq	$14,%r13
-	andq	%r8,%rdx
-	andq	%r10,%r15
+	movq	%r9,%rdx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rdx
 
+	xorq	%rdi,%rdx
 	addq	%r12,%r11
 	addq	%r12,%rdx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rdx
 
+	leaq	8(%rbp),%rbp
 	movq	112(%rsp),%r13
-	movq	88(%rsp),%r14
+	movq	88(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	48(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
 
 	addq	104(%rsp),%r12
 	movq	%r11,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%rdx,%r14
 	rorq	$23,%r13
-	movq	%rax,%r15
-	movq	%r12,104(%rsp)
+	movq	%rax,%rdi
 
+	xorq	%r11,%r13
 	rorq	$5,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	xorq	%rbx,%rdi
 
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
 	rorq	$4,%r13
 	addq	%rcx,%r12
-	xorq	%rdx,%r14
+	xorq	%rbx,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r11,%r15
-	movq	%r8,%rcx
-
 	rorq	$6,%r14
 	xorq	%r11,%r13
-	xorq	%rbx,%r15
+	addq	%rdi,%r12
 
-	xorq	%r9,%rcx
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rdx,%r14
-	addq	%r15,%r12
-	movq	%r8,%r15
 
+	xorq	%r8,%rdi
 	rorq	$14,%r13
-	andq	%rdx,%rcx
-	andq	%r9,%r15
+	movq	%r8,%rcx
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rcx
 
+	xorq	%r15,%rcx
 	addq	%r12,%r10
 	addq	%r12,%rcx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rcx
 
+	leaq	24(%rbp),%rbp
 	movq	120(%rsp),%r13
-	movq	96(%rsp),%r14
+	movq	96(%rsp),%r15
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	56(%rsp),%r12
-
-	rorq	$42,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
 	xorq	%r14,%r15
 	shrq	$6,%r14
 
 	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
 
 	addq	112(%rsp),%r12
 	movq	%r10,%r13
-	addq	%r14,%r12
+	addq	%r15,%r12
 	movq	%rcx,%r14
 	rorq	$23,%r13
 	movq	%r11,%r15
-	movq	%r12,112(%rsp)
 
+	xorq	%r10,%r13
 	rorq	$5,%r14
-	xorq	%r10,%r13
 	xorq	%rax,%r15
 
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
 	rorq	$4,%r13
 	addq	%rbx,%r12
-	xorq	%rcx,%r14
+	xorq	%rax,%r15
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r10,%r15
-	movq	%rdx,%rbx
-
 	rorq	$6,%r14
 	xorq	%r10,%r13
-	xorq	%rax,%r15
+	addq	%r15,%r12
 
-	xorq	%r8,%rbx
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
 	xorq	%rcx,%r14
-	addq	%r15,%r12
-	movq	%rdx,%r15
 
+	xorq	%rdx,%r15
 	rorq	$14,%r13
-	andq	%rcx,%rbx
-	andq	%r8,%r15
+	movq	%rdx,%rbx
 
+	andq	%r15,%rdi
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rbx
 
+	xorq	%rdi,%rbx
 	addq	%r12,%r9
 	addq	%r12,%rbx
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rbx
 
+	leaq	8(%rbp),%rbp
 	movq	0(%rsp),%r13
-	movq	104(%rsp),%r14
+	movq	104(%rsp),%rdi
+
 	movq	%r13,%r12
-	movq	%r14,%r15
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
 
-	rorq	$7,%r12
-	xorq	%r13,%r12
-	shrq	$7,%r13
-
-	rorq	$1,%r12
 	xorq	%r12,%r13
-	movq	64(%rsp),%r12
-
-	rorq	$42,%r15
-	xorq	%r14,%r15
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
 	shrq	$6,%r14
 
-	rorq	$19,%r15
-	addq	%r13,%r12
-	xorq	%r15,%r14
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
 
 	addq	120(%rsp),%r12
 	movq	%r9,%r13
-	addq	%r14,%r12
+	addq	%rdi,%r12
 	movq	%rbx,%r14
 	rorq	$23,%r13
-	movq	%r10,%r15
-	movq	%r12,120(%rsp)
+	movq	%r10,%rdi
 
+	xorq	%r9,%r13
 	rorq	$5,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r15
+	xorq	%r11,%rdi
 
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
 	rorq	$4,%r13
 	addq	%rax,%r12
-	xorq	%rbx,%r14
+	xorq	%r11,%rdi
 
-	addq	(%rbp,%rdi,8),%r12
-	andq	%r9,%r15
-	movq	%rcx,%rax
-
 	rorq	$6,%r14
 	xorq	%r9,%r13
-	xorq	%r11,%r15
+	addq	%rdi,%r12
 
-	xorq	%rdx,%rax
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
 	xorq	%rbx,%r14
-	addq	%r15,%r12
-	movq	%rcx,%r15
 
+	xorq	%rcx,%rdi
 	rorq	$14,%r13
-	andq	%rbx,%rax
-	andq	%rdx,%r15
+	movq	%rcx,%rax
 
+	andq	%rdi,%r15
 	rorq	$28,%r14
 	addq	%r13,%r12
-	addq	%r15,%rax
 
+	xorq	%r15,%rax
 	addq	%r12,%r8
 	addq	%r12,%rax
-	leaq	1(%rdi),%rdi
-	addq	%r14,%rax
 
-	cmpq	$80,%rdi
-	jb	.Lrounds_16_xx
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	.Lrounds_16_xx
 
 	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
 	leaq	128(%rsi),%rsi
 
 	addq	0(%rdi),%rax
@@ -1764,42 +1715,3654 @@
 .type	K512, at object
 K512:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 .quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 .quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
 .quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 .quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
 .quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 .quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 .quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 .quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 .quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 .quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 .quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 .quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 .quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 .quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 .quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
 .quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 .quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 .quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 .quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
 .quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 .quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 .quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
 .quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 .quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 .quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 .quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 .quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 .quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 .quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 .quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
 .quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 .quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
 .quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 .quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 .quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
 .quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
 .quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 .quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 .quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type	sha512_block_data_order_xop, at function
+.align	64
+sha512_block_data_order_xop:
+.Lxop_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%r11,128+24(%rsp)
+.Lprologue_xop:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop_xop
+.align	16
+.Lloop_xop:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	.Lxop_00_47
+
+.align	16
+.Lxop_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%rax,%r14
+	vpaddq	%xmm11,%xmm0,%xmm0
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+.byte	143,72,120,195,209,7
+	xorq	%r10,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,223,3
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm7,%xmm10
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%rdx,%r13
+	addq	%r11,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%r11
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%r8,%r12
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpaddq	%xmm11,%xmm0,%xmm0
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	rorq	$6,%r14
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	rorq	$28,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%r10,%r14
+	vpaddq	%xmm11,%xmm1,%xmm1
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+.byte	143,72,120,195,209,7
+	xorq	%r8,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,216,3
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm0,%xmm10
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rbx,%r13
+	addq	%r9,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%r9
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%rcx,%r12
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpaddq	%xmm11,%xmm1,%xmm1
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	rorq	$6,%r14
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	rorq	$28,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%r8,%r14
+	vpaddq	%xmm11,%xmm2,%xmm2
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+.byte	143,72,120,195,209,7
+	xorq	%rcx,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,217,3
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm1,%xmm10
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%r11,%r13
+	addq	%rdx,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%rdx
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%rax,%r12
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpaddq	%xmm11,%xmm2,%xmm2
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	rorq	$6,%r14
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	rorq	$28,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%rcx,%r14
+	vpaddq	%xmm11,%xmm3,%xmm3
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+.byte	143,72,120,195,209,7
+	xorq	%rax,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,218,3
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm2,%xmm10
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r9,%r13
+	addq	%rbx,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%rbx
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%r10,%r12
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpaddq	%xmm11,%xmm3,%xmm3
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	rorq	$6,%r14
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	rorq	$28,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%rax,%r14
+	vpaddq	%xmm11,%xmm4,%xmm4
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+.byte	143,72,120,195,209,7
+	xorq	%r10,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,219,3
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm3,%xmm10
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%rdx,%r13
+	addq	%r11,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%r11
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%r8,%r12
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpaddq	%xmm11,%xmm4,%xmm4
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	rorq	$6,%r14
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	rorq	$28,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%r10,%r14
+	vpaddq	%xmm11,%xmm5,%xmm5
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+.byte	143,72,120,195,209,7
+	xorq	%r8,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,220,3
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm4,%xmm10
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rbx,%r13
+	addq	%r9,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%r9
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%rcx,%r12
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpaddq	%xmm11,%xmm5,%xmm5
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	rorq	$6,%r14
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	rorq	$28,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%r8,%r14
+	vpaddq	%xmm11,%xmm6,%xmm6
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+.byte	143,72,120,195,209,7
+	xorq	%rcx,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,221,3
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm5,%xmm10
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%r11,%r13
+	addq	%rdx,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%rdx
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%rax,%r12
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpaddq	%xmm11,%xmm6,%xmm6
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	rorq	$6,%r14
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	rorq	$28,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	rorq	$23,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	rorq	$5,%r14
+.byte	143,72,120,195,200,56
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpsrlq	$7,%xmm8,%xmm8
+	rorq	$4,%r13
+	xorq	%rcx,%r14
+	vpaddq	%xmm11,%xmm7,%xmm7
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+.byte	143,72,120,195,209,7
+	xorq	%rax,%r12
+	rorq	$6,%r14
+	vpxor	%xmm9,%xmm8,%xmm8
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+.byte	143,104,120,195,222,3
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	rorq	$28,%r14
+	vpsrlq	$6,%xmm6,%xmm10
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r9,%r13
+	addq	%rbx,%r14
+.byte	143,72,120,195,203,42
+	rorq	$23,%r13
+	movq	%r14,%rbx
+	vpxor	%xmm10,%xmm11,%xmm11
+	movq	%r10,%r12
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm9,%xmm11,%xmm11
+	rorq	$4,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpaddq	%xmm11,%xmm7,%xmm7
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	rorq	$6,%r14
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	rorq	$28,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	.Lxop_00_47
+	rorq	$23,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	rorq	$4,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	rorq	$6,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	rorq	$28,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	rorq	$4,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	rorq	$6,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	rorq	$28,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	rorq	$23,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	rorq	$4,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	rorq	$6,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	rorq	$28,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	rorq	$23,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	rorq	$4,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	rorq	$6,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	rorq	$28,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	rorq	$23,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	rorq	$4,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	rorq	$6,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	rorq	$28,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	rorq	$4,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	rorq	$6,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	rorq	$28,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	rorq	$4,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	rorq	$6,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	rorq	$28,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	rorq	$4,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	rorq	$6,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	rorq	$28,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	rorq	$4,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	rorq	$6,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	rorq	$28,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	rorq	$4,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	rorq	$6,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	rorq	$28,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	rorq	$23,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	rorq	$4,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	rorq	$6,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	rorq	$28,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	rorq	$23,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	rorq	$4,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	rorq	$6,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	rorq	$28,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	rorq	$23,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	rorq	$4,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	rorq	$6,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	rorq	$28,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	rorq	$4,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	rorq	$6,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	rorq	$28,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	rorq	$4,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	rorq	$6,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	rorq	$14,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	rorq	$28,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	rorq	$4,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	rorq	$6,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	rorq	$14,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	rorq	$28,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop_xop
+
+	movq	128+24(%rsp),%rsi
+	vzeroupper
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_xop:
+	.byte	0xf3,0xc3
+.size	sha512_block_data_order_xop,.-sha512_block_data_order_xop
+.type	sha512_block_data_order_avx, at function
+.align	64
+sha512_block_data_order_avx:
+.Lavx_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%r11,128+24(%rsp)
+.Lprologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	.Lavx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop_avx
+
+	movq	128+24(%rsp),%rsi
+	vzeroupper
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
+.type	sha512_block_data_order_avx2, at function
+.align	64
+sha512_block_data_order_avx2:
+.Lavx2_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	subq	$1312,%rsp
+	shlq	$4,%rdx
+	andq	$-2048,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	addq	$1152,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%r11,128+24(%rsp)
+.Lprologue_avx2:
+
+	vzeroupper
+	subq	$-128,%rsi
+	movq	0(%rdi),%rax
+	movq	%rsi,%r12
+	movq	8(%rdi),%rbx
+	cmpq	%rdx,%rsi
+	movq	16(%rdi),%rcx
+	cmoveq	%rsp,%r12
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Loop_avx2
+.align	16
+.Loop_avx2:
+	vmovdqu	-128(%rsi),%xmm0
+	vmovdqu	-128+16(%rsi),%xmm1
+	vmovdqu	-128+32(%rsi),%xmm2
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	-128+48(%rsi),%xmm3
+	vmovdqu	-128+64(%rsi),%xmm4
+	vmovdqu	-128+80(%rsi),%xmm5
+	vmovdqu	-128+96(%rsi),%xmm6
+	vmovdqu	-128+112(%rsi),%xmm7
+
+	vmovdqa	1152(%rbp),%ymm10
+	vinserti128	$1,(%r12),%ymm0,%ymm0
+	vinserti128	$1,16(%r12),%ymm1,%ymm1
+	vpshufb	%ymm10,%ymm0,%ymm0
+	vinserti128	$1,32(%r12),%ymm2,%ymm2
+	vpshufb	%ymm10,%ymm1,%ymm1
+	vinserti128	$1,48(%r12),%ymm3,%ymm3
+	vpshufb	%ymm10,%ymm2,%ymm2
+	vinserti128	$1,64(%r12),%ymm4,%ymm4
+	vpshufb	%ymm10,%ymm3,%ymm3
+	vinserti128	$1,80(%r12),%ymm5,%ymm5
+	vpshufb	%ymm10,%ymm4,%ymm4
+	vinserti128	$1,96(%r12),%ymm6,%ymm6
+	vpshufb	%ymm10,%ymm5,%ymm5
+	vinserti128	$1,112(%r12),%ymm7,%ymm7
+
+	vpaddq	-128(%rbp),%ymm0,%ymm8
+	vpshufb	%ymm10,%ymm6,%ymm6
+	vpaddq	-96(%rbp),%ymm1,%ymm9
+	vpshufb	%ymm10,%ymm7,%ymm7
+	vpaddq	-64(%rbp),%ymm2,%ymm10
+	vpaddq	-32(%rbp),%ymm3,%ymm11
+	vmovdqa	%ymm8,0(%rsp)
+	vpaddq	0(%rbp),%ymm4,%ymm8
+	vmovdqa	%ymm9,32(%rsp)
+	vpaddq	32(%rbp),%ymm5,%ymm9
+	vmovdqa	%ymm10,64(%rsp)
+	vpaddq	64(%rbp),%ymm6,%ymm10
+	vmovdqa	%ymm11,96(%rsp)
+	leaq	-128(%rsp),%rsp
+	vpaddq	96(%rbp),%ymm7,%ymm11
+	vmovdqa	%ymm8,0(%rsp)
+	xorq	%r14,%r14
+	vmovdqa	%ymm9,32(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%ymm10,64(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%ymm11,96(%rsp)
+	movq	%r9,%r12
+	addq	$32*8,%rbp
+	jmp	.Lavx2_00_47
+
+.align	16
+.Lavx2_00_47:
+	leaq	-128(%rsp),%rsp
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	addq	0+256(%rsp),%r11
+	andq	%r8,%r12
+	rorxq	$41,%r8,%r13
+	vpalignr	$8,%ymm4,%ymm5,%ymm11
+	rorxq	$18,%r8,%r15
+	leaq	(%rax,%r14,1),%rax
+	leaq	(%r11,%r12,1),%r11
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%r10,%r8,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r8,%r14
+	vpaddq	%ymm11,%ymm0,%ymm0
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%r11,%r12,1),%r11
+	xorq	%r14,%r13
+	movq	%rax,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%rax,%r12
+	leaq	(%r11,%r13,1),%r11
+	xorq	%rbx,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%rax,%r14
+	rorxq	$28,%rax,%r13
+	leaq	(%rdx,%r11,1),%rdx
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rbx,%rdi
+	vpsrlq	$6,%ymm7,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%r11,%rdi,1),%r11
+	movq	%r8,%r12
+	vpsllq	$3,%ymm7,%ymm10
+	vpaddq	%ymm8,%ymm0,%ymm0
+	addq	8+256(%rsp),%r10
+	andq	%rdx,%r12
+	rorxq	$41,%rdx,%r13
+	vpsrlq	$19,%ymm7,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%rdx,%rdi
+	leaq	(%r11,%r14,1),%r11
+	leaq	(%r10,%r12,1),%r10
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%r9,%rdx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rdx,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%r10,%r12,1),%r10
+	xorq	%r14,%r13
+	movq	%r11,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%r11,%r12
+	leaq	(%r10,%r13,1),%r10
+	xorq	%rax,%rdi
+	vpaddq	%ymm11,%ymm0,%ymm0
+	rorxq	$34,%r11,%r14
+	rorxq	$28,%r11,%r13
+	leaq	(%rcx,%r10,1),%rcx
+	vpaddq	-128(%rbp),%ymm0,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rax,%r15
+	xorq	%r13,%r14
+	leaq	(%r10,%r15,1),%r10
+	movq	%rdx,%r12
+	vmovdqa	%ymm10,0(%rsp)
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	addq	32+256(%rsp),%r9
+	andq	%rcx,%r12
+	rorxq	$41,%rcx,%r13
+	vpalignr	$8,%ymm5,%ymm6,%ymm11
+	rorxq	$18,%rcx,%r15
+	leaq	(%r10,%r14,1),%r10
+	leaq	(%r9,%r12,1),%r9
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%r8,%rcx,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rcx,%r14
+	vpaddq	%ymm11,%ymm1,%ymm1
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%r9,%r12,1),%r9
+	xorq	%r14,%r13
+	movq	%r10,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%r10,%r12
+	leaq	(%r9,%r13,1),%r9
+	xorq	%r11,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%r10,%r14
+	rorxq	$28,%r10,%r13
+	leaq	(%rbx,%r9,1),%rbx
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r11,%rdi
+	vpsrlq	$6,%ymm0,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%r9,%rdi,1),%r9
+	movq	%rcx,%r12
+	vpsllq	$3,%ymm0,%ymm10
+	vpaddq	%ymm8,%ymm1,%ymm1
+	addq	40+256(%rsp),%r8
+	andq	%rbx,%r12
+	rorxq	$41,%rbx,%r13
+	vpsrlq	$19,%ymm0,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%rbx,%rdi
+	leaq	(%r9,%r14,1),%r9
+	leaq	(%r8,%r12,1),%r8
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%rdx,%rbx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rbx,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%r8,%r12,1),%r8
+	xorq	%r14,%r13
+	movq	%r9,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%r9,%r12
+	leaq	(%r8,%r13,1),%r8
+	xorq	%r10,%rdi
+	vpaddq	%ymm11,%ymm1,%ymm1
+	rorxq	$34,%r9,%r14
+	rorxq	$28,%r9,%r13
+	leaq	(%rax,%r8,1),%rax
+	vpaddq	-96(%rbp),%ymm1,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r10,%r15
+	xorq	%r13,%r14
+	leaq	(%r8,%r15,1),%r8
+	movq	%rbx,%r12
+	vmovdqa	%ymm10,32(%rsp)
+	vpalignr	$8,%ymm2,%ymm3,%ymm8
+	addq	64+256(%rsp),%rdx
+	andq	%rax,%r12
+	rorxq	$41,%rax,%r13
+	vpalignr	$8,%ymm6,%ymm7,%ymm11
+	rorxq	$18,%rax,%r15
+	leaq	(%r8,%r14,1),%r8
+	leaq	(%rdx,%r12,1),%rdx
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%rcx,%rax,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rax,%r14
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%rdx,%r12,1),%rdx
+	xorq	%r14,%r13
+	movq	%r8,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%r8,%r12
+	leaq	(%rdx,%r13,1),%rdx
+	xorq	%r9,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%r8,%r14
+	rorxq	$28,%r8,%r13
+	leaq	(%r11,%rdx,1),%r11
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r9,%rdi
+	vpsrlq	$6,%ymm1,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%rdx,%rdi,1),%rdx
+	movq	%rax,%r12
+	vpsllq	$3,%ymm1,%ymm10
+	vpaddq	%ymm8,%ymm2,%ymm2
+	addq	72+256(%rsp),%rcx
+	andq	%r11,%r12
+	rorxq	$41,%r11,%r13
+	vpsrlq	$19,%ymm1,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%r11,%rdi
+	leaq	(%rdx,%r14,1),%rdx
+	leaq	(%rcx,%r12,1),%rcx
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%rbx,%r11,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r11,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%rcx,%r12,1),%rcx
+	xorq	%r14,%r13
+	movq	%rdx,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%rdx,%r12
+	leaq	(%rcx,%r13,1),%rcx
+	xorq	%r8,%rdi
+	vpaddq	%ymm11,%ymm2,%ymm2
+	rorxq	$34,%rdx,%r14
+	rorxq	$28,%rdx,%r13
+	leaq	(%r10,%rcx,1),%r10
+	vpaddq	-64(%rbp),%ymm2,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r8,%r15
+	xorq	%r13,%r14
+	leaq	(%rcx,%r15,1),%rcx
+	movq	%r11,%r12
+	vmovdqa	%ymm10,64(%rsp)
+	vpalignr	$8,%ymm3,%ymm4,%ymm8
+	addq	96+256(%rsp),%rbx
+	andq	%r10,%r12
+	rorxq	$41,%r10,%r13
+	vpalignr	$8,%ymm7,%ymm0,%ymm11
+	rorxq	$18,%r10,%r15
+	leaq	(%rcx,%r14,1),%rcx
+	leaq	(%rbx,%r12,1),%rbx
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%rax,%r10,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r10,%r14
+	vpaddq	%ymm11,%ymm3,%ymm3
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%rbx,%r12,1),%rbx
+	xorq	%r14,%r13
+	movq	%rcx,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%rcx,%r12
+	leaq	(%rbx,%r13,1),%rbx
+	xorq	%rdx,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%rcx,%r14
+	rorxq	$28,%rcx,%r13
+	leaq	(%r9,%rbx,1),%r9
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rdx,%rdi
+	vpsrlq	$6,%ymm2,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%rbx,%rdi,1),%rbx
+	movq	%r10,%r12
+	vpsllq	$3,%ymm2,%ymm10
+	vpaddq	%ymm8,%ymm3,%ymm3
+	addq	104+256(%rsp),%rax
+	andq	%r9,%r12
+	rorxq	$41,%r9,%r13
+	vpsrlq	$19,%ymm2,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%r9,%rdi
+	leaq	(%rbx,%r14,1),%rbx
+	leaq	(%rax,%r12,1),%rax
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%r11,%r9,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r9,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%rax,%r12,1),%rax
+	xorq	%r14,%r13
+	movq	%rbx,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%rbx,%r12
+	leaq	(%rax,%r13,1),%rax
+	xorq	%rcx,%rdi
+	vpaddq	%ymm11,%ymm3,%ymm3
+	rorxq	$34,%rbx,%r14
+	rorxq	$28,%rbx,%r13
+	leaq	(%r8,%rax,1),%r8
+	vpaddq	-32(%rbp),%ymm3,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rcx,%r15
+	xorq	%r13,%r14
+	leaq	(%rax,%r15,1),%rax
+	movq	%r9,%r12
+	vmovdqa	%ymm10,96(%rsp)
+	leaq	-128(%rsp),%rsp
+	vpalignr	$8,%ymm4,%ymm5,%ymm8
+	addq	0+256(%rsp),%r11
+	andq	%r8,%r12
+	rorxq	$41,%r8,%r13
+	vpalignr	$8,%ymm0,%ymm1,%ymm11
+	rorxq	$18,%r8,%r15
+	leaq	(%rax,%r14,1),%rax
+	leaq	(%r11,%r12,1),%r11
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%r10,%r8,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r8,%r14
+	vpaddq	%ymm11,%ymm4,%ymm4
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%r11,%r12,1),%r11
+	xorq	%r14,%r13
+	movq	%rax,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%rax,%r12
+	leaq	(%r11,%r13,1),%r11
+	xorq	%rbx,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%rax,%r14
+	rorxq	$28,%rax,%r13
+	leaq	(%rdx,%r11,1),%rdx
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rbx,%rdi
+	vpsrlq	$6,%ymm3,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%r11,%rdi,1),%r11
+	movq	%r8,%r12
+	vpsllq	$3,%ymm3,%ymm10
+	vpaddq	%ymm8,%ymm4,%ymm4
+	addq	8+256(%rsp),%r10
+	andq	%rdx,%r12
+	rorxq	$41,%rdx,%r13
+	vpsrlq	$19,%ymm3,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%rdx,%rdi
+	leaq	(%r11,%r14,1),%r11
+	leaq	(%r10,%r12,1),%r10
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%r9,%rdx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rdx,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%r10,%r12,1),%r10
+	xorq	%r14,%r13
+	movq	%r11,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%r11,%r12
+	leaq	(%r10,%r13,1),%r10
+	xorq	%rax,%rdi
+	vpaddq	%ymm11,%ymm4,%ymm4
+	rorxq	$34,%r11,%r14
+	rorxq	$28,%r11,%r13
+	leaq	(%rcx,%r10,1),%rcx
+	vpaddq	0(%rbp),%ymm4,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rax,%r15
+	xorq	%r13,%r14
+	leaq	(%r10,%r15,1),%r10
+	movq	%rdx,%r12
+	vmovdqa	%ymm10,0(%rsp)
+	vpalignr	$8,%ymm5,%ymm6,%ymm8
+	addq	32+256(%rsp),%r9
+	andq	%rcx,%r12
+	rorxq	$41,%rcx,%r13
+	vpalignr	$8,%ymm1,%ymm2,%ymm11
+	rorxq	$18,%rcx,%r15
+	leaq	(%r10,%r14,1),%r10
+	leaq	(%r9,%r12,1),%r9
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%r8,%rcx,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rcx,%r14
+	vpaddq	%ymm11,%ymm5,%ymm5
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%r9,%r12,1),%r9
+	xorq	%r14,%r13
+	movq	%r10,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%r10,%r12
+	leaq	(%r9,%r13,1),%r9
+	xorq	%r11,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%r10,%r14
+	rorxq	$28,%r10,%r13
+	leaq	(%rbx,%r9,1),%rbx
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r11,%rdi
+	vpsrlq	$6,%ymm4,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%r9,%rdi,1),%r9
+	movq	%rcx,%r12
+	vpsllq	$3,%ymm4,%ymm10
+	vpaddq	%ymm8,%ymm5,%ymm5
+	addq	40+256(%rsp),%r8
+	andq	%rbx,%r12
+	rorxq	$41,%rbx,%r13
+	vpsrlq	$19,%ymm4,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%rbx,%rdi
+	leaq	(%r9,%r14,1),%r9
+	leaq	(%r8,%r12,1),%r8
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%rdx,%rbx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rbx,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%r8,%r12,1),%r8
+	xorq	%r14,%r13
+	movq	%r9,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%r9,%r12
+	leaq	(%r8,%r13,1),%r8
+	xorq	%r10,%rdi
+	vpaddq	%ymm11,%ymm5,%ymm5
+	rorxq	$34,%r9,%r14
+	rorxq	$28,%r9,%r13
+	leaq	(%rax,%r8,1),%rax
+	vpaddq	32(%rbp),%ymm5,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r10,%r15
+	xorq	%r13,%r14
+	leaq	(%r8,%r15,1),%r8
+	movq	%rbx,%r12
+	vmovdqa	%ymm10,32(%rsp)
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	addq	64+256(%rsp),%rdx
+	andq	%rax,%r12
+	rorxq	$41,%rax,%r13
+	vpalignr	$8,%ymm2,%ymm3,%ymm11
+	rorxq	$18,%rax,%r15
+	leaq	(%r8,%r14,1),%r8
+	leaq	(%rdx,%r12,1),%rdx
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%rcx,%rax,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rax,%r14
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%rdx,%r12,1),%rdx
+	xorq	%r14,%r13
+	movq	%r8,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%r8,%r12
+	leaq	(%rdx,%r13,1),%rdx
+	xorq	%r9,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%r8,%r14
+	rorxq	$28,%r8,%r13
+	leaq	(%r11,%rdx,1),%r11
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r9,%rdi
+	vpsrlq	$6,%ymm5,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%rdx,%rdi,1),%rdx
+	movq	%rax,%r12
+	vpsllq	$3,%ymm5,%ymm10
+	vpaddq	%ymm8,%ymm6,%ymm6
+	addq	72+256(%rsp),%rcx
+	andq	%r11,%r12
+	rorxq	$41,%r11,%r13
+	vpsrlq	$19,%ymm5,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%r11,%rdi
+	leaq	(%rdx,%r14,1),%rdx
+	leaq	(%rcx,%r12,1),%rcx
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%rbx,%r11,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r11,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%rcx,%r12,1),%rcx
+	xorq	%r14,%r13
+	movq	%rdx,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%rdx,%r12
+	leaq	(%rcx,%r13,1),%rcx
+	xorq	%r8,%rdi
+	vpaddq	%ymm11,%ymm6,%ymm6
+	rorxq	$34,%rdx,%r14
+	rorxq	$28,%rdx,%r13
+	leaq	(%r10,%rcx,1),%r10
+	vpaddq	64(%rbp),%ymm6,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r8,%r15
+	xorq	%r13,%r14
+	leaq	(%rcx,%r15,1),%rcx
+	movq	%r11,%r12
+	vmovdqa	%ymm10,64(%rsp)
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	addq	96+256(%rsp),%rbx
+	andq	%r10,%r12
+	rorxq	$41,%r10,%r13
+	vpalignr	$8,%ymm3,%ymm4,%ymm11
+	rorxq	$18,%r10,%r15
+	leaq	(%rcx,%r14,1),%rcx
+	leaq	(%rbx,%r12,1),%rbx
+	vpsrlq	$1,%ymm8,%ymm10
+	andnq	%rax,%r10,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r10,%r14
+	vpaddq	%ymm11,%ymm7,%ymm7
+	vpsrlq	$7,%ymm8,%ymm11
+	leaq	(%rbx,%r12,1),%rbx
+	xorq	%r14,%r13
+	movq	%rcx,%r15
+	vpsllq	$56,%ymm8,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm8
+	rorxq	$39,%rcx,%r12
+	leaq	(%rbx,%r13,1),%rbx
+	xorq	%rdx,%r15
+	vpsrlq	$7,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm8,%ymm8
+	rorxq	$34,%rcx,%r14
+	rorxq	$28,%rcx,%r13
+	leaq	(%r9,%rbx,1),%r9
+	vpsllq	$7,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm8,%ymm8
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rdx,%rdi
+	vpsrlq	$6,%ymm6,%ymm11
+	vpxor	%ymm9,%ymm8,%ymm8
+	xorq	%r13,%r14
+	leaq	(%rbx,%rdi,1),%rbx
+	movq	%r10,%r12
+	vpsllq	$3,%ymm6,%ymm10
+	vpaddq	%ymm8,%ymm7,%ymm7
+	addq	104+256(%rsp),%rax
+	andq	%r9,%r12
+	rorxq	$41,%r9,%r13
+	vpsrlq	$19,%ymm6,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	rorxq	$18,%r9,%rdi
+	leaq	(%rbx,%r14,1),%rbx
+	leaq	(%rax,%r12,1),%rax
+	vpsllq	$42,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm11,%ymm11
+	andnq	%r11,%r9,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r9,%r14
+	vpsrlq	$42,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm11,%ymm11
+	leaq	(%rax,%r12,1),%rax
+	xorq	%r14,%r13
+	movq	%rbx,%rdi
+	vpxor	%ymm9,%ymm11,%ymm11
+	rorxq	$39,%rbx,%r12
+	leaq	(%rax,%r13,1),%rax
+	xorq	%rcx,%rdi
+	vpaddq	%ymm11,%ymm7,%ymm7
+	rorxq	$34,%rbx,%r14
+	rorxq	$28,%rbx,%r13
+	leaq	(%r8,%rax,1),%r8
+	vpaddq	96(%rbp),%ymm7,%ymm10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rcx,%r15
+	xorq	%r13,%r14
+	leaq	(%rax,%r15,1),%rax
+	movq	%r9,%r12
+	vmovdqa	%ymm10,96(%rsp)
+	leaq	256(%rbp),%rbp
+	cmpb	$0,-121(%rbp)
+	jne	.Lavx2_00_47
+	addq	0+128(%rsp),%r11
+	andq	%r8,%r12
+	rorxq	$41,%r8,%r13
+	rorxq	$18,%r8,%r15
+	leaq	(%rax,%r14,1),%rax
+	leaq	(%r11,%r12,1),%r11
+	andnq	%r10,%r8,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r8,%r14
+	leaq	(%r11,%r12,1),%r11
+	xorq	%r14,%r13
+	movq	%rax,%r15
+	rorxq	$39,%rax,%r12
+	leaq	(%r11,%r13,1),%r11
+	xorq	%rbx,%r15
+	rorxq	$34,%rax,%r14
+	rorxq	$28,%rax,%r13
+	leaq	(%rdx,%r11,1),%rdx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rbx,%rdi
+	xorq	%r13,%r14
+	leaq	(%r11,%rdi,1),%r11
+	movq	%r8,%r12
+	addq	8+128(%rsp),%r10
+	andq	%rdx,%r12
+	rorxq	$41,%rdx,%r13
+	rorxq	$18,%rdx,%rdi
+	leaq	(%r11,%r14,1),%r11
+	leaq	(%r10,%r12,1),%r10
+	andnq	%r9,%rdx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rdx,%r14
+	leaq	(%r10,%r12,1),%r10
+	xorq	%r14,%r13
+	movq	%r11,%rdi
+	rorxq	$39,%r11,%r12
+	leaq	(%r10,%r13,1),%r10
+	xorq	%rax,%rdi
+	rorxq	$34,%r11,%r14
+	rorxq	$28,%r11,%r13
+	leaq	(%rcx,%r10,1),%rcx
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rax,%r15
+	xorq	%r13,%r14
+	leaq	(%r10,%r15,1),%r10
+	movq	%rdx,%r12
+	addq	32+128(%rsp),%r9
+	andq	%rcx,%r12
+	rorxq	$41,%rcx,%r13
+	rorxq	$18,%rcx,%r15
+	leaq	(%r10,%r14,1),%r10
+	leaq	(%r9,%r12,1),%r9
+	andnq	%r8,%rcx,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rcx,%r14
+	leaq	(%r9,%r12,1),%r9
+	xorq	%r14,%r13
+	movq	%r10,%r15
+	rorxq	$39,%r10,%r12
+	leaq	(%r9,%r13,1),%r9
+	xorq	%r11,%r15
+	rorxq	$34,%r10,%r14
+	rorxq	$28,%r10,%r13
+	leaq	(%rbx,%r9,1),%rbx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r11,%rdi
+	xorq	%r13,%r14
+	leaq	(%r9,%rdi,1),%r9
+	movq	%rcx,%r12
+	addq	40+128(%rsp),%r8
+	andq	%rbx,%r12
+	rorxq	$41,%rbx,%r13
+	rorxq	$18,%rbx,%rdi
+	leaq	(%r9,%r14,1),%r9
+	leaq	(%r8,%r12,1),%r8
+	andnq	%rdx,%rbx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rbx,%r14
+	leaq	(%r8,%r12,1),%r8
+	xorq	%r14,%r13
+	movq	%r9,%rdi
+	rorxq	$39,%r9,%r12
+	leaq	(%r8,%r13,1),%r8
+	xorq	%r10,%rdi
+	rorxq	$34,%r9,%r14
+	rorxq	$28,%r9,%r13
+	leaq	(%rax,%r8,1),%rax
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r10,%r15
+	xorq	%r13,%r14
+	leaq	(%r8,%r15,1),%r8
+	movq	%rbx,%r12
+	addq	64+128(%rsp),%rdx
+	andq	%rax,%r12
+	rorxq	$41,%rax,%r13
+	rorxq	$18,%rax,%r15
+	leaq	(%r8,%r14,1),%r8
+	leaq	(%rdx,%r12,1),%rdx
+	andnq	%rcx,%rax,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rax,%r14
+	leaq	(%rdx,%r12,1),%rdx
+	xorq	%r14,%r13
+	movq	%r8,%r15
+	rorxq	$39,%r8,%r12
+	leaq	(%rdx,%r13,1),%rdx
+	xorq	%r9,%r15
+	rorxq	$34,%r8,%r14
+	rorxq	$28,%r8,%r13
+	leaq	(%r11,%rdx,1),%r11
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r9,%rdi
+	xorq	%r13,%r14
+	leaq	(%rdx,%rdi,1),%rdx
+	movq	%rax,%r12
+	addq	72+128(%rsp),%rcx
+	andq	%r11,%r12
+	rorxq	$41,%r11,%r13
+	rorxq	$18,%r11,%rdi
+	leaq	(%rdx,%r14,1),%rdx
+	leaq	(%rcx,%r12,1),%rcx
+	andnq	%rbx,%r11,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r11,%r14
+	leaq	(%rcx,%r12,1),%rcx
+	xorq	%r14,%r13
+	movq	%rdx,%rdi
+	rorxq	$39,%rdx,%r12
+	leaq	(%rcx,%r13,1),%rcx
+	xorq	%r8,%rdi
+	rorxq	$34,%rdx,%r14
+	rorxq	$28,%rdx,%r13
+	leaq	(%r10,%rcx,1),%r10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r8,%r15
+	xorq	%r13,%r14
+	leaq	(%rcx,%r15,1),%rcx
+	movq	%r11,%r12
+	addq	96+128(%rsp),%rbx
+	andq	%r10,%r12
+	rorxq	$41,%r10,%r13
+	rorxq	$18,%r10,%r15
+	leaq	(%rcx,%r14,1),%rcx
+	leaq	(%rbx,%r12,1),%rbx
+	andnq	%rax,%r10,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r10,%r14
+	leaq	(%rbx,%r12,1),%rbx
+	xorq	%r14,%r13
+	movq	%rcx,%r15
+	rorxq	$39,%rcx,%r12
+	leaq	(%rbx,%r13,1),%rbx
+	xorq	%rdx,%r15
+	rorxq	$34,%rcx,%r14
+	rorxq	$28,%rcx,%r13
+	leaq	(%r9,%rbx,1),%r9
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rdx,%rdi
+	xorq	%r13,%r14
+	leaq	(%rbx,%rdi,1),%rbx
+	movq	%r10,%r12
+	addq	104+128(%rsp),%rax
+	andq	%r9,%r12
+	rorxq	$41,%r9,%r13
+	rorxq	$18,%r9,%rdi
+	leaq	(%rbx,%r14,1),%rbx
+	leaq	(%rax,%r12,1),%rax
+	andnq	%r11,%r9,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r9,%r14
+	leaq	(%rax,%r12,1),%rax
+	xorq	%r14,%r13
+	movq	%rbx,%rdi
+	rorxq	$39,%rbx,%r12
+	leaq	(%rax,%r13,1),%rax
+	xorq	%rcx,%rdi
+	rorxq	$34,%rbx,%r14
+	rorxq	$28,%rbx,%r13
+	leaq	(%r8,%rax,1),%r8
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rcx,%r15
+	xorq	%r13,%r14
+	leaq	(%rax,%r15,1),%rax
+	movq	%r9,%r12
+	addq	0(%rsp),%r11
+	andq	%r8,%r12
+	rorxq	$41,%r8,%r13
+	rorxq	$18,%r8,%r15
+	leaq	(%rax,%r14,1),%rax
+	leaq	(%r11,%r12,1),%r11
+	andnq	%r10,%r8,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r8,%r14
+	leaq	(%r11,%r12,1),%r11
+	xorq	%r14,%r13
+	movq	%rax,%r15
+	rorxq	$39,%rax,%r12
+	leaq	(%r11,%r13,1),%r11
+	xorq	%rbx,%r15
+	rorxq	$34,%rax,%r14
+	rorxq	$28,%rax,%r13
+	leaq	(%rdx,%r11,1),%rdx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rbx,%rdi
+	xorq	%r13,%r14
+	leaq	(%r11,%rdi,1),%r11
+	movq	%r8,%r12
+	addq	8(%rsp),%r10
+	andq	%rdx,%r12
+	rorxq	$41,%rdx,%r13
+	rorxq	$18,%rdx,%rdi
+	leaq	(%r11,%r14,1),%r11
+	leaq	(%r10,%r12,1),%r10
+	andnq	%r9,%rdx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rdx,%r14
+	leaq	(%r10,%r12,1),%r10
+	xorq	%r14,%r13
+	movq	%r11,%rdi
+	rorxq	$39,%r11,%r12
+	leaq	(%r10,%r13,1),%r10
+	xorq	%rax,%rdi
+	rorxq	$34,%r11,%r14
+	rorxq	$28,%r11,%r13
+	leaq	(%rcx,%r10,1),%rcx
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rax,%r15
+	xorq	%r13,%r14
+	leaq	(%r10,%r15,1),%r10
+	movq	%rdx,%r12
+	addq	32(%rsp),%r9
+	andq	%rcx,%r12
+	rorxq	$41,%rcx,%r13
+	rorxq	$18,%rcx,%r15
+	leaq	(%r10,%r14,1),%r10
+	leaq	(%r9,%r12,1),%r9
+	andnq	%r8,%rcx,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rcx,%r14
+	leaq	(%r9,%r12,1),%r9
+	xorq	%r14,%r13
+	movq	%r10,%r15
+	rorxq	$39,%r10,%r12
+	leaq	(%r9,%r13,1),%r9
+	xorq	%r11,%r15
+	rorxq	$34,%r10,%r14
+	rorxq	$28,%r10,%r13
+	leaq	(%rbx,%r9,1),%rbx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r11,%rdi
+	xorq	%r13,%r14
+	leaq	(%r9,%rdi,1),%r9
+	movq	%rcx,%r12
+	addq	40(%rsp),%r8
+	andq	%rbx,%r12
+	rorxq	$41,%rbx,%r13
+	rorxq	$18,%rbx,%rdi
+	leaq	(%r9,%r14,1),%r9
+	leaq	(%r8,%r12,1),%r8
+	andnq	%rdx,%rbx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rbx,%r14
+	leaq	(%r8,%r12,1),%r8
+	xorq	%r14,%r13
+	movq	%r9,%rdi
+	rorxq	$39,%r9,%r12
+	leaq	(%r8,%r13,1),%r8
+	xorq	%r10,%rdi
+	rorxq	$34,%r9,%r14
+	rorxq	$28,%r9,%r13
+	leaq	(%rax,%r8,1),%rax
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r10,%r15
+	xorq	%r13,%r14
+	leaq	(%r8,%r15,1),%r8
+	movq	%rbx,%r12
+	addq	64(%rsp),%rdx
+	andq	%rax,%r12
+	rorxq	$41,%rax,%r13
+	rorxq	$18,%rax,%r15
+	leaq	(%r8,%r14,1),%r8
+	leaq	(%rdx,%r12,1),%rdx
+	andnq	%rcx,%rax,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rax,%r14
+	leaq	(%rdx,%r12,1),%rdx
+	xorq	%r14,%r13
+	movq	%r8,%r15
+	rorxq	$39,%r8,%r12
+	leaq	(%rdx,%r13,1),%rdx
+	xorq	%r9,%r15
+	rorxq	$34,%r8,%r14
+	rorxq	$28,%r8,%r13
+	leaq	(%r11,%rdx,1),%r11
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r9,%rdi
+	xorq	%r13,%r14
+	leaq	(%rdx,%rdi,1),%rdx
+	movq	%rax,%r12
+	addq	72(%rsp),%rcx
+	andq	%r11,%r12
+	rorxq	$41,%r11,%r13
+	rorxq	$18,%r11,%rdi
+	leaq	(%rdx,%r14,1),%rdx
+	leaq	(%rcx,%r12,1),%rcx
+	andnq	%rbx,%r11,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r11,%r14
+	leaq	(%rcx,%r12,1),%rcx
+	xorq	%r14,%r13
+	movq	%rdx,%rdi
+	rorxq	$39,%rdx,%r12
+	leaq	(%rcx,%r13,1),%rcx
+	xorq	%r8,%rdi
+	rorxq	$34,%rdx,%r14
+	rorxq	$28,%rdx,%r13
+	leaq	(%r10,%rcx,1),%r10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r8,%r15
+	xorq	%r13,%r14
+	leaq	(%rcx,%r15,1),%rcx
+	movq	%r11,%r12
+	addq	96(%rsp),%rbx
+	andq	%r10,%r12
+	rorxq	$41,%r10,%r13
+	rorxq	$18,%r10,%r15
+	leaq	(%rcx,%r14,1),%rcx
+	leaq	(%rbx,%r12,1),%rbx
+	andnq	%rax,%r10,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r10,%r14
+	leaq	(%rbx,%r12,1),%rbx
+	xorq	%r14,%r13
+	movq	%rcx,%r15
+	rorxq	$39,%rcx,%r12
+	leaq	(%rbx,%r13,1),%rbx
+	xorq	%rdx,%r15
+	rorxq	$34,%rcx,%r14
+	rorxq	$28,%rcx,%r13
+	leaq	(%r9,%rbx,1),%r9
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rdx,%rdi
+	xorq	%r13,%r14
+	leaq	(%rbx,%rdi,1),%rbx
+	movq	%r10,%r12
+	addq	104(%rsp),%rax
+	andq	%r9,%r12
+	rorxq	$41,%r9,%r13
+	rorxq	$18,%r9,%rdi
+	leaq	(%rbx,%r14,1),%rbx
+	leaq	(%rax,%r12,1),%rax
+	andnq	%r11,%r9,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r9,%r14
+	leaq	(%rax,%r12,1),%rax
+	xorq	%r14,%r13
+	movq	%rbx,%rdi
+	rorxq	$39,%rbx,%r12
+	leaq	(%rax,%r13,1),%rax
+	xorq	%rcx,%rdi
+	rorxq	$34,%rbx,%r14
+	rorxq	$28,%rbx,%r13
+	leaq	(%r8,%rax,1),%r8
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rcx,%r15
+	xorq	%r13,%r14
+	leaq	(%rax,%r15,1),%rax
+	movq	%r9,%r12
+	movq	1280(%rsp),%rdi
+	addq	%r14,%rax
+
+	leaq	1152(%rsp),%rbp
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+
+	cmpq	144(%rbp),%rsi
+	je	.Ldone_avx2
+
+	xorq	%r14,%r14
+	movq	%rbx,%rdi
+	xorq	%rcx,%rdi
+	movq	%r9,%r12
+	jmp	.Lower_avx2
+.align	16
+.Lower_avx2:
+	addq	0+16(%rbp),%r11
+	andq	%r8,%r12
+	rorxq	$41,%r8,%r13
+	rorxq	$18,%r8,%r15
+	leaq	(%rax,%r14,1),%rax
+	leaq	(%r11,%r12,1),%r11
+	andnq	%r10,%r8,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r8,%r14
+	leaq	(%r11,%r12,1),%r11
+	xorq	%r14,%r13
+	movq	%rax,%r15
+	rorxq	$39,%rax,%r12
+	leaq	(%r11,%r13,1),%r11
+	xorq	%rbx,%r15
+	rorxq	$34,%rax,%r14
+	rorxq	$28,%rax,%r13
+	leaq	(%rdx,%r11,1),%rdx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rbx,%rdi
+	xorq	%r13,%r14
+	leaq	(%r11,%rdi,1),%r11
+	movq	%r8,%r12
+	addq	8+16(%rbp),%r10
+	andq	%rdx,%r12
+	rorxq	$41,%rdx,%r13
+	rorxq	$18,%rdx,%rdi
+	leaq	(%r11,%r14,1),%r11
+	leaq	(%r10,%r12,1),%r10
+	andnq	%r9,%rdx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rdx,%r14
+	leaq	(%r10,%r12,1),%r10
+	xorq	%r14,%r13
+	movq	%r11,%rdi
+	rorxq	$39,%r11,%r12
+	leaq	(%r10,%r13,1),%r10
+	xorq	%rax,%rdi
+	rorxq	$34,%r11,%r14
+	rorxq	$28,%r11,%r13
+	leaq	(%rcx,%r10,1),%rcx
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rax,%r15
+	xorq	%r13,%r14
+	leaq	(%r10,%r15,1),%r10
+	movq	%rdx,%r12
+	addq	32+16(%rbp),%r9
+	andq	%rcx,%r12
+	rorxq	$41,%rcx,%r13
+	rorxq	$18,%rcx,%r15
+	leaq	(%r10,%r14,1),%r10
+	leaq	(%r9,%r12,1),%r9
+	andnq	%r8,%rcx,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rcx,%r14
+	leaq	(%r9,%r12,1),%r9
+	xorq	%r14,%r13
+	movq	%r10,%r15
+	rorxq	$39,%r10,%r12
+	leaq	(%r9,%r13,1),%r9
+	xorq	%r11,%r15
+	rorxq	$34,%r10,%r14
+	rorxq	$28,%r10,%r13
+	leaq	(%rbx,%r9,1),%rbx
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r11,%rdi
+	xorq	%r13,%r14
+	leaq	(%r9,%rdi,1),%r9
+	movq	%rcx,%r12
+	addq	40+16(%rbp),%r8
+	andq	%rbx,%r12
+	rorxq	$41,%rbx,%r13
+	rorxq	$18,%rbx,%rdi
+	leaq	(%r9,%r14,1),%r9
+	leaq	(%r8,%r12,1),%r8
+	andnq	%rdx,%rbx,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%rbx,%r14
+	leaq	(%r8,%r12,1),%r8
+	xorq	%r14,%r13
+	movq	%r9,%rdi
+	rorxq	$39,%r9,%r12
+	leaq	(%r8,%r13,1),%r8
+	xorq	%r10,%rdi
+	rorxq	$34,%r9,%r14
+	rorxq	$28,%r9,%r13
+	leaq	(%rax,%r8,1),%rax
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r10,%r15
+	xorq	%r13,%r14
+	leaq	(%r8,%r15,1),%r8
+	movq	%rbx,%r12
+	addq	64+16(%rbp),%rdx
+	andq	%rax,%r12
+	rorxq	$41,%rax,%r13
+	rorxq	$18,%rax,%r15
+	leaq	(%r8,%r14,1),%r8
+	leaq	(%rdx,%r12,1),%rdx
+	andnq	%rcx,%rax,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%rax,%r14
+	leaq	(%rdx,%r12,1),%rdx
+	xorq	%r14,%r13
+	movq	%r8,%r15
+	rorxq	$39,%r8,%r12
+	leaq	(%rdx,%r13,1),%rdx
+	xorq	%r9,%r15
+	rorxq	$34,%r8,%r14
+	rorxq	$28,%r8,%r13
+	leaq	(%r11,%rdx,1),%r11
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%r9,%rdi
+	xorq	%r13,%r14
+	leaq	(%rdx,%rdi,1),%rdx
+	movq	%rax,%r12
+	addq	72+16(%rbp),%rcx
+	andq	%r11,%r12
+	rorxq	$41,%r11,%r13
+	rorxq	$18,%r11,%rdi
+	leaq	(%rdx,%r14,1),%rdx
+	leaq	(%rcx,%r12,1),%rcx
+	andnq	%rbx,%r11,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r11,%r14
+	leaq	(%rcx,%r12,1),%rcx
+	xorq	%r14,%r13
+	movq	%rdx,%rdi
+	rorxq	$39,%rdx,%r12
+	leaq	(%rcx,%r13,1),%rcx
+	xorq	%r8,%rdi
+	rorxq	$34,%rdx,%r14
+	rorxq	$28,%rdx,%r13
+	leaq	(%r10,%rcx,1),%r10
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%r8,%r15
+	xorq	%r13,%r14
+	leaq	(%rcx,%r15,1),%rcx
+	movq	%r11,%r12
+	addq	96+16(%rbp),%rbx
+	andq	%r10,%r12
+	rorxq	$41,%r10,%r13
+	rorxq	$18,%r10,%r15
+	leaq	(%rcx,%r14,1),%rcx
+	leaq	(%rbx,%r12,1),%rbx
+	andnq	%rax,%r10,%r12
+	xorq	%r15,%r13
+	rorxq	$14,%r10,%r14
+	leaq	(%rbx,%r12,1),%rbx
+	xorq	%r14,%r13
+	movq	%rcx,%r15
+	rorxq	$39,%rcx,%r12
+	leaq	(%rbx,%r13,1),%rbx
+	xorq	%rdx,%r15
+	rorxq	$34,%rcx,%r14
+	rorxq	$28,%rcx,%r13
+	leaq	(%r9,%rbx,1),%r9
+	andq	%r15,%rdi
+	xorq	%r12,%r14
+	xorq	%rdx,%rdi
+	xorq	%r13,%r14
+	leaq	(%rbx,%rdi,1),%rbx
+	movq	%r10,%r12
+	addq	104+16(%rbp),%rax
+	andq	%r9,%r12
+	rorxq	$41,%r9,%r13
+	rorxq	$18,%r9,%rdi
+	leaq	(%rbx,%r14,1),%rbx
+	leaq	(%rax,%r12,1),%rax
+	andnq	%r11,%r9,%r12
+	xorq	%rdi,%r13
+	rorxq	$14,%r9,%r14
+	leaq	(%rax,%r12,1),%rax
+	xorq	%r14,%r13
+	movq	%rbx,%rdi
+	rorxq	$39,%rbx,%r12
+	leaq	(%rax,%r13,1),%rax
+	xorq	%rcx,%rdi
+	rorxq	$34,%rbx,%r14
+	rorxq	$28,%rbx,%r13
+	leaq	(%r8,%rax,1),%r8
+	andq	%rdi,%r15
+	xorq	%r12,%r14
+	xorq	%rcx,%r15
+	xorq	%r13,%r14
+	leaq	(%rax,%r15,1),%rax
+	movq	%r9,%r12
+	leaq	-128(%rbp),%rbp
+	cmpq	%rsp,%rbp
+	jae	.Lower_avx2
+
+	movq	1280(%rsp),%rdi
+	addq	%r14,%rax
+
+	leaq	1152(%rsp),%rsp
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	leaq	256(%rsi),%rsi
+	addq	48(%rdi),%r10
+	movq	%rsi,%r12
+	addq	56(%rdi),%r11
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	cmoveq	%rsp,%r12
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+
+	jbe	.Loop_avx2
+	leaq	(%rsp),%rbp
+
+.Ldone_avx2:
+	leaq	(%rbp),%rsp
+	movq	128+24(%rsp),%rsi
+	vzeroupper
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	sha512_block_data_order_avx2,.-sha512_block_data_order_avx2

Modified: trunk/secure/lib/libcrypto/amd64/vpaes-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/vpaes-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/vpaes-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/vpaes-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from vpaes-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/vpaes-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from vpaes-x86_64.pl. */
 .text	
 
 
@@ -34,8 +34,8 @@
 	movdqa	.Lk_ipt+16(%rip),%xmm0
 .byte	102,15,56,0,193
 	pxor	%xmm5,%xmm2
+	addq	$16,%r9
 	pxor	%xmm2,%xmm0
-	addq	$16,%r9
 	leaq	.Lk_mc_backward(%rip),%r10
 	jmp	.Lenc_entry
 
@@ -43,19 +43,19 @@
 .Lenc_loop:
 
 	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
 .byte	102,15,56,0,226
+.byte	102,15,56,0,195
 	pxor	%xmm5,%xmm4
-	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,195
+	movdqa	%xmm15,%xmm5
 	pxor	%xmm4,%xmm0
-	movdqa	%xmm15,%xmm5
+	movdqa	-64(%r11,%r10,1),%xmm1
 .byte	102,15,56,0,234
-	movdqa	-64(%r11,%r10,1),%xmm1
+	movdqa	(%r11,%r10,1),%xmm4
 	movdqa	%xmm14,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-	movdqa	(%r11,%r10,1),%xmm4
-	movdqa	%xmm0,%xmm3
 .byte	102,15,56,0,193
 	addq	$16,%r9
 	pxor	%xmm2,%xmm0
@@ -63,31 +63,31 @@
 	addq	$16,%r11
 	pxor	%xmm0,%xmm3
 .byte	102,15,56,0,193
-	andq	$48,%r11
+	andq	$0x30,%r11
+	subq	$1,%rax
 	pxor	%xmm3,%xmm0
-	subq	$1,%rax
 
 .Lenc_entry:
 
 	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-	movdqa	%xmm11,%xmm5
 .byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm10,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
 	pxor	%xmm5,%xmm3
-	movdqa	%xmm10,%xmm4
 .byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
 	pxor	%xmm5,%xmm4
-	movdqa	%xmm10,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
 	movdqu	(%r9),%xmm5
-.byte	102,15,56,0,220
 	pxor	%xmm1,%xmm3
 	jnz	.Lenc_loop
 
@@ -123,10 +123,10 @@
 	pand	%xmm9,%xmm0
 .byte	102,15,56,0,208
 	movdqa	.Lk_dipt+16(%rip),%xmm0
-	xorq	$48,%r11
+	xorq	$0x30,%r11
 	leaq	.Lk_dsbd(%rip),%r10
 .byte	102,15,56,0,193
-	andq	$48,%r11
+	andq	$0x30,%r11
 	pxor	%xmm5,%xmm2
 	movdqa	.Lk_mc_forward+48(%rip),%xmm5
 	pxor	%xmm2,%xmm0
@@ -140,62 +140,61 @@
 
 
 	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	-16(%r10),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	addq	$16,%r9
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
 
+.byte	102,15,56,0,226
 .byte	102,15,56,0,197
-	movdqa	0(%r10),%xmm4
-.byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	16(%r10),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	subq	$1,%rax
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
 
+.byte	102,15,56,0,226
 .byte	102,15,56,0,197
-	movdqa	32(%r10),%xmm4
-.byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	48(%r10),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
 
+.byte	102,15,56,0,226
 .byte	102,15,56,0,197
-	movdqa	64(%r10),%xmm4
-.byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	80(%r10),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-
+	addq	$16,%r9
 .byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
 
 .Ldec_entry:
 
 	movdqa	%xmm9,%xmm1
 	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
 	psrld	$4,%xmm1
 	pand	%xmm9,%xmm0
-	movdqa	%xmm11,%xmm2
 .byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm10,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
 	pxor	%xmm2,%xmm3
-	movdqa	%xmm10,%xmm4
 .byte	102,15,56,0,224
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm10,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm10,%xmm3
 .byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
 	pxor	%xmm1,%xmm3
-	movdqu	(%r9),%xmm0
 	jnz	.Ldec_loop
 
 
@@ -223,7 +222,7 @@
 
 
 
-	call	_vpaes_preheat		
+	call	_vpaes_preheat
 	movdqa	.Lk_rcon(%rip),%xmm8
 	movdqu	(%rdi),%xmm0
 
@@ -246,7 +245,7 @@
 	movdqa	(%r8,%r10,1),%xmm1
 .byte	102,15,56,0,217
 	movdqu	%xmm3,(%rdx)
-	xorq	$48,%r8
+	xorq	$0x30,%r8
 
 .Lschedule_go:
 	cmpl	$192,%esi
@@ -269,7 +268,7 @@
 	call	_vpaes_schedule_round
 	decq	%rsi
 	jz	.Lschedule_mangle_last
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 	jmp	.Loop_schedule_128
 
 
@@ -290,7 +289,7 @@
 .align	16
 .Lschedule_192:
 	movdqu	8(%rdi),%xmm0
-	call	_vpaes_schedule_transform	
+	call	_vpaes_schedule_transform
 	movdqa	%xmm0,%xmm6
 	pxor	%xmm4,%xmm4
 	movhlps	%xmm4,%xmm6
@@ -299,13 +298,13 @@
 .Loop_schedule_192:
 	call	_vpaes_schedule_round
 .byte	102,15,58,15,198,8
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_round
 	decq	%rsi
 	jz	.Lschedule_mangle_last
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 	call	_vpaes_schedule_192_smear
 	jmp	.Loop_schedule_192
 
@@ -322,11 +321,11 @@
 .align	16
 .Lschedule_256:
 	movdqu	16(%rdi),%xmm0
-	call	_vpaes_schedule_transform	
+	call	_vpaes_schedule_transform
 	movl	$7,%esi
 
 .Loop_schedule_256:
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 	movdqa	%xmm0,%xmm6
 
 
@@ -333,10 +332,10 @@
 	call	_vpaes_schedule_round
 	decq	%rsi
 	jz	.Lschedule_mangle_last
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 
 
-	pshufd	$255,%xmm0,%xmm0
+	pshufd	$0xFF,%xmm0,%xmm0
 	movdqa	%xmm7,%xmm5
 	movdqa	%xmm6,%xmm7
 	call	_vpaes_schedule_low_round
@@ -371,7 +370,7 @@
 .Lschedule_mangle_last_dec:
 	addq	$-16,%rdx
 	pxor	.Lk_s63(%rip),%xmm0
-	call	_vpaes_schedule_transform 
+	call	_vpaes_schedule_transform
 	movdqu	%xmm0,(%rdx)
 
 
@@ -403,12 +402,12 @@
 .type	_vpaes_schedule_192_smear, at function
 .align	16
 _vpaes_schedule_192_smear:
-	pshufd	$128,%xmm6,%xmm0
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
 	pxor	%xmm0,%xmm6
-	pshufd	$254,%xmm7,%xmm0
-	pxor	%xmm0,%xmm6
 	movdqa	%xmm6,%xmm0
-	pxor	%xmm1,%xmm1
 	movhlps	%xmm1,%xmm6
 	.byte	0xf3,0xc3
 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
@@ -441,7 +440,7 @@
 	pxor	%xmm1,%xmm7
 
 
-	pshufd	$255,%xmm0,%xmm0
+	pshufd	$0xFF,%xmm0,%xmm0
 .byte	102,15,58,15,192,1
 
 
@@ -600,7 +599,7 @@
 	movdqa	(%r8,%r10,1),%xmm1
 .byte	102,15,56,0,217
 	addq	$-16,%r8
-	andq	$48,%r8
+	andq	$0x30,%r8
 	movdqu	%xmm3,(%rdx)
 	.byte	0xf3,0xc3
 .size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
@@ -618,7 +617,7 @@
 	movl	%eax,240(%rdx)
 
 	movl	$0,%ecx
-	movl	$48,%r8d
+	movl	$0x30,%r8d
 	call	_vpaes_schedule_core
 	xorl	%eax,%eax
 	.byte	0xf3,0xc3

Modified: trunk/secure/lib/libcrypto/amd64/wp-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/wp-x86_64.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/wp-x86_64.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/wp-x86_64.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from wp-x86_64.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/wp-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from wp-x86_64.pl. */
 .text	
 
 .globl	whirlpool_block
@@ -65,233 +65,236 @@
 	movq	%r15,64+56(%rsp)
 	xorq	%rsi,%rsi
 	movq	%rsi,24(%rbx)
+	jmp	.Lround
 .align	16
 .Lround:
 	movq	4096(%rbp,%rsi,8),%r8
 	movl	0(%rsp),%eax
 	movl	4(%rsp),%ebx
-	movb	%al,%cl
-	movb	%ah,%dl
+	movzbl	%al,%ecx
+	movzbl	%ah,%edx
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r8
 	movq	7(%rbp,%rdi,8),%r9
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	0+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	movq	6(%rbp,%rsi,8),%r10
 	movq	5(%rbp,%rdi,8),%r11
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	movq	4(%rbp,%rsi,8),%r12
 	movq	3(%rbp,%rdi,8),%r13
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	0+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	movq	2(%rbp,%rsi,8),%r14
 	movq	1(%rbp,%rdi,8),%r15
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r9
 	xorq	7(%rbp,%rdi,8),%r10
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	8+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r11
 	xorq	5(%rbp,%rdi,8),%r12
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r13
 	xorq	3(%rbp,%rdi,8),%r14
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	8+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r15
 	xorq	1(%rbp,%rdi,8),%r8
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r10
 	xorq	7(%rbp,%rdi,8),%r11
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	16+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r12
 	xorq	5(%rbp,%rdi,8),%r13
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r14
 	xorq	3(%rbp,%rdi,8),%r15
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	16+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r8
 	xorq	1(%rbp,%rdi,8),%r9
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r11
 	xorq	7(%rbp,%rdi,8),%r12
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	24+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r13
 	xorq	5(%rbp,%rdi,8),%r14
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r15
 	xorq	3(%rbp,%rdi,8),%r8
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	24+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r9
 	xorq	1(%rbp,%rdi,8),%r10
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r12
 	xorq	7(%rbp,%rdi,8),%r13
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	32+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r14
 	xorq	5(%rbp,%rdi,8),%r15
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r8
 	xorq	3(%rbp,%rdi,8),%r9
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	32+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r10
 	xorq	1(%rbp,%rdi,8),%r11
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r13
 	xorq	7(%rbp,%rdi,8),%r14
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	40+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r15
 	xorq	5(%rbp,%rdi,8),%r8
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r9
 	xorq	3(%rbp,%rdi,8),%r10
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	40+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r11
 	xorq	1(%rbp,%rdi,8),%r12
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r14
 	xorq	7(%rbp,%rdi,8),%r15
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	48+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r8
 	xorq	5(%rbp,%rdi,8),%r9
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r10
 	xorq	3(%rbp,%rdi,8),%r11
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	48+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r12
 	xorq	1(%rbp,%rdi,8),%r13
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r15
 	xorq	7(%rbp,%rdi,8),%r8
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	56+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r9
 	xorq	5(%rbp,%rdi,8),%r10
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r11
 	xorq	3(%rbp,%rdi,8),%r12
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	56+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r13
 	xorq	1(%rbp,%rdi,8),%r14
 	movq	%r8,0(%rsp)
@@ -302,228 +305,228 @@
 	movq	%r13,40(%rsp)
 	movq	%r14,48(%rsp)
 	movq	%r15,56(%rsp)
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r8
 	xorq	7(%rbp,%rdi,8),%r9
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+0+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r10
 	xorq	5(%rbp,%rdi,8),%r11
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r12
 	xorq	3(%rbp,%rdi,8),%r13
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+0+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r14
 	xorq	1(%rbp,%rdi,8),%r15
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r9
 	xorq	7(%rbp,%rdi,8),%r10
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+8+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r11
 	xorq	5(%rbp,%rdi,8),%r12
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r13
 	xorq	3(%rbp,%rdi,8),%r14
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+8+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r15
 	xorq	1(%rbp,%rdi,8),%r8
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r10
 	xorq	7(%rbp,%rdi,8),%r11
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+16+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r12
 	xorq	5(%rbp,%rdi,8),%r13
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r14
 	xorq	3(%rbp,%rdi,8),%r15
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+16+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r8
 	xorq	1(%rbp,%rdi,8),%r9
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r11
 	xorq	7(%rbp,%rdi,8),%r12
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+24+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r13
 	xorq	5(%rbp,%rdi,8),%r14
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r15
 	xorq	3(%rbp,%rdi,8),%r8
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+24+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r9
 	xorq	1(%rbp,%rdi,8),%r10
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r12
 	xorq	7(%rbp,%rdi,8),%r13
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+32+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r14
 	xorq	5(%rbp,%rdi,8),%r15
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r8
 	xorq	3(%rbp,%rdi,8),%r9
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+32+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r10
 	xorq	1(%rbp,%rdi,8),%r11
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r13
 	xorq	7(%rbp,%rdi,8),%r14
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+40+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r15
 	xorq	5(%rbp,%rdi,8),%r8
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r9
 	xorq	3(%rbp,%rdi,8),%r10
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+40+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r11
 	xorq	1(%rbp,%rdi,8),%r12
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r14
 	xorq	7(%rbp,%rdi,8),%r15
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64+48+8(%rsp),%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r8
 	xorq	5(%rbp,%rdi,8),%r9
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r10
 	xorq	3(%rbp,%rdi,8),%r11
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	64+48+8+4(%rsp),%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r12
 	xorq	1(%rbp,%rdi,8),%r13
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	xorq	0(%rbp,%rsi,8),%r15
 	xorq	7(%rbp,%rdi,8),%r8
-	movb	%al,%cl
-	movb	%ah,%dl
 
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%bh,%edx
 	xorq	6(%rbp,%rsi,8),%r9
 	xorq	5(%rbp,%rdi,8),%r10
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%bl,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	xorq	4(%rbp,%rsi,8),%r11
 	xorq	3(%rbp,%rdi,8),%r12
-	movb	%bl,%cl
-	movb	%bh,%dl
 
 	leaq	(%rcx,%rcx,1),%rsi
+	movzbl	%al,%ecx
 	leaq	(%rdx,%rdx,1),%rdi
+	movzbl	%ah,%edx
 	xorq	2(%rbp,%rsi,8),%r13
 	xorq	1(%rbp,%rdi,8),%r14
 	leaq	128(%rsp),%rbx

Modified: trunk/secure/lib/libcrypto/amd64/x86_64-gf2m.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/x86_64-gf2m.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/x86_64-gf2m.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-gf2m.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86_64-gf2m.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-gf2m.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from x86_64-gf2m.pl. */
 .text	
 
 .type	_mul_1x1, at function
@@ -245,16 +245,16 @@
 	movq	%rcx,56(%rsp)
 	movq	%r8,64(%rsp)
 
-	movq	$15,%r8
+	movq	$0xf,%r8
 	movq	%rsi,%rax
 	movq	%rcx,%rbp
-	call	_mul_1x1		
+	call	_mul_1x1
 	movq	%rax,16(%rsp)
 	movq	%rdx,24(%rsp)
 
 	movq	48(%rsp),%rax
 	movq	64(%rsp),%rbp
-	call	_mul_1x1		
+	call	_mul_1x1
 	movq	%rax,0(%rsp)
 	movq	%rdx,8(%rsp)
 
@@ -262,7 +262,7 @@
 	movq	56(%rsp),%rbp
 	xorq	48(%rsp),%rax
 	xorq	64(%rsp),%rbp
-	call	_mul_1x1		
+	call	_mul_1x1
 	movq	0(%rsp),%rbx
 	movq	8(%rsp),%rcx
 	movq	16(%rsp),%rdi

Modified: trunk/secure/lib/libcrypto/amd64/x86_64-mont.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/x86_64-mont.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/x86_64-mont.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,19 +1,26 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-mont.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86_64-mont.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 337982 2018-08-17 18:32:53Z jkim $ */
+/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
 .text	
 
+
+
 .globl	bn_mul_mont
 .type	bn_mul_mont, at function
 .align	16
 bn_mul_mont:
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
 	testl	$3,%r9d
 	jnz	.Lmul_enter
 	cmpl	$8,%r9d
 	jb	.Lmul_enter
+	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
 	cmpq	%rsi,%rdx
 	jne	.Lmul4x_enter
-	jmp	.Lsqr4x_enter
+	testl	$7,%r9d
+	jz	.Lsqr8x_enter
+	jmp	.Lmul4x_enter
 
 .align	16
 .Lmul_enter:
@@ -24,15 +31,12 @@
 	pushq	%r14
 	pushq	%r15
 
-	movl	%r9d,%r9d
-	leaq	2(%r9),%r10
+	negq	%r9
 	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
-	movq	%r11,8(%rsp,%r9,8)
-.Lmul_body:
 
 
 
@@ -39,14 +43,24 @@
 
 
 
-	subq	%rsp,%r11
+	subq	%r10,%r11
 	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
 .Lmul_page_walk:
-	movq	(%rsp,%r11,1),%r10
-	subq	$4096,%r11
-.byte	0x66,0x2e		
-	jnc	.Lmul_page_walk
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
 
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
 	movq	%rdx,%r12
 	movq	(%r8),%r8
 	movq	(%r12),%rbx
@@ -180,34 +194,34 @@
 
 	leaq	1(%r14),%r14
 	cmpq	%r9,%r14
-	jl	.Louter
+	jb	.Louter
 
 	xorq	%r14,%r14
 	movq	(%rsp),%rax
-	leaq	(%rsp),%rsi
 	movq	%r9,%r15
-	jmp	.Lsub
+
 .align	16
 .Lsub:	sbbq	(%rcx,%r14,8),%rax
 	movq	%rax,(%rdi,%r14,8)
-	movq	8(%rsi,%r14,8),%rax
+	movq	8(%rsp,%r14,8),%rax
 	leaq	1(%r14),%r14
 	decq	%r15
 	jnz	.Lsub
 
 	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
 	xorq	%r14,%r14
-	andq	%rax,%rsi
-	notq	%rax
-	movq	%rdi,%rcx
-	andq	%rax,%rcx
 	movq	%r9,%r15
-	orq	%rcx,%rsi
-.align	16
+
 .Lcopy:
-	movq	(%rsi,%r14,8),%rax
-	movq	%r14,(%rsp,%r14,8)
-	movq	%rax,(%rdi,%r14,8)
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	.Lcopy
@@ -214,13 +228,13 @@
 
 	movq	8(%rsp,%r9,8),%rsi
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lmul_epilogue:
 	.byte	0xf3,0xc3
 .size	bn_mul_mont,.-bn_mul_mont
@@ -227,7 +241,12 @@
 .type	bn_mul4x_mont, at function
 .align	16
 bn_mul4x_mont:
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
 .Lmul4x_enter:
+	andl	$0x80100,%r11d
+	cmpl	$0x80100,%r11d
+	je	.Lmulx4x_enter
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -235,23 +254,29 @@
 	pushq	%r14
 	pushq	%r15
 
-	movl	%r9d,%r9d
-	leaq	4(%r9),%r10
+	negq	%r9
 	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
-	movq	%r11,8(%rsp,%r9,8)
-.Lmul4x_body:
-	subq	%rsp,%r11
+	subq	%r10,%r11
 	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-	movq	(%rsp,%r11,1),%r10
-	subq	$4096,%r11
-.byte	0x2e			
-	jnc	.Lmul4x_page_walk
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul4x_body:
 	movq	%rdi,16(%rsp,%r9,8)
 	movq	%rdx,%r12
 	movq	(%r8),%r8
@@ -354,7 +379,7 @@
 	movq	%rdi,-32(%rsp,%r15,8)
 	movq	%rdx,%r13
 	cmpq	%r9,%r15
-	jl	.L1st4x
+	jb	.L1st4x
 
 	mulq	%rbx
 	addq	%rax,%r10
@@ -502,7 +527,7 @@
 	movq	%rdi,-32(%rsp,%r15,8)
 	movq	%rdx,%r13
 	cmpq	%r9,%r15
-	jl	.Linner4x
+	jb	.Linner4x
 
 	mulq	%rbx
 	addq	%rax,%r10
@@ -548,12 +573,12 @@
 	movq	%rdi,(%rsp,%r15,8)
 
 	cmpq	%r9,%r14
-	jl	.Louter4x
+	jb	.Louter4x
 	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
 	movq	0(%rsp),%rax
-	pxor	%xmm0,%xmm0
 	movq	8(%rsp),%rdx
-	shrq	$2,%r9
+	shrq	$2,%r15
 	leaq	(%rsp),%rsi
 	xorq	%r14,%r14
 
@@ -561,9 +586,7 @@
 	movq	16(%rsi),%rbx
 	movq	24(%rsi),%rbp
 	sbbq	8(%rcx),%rdx
-	leaq	-1(%r9),%r15
-	jmp	.Lsub4x
-.align	16
+
 .Lsub4x:
 	movq	%rax,0(%rdi,%r14,8)
 	movq	%rdx,8(%rdi,%r14,8)
@@ -590,51 +613,55 @@
 
 	sbbq	$0,%rax
 	movq	%rbp,24(%rdi,%r14,8)
-	xorq	%r14,%r14
-	andq	%rax,%rsi
-	notq	%rax
-	movq	%rdi,%rcx
-	andq	%rax,%rcx
-	leaq	-1(%r9),%r15
-	orq	%rcx,%rsi
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
 
-	movdqu	(%rsi),%xmm1
-	movdqa	%xmm0,(%rsp)
-	movdqu	%xmm1,(%rdi)
 	jmp	.Lcopy4x
 .align	16
 .Lcopy4x:
-	movdqu	16(%rsi,%r14,1),%xmm2
-	movdqu	32(%rsi,%r14,1),%xmm1
-	movdqa	%xmm0,16(%rsp,%r14,1)
-	movdqu	%xmm2,16(%rdi,%r14,1)
-	movdqa	%xmm0,32(%rsp,%r14,1)
-	movdqu	%xmm1,32(%rdi,%r14,1)
-	leaq	32(%r14),%r14
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
 	decq	%r15
 	jnz	.Lcopy4x
-
-	shlq	$2,%r9
-	movdqu	16(%rsi,%r14,1),%xmm2
-	movdqa	%xmm0,16(%rsp,%r14,1)
-	movdqu	%xmm2,16(%rdi,%r14,1)
 	movq	8(%rsp,%r9,8),%rsi
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lmul4x_epilogue:
 	.byte	0xf3,0xc3
 .size	bn_mul4x_mont,.-bn_mul4x_mont
-.type	bn_sqr4x_mont, at function
-.align	16
-bn_sqr4x_mont:
-.Lsqr4x_enter:
+
+
+
+.type	bn_sqr8x_mont, at function
+.align	32
+bn_sqr8x_mont:
 	movq	%rsp,%rax
+.Lsqr8x_enter:
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -641,771 +668,501 @@
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
+.Lsqr8x_prologue:
 
+	movl	%r9d,%r10d
 	shll	$3,%r9d
-	movq	%rsp,%r11
+	shlq	$3+2,%r10
 	negq	%r9
-	movq	(%r8),%r8
-	leaq	-72(%rsp,%r9,2),%rsp
-	andq	$-1024,%rsp
 
-	subq	%rsp,%r11
-	andq	$-4096,%r11
-.Lsqr4x_page_walk:
-	movq	(%rsp,%r11,1),%r10
-	subq	$4096,%r11
-.byte	0x2e			
-	jnc	.Lsqr4x_page_walk
 
-	movq	%r9,%r10
-	negq	%r9
-	leaq	-48(%rax),%r11
 
 
 
 
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	.Lsqr8x_sp_done
 
+.align	32
+.Lsqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lsqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
 
+.align	16
+.Lsqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
 
+	movq	%r9,%r10
+	negq	%r9
 
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.Lsqr8x_body:
 
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$0x80100,%eax
+	cmpl	$0x80100,%eax
+	jne	.Lsqr8x_nox
 
+	call	bn_sqrx8x_internal
 
-	movq	%rdi,32(%rsp)
-	movq	%rcx,40(%rsp)
-	movq	%r8,48(%rsp)
-	movq	%r11,56(%rsp)
-.Lsqr4x_body:
 
 
 
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
 
+.align	32
+.Lsqr8x_nox:
+	call	bn_sqr8x_internal
 
 
 
-	leaq	32(%r10),%rbp
-	leaq	(%rsi,%r9,1),%rsi
 
+	leaq	(%rdi,%r9,1),%rbx
 	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
 
+.align	32
+.Lsqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	.Lsqr8x_sub
 
-	movq	-32(%rsi,%rbp,1),%r14
-	leaq	64(%rsp,%r9,2),%rdi
-	movq	-24(%rsi,%rbp,1),%rax
-	leaq	-32(%rdi,%rbp,1),%rdi
-	movq	-16(%rsi,%rbp,1),%rbx
-	movq	%rax,%r15
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
 
-	mulq	%r14
-	movq	%rax,%r10
-	movq	%rbx,%rax
-	movq	%rdx,%r11
-	movq	%r10,-24(%rdi,%rbp,1)
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+	jmp	.Lsqr8x_cond_copy
 
-	xorq	%r10,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,-16(%rdi,%rbp,1)
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	.Lsqr8x_cond_copy
 
-	leaq	-16(%rbp),%rcx
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lsqr8x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+.type	bn_mulx4x_mont, at function
+.align	32
+bn_mulx4x_mont:
+	movq	%rsp,%rax
+.Lmulx4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lmulx4x_prologue:
 
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
 
-	movq	8(%rsi,%rcx,1),%rbx
-	mulq	%r15
-	movq	%rax,%r12
-	movq	%rbx,%rax
-	movq	%rdx,%r13
-
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	leaq	16(%rcx),%rcx
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-8(%rdi,%rcx,1)
-	jmp	.Lsqr4x_1st
-
 .align	16
-.Lsqr4x_1st:
-	movq	(%rsi,%rcx,1),%rbx
-	xorq	%r12,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%rbx,%rax
-	adcq	%rdx,%r12
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
 
-	xorq	%r10,%r10
-	addq	%r13,%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,(%rdi,%rcx,1)
+	leaq	(%rdx,%r9,1),%r10
 
 
-	movq	8(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
 
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,8(%rdi,%rcx,1)
 
-	movq	16(%rsi,%rcx,1),%rbx
-	xorq	%r12,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%rbx,%rax
-	adcq	%rdx,%r12
 
-	xorq	%r10,%r10
-	addq	%r13,%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,16(%rdi,%rcx,1)
 
 
-	movq	24(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
 
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	leaq	32(%rcx),%rcx
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-8(%rdi,%rcx,1)
 
-	cmpq	$0,%rcx
-	jne	.Lsqr4x_1st
 
-	xorq	%r12,%r12
-	addq	%r11,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	adcq	%rdx,%r12
 
-	movq	%r13,(%rdi)
-	leaq	16(%rbp),%rbp
-	movq	%r12,8(%rdi)
-	jmp	.Lsqr4x_outer
 
-.align	16
-.Lsqr4x_outer:
-	movq	-32(%rsi,%rbp,1),%r14
-	leaq	64(%rsp,%r9,2),%rdi
-	movq	-24(%rsi,%rbp,1),%rax
-	leaq	-32(%rdi,%rbp,1),%rdi
-	movq	-16(%rsi,%rbp,1),%rbx
-	movq	%rax,%r15
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+	movq	%r9,48(%rsp)
+	jmp	.Lmulx4x_body
 
-	movq	-24(%rdi,%rbp,1),%r10
-	xorq	%r11,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-24(%rdi,%rbp,1)
+.align	32
+.Lmulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
 
-	xorq	%r10,%r10
-	addq	-16(%rdi,%rbp,1),%r11
-	adcq	$0,%r10
-	mulq	%r14
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
 	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,-16(%rdi,%rbp,1)
-
-	leaq	-16(%rbp),%rcx
-	xorq	%r12,%r12
-
-
-	movq	8(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	addq	8(%rdi,%rcx,1),%r12
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
 	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
 
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,8(%rdi,%rcx,1)
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
 
-	leaq	16(%rcx),%rcx
-	jmp	.Lsqr4x_inner
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
 
-.align	16
-.Lsqr4x_inner:
-	movq	(%rsi,%rcx,1),%rbx
-	xorq	%r12,%r12
-	addq	(%rdi,%rcx,1),%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%rbx,%rax
-	adcq	%rdx,%r12
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
 
-	xorq	%r10,%r10
-	addq	%r13,%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,(%rdi,%rcx,1)
+	jmp	.Lmulx4x_1st
 
-	movq	8(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	addq	8(%rdi,%rcx,1),%r12
-	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
 
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	leaq	16(%rcx),%rcx
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-8(%rdi,%rcx,1)
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
 
-	cmpq	$0,%rcx
-	jne	.Lsqr4x_inner
+	decq	%rdi
+	jnz	.Lmulx4x_1st
 
-	xorq	%r12,%r12
-	addq	%r11,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	adcq	%rdx,%r12
-
-	movq	%r13,(%rdi)
-	movq	%r12,8(%rdi)
-
-	addq	$16,%rbp
-	jnz	.Lsqr4x_outer
-
-
-	movq	-32(%rsi),%r14
-	leaq	64(%rsp,%r9,2),%rdi
-	movq	-24(%rsi),%rax
-	leaq	-32(%rdi,%rbp,1),%rdi
-	movq	-16(%rsi),%rbx
-	movq	%rax,%r15
-
-	xorq	%r11,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-24(%rdi)
-
-	xorq	%r10,%r10
-	addq	%r13,%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	movq	%r11,-16(%rdi)
-
-	movq	-8(%rsi),%rbx
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	$0,%rdx
-
-	xorq	%r11,%r11
-	addq	%r12,%r10
-	movq	%rdx,%r13
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%rbx,%rax
-	adcq	%rdx,%r11
-	movq	%r10,-8(%rdi)
-
-	xorq	%r12,%r12
-	addq	%r11,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	-16(%rsi),%rax
-	adcq	%rdx,%r12
-
-	movq	%r13,(%rdi)
-	movq	%r12,8(%rdi)
-
-	mulq	%rbx
-	addq	$16,%rbp
-	xorq	%r14,%r14
-	subq	%r9,%rbp
-	xorq	%r15,%r15
-
-	addq	%r12,%rax
-	adcq	$0,%rdx
-	movq	%rax,8(%rdi)
-	movq	%rdx,16(%rdi)
-	movq	%r15,24(%rdi)
-
-	movq	-16(%rsi,%rbp,1),%rax
-	leaq	64(%rsp,%r9,2),%rdi
-	xorq	%r10,%r10
-	movq	-24(%rdi,%rbp,2),%r11
-
-	leaq	(%r14,%r10,2),%r12
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r13
-	shrq	$63,%r11
-	orq	%r10,%r13
-	movq	-16(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	-8(%rdi,%rbp,2),%r11
-	adcq	%rax,%r12
-	movq	-8(%rsi,%rbp,1),%rax
-	movq	%r12,-32(%rdi,%rbp,2)
-	adcq	%rdx,%r13
-
-	leaq	(%r14,%r10,2),%rbx
-	movq	%r13,-24(%rdi,%rbp,2)
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
 	sbbq	%r15,%r15
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r8
-	shrq	$63,%r11
-	orq	%r10,%r8
-	movq	0(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	8(%rdi,%rbp,2),%r11
-	adcq	%rax,%rbx
-	movq	0(%rsi,%rbp,1),%rax
-	movq	%rbx,-16(%rdi,%rbp,2)
-	adcq	%rdx,%r8
-	leaq	16(%rbp),%rbp
-	movq	%r8,-40(%rdi,%rbp,2)
-	sbbq	%r15,%r15
-	jmp	.Lsqr4x_shift_n_add
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
 
-.align	16
-.Lsqr4x_shift_n_add:
-	leaq	(%r14,%r10,2),%r12
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r13
-	shrq	$63,%r11
-	orq	%r10,%r13
-	movq	-16(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	-8(%rdi,%rbp,2),%r11
-	adcq	%rax,%r12
-	movq	-8(%rsi,%rbp,1),%rax
-	movq	%r12,-32(%rdi,%rbp,2)
-	adcq	%rdx,%r13
+.align	32
+.Lmulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
 
-	leaq	(%r14,%r10,2),%rbx
-	movq	%r13,-24(%rdi,%rbp,2)
-	sbbq	%r15,%r15
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r8
-	shrq	$63,%r11
-	orq	%r10,%r8
-	movq	0(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	8(%rdi,%rbp,2),%r11
-	adcq	%rax,%rbx
-	movq	0(%rsi,%rbp,1),%rax
-	movq	%rbx,-16(%rdi,%rbp,2)
-	adcq	%rdx,%r8
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
 
-	leaq	(%r14,%r10,2),%r12
-	movq	%r8,-8(%rdi,%rbp,2)
-	sbbq	%r15,%r15
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r13
-	shrq	$63,%r11
-	orq	%r10,%r13
-	movq	16(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	24(%rdi,%rbp,2),%r11
-	adcq	%rax,%r12
-	movq	8(%rsi,%rbp,1),%rax
-	movq	%r12,0(%rdi,%rbp,2)
-	adcq	%rdx,%r13
-
-	leaq	(%r14,%r10,2),%rbx
-	movq	%r13,8(%rdi,%rbp,2)
-	sbbq	%r15,%r15
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r8
-	shrq	$63,%r11
-	orq	%r10,%r8
-	movq	32(%rdi,%rbp,2),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	40(%rdi,%rbp,2),%r11
-	adcq	%rax,%rbx
-	movq	16(%rsi,%rbp,1),%rax
-	movq	%rbx,16(%rdi,%rbp,2)
-	adcq	%rdx,%r8
-	movq	%r8,24(%rdi,%rbp,2)
-	sbbq	%r15,%r15
-	addq	$32,%rbp
-	jnz	.Lsqr4x_shift_n_add
-
-	leaq	(%r14,%r10,2),%r12
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r13
-	shrq	$63,%r11
-	orq	%r10,%r13
-	movq	-16(%rdi),%r10
-	movq	%r11,%r14
-	mulq	%rax
-	negq	%r15
-	movq	-8(%rdi),%r11
-	adcq	%rax,%r12
-	movq	-8(%rsi),%rax
-	movq	%r12,-32(%rdi)
-	adcq	%rdx,%r13
-
-	leaq	(%r14,%r10,2),%rbx
-	movq	%r13,-24(%rdi)
-	sbbq	%r15,%r15
-	shrq	$63,%r10
-	leaq	(%rcx,%r11,2),%r8
-	shrq	$63,%r11
-	orq	%r10,%r8
-	mulq	%rax
-	negq	%r15
-	adcq	%rax,%rbx
-	adcq	%rdx,%r8
-	movq	%rbx,-16(%rdi)
-	movq	%r8,-8(%rdi)
-	movq	40(%rsp),%rsi
-	movq	48(%rsp),%r8
-	xorq	%rcx,%rcx
-	movq	%r9,0(%rsp)
-	subq	%r9,%rcx
-	movq	64(%rsp),%r10
-	movq	%r8,%r14
-	leaq	64(%rsp,%r9,2),%rax
-	leaq	64(%rsp,%r9,1),%rdi
-	movq	%rax,8(%rsp)
-	leaq	(%rsi,%r9,1),%rsi
-	xorq	%rbp,%rbp
-
-	movq	0(%rsi,%rcx,1),%rax
-	movq	8(%rsi,%rcx,1),%r9
-	imulq	%r10,%r14
-	movq	%rax,%rbx
-	jmp	.Lsqr4x_mont_outer
-
-.align	16
-.Lsqr4x_mont_outer:
-	xorq	%r11,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%r9,%rax
-	adcq	%rdx,%r11
+	movq	%rdi,8(%rsp)
 	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
 
-	xorq	%r10,%r10
-	addq	8(%rdi,%rcx,1),%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
 
-	imulq	%r11,%r15
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
 
-	movq	16(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	addq	%r11,%r12
-	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
-	movq	%r12,8(%rdi,%rcx,1)
+	jmp	.Lmulx4x_inner
 
-	xorq	%r11,%r11
-	addq	16(%rdi,%rcx,1),%r10
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%r9,%rax
-	adcq	%rdx,%r11
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
 
-	movq	24(%rsi,%rcx,1),%r9
-	xorq	%r12,%r12
-	addq	%r10,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%r9,%rax
-	adcq	%rdx,%r12
-	movq	%r13,16(%rdi,%rcx,1)
-
-	xorq	%r10,%r10
-	addq	24(%rdi,%rcx,1),%r11
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
 	leaq	32(%rcx),%rcx
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	jmp	.Lsqr4x_mont_inner
+	movq	%r13,-16(%rbx)
 
-.align	16
-.Lsqr4x_mont_inner:
-	movq	(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	addq	%r11,%r12
-	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
-	movq	%r12,-8(%rdi,%rcx,1)
+	decq	%rdi
+	jnz	.Lmulx4x_inner
 
-	xorq	%r11,%r11
-	addq	(%rdi,%rcx,1),%r10
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%r9,%rax
-	adcq	%rdx,%r11
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
 
-	movq	8(%rsi,%rcx,1),%r9
-	xorq	%r12,%r12
-	addq	%r10,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%r9,%rax
-	adcq	%rdx,%r12
-	movq	%r13,(%rdi,%rcx,1)
+	cmpq	16(%rsp),%rdi
+	jne	.Lmulx4x_outer
 
-	xorq	%r10,%r10
-	addq	8(%rdi,%rcx,1),%r11
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	.Lmulx4x_sub
 
-
-	movq	16(%rsi,%rcx,1),%rbx
-	xorq	%r13,%r13
-	addq	%r11,%r12
-	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%rbx,%rax
-	adcq	%rdx,%r13
-	movq	%r12,8(%rdi,%rcx,1)
-
-	xorq	%r11,%r11
-	addq	16(%rdi,%rcx,1),%r10
-	adcq	$0,%r11
-	mulq	%r14
-	addq	%rax,%r10
-	movq	%r9,%rax
-	adcq	%rdx,%r11
-
-	movq	24(%rsi,%rcx,1),%r9
-	xorq	%r12,%r12
-	addq	%r10,%r13
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%r9,%rax
-	adcq	%rdx,%r12
-	movq	%r13,16(%rdi,%rcx,1)
-
-	xorq	%r10,%r10
-	addq	24(%rdi,%rcx,1),%r11
+.align	32
+.Lmulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
 	leaq	32(%rcx),%rcx
-	adcq	$0,%r10
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%rbx,%rax
-	adcq	%rdx,%r10
-	cmpq	$0,%rcx
-	jne	.Lsqr4x_mont_inner
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	.Lmulx4x_sub
 
-	subq	0(%rsp),%rcx
-	movq	%r8,%r14
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
 
-	xorq	%r13,%r13
-	addq	%r11,%r12
-	adcq	$0,%r13
-	mulq	%r15
-	addq	%rax,%r12
-	movq	%r9,%rax
-	adcq	%rdx,%r13
-	movq	%r12,-8(%rdi)
-
-	xorq	%r11,%r11
-	addq	(%rdi),%r10
-	adcq	$0,%r11
-	movq	0(%rsi,%rcx,1),%rbx
-	addq	%rbp,%r10
-	adcq	$0,%r11
-
-	imulq	16(%rdi,%rcx,1),%r14
-	xorq	%r12,%r12
-	movq	8(%rsi,%rcx,1),%r9
-	addq	%r10,%r13
-	movq	16(%rdi,%rcx,1),%r10
-	adcq	$0,%r12
-	mulq	%r15
-	addq	%rax,%r13
-	movq	%rbx,%rax
-	adcq	%rdx,%r12
-	movq	%r13,(%rdi)
-
-	xorq	%rbp,%rbp
-	addq	8(%rdi),%r12
-	adcq	%rbp,%rbp
-	addq	%r11,%r12
-	leaq	16(%rdi),%rdi
-	adcq	$0,%rbp
-	movq	%r12,-8(%rdi)
-	cmpq	8(%rsp),%rdi
-	jb	.Lsqr4x_mont_outer
-
-	movq	0(%rsp),%r9
-	movq	%rbp,(%rdi)
-	movq	64(%rsp,%r9,1),%rax
-	leaq	64(%rsp,%r9,1),%rbx
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
-	shrq	$5,%r9
-	movq	8(%rbx),%rdx
-	xorq	%rbp,%rbp
+	jmp	.Lmulx4x_cond_copy
 
-	movq	32(%rsp),%rdi
-	subq	0(%rsi),%rax
-	movq	16(%rbx),%r10
-	movq	24(%rbx),%r11
-	sbbq	8(%rsi),%rdx
-	leaq	-1(%r9),%rcx
-	jmp	.Lsqr4x_sub
-.align	16
-.Lsqr4x_sub:
-	movq	%rax,0(%rdi,%rbp,8)
-	movq	%rdx,8(%rdi,%rbp,8)
-	sbbq	16(%rsi,%rbp,8),%r10
-	movq	32(%rbx,%rbp,8),%rax
-	movq	40(%rbx,%rbp,8),%rdx
-	sbbq	24(%rsi,%rbp,8),%r11
-	movq	%r10,16(%rdi,%rbp,8)
-	movq	%r11,24(%rdi,%rbp,8)
-	sbbq	32(%rsi,%rbp,8),%rax
-	movq	48(%rbx,%rbp,8),%r10
-	movq	56(%rbx,%rbp,8),%r11
-	sbbq	40(%rsi,%rbp,8),%rdx
-	leaq	4(%rbp),%rbp
-	decq	%rcx
-	jnz	.Lsqr4x_sub
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	.Lmulx4x_cond_copy
 
-	movq	%rax,0(%rdi,%rbp,8)
-	movq	32(%rbx,%rbp,8),%rax
-	sbbq	16(%rsi,%rbp,8),%r10
-	movq	%rdx,8(%rdi,%rbp,8)
-	sbbq	24(%rsi,%rbp,8),%r11
-	movq	%r10,16(%rdi,%rbp,8)
+	movq	%rdx,(%rbx)
 
-	sbbq	$0,%rax
-	movq	%r11,24(%rdi,%rbp,8)
-	xorq	%rbp,%rbp
-	andq	%rax,%rbx
-	notq	%rax
-	movq	%rdi,%rsi
-	andq	%rax,%rsi
-	leaq	-1(%r9),%rcx
-	orq	%rsi,%rbx
-
-	pxor	%xmm0,%xmm0
-	leaq	64(%rsp,%r9,8),%rsi
-	movdqu	(%rbx),%xmm1
-	leaq	(%rsi,%r9,8),%rsi
-	movdqa	%xmm0,64(%rsp)
-	movdqa	%xmm0,(%rsi)
-	movdqu	%xmm1,(%rdi)
-	jmp	.Lsqr4x_copy
-.align	16
-.Lsqr4x_copy:
-	movdqu	16(%rbx,%rbp,1),%xmm2
-	movdqu	32(%rbx,%rbp,1),%xmm1
-	movdqa	%xmm0,80(%rsp,%rbp,1)
-	movdqa	%xmm0,96(%rsp,%rbp,1)
-	movdqa	%xmm0,16(%rsi,%rbp,1)
-	movdqa	%xmm0,32(%rsi,%rbp,1)
-	movdqu	%xmm2,16(%rdi,%rbp,1)
-	movdqu	%xmm1,32(%rdi,%rbp,1)
-	leaq	32(%rbp),%rbp
-	decq	%rcx
-	jnz	.Lsqr4x_copy
-
-	movdqu	16(%rbx,%rbp,1),%xmm2
-	movdqa	%xmm0,80(%rsp,%rbp,1)
-	movdqa	%xmm0,16(%rsi,%rbp,1)
-	movdqu	%xmm2,16(%rdi,%rbp,1)
-	movq	56(%rsp),%rsi
 	movq	$1,%rax
-	movq	0(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
-.Lsqr4x_epilogue:
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lmulx4x_epilogue:
 	.byte	0xf3,0xc3
-.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	16

Modified: trunk/secure/lib/libcrypto/amd64/x86_64-mont5.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/x86_64-mont5.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/x86_64-mont5.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,23 +1,24 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-mont5.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86_64-mont5.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 337982 2018-08-17 18:32:53Z jkim $ */
+/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
 .text	
 
+
+
 .globl	bn_mul_mont_gather5
 .type	bn_mul_mont_gather5, at function
 .align	64
 bn_mul_mont_gather5:
-	testl	$3,%r9d
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+	testl	$7,%r9d
 	jnz	.Lmul_enter
-	cmpl	$8,%r9d
-	jb	.Lmul_enter
+	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
 	jmp	.Lmul4x_enter
 
 .align	16
 .Lmul_enter:
-	movl	%r9d,%r9d
 	movd	8(%rsp),%xmm5
-	leaq	.Linc(%rip),%r10
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -25,15 +26,12 @@
 	pushq	%r14
 	pushq	%r15
 
-.Lmul_alloca:
-	movq	%rsp,%rax
-	leaq	2(%r9),%r11
-	negq	%r11
-	leaq	-264(%rsp,%r11,8),%rsp
-	andq	$-1024,%rsp
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
-	movq	%rax,8(%rsp,%r9,8)
-.Lmul_body:
 
 
 
@@ -40,14 +38,25 @@
 
 
 
-	subq	%rsp,%rax
-	andq	$-4096,%rax
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
 .Lmul_page_walk:
-	movq	(%rsp,%rax,1),%r11
-	subq	$4096,%rax
-.byte	0x2e			
-	jnc	.Lmul_page_walk
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
 
+	leaq	.Linc(%rip),%r10
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
+
 	leaq	128(%rdx),%r12
 	movdqa	0(%r10),%xmm0
 	movdqa	16(%r10),%xmm1
@@ -174,7 +183,7 @@
 	por	%xmm2,%xmm0
 	por	%xmm3,%xmm1
 	por	%xmm1,%xmm0
-	pshufd	$78,%xmm0,%xmm1
+	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
 .byte	102,72,15,126,195
@@ -225,12 +234,12 @@
 	cmpq	%r9,%r15
 	jne	.L1st
 
+
 	addq	%rax,%r13
-	movq	(%rsi),%rax
 	adcq	$0,%rdx
 	addq	%r11,%r13
 	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
+	movq	%r13,-16(%rsp,%r9,8)
 	movq	%rdx,%r13
 	movq	%r10,%r11
 
@@ -297,9 +306,11 @@
 	por	%xmm2,%xmm4
 	por	%xmm3,%xmm5
 	por	%xmm5,%xmm4
-	pshufd	$78,%xmm4,%xmm0
+	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%r12),%r12
+
+	movq	(%rsi),%rax
 .byte	102,72,15,126,195
 
 	xorq	%r15,%r15
@@ -350,12 +361,11 @@
 	jne	.Linner
 
 	addq	%rax,%r13
-	movq	(%rsi),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
-	movq	(%rsp,%r15,8),%r10
+	movq	(%rsp,%r9,8),%r10
 	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
+	movq	%r13,-16(%rsp,%r9,8)
 	movq	%rdx,%r13
 
 	xorq	%rdx,%rdx
@@ -368,7 +378,7 @@
 
 	leaq	1(%r14),%r14
 	cmpq	%r9,%r14
-	jl	.Louter
+	jb	.Louter
 
 	xorq	%r14,%r14
 	movq	(%rsp),%rax
@@ -384,18 +394,19 @@
 	jnz	.Lsub
 
 	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
 	xorq	%r14,%r14
-	andq	%rax,%rsi
-	notq	%rax
-	movq	%rdi,%rcx
-	andq	%rax,%rcx
 	movq	%r9,%r15
-	orq	%rcx,%rsi
-.align	16
+
 .Lcopy:
-	movq	(%rsi,%r14,8),%rax
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
 	movq	%r14,(%rsp,%r14,8)
-	movq	%rax,(%rdi,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	.Lcopy
@@ -403,23 +414,25 @@
 	movq	8(%rsp,%r9,8),%rsi
 	movq	$1,%rax
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lmul_epilogue:
 	.byte	0xf3,0xc3
 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
 .type	bn_mul4x_mont_gather5, at function
-.align	16
+.align	32
 bn_mul4x_mont_gather5:
+.byte	0x67
+	movq	%rsp,%rax
 .Lmul4x_enter:
-	movl	%r9d,%r9d
-	movd	8(%rsp),%xmm5
-	leaq	.Linc(%rip),%r10
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lmulx4x_enter
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -426,29 +439,91 @@
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
+.Lmul4x_prologue:
 
-.Lmul4x_alloca:
-	movq	%rsp,%rax
-	leaq	4(%r9),%r11
-	negq	%r11
-	leaq	-256(%rsp,%r11,8),%rsp
-	andq	$-1024,%rsp
+.byte	0x67
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
 
-	movq	%rax,8(%rsp,%r9,8)
-.Lmul4x_body:
-	subq	%rsp,%rax
-	andq	$-4096,%rax
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-	movq	(%rsp,%rax,1),%r11
-	subq	$4096,%rax
-.byte	0x2e			
-	jnc	.Lmul4x_page_walk
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
-	movq	%rdi,16(%rsp,%r9,8)
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal, at function
+.align	32
+mul4x_internal:
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	.Linc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
 	leaq	128(%rdx),%r12
-	movdqa	0(%r10),%xmm0
-	movdqa	16(%r10),%xmm1
-	leaq	32-112(%rsp,%r9,8),%r10
 
 	pshufd	$0,%xmm5,%xmm5
 	movdqa	%xmm1,%xmm4
@@ -571,17 +646,19 @@
 	por	%xmm2,%xmm0
 	por	%xmm3,%xmm1
 	por	%xmm1,%xmm0
-	pshufd	$78,%xmm0,%xmm1
+	pshufd	$0x4e,%xmm0,%xmm1
 	por	%xmm1,%xmm0
 	leaq	256(%r12),%r12
 .byte	102,72,15,126,195
 
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
 	movq	(%r8),%r8
 	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
 
-	xorq	%r14,%r14
-	xorq	%r15,%r15
-
 	movq	%r8,%rbp
 	mulq	%rbx
 	movq	%rax,%r10
@@ -588,11 +665,12 @@
 	movq	(%rcx),%rax
 
 	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r10
-	movq	8(%rsi),%rax
+	movq	8(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rdi
 
@@ -604,119 +682,126 @@
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	16(%rsi),%rax
+	movq	16(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
-	leaq	4(%r15),%r15
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
 	adcq	$0,%rdx
-	movq	%rdi,(%rsp)
+	movq	%rdi,(%r14)
 	movq	%rdx,%r13
 	jmp	.L1st4x
-.align	16
+
+.align	32
 .L1st4x:
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	-16(%rcx,%r15,8),%rax
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	-8(%rsi,%r15,8),%rax
+	movq	-8(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%r13,-24(%rsp,%r15,8)
+	movq	%r13,-24(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	-8(%rcx,%r15,8),%rax
+	movq	-8(%rcx),%rax
 	adcq	$0,%rdx
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	(%rsi,%r15,8),%rax
+	movq	(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
 	adcq	$0,%rdx
-	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdi,-16(%r14)
 	movq	%rdx,%r13
 
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	(%rcx,%r15,8),%rax
+	movq	0(%rcx),%rax
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	8(%rsi,%r15,8),%rax
+	movq	8(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%r13,-8(%rsp,%r15,8)
+	movq	%r13,-8(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	8(%rcx,%r15,8),%rax
+	movq	8(%rcx),%rax
 	adcq	$0,%rdx
-	leaq	4(%r15),%r15
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	-16(%rsi,%r15,8),%rax
+	movq	16(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
 	adcq	$0,%rdx
-	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdi,(%r14)
 	movq	%rdx,%r13
-	cmpq	%r9,%r15
-	jl	.L1st4x
 
+	addq	$32,%r15
+	jnz	.L1st4x
+
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	-16(%rcx,%r15,8),%rax
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	-8(%rsi,%r15,8),%rax
+	movq	-8(%rsi),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%r13,-24(%rsp,%r15,8)
+	movq	%r13,-24(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	-8(%rcx,%r15,8),%rax
+	movq	-8(%rcx),%rax
 	adcq	$0,%rdx
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	(%rsi),%rax
+	movq	(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
 	adcq	$0,%rdx
-	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdi,-16(%r14)
 	movq	%rdx,%r13
 
+	leaq	(%rcx,%r9,1),%rcx
+
 	xorq	%rdi,%rdi
 	addq	%r10,%r13
 	adcq	$0,%rdi
-	movq	%r13,-8(%rsp,%r15,8)
-	movq	%rdi,(%rsp,%r15,8)
+	movq	%r13,-8(%r14)
 
-	leaq	1(%r14),%r14
-.align	4
+	jmp	.Louter4x
+
+.align	32
 .Louter4x:
-	leaq	32+128(%rsp,%r9,8),%rdx
+	leaq	16+128(%r14),%rdx
 	pxor	%xmm4,%xmm4
 	pxor	%xmm5,%xmm5
 	movdqa	-128(%r12),%xmm0
@@ -768,14 +853,12 @@
 	por	%xmm2,%xmm4
 	por	%xmm3,%xmm5
 	por	%xmm5,%xmm4
-	pshufd	$78,%xmm4,%xmm0
+	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	leaq	256(%r12),%r12
 .byte	102,72,15,126,195
 
-	xorq	%r15,%r15
-
-	movq	(%rsp),%r10
+	movq	(%r14,%r9,1),%r10
 	movq	%r8,%rbp
 	mulq	%rbx
 	addq	%rax,%r10
@@ -784,10 +867,13 @@
 
 	imulq	%r10,%rbp
 	movq	%rdx,%r11
+	movq	%rdi,(%r14)
 
+	leaq	(%r14,%r9,1),%r14
+
 	mulq	%rbp
 	addq	%rax,%r10
-	movq	8(%rsi),%rax
+	movq	8(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	movq	%rdx,%rdi
 
@@ -795,226 +881,2598 @@
 	addq	%rax,%r11
 	movq	8(%rcx),%rax
 	adcq	$0,%rdx
-	addq	8(%rsp),%r11
+	addq	8(%r14),%r11
 	adcq	$0,%rdx
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	16(%rsi),%rax
+	movq	16(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
-	leaq	4(%r15),%r15
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
 	adcq	$0,%rdx
 	movq	%rdx,%r13
 	jmp	.Linner4x
-.align	16
+
+.align	32
 .Linner4x:
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	-16(%rcx,%r15,8),%rax
+	movq	-16(%rcx),%rax
 	adcq	$0,%rdx
-	addq	-16(%rsp,%r15,8),%r10
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	-8(%rsi,%r15,8),%rax
+	movq	-8(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdi,-32(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	-8(%rcx,%r15,8),%rax
+	movq	-8(%rcx),%rax
 	adcq	$0,%rdx
-	addq	-8(%rsp,%r15,8),%r11
+	addq	-8(%r14),%r11
 	adcq	$0,%rdx
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	(%rsi,%r15,8),%rax
+	movq	(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
 	adcq	$0,%rdx
-	movq	%r13,-24(%rsp,%r15,8)
+	movq	%r13,-24(%r14)
 	movq	%rdx,%r13
 
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	(%rcx,%r15,8),%rax
+	movq	0(%rcx),%rax
 	adcq	$0,%rdx
-	addq	(%rsp,%r15,8),%r10
+	addq	(%r14),%r10
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	8(%rsi,%r15,8),%rax
+	movq	8(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdi,-16(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	8(%rcx,%r15,8),%rax
+	movq	8(%rcx),%rax
 	adcq	$0,%rdx
-	addq	8(%rsp,%r15,8),%r11
+	addq	8(%r14),%r11
 	adcq	$0,%rdx
-	leaq	4(%r15),%r15
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	-16(%rsi,%r15,8),%rax
+	movq	16(%rsi,%r15,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
 	adcq	$0,%rdx
-	movq	%r13,-40(%rsp,%r15,8)
+	movq	%r13,-8(%r14)
 	movq	%rdx,%r13
-	cmpq	%r9,%r15
-	jl	.Linner4x
 
+	addq	$32,%r15
+	jnz	.Linner4x
+
 	mulq	%rbx
 	addq	%rax,%r10
-	movq	-16(%rcx,%r15,8),%rax
+	movq	-16(%rcx),%rax
 	adcq	$0,%rdx
-	addq	-16(%rsp,%r15,8),%r10
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
 	adcq	$0,%rdx
 	movq	%rdx,%r11
 
 	mulq	%rbp
 	addq	%rax,%r13
-	movq	-8(%rsi,%r15,8),%rax
+	movq	-8(%rsi),%rax
 	adcq	$0,%rdx
 	addq	%r10,%r13
 	adcq	$0,%rdx
-	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdi,-32(%r14)
 	movq	%rdx,%rdi
 
 	mulq	%rbx
 	addq	%rax,%r11
-	movq	-8(%rcx,%r15,8),%rax
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
 	adcq	$0,%rdx
-	addq	-8(%rsp,%r15,8),%r11
+	addq	-8(%r14),%r11
 	adcq	$0,%rdx
-	leaq	1(%r14),%r14
 	movq	%rdx,%r10
 
 	mulq	%rbp
 	addq	%rax,%rdi
-	movq	(%rsi),%rax
+	movq	(%rsi,%r9,1),%rax
 	adcq	$0,%rdx
 	addq	%r11,%rdi
 	adcq	$0,%rdx
-	movq	%r13,-24(%rsp,%r15,8)
+	movq	%r13,-24(%r14)
 	movq	%rdx,%r13
 
-	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
 
 	xorq	%rdi,%rdi
 	addq	%r10,%r13
 	adcq	$0,%rdi
-	addq	(%rsp,%r9,8),%r13
+	addq	(%r14),%r13
 	adcq	$0,%rdi
-	movq	%r13,-8(%rsp,%r15,8)
-	movq	%rdi,(%rsp,%r15,8)
+	movq	%r13,-8(%r14)
 
-	cmpq	%r9,%r14
-	jl	.Louter4x
-	movq	16(%rsp,%r9,8),%rdi
-	movq	0(%rsp),%rax
-	pxor	%xmm0,%xmm0
-	movq	8(%rsp),%rdx
-	shrq	$2,%r9
-	leaq	(%rsp),%rsi
+	cmpq	16+8(%rsp),%r12
+	jb	.Louter4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+.size	mul4x_internal,.-mul4x_internal
+.globl	bn_power5
+.type	bn_power5, at function
+.align	32
+bn_power5:
+	movq	%rsp,%rax
+	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lpowerx5_enter
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lpower5_prologue:
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.Lpower5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lpower5_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_power5,.-bn_power5
+
+.globl	bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal, at function
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
 	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
 
-	subq	0(%rcx),%rax
-	movq	16(%rsi),%rbx
-	movq	24(%rsi),%rbp
-	sbbq	8(%rcx),%rdx
-	leaq	-1(%r9),%r15
-	jmp	.Lsub4x
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+.L8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	.L8x_reduction_loop
+	.byte	0xf3,0xc3
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+.type	__bn_post4x_internal, at function
+.align	32
+__bn_post4x_internal:
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+
 .align	16
-.Lsub4x:
-	movq	%rax,0(%rdi,%r14,8)
-	movq	%rdx,8(%rdi,%r14,8)
-	sbbq	16(%rcx,%r14,8),%rbx
-	movq	32(%rsi,%r14,8),%rax
-	movq	40(%rsi,%r14,8),%rdx
-	sbbq	24(%rcx,%r14,8),%rbp
-	movq	%rbx,16(%rdi,%r14,8)
-	movq	%rbp,24(%rdi,%r14,8)
-	sbbq	32(%rcx,%r14,8),%rax
-	movq	48(%rsi,%r14,8),%rbx
-	movq	56(%rsi,%r14,8),%rbp
-	sbbq	40(%rcx,%r14,8),%rdx
-	leaq	4(%r14),%r14
-	decq	%r15
-	jnz	.Lsub4x
+.Lsqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
 
-	movq	%rax,0(%rdi,%r14,8)
-	movq	32(%rsi,%r14,8),%rax
-	sbbq	16(%rcx,%r14,8),%rbx
-	movq	%rdx,8(%rdi,%r14,8)
-	sbbq	24(%rcx,%r14,8),%rbp
-	movq	%rbx,16(%rdi,%r14,8)
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
 
-	sbbq	$0,%rax
-	movq	%rbp,24(%rdi,%r14,8)
-	xorq	%r14,%r14
-	andq	%rax,%rsi
-	notq	%rax
-	movq	%rdi,%rcx
-	andq	%rax,%rcx
-	leaq	-1(%r9),%r15
-	orq	%rcx,%rsi
+	incq	%rcx
+	jnz	.Lsqr4x_sub
 
+	movq	%r9,%r10
+	negq	%r9
+	.byte	0xf3,0xc3
+.size	__bn_post4x_internal,.-__bn_post4x_internal
+.globl	bn_from_montgomery
+.type	bn_from_montgomery, at function
+.align	32
+bn_from_montgomery:
+	testl	$7,%r9d
+	jz	bn_from_mont8x
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	bn_from_montgomery,.-bn_from_montgomery
+
+.type	bn_from_mont8x, at function
+.align	32
+bn_from_mont8x:
+.byte	0x67
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lfrom_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lfrom_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lfrom_sp_done
+
+.align	32
+.Lfrom_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lfrom_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+	jmp	.Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+.Lfrom_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.Lfrom_body:
+	movq	%r9,%r11
+	leaq	48(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	jmp	.Lmul_by_1
+
+.align	32
+.Lmul_by_1:
 	movdqu	(%rsi),%xmm1
-	movdqa	%xmm0,(%rsp)
-	movdqu	%xmm1,(%rdi)
-	jmp	.Lcopy4x
-.align	16
-.Lcopy4x:
-	movdqu	16(%rsi,%r14,1),%xmm2
-	movdqu	32(%rsi,%r14,1),%xmm1
-	movdqa	%xmm0,16(%rsp,%r14,1)
-	movdqu	%xmm2,16(%rdi,%r14,1)
-	movdqa	%xmm0,32(%rsp,%r14,1)
-	movdqu	%xmm1,32(%rdi,%r14,1)
-	leaq	32(%r14),%r14
-	decq	%r15
-	jnz	.Lcopy4x
+	movdqu	16(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm3
+	movdqa	%xmm0,(%rax,%r9,1)
+	movdqu	48(%rsi),%xmm4
+	movdqa	%xmm0,16(%rax,%r9,1)
+.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+	movdqa	%xmm1,(%rax)
+	movdqa	%xmm0,32(%rax,%r9,1)
+	movdqa	%xmm2,16(%rax)
+	movdqa	%xmm0,48(%rax,%r9,1)
+	movdqa	%xmm3,32(%rax)
+	movdqa	%xmm4,48(%rax)
+	leaq	64(%rax),%rax
+	subq	$64,%r11
+	jnz	.Lmul_by_1
 
-	shlq	$2,%r9
-	movdqu	16(%rsi,%r14,1),%xmm2
-	movdqa	%xmm0,16(%rsp,%r14,1)
-	movdqu	%xmm2,16(%rdi,%r14,1)
-	movq	8(%rsp,%r9,8),%rsi
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	0x67
+	movq	%rcx,%rbp
+.byte	102,73,15,110,218
+	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	jne	.Lfrom_mont_nox
+
+	leaq	(%rax,%r9,1),%rdi
+	call	__bn_sqrx8x_reduction
+	call	__bn_postx4x_internal
+
+	pxor	%xmm0,%xmm0
+	leaq	48(%rsp),%rax
+	movq	40(%rsp),%rsi
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_nox:
+	call	__bn_sqr8x_reduction
+	call	__bn_post4x_internal
+
+	pxor	%xmm0,%xmm0
+	leaq	48(%rsp),%rax
+	movq	40(%rsp),%rsi
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_zero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	movdqa	%xmm0,32(%rax)
+	movdqa	%xmm0,48(%rax)
+	leaq	64(%rax),%rax
+	subq	$32,%r9
+	jnz	.Lfrom_mont_zero
+
 	movq	$1,%rax
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lfrom_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_from_mont8x,.-bn_from_mont8x
+.type	bn_mulx4x_mont_gather5, at function
+.align	32
+bn_mulx4x_mont_gather5:
+	movq	%rsp,%rax
+.Lmulx4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lmulx4x_prologue:
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
-.Lmul4x_epilogue:
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lmulx4x_epilogue:
 	.byte	0xf3,0xc3
-.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal, at function
+.align	32
+mulx4x_internal:
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	.Linc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	.Lmulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+.size	mulx4x_internal,.-mulx4x_internal
+.type	bn_powerx5, at function
+.align	32
+bn_powerx5:
+	movq	%rsp,%rax
+.Lpowerx5_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lpowerx5_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
+.Lpowerx5_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal, at function
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	.Lsqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	.Lsqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	.Lsqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	.Lsqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	.Lsqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	.Lsqrx8x_reduction_loop
+	.byte	0xf3,0xc3
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align	32
+__bn_postx4x_internal:
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+
+.align	16
+.Lsqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	.Lsqrx4x_sub
+
+	negq	%r9
+
+	.byte	0xf3,0xc3
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
+.globl	bn_get_bits5
+.type	bn_get_bits5, at function
+.align	16
+bn_get_bits5:
+	leaq	0(%rdi),%r10
+	leaq	1(%rdi),%r11
+	movl	%esi,%ecx
+	shrl	$4,%esi
+	andl	$15,%ecx
+	leal	-8(%rcx),%eax
+	cmpl	$11,%ecx
+	cmovaq	%r11,%r10
+	cmoval	%eax,%ecx
+	movzwl	(%r10,%rsi,2),%eax
+	shrl	%cl,%eax
+	andl	$31,%eax
+	.byte	0xf3,0xc3
+.size	bn_get_bits5,.-bn_get_bits5
+
 .globl	bn_scatter5
 .type	bn_scatter5, at function
 .align	16
 bn_scatter5:
-	cmpq	$0,%rsi
+	cmpl	$0,%esi
 	jz	.Lscatter_epilogue
 	leaq	(%rdx,%rcx,8),%rdx
 .Lscatter:
@@ -1022,7 +3480,7 @@
 	leaq	8(%rdi),%rdi
 	movq	%rax,(%rdx)
 	leaq	256(%rdx),%rdx
-	subq	$1,%rsi
+	subl	$1,%esi
 	jnz	.Lscatter
 .Lscatter_epilogue:
 	.byte	0xf3,0xc3
@@ -1030,12 +3488,12 @@
 
 .globl	bn_gather5
 .type	bn_gather5, at function
-.align	16
+.align	32
 bn_gather5:
 .LSEH_begin_bn_gather5:
 
-.byte	0x4c,0x8d,0x14,0x24			
-.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	
+.byte	0x4c,0x8d,0x14,0x24
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
 	leaq	.Linc(%rip),%rax
 	andq	$-16,%rsp
 
@@ -1180,11 +3638,11 @@
 	por	%xmm3,%xmm5
 	por	%xmm5,%xmm4
 	leaq	256(%r11),%r11
-	pshufd	$78,%xmm4,%xmm0
+	pshufd	$0x4e,%xmm4,%xmm0
 	por	%xmm4,%xmm0
 	movq	%xmm0,(%rdi)
 	leaq	8(%rdi),%rdi
-	subq	$1,%rsi
+	subl	$1,%esi
 	jnz	.Lgather
 
 	leaq	(%r10),%rsp

Modified: trunk/secure/lib/libcrypto/amd64/x86_64cpuid.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/x86_64cpuid.S	2019-01-20 05:38:02 UTC (rev 12152)
+++ trunk/secure/lib/libcrypto/amd64/x86_64cpuid.S	2019-01-20 05:38:15 UTC (rev 12153)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64cpuid.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86_64cpuid.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64cpuid.S 325337 2017-11-02 18:30:41Z jkim $ */
+/* Do not modify. This file is auto-generated from x86_64cpuid.pl. */
 
 .hidden	OPENSSL_cpuid_setup
 .section	.init
@@ -7,7 +7,7 @@
 	call	OPENSSL_cpuid_setup
 
 .hidden	OPENSSL_ia32cap_P
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 
 .text	
 
@@ -17,11 +17,11 @@
 OPENSSL_atomic_add:
 	movl	(%rdi),%eax
 .Lspin:	leaq	(%rsi,%rax,1),%r8
-.byte	0xf0		
+.byte	0xf0
 	cmpxchgl	%r8d,(%rdi)
 	jne	.Lspin
 	movl	%r8d,%eax
-.byte	0x48,0x98	
+.byte	0x48,0x98
 	.byte	0xf3,0xc3
 .size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
 
@@ -42,47 +42,48 @@
 	movq	%rbx,%r8
 
 	xorl	%eax,%eax
+	movl	%eax,8(%rdi)
 	cpuid
 	movl	%eax,%r11d
 
 	xorl	%eax,%eax
-	cmpl	$1970169159,%ebx
+	cmpl	$0x756e6547,%ebx
 	setne	%al
 	movl	%eax,%r9d
-	cmpl	$1231384169,%edx
+	cmpl	$0x49656e69,%edx
 	setne	%al
 	orl	%eax,%r9d
-	cmpl	$1818588270,%ecx
+	cmpl	$0x6c65746e,%ecx
 	setne	%al
 	orl	%eax,%r9d
 	jz	.Lintel
 
-	cmpl	$1752462657,%ebx
+	cmpl	$0x68747541,%ebx
 	setne	%al
 	movl	%eax,%r10d
-	cmpl	$1769238117,%edx
+	cmpl	$0x69746E65,%edx
 	setne	%al
 	orl	%eax,%r10d
-	cmpl	$1145913699,%ecx
+	cmpl	$0x444D4163,%ecx
 	setne	%al
 	orl	%eax,%r10d
 	jnz	.Lintel
 
 
-	movl	$2147483648,%eax
+	movl	$0x80000000,%eax
 	cpuid
-	cmpl	$2147483649,%eax
+	cmpl	$0x80000001,%eax
 	jb	.Lintel
 	movl	%eax,%r10d
-	movl	$2147483649,%eax
+	movl	$0x80000001,%eax
 	cpuid
 	orl	%ecx,%r9d
-	andl	$2049,%r9d
+	andl	$0x00000801,%r9d
 
-	cmpl	$2147483656,%r10d
+	cmpl	$0x80000008,%r10d
 	jb	.Lintel
 
-	movl	$2147483656,%eax
+	movl	$0x80000008,%eax
 	cpuid
 	movzbq	%cl,%r10
 	incq	%r10
@@ -94,7 +95,7 @@
 	shrl	$16,%ebx
 	cmpb	%r10b,%bl
 	ja	.Lgeneric
-	andl	$4026531839,%edx
+	andl	$0xefffffff,%edx
 	jmp	.Lgeneric
 
 .Lintel:
@@ -107,47 +108,72 @@
 	cpuid
 	movl	%eax,%r10d
 	shrl	$14,%r10d
-	andl	$4095,%r10d
+	andl	$0xfff,%r10d
 
 .Lnocacheinfo:
 	movl	$1,%eax
 	cpuid
-	andl	$3220176895,%edx
+	andl	$0xbfefffff,%edx
 	cmpl	$0,%r9d
 	jne	.Lnotintel
-	orl	$1073741824,%edx
+	orl	$0x40000000,%edx
 	andb	$15,%ah
 	cmpb	$15,%ah
+	jne	.LnotP4
+	orl	$0x00100000,%edx
+.LnotP4:
+	cmpb	$6,%ah
 	jne	.Lnotintel
-	orl	$1048576,%edx
+	andl	$0x0fff0ff0,%eax
+	cmpl	$0x00050670,%eax
+	je	.Lknights
+	cmpl	$0x00080650,%eax
+	jne	.Lnotintel
+.Lknights:
+	andl	$0xfbffffff,%ecx
+
 .Lnotintel:
 	btl	$28,%edx
 	jnc	.Lgeneric
-	andl	$4026531839,%edx
+	andl	$0xefffffff,%edx
 	cmpl	$0,%r10d
 	je	.Lgeneric
 
-	orl	$268435456,%edx
+	orl	$0x10000000,%edx
 	shrl	$16,%ebx
 	cmpb	$1,%bl
 	ja	.Lgeneric
-	andl	$4026531839,%edx
+	andl	$0xefffffff,%edx
 .Lgeneric:
-	andl	$2048,%r9d
-	andl	$4294965247,%ecx
+	andl	$0x00000800,%r9d
+	andl	$0xfffff7ff,%ecx
 	orl	%ecx,%r9d
 
 	movl	%edx,%r10d
+
+	cmpl	$7,%r11d
+	jb	.Lno_extended_info
+	movl	$7,%eax
+	xorl	%ecx,%ecx
+	cpuid
+	btl	$26,%r9d
+	jc	.Lnotknights
+	andl	$0xfff7ffff,%ebx
+.Lnotknights:
+	movl	%ebx,8(%rdi)
+.Lno_extended_info:
+
 	btl	$27,%r9d
 	jnc	.Lclear_avx
 	xorl	%ecx,%ecx
-.byte	0x0f,0x01,0xd0		
+.byte	0x0f,0x01,0xd0
 	andl	$6,%eax
 	cmpl	$6,%eax
 	je	.Ldone
 .Lclear_avx:
-	movl	$4026525695,%eax
+	movl	$0xefffe7ff,%eax
 	andl	%eax,%r9d
+	andl	$0xffffffdf,8(%rdi)
 .Ldone:
 	shlq	$32,%r9
 	movl	%r10d,%eax
@@ -235,3 +261,18 @@
 	cmoveq	%rcx,%rax
 	.byte	0xf3,0xc3
 .size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+
+.globl	OPENSSL_ia32_rdseed
+.type	OPENSSL_ia32_rdseed, at function
+.align	16
+OPENSSL_ia32_rdseed:
+	movl	$8,%ecx
+.Loop_rdseed:
+.byte	72,15,199,248
+	jc	.Lbreak_rdseed
+	loop	.Loop_rdseed
+.Lbreak_rdseed:
+	cmpq	$0,%rax
+	cmoveq	%rcx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed



More information about the Midnightbsd-cvs mailing list