[Midnightbsd-cvs] src [12154] trunk/secure/lib/libcrypto/i386: update

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sun Jan 20 00:38:27 EST 2019


Revision: 12154
          http://svnweb.midnightbsd.org/src/?rev=12154
Author:   laffer1
Date:     2019-01-20 00:38:27 -0500 (Sun, 20 Jan 2019)
Log Message:
-----------
update

Modified Paths:
--------------
    trunk/secure/lib/libcrypto/i386/aes-586.S
    trunk/secure/lib/libcrypto/i386/aesni-x86.S
    trunk/secure/lib/libcrypto/i386/bf-586.S
    trunk/secure/lib/libcrypto/i386/bf-686.S
    trunk/secure/lib/libcrypto/i386/bn-586.S
    trunk/secure/lib/libcrypto/i386/cmll-x86.S
    trunk/secure/lib/libcrypto/i386/co-586.S
    trunk/secure/lib/libcrypto/i386/crypt586.S
    trunk/secure/lib/libcrypto/i386/des-586.S
    trunk/secure/lib/libcrypto/i386/ghash-x86.S
    trunk/secure/lib/libcrypto/i386/md5-586.S
    trunk/secure/lib/libcrypto/i386/rc4-586.S
    trunk/secure/lib/libcrypto/i386/rc5-586.S
    trunk/secure/lib/libcrypto/i386/rmd-586.S
    trunk/secure/lib/libcrypto/i386/sha1-586.S
    trunk/secure/lib/libcrypto/i386/sha256-586.S
    trunk/secure/lib/libcrypto/i386/sha512-586.S
    trunk/secure/lib/libcrypto/i386/vpaes-x86.S
    trunk/secure/lib/libcrypto/i386/wp-mmx.S
    trunk/secure/lib/libcrypto/i386/x86-gf2m.S
    trunk/secure/lib/libcrypto/i386/x86-mont.S
    trunk/secure/lib/libcrypto/i386/x86cpuid.S

Modified: trunk/secure/lib/libcrypto/i386/aes-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/aes-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/aes-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/aes-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from aes-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/aes-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aes-586.pl. */
 #ifdef PIC
 .file	"aes-586.S"
 .text
@@ -104,74 +104,78 @@
 	xorl	%ecx,%edx
 	movl	%esi,%ecx
 
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%ecx,%ebp
 	andl	$454761243,%esi
-	movl	%ecx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%ecx,%edi
 	xorl	%esi,%ecx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%ecx
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%ecx
-	rorl	$16,%ebp
-	xorl	%ebp,%ecx
-	rorl	$8,%ebp
-	xorl	%ebp,%ecx
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%edx,%ebp
 	andl	$454761243,%esi
-	movl	%edx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%edx,%edi
 	xorl	%esi,%edx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%edx
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%edx
-	rorl	$16,%ebp
-	xorl	%ebp,%edx
-	rorl	$8,%ebp
-	xorl	%ebp,%edx
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%eax,%ebp
 	andl	$454761243,%esi
-	movl	%eax,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%eax,%edi
 	xorl	%esi,%eax
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%eax
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%eax
-	rorl	$16,%ebp
-	xorl	%ebp,%eax
-	rorl	$8,%ebp
-	xorl	%ebp,%eax
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%ebx,%ebp
 	andl	$454761243,%esi
-	movl	%ebx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%ebx,%edi
 	xorl	%esi,%ebx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%ebx
+	xorl	%edi,%esi
 	xorl	%esi,%ebx
-	rorl	$16,%ebp
-	xorl	%ebp,%ebx
-	rorl	$8,%ebp
-	xorl	%ebp,%ebx
 	movl	20(%esp),%edi
 	movl	28(%esp),%ebp
 	addl	$16,%edi
@@ -293,74 +297,76 @@
 	pshufw	$13,%mm4,%mm5
 	movd	%mm1,%eax
 	movd	%mm5,%ebx
+	movl	%edi,20(%esp)
 	movzbl	%al,%esi
+	movzbl	%ah,%edx
+	pshufw	$13,%mm0,%mm2
 	movzbl	-128(%ebp,%esi,1),%ecx
-	pshufw	$13,%mm0,%mm2
-	movzbl	%ah,%edx
+	movzbl	%bl,%edi
 	movzbl	-128(%ebp,%edx,1),%edx
+	shrl	$16,%eax
 	shll	$8,%edx
-	shrl	$16,%eax
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$16,%esi
+	pshufw	$8,%mm4,%mm6
 	orl	%esi,%ecx
-	pshufw	$8,%mm4,%mm6
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$24,%esi
+	shrl	$16,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$8,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
+	movd	%mm2,%eax
 	movd	%ecx,%mm0
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%ecx
-	movd	%mm2,%eax
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$16,%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%ah,%edi
+	shll	$16,%ecx
+	movd	%mm6,%ebx
 	orl	%esi,%ecx
-	movd	%mm6,%ebx
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
 	shll	$8,%esi
+	shrl	$16,%ebx
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
+	shrl	$16,%eax
 	movd	%ecx,%mm1
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%ecx
-	shrl	$16,%ebx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$16,%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%ah,%edi
+	shll	$16,%ecx
+	andl	$255,%eax
 	orl	%esi,%ecx
-	shrl	$16,%eax
 	punpckldq	%mm1,%mm0
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%eax,1),%eax
 	orl	%esi,%ecx
-	andl	$255,%eax
-	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$16,%eax
+	movzbl	-128(%ebp,%edi,1),%esi
 	orl	%eax,%edx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
 	shll	$8,%esi
+	movzbl	-128(%ebp,%ebx,1),%ebx
 	orl	%esi,%ecx
+	orl	%ebx,%edx
+	movl	20(%esp),%edi
 	movd	%ecx,%mm4
-	andl	$255,%ebx
-	movzbl	-128(%ebp,%ebx,1),%ebx
-	orl	%ebx,%edx
 	movd	%edx,%mm5
 	punpckldq	%mm5,%mm4
 	addl	$16,%edi
@@ -998,8 +1004,7 @@
 	call	.L004pic_point
 .L004pic_point:
 	popl	%ebp
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L004pic_point](%ebp),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L004pic_point(%ebp),%eax
 	leal	.LAES_Te-.L004pic_point(%ebp),%ebp
 	leal	764(%esp),%ebx
 	subl	%ebp,%ebx
@@ -1134,18 +1139,18 @@
 	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$24,%eax
 	xorl	%eax,%edx
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%eax
 	subl	%edi,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%eax
+	movl	$2155905152,%edi
+	andl	%eax,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%eax,%eax,1),%ebx
 	subl	%edi,%esi
@@ -1152,10 +1157,10 @@
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
 	xorl	%ecx,%eax
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ebx
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ebp
 	subl	%edi,%esi
@@ -1166,29 +1171,29 @@
 	xorl	%esi,%ebp
 	xorl	%eax,%ecx
 	xorl	%ebp,%eax
-	roll	$24,%eax
 	xorl	%ebx,%ecx
 	xorl	%ebp,%ebx
+	roll	$24,%eax
+	xorl	%ebp,%ecx
 	roll	$16,%ebx
-	xorl	%ebp,%ecx
+	xorl	%eax,%ecx
 	roll	$8,%ebp
-	xorl	%eax,%ecx
 	xorl	%ebx,%ecx
 	movl	4(%esp),%eax
 	xorl	%ebp,%ecx
 	movl	%ecx,12(%esp)
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebx
 	subl	%edi,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ebx
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ecx
 	subl	%edi,%esi
@@ -1195,10 +1200,10 @@
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
 	xorl	%edx,%ebx
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%ebp
 	subl	%edi,%esi
@@ -1209,29 +1214,29 @@
 	xorl	%esi,%ebp
 	xorl	%ebx,%edx
 	xorl	%ebp,%ebx
-	roll	$24,%ebx
 	xorl	%ecx,%edx
 	xorl	%ebp,%ecx
+	roll	$24,%ebx
+	xorl	%ebp,%edx
 	roll	$16,%ecx
-	xorl	%ebp,%edx
+	xorl	%ebx,%edx
 	roll	$8,%ebp
-	xorl	%ebx,%edx
 	xorl	%ecx,%edx
 	movl	8(%esp),%ebx
 	xorl	%ebp,%edx
 	movl	%edx,16(%esp)
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%eax,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%eax,%eax,1),%ecx
 	subl	%edi,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%edx
 	subl	%edi,%esi
@@ -1238,10 +1243,10 @@
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%eax,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%edx
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebp
 	subl	%edi,%esi
@@ -1252,27 +1257,27 @@
 	xorl	%esi,%ebp
 	xorl	%ecx,%eax
 	xorl	%ebp,%ecx
-	roll	$24,%ecx
 	xorl	%edx,%eax
 	xorl	%ebp,%edx
+	roll	$24,%ecx
+	xorl	%ebp,%eax
 	roll	$16,%edx
-	xorl	%ebp,%eax
+	xorl	%ecx,%eax
 	roll	$8,%ebp
-	xorl	%ecx,%eax
 	xorl	%edx,%eax
 	xorl	%ebp,%eax
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ecx
 	subl	%edi,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%edx
 	subl	%edi,%esi
@@ -1279,10 +1284,10 @@
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%ebx,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%edx
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebp
 	subl	%edi,%esi
@@ -1293,13 +1298,13 @@
 	xorl	%esi,%ebp
 	xorl	%ecx,%ebx
 	xorl	%ebp,%ecx
-	roll	$24,%ecx
 	xorl	%edx,%ebx
 	xorl	%ebp,%edx
+	roll	$24,%ecx
+	xorl	%ebp,%ebx
 	roll	$16,%edx
-	xorl	%ebp,%ebx
+	xorl	%ecx,%ebx
 	roll	$8,%ebp
-	xorl	%ecx,%ebx
 	xorl	%edx,%ebx
 	movl	12(%esp),%ecx
 	xorl	%ebp,%ebx
@@ -1418,77 +1423,79 @@
 .align	16
 .L007loop:
 	pshufw	$12,%mm0,%mm1
+	pshufw	$9,%mm4,%mm5
 	movd	%mm1,%eax
-	pshufw	$9,%mm4,%mm5
+	movd	%mm5,%ebx
+	movl	%edi,20(%esp)
 	movzbl	%al,%esi
+	movzbl	%ah,%edx
+	pshufw	$6,%mm0,%mm2
 	movzbl	-128(%ebp,%esi,1),%ecx
-	movd	%mm5,%ebx
-	movzbl	%ah,%edx
+	movzbl	%bl,%edi
 	movzbl	-128(%ebp,%edx,1),%edx
+	shrl	$16,%eax
 	shll	$8,%edx
-	pshufw	$6,%mm0,%mm2
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$16,%esi
+	pshufw	$3,%mm4,%mm6
 	orl	%esi,%ecx
-	shrl	$16,%eax
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$24,%esi
+	shrl	$16,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
-	pshufw	$3,%mm4,%mm6
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$8,%esi
+	movd	%mm2,%eax
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
+	shll	$16,%esi
+	movd	%mm6,%ebx
 	movd	%ecx,%mm0
-	movzbl	%al,%esi
-	movd	%mm2,%eax
-	movzbl	-128(%ebp,%esi,1),%ecx
-	shll	$16,%ecx
-	movzbl	%bl,%esi
-	movd	%mm6,%ebx
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%al,%edi
 	orl	%esi,%ecx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
 	orl	%esi,%edx
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$16,%esi
+	shrl	$16,%eax
 	orl	%esi,%edx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
+	shrl	$16,%ebx
+	shll	$8,%esi
 	movd	%edx,%mm1
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%edx
-	shll	$8,%edx
-	movzbl	%bh,%esi
-	shrl	$16,%eax
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$24,%esi
+	movzbl	-128(%ebp,%edi,1),%edx
+	movzbl	%bh,%edi
+	shll	$24,%edx
+	andl	$255,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
 	punpckldq	%mm1,%mm0
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$8,%esi
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%ebx,1),%ebx
 	orl	%esi,%ecx
-	andl	$255,%ebx
-	movzbl	-128(%ebp,%ebx,1),%ebx
+	movzbl	-128(%ebp,%edi,1),%esi
 	orl	%ebx,%edx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
 	shll	$16,%esi
+	movzbl	-128(%ebp,%eax,1),%eax
 	orl	%esi,%edx
-	movd	%edx,%mm4
-	movzbl	%ah,%eax
-	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$24,%eax
 	orl	%eax,%ecx
+	movl	20(%esp),%edi
+	movd	%edx,%mm4
 	movd	%ecx,%mm5
 	punpckldq	%mm5,%mm4
 	addl	$16,%edi
@@ -2189,8 +2196,7 @@
 	call	.L010pic_point
 .L010pic_point:
 	popl	%ebp
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010pic_point](%ebp),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L010pic_point(%ebp),%eax
 	leal	.LAES_Td-.L010pic_point(%ebp),%ebp
 	leal	764(%esp),%ebx
 	subl	%ebp,%ebx
@@ -2246,8 +2252,7 @@
 	call	.L013pic_point
 .L013pic_point:
 	popl	%ebp
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L013pic_point](%ebp),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L013pic_point(%ebp),%eax
 	cmpl	$0,40(%esp)
 	leal	.LAES_Te-.L013pic_point(%ebp),%ebp
 	jne	.L014picked_te
@@ -3052,30 +3057,30 @@
 .align	4
 .L056permute:
 	addl	$16,%edi
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ebx
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
 	xorl	%eax,%ebx
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ecx
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	xorl	%eax,%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
@@ -3096,30 +3101,30 @@
 	movl	%ebp,%ebx
 	xorl	%edx,%eax
 	movl	%eax,(%edi)
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ecx
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%ebx,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%edx
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	xorl	%ebx,%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
@@ -3140,30 +3145,30 @@
 	movl	%ebp,%ecx
 	xorl	%eax,%ebx
 	movl	%ebx,4(%edi)
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%edx
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
 	xorl	%ecx,%edx
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%eax
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	xorl	%ecx,%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
@@ -3184,30 +3189,30 @@
 	movl	%ebp,%edx
 	xorl	%ebx,%ecx
 	movl	%ecx,8(%edi)
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%eax
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
 	xorl	%edx,%eax
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ebx
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	xorl	%edx,%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
@@ -3240,7 +3245,7 @@
 .byte	65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
 .byte	80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
 .byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"aes-586.S"
 .text
@@ -3344,74 +3349,78 @@
 	xorl	%ecx,%edx
 	movl	%esi,%ecx
 
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%ecx,%ebp
 	andl	$454761243,%esi
-	movl	%ecx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%ecx,%edi
 	xorl	%esi,%ecx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%ecx
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%ecx
-	rorl	$16,%ebp
-	xorl	%ebp,%ecx
-	rorl	$8,%ebp
-	xorl	%ebp,%ecx
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%edx,%ebp
 	andl	$454761243,%esi
-	movl	%edx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%edx,%edi
 	xorl	%esi,%edx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%edx
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%edx
-	rorl	$16,%ebp
-	xorl	%ebp,%edx
-	rorl	$8,%ebp
-	xorl	%ebp,%edx
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%eax,%ebp
 	andl	$454761243,%esi
-	movl	%eax,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%eax,%edi
 	xorl	%esi,%eax
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%eax
+	xorl	%edi,%esi
+	movl	$2155905152,%ebp
 	xorl	%esi,%eax
-	rorl	$16,%ebp
-	xorl	%ebp,%eax
-	rorl	$8,%ebp
-	xorl	%ebp,%eax
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%edi
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%edi
+	andl	$4278124286,%edi
 	subl	%ebp,%esi
-	andl	$4278124286,%edi
+	movl	%ebx,%ebp
 	andl	$454761243,%esi
-	movl	%ebx,%ebp
+	rorl	$16,%ebp
 	xorl	%edi,%esi
+	movl	%ebx,%edi
 	xorl	%esi,%ebx
+	rorl	$24,%edi
+	xorl	%ebp,%esi
 	roll	$24,%ebx
+	xorl	%edi,%esi
 	xorl	%esi,%ebx
-	rorl	$16,%ebp
-	xorl	%ebp,%ebx
-	rorl	$8,%ebp
-	xorl	%ebp,%ebx
 	movl	20(%esp),%edi
 	movl	28(%esp),%ebp
 	addl	$16,%edi
@@ -3533,74 +3542,76 @@
 	pshufw	$13,%mm4,%mm5
 	movd	%mm1,%eax
 	movd	%mm5,%ebx
+	movl	%edi,20(%esp)
 	movzbl	%al,%esi
+	movzbl	%ah,%edx
+	pshufw	$13,%mm0,%mm2
 	movzbl	-128(%ebp,%esi,1),%ecx
-	pshufw	$13,%mm0,%mm2
-	movzbl	%ah,%edx
+	movzbl	%bl,%edi
 	movzbl	-128(%ebp,%edx,1),%edx
+	shrl	$16,%eax
 	shll	$8,%edx
-	shrl	$16,%eax
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$16,%esi
+	pshufw	$8,%mm4,%mm6
 	orl	%esi,%ecx
-	pshufw	$8,%mm4,%mm6
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$24,%esi
+	shrl	$16,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$8,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
+	movd	%mm2,%eax
 	movd	%ecx,%mm0
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%ecx
-	movd	%mm2,%eax
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$16,%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%ah,%edi
+	shll	$16,%ecx
+	movd	%mm6,%ebx
 	orl	%esi,%ecx
-	movd	%mm6,%ebx
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
 	shll	$8,%esi
+	shrl	$16,%ebx
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
+	shrl	$16,%eax
 	movd	%ecx,%mm1
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%ecx
-	shrl	$16,%ebx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$16,%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%ah,%edi
+	shll	$16,%ecx
+	andl	$255,%eax
 	orl	%esi,%ecx
-	shrl	$16,%eax
 	punpckldq	%mm1,%mm0
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%eax,1),%eax
 	orl	%esi,%ecx
-	andl	$255,%eax
-	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$16,%eax
+	movzbl	-128(%ebp,%edi,1),%esi
 	orl	%eax,%edx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
 	shll	$8,%esi
+	movzbl	-128(%ebp,%ebx,1),%ebx
 	orl	%esi,%ecx
+	orl	%ebx,%edx
+	movl	20(%esp),%edi
 	movd	%ecx,%mm4
-	andl	$255,%ebx
-	movzbl	-128(%ebp,%ebx,1),%ebx
-	orl	%ebx,%edx
 	movd	%edx,%mm5
 	punpckldq	%mm5,%mm4
 	addl	$16,%edi
@@ -4373,18 +4384,18 @@
 	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$24,%eax
 	xorl	%eax,%edx
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%eax
 	subl	%edi,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%eax
+	movl	$2155905152,%edi
+	andl	%eax,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%eax,%eax,1),%ebx
 	subl	%edi,%esi
@@ -4391,10 +4402,10 @@
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
 	xorl	%ecx,%eax
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ebx
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ebp
 	subl	%edi,%esi
@@ -4405,29 +4416,29 @@
 	xorl	%esi,%ebp
 	xorl	%eax,%ecx
 	xorl	%ebp,%eax
-	roll	$24,%eax
 	xorl	%ebx,%ecx
 	xorl	%ebp,%ebx
+	roll	$24,%eax
+	xorl	%ebp,%ecx
 	roll	$16,%ebx
-	xorl	%ebp,%ecx
+	xorl	%eax,%ecx
 	roll	$8,%ebp
-	xorl	%eax,%ecx
 	xorl	%ebx,%ecx
 	movl	4(%esp),%eax
 	xorl	%ebp,%ecx
 	movl	%ecx,12(%esp)
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebx
 	subl	%edi,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ebx
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ecx
 	subl	%edi,%esi
@@ -4434,10 +4445,10 @@
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
 	xorl	%edx,%ebx
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%ebp
 	subl	%edi,%esi
@@ -4448,29 +4459,29 @@
 	xorl	%esi,%ebp
 	xorl	%ebx,%edx
 	xorl	%ebp,%ebx
-	roll	$24,%ebx
 	xorl	%ecx,%edx
 	xorl	%ebp,%ecx
+	roll	$24,%ebx
+	xorl	%ebp,%edx
 	roll	$16,%ecx
-	xorl	%ebp,%edx
+	xorl	%ebx,%edx
 	roll	$8,%ebp
-	xorl	%ebx,%edx
 	xorl	%ecx,%edx
 	movl	8(%esp),%ebx
 	xorl	%ebp,%edx
 	movl	%edx,16(%esp)
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%eax,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%eax,%eax,1),%ecx
 	subl	%edi,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%edx
 	subl	%edi,%esi
@@ -4477,10 +4488,10 @@
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%eax,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%edx
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebp
 	subl	%edi,%esi
@@ -4491,27 +4502,27 @@
 	xorl	%esi,%ebp
 	xorl	%ecx,%eax
 	xorl	%ebp,%ecx
-	roll	$24,%ecx
 	xorl	%edx,%eax
 	xorl	%ebp,%edx
+	roll	$24,%ecx
+	xorl	%ebp,%eax
 	roll	$16,%edx
-	xorl	%ebp,%eax
+	xorl	%ecx,%eax
 	roll	$8,%ebp
-	xorl	%ecx,%eax
 	xorl	%edx,%eax
 	xorl	%ebp,%eax
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	movl	$2155905152,%edi
+	andl	%ebx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ebx,%ebx,1),%ecx
 	subl	%edi,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%ecx
+	movl	$2155905152,%edi
+	andl	%ecx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%ecx,%ecx,1),%edx
 	subl	%edi,%esi
@@ -4518,10 +4529,10 @@
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%ebx,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%edi
+	xorl	%esi,%edx
+	movl	$2155905152,%edi
+	andl	%edx,%edi
+	movl	%edi,%esi
 	shrl	$7,%edi
 	leal	(%edx,%edx,1),%ebp
 	subl	%edi,%esi
@@ -4532,13 +4543,13 @@
 	xorl	%esi,%ebp
 	xorl	%ecx,%ebx
 	xorl	%ebp,%ecx
-	roll	$24,%ecx
 	xorl	%edx,%ebx
 	xorl	%ebp,%edx
+	roll	$24,%ecx
+	xorl	%ebp,%ebx
 	roll	$16,%edx
-	xorl	%ebp,%ebx
+	xorl	%ecx,%ebx
 	roll	$8,%ebp
-	xorl	%ecx,%ebx
 	xorl	%edx,%ebx
 	movl	12(%esp),%ecx
 	xorl	%ebp,%ebx
@@ -4657,77 +4668,79 @@
 .align	16
 .L007loop:
 	pshufw	$12,%mm0,%mm1
+	pshufw	$9,%mm4,%mm5
 	movd	%mm1,%eax
-	pshufw	$9,%mm4,%mm5
+	movd	%mm5,%ebx
+	movl	%edi,20(%esp)
 	movzbl	%al,%esi
+	movzbl	%ah,%edx
+	pshufw	$6,%mm0,%mm2
 	movzbl	-128(%ebp,%esi,1),%ecx
-	movd	%mm5,%ebx
-	movzbl	%ah,%edx
+	movzbl	%bl,%edi
 	movzbl	-128(%ebp,%edx,1),%edx
+	shrl	$16,%eax
 	shll	$8,%edx
-	pshufw	$6,%mm0,%mm2
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$16,%esi
+	pshufw	$3,%mm4,%mm6
 	orl	%esi,%ecx
-	shrl	$16,%eax
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$24,%esi
+	shrl	$16,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
-	pshufw	$3,%mm4,%mm6
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
 	shll	$24,%esi
 	orl	%esi,%ecx
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$8,%esi
+	movd	%mm2,%eax
 	orl	%esi,%ecx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
+	shll	$16,%esi
+	movd	%mm6,%ebx
 	movd	%ecx,%mm0
-	movzbl	%al,%esi
-	movd	%mm2,%eax
-	movzbl	-128(%ebp,%esi,1),%ecx
-	shll	$16,%ecx
-	movzbl	%bl,%esi
-	movd	%mm6,%ebx
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%ecx
+	movzbl	%al,%edi
 	orl	%esi,%ecx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bl,%edi
 	orl	%esi,%edx
-	movzbl	%bl,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%ah,%edi
 	shll	$16,%esi
+	shrl	$16,%eax
 	orl	%esi,%edx
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%bh,%edi
+	shrl	$16,%ebx
+	shll	$8,%esi
 	movd	%edx,%mm1
-	movzbl	%ah,%esi
-	movzbl	-128(%ebp,%esi,1),%edx
-	shll	$8,%edx
-	movzbl	%bh,%esi
-	shrl	$16,%eax
-	movzbl	-128(%ebp,%esi,1),%esi
-	shll	$24,%esi
+	movzbl	-128(%ebp,%edi,1),%edx
+	movzbl	%bh,%edi
+	shll	$24,%edx
+	andl	$255,%ebx
 	orl	%esi,%edx
-	shrl	$16,%ebx
 	punpckldq	%mm1,%mm0
-	movzbl	%bh,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	-128(%ebp,%edi,1),%esi
+	movzbl	%al,%edi
 	shll	$8,%esi
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%ebx,1),%ebx
 	orl	%esi,%ecx
-	andl	$255,%ebx
-	movzbl	-128(%ebp,%ebx,1),%ebx
+	movzbl	-128(%ebp,%edi,1),%esi
 	orl	%ebx,%edx
-	movzbl	%al,%esi
-	movzbl	-128(%ebp,%esi,1),%esi
 	shll	$16,%esi
+	movzbl	-128(%ebp,%eax,1),%eax
 	orl	%esi,%edx
-	movd	%edx,%mm4
-	movzbl	%ah,%eax
-	movzbl	-128(%ebp,%eax,1),%eax
 	shll	$24,%eax
 	orl	%eax,%ecx
+	movl	20(%esp),%edi
+	movd	%edx,%mm4
 	movd	%ecx,%mm5
 	punpckldq	%mm5,%mm4
 	addl	$16,%edi
@@ -6289,30 +6302,30 @@
 .align	4
 .L056permute:
 	addl	$16,%edi
-	movl	%eax,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ebx
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
 	xorl	%eax,%ebx
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ecx
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	xorl	%eax,%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
@@ -6333,30 +6346,30 @@
 	movl	%ebp,%ebx
 	xorl	%edx,%eax
 	movl	%eax,(%edi)
-	movl	%ebx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
 	andl	$454761243,%esi
-	xorl	%ecx,%esi
-	movl	%esi,%ecx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ecx
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
 	xorl	%ebx,%ecx
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%edx
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	xorl	%ebx,%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
@@ -6377,30 +6390,30 @@
 	movl	%ebp,%ecx
 	xorl	%eax,%ebx
 	movl	%ebx,4(%edi)
-	movl	%ecx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%ecx,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ecx,%ecx,1),%edx
 	subl	%ebp,%esi
 	andl	$4278124286,%edx
 	andl	$454761243,%esi
-	xorl	%edx,%esi
-	movl	%esi,%edx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%edx
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
 	xorl	%ecx,%edx
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%eax
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	xorl	%ecx,%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
@@ -6421,30 +6434,30 @@
 	movl	%ebp,%edx
 	xorl	%ebx,%ecx
 	movl	%ecx,8(%edi)
-	movl	%edx,%esi
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	movl	$2155905152,%ebp
+	andl	%edx,%ebp
+	leal	(%edx,%edx,1),%eax
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%edx,%edx,1),%eax
 	subl	%ebp,%esi
 	andl	$4278124286,%eax
 	andl	$454761243,%esi
-	xorl	%eax,%esi
-	movl	%esi,%eax
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%eax
+	movl	$2155905152,%ebp
+	andl	%eax,%ebp
+	leal	(%eax,%eax,1),%ebx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%eax,%eax,1),%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ebx
 	andl	$454761243,%esi
 	xorl	%edx,%eax
-	xorl	%ebx,%esi
-	movl	%esi,%ebx
-	andl	$2155905152,%esi
-	movl	%esi,%ebp
+	xorl	%esi,%ebx
+	movl	$2155905152,%ebp
+	andl	%ebx,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	movl	%ebp,%esi
 	shrl	$7,%ebp
-	leal	(%ebx,%ebx,1),%ecx
 	xorl	%edx,%ebx
 	subl	%ebp,%esi
 	andl	$4278124286,%ecx
@@ -6477,5 +6490,5 @@
 .byte	65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
 .byte	80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
 .byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/aesni-x86.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/aesni-x86.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/aesni-x86.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/aesni-x86.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from aesni-x86.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/aesni-x86.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-x86.pl. */
 #ifdef PIC
 .file	"aesni-x86.S"
 .text
@@ -25,7 +25,10 @@
 	leal	16(%edx),%edx
 	jnz	.L000enc1_loop_1
 .byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
 	ret
 .size	aesni_encrypt,.-.L_aesni_encrypt_begin
 .globl	aesni_decrypt
@@ -49,32 +52,90 @@
 	leal	16(%edx),%edx
 	jnz	.L001dec1_loop_2
 .byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
 	ret
 .size	aesni_decrypt,.-.L_aesni_decrypt_begin
+.type	_aesni_encrypt2, at function
+.align	16
+_aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L002enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L002enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_decrypt2, at function
+.align	16
+_aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L003dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L003dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.size	_aesni_decrypt2,.-_aesni_decrypt2
 .type	_aesni_encrypt3, at function
 .align	16
 _aesni_encrypt3:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-	movups	(%edx),%xmm0
-.L002enc3_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L004enc3_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
-	movups	(%edx),%xmm0
-	jnz	.L002enc3_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L004enc3_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -87,25 +148,26 @@
 .align	16
 _aesni_decrypt3:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-	movups	(%edx),%xmm0
-.L003dec3_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L005dec3_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
-	movups	(%edx),%xmm0
-	jnz	.L003dec3_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L005dec3_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -119,27 +181,29 @@
 _aesni_encrypt4:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
-	shrl	$1,%ecx
-	leal	32(%edx),%edx
+	shll	$4,%ecx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
 	pxor	%xmm0,%xmm5
-	movups	(%edx),%xmm0
-.L004enc4_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L006enc4_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
-	movups	(%edx),%xmm0
-	jnz	.L004enc4_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L006enc4_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -155,27 +219,29 @@
 _aesni_decrypt4:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
-	shrl	$1,%ecx
-	leal	32(%edx),%edx
+	shll	$4,%ecx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
 	pxor	%xmm0,%xmm5
-	movups	(%edx),%xmm0
-.L005dec4_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L007dec4_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
-	movups	(%edx),%xmm0
-	jnz	.L005dec4_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L007dec4_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -190,45 +256,42 @@
 .align	16
 _aesni_encrypt6:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,220,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
-	decl	%ecx
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,233
 	pxor	%xmm0,%xmm7
-.byte	102,15,56,220,241
-	movups	(%edx),%xmm0
-.byte	102,15,56,220,249
-	jmp	.L_aesni_encrypt6_enter
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L008_aesni_encrypt6_inner
 .align	16
-.L006enc6_loop:
+.L009enc6_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-.align	16
 .L_aesni_encrypt6_enter:
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	(%edx),%xmm0
-	jnz	.L006enc6_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L009enc6_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -247,45 +310,42 @@
 .align	16
 _aesni_decrypt6:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,222,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,222,217
-	pxor	%xmm0,%xmm5
-	decl	%ecx
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
 .byte	102,15,56,222,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,233
 	pxor	%xmm0,%xmm7
-.byte	102,15,56,222,241
-	movups	(%edx),%xmm0
-.byte	102,15,56,222,249
-	jmp	.L_aesni_decrypt6_enter
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L010_aesni_decrypt6_inner
 .align	16
-.L007dec6_loop:
+.L011dec6_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
-.align	16
 .L_aesni_decrypt6_enter:
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
 .byte	102,15,56,222,240
 .byte	102,15,56,222,248
-	movups	(%edx),%xmm0
-	jnz	.L007dec6_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L011dec6_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -315,14 +375,14 @@
 	movl	32(%esp),%edx
 	movl	36(%esp),%ebx
 	andl	$-16,%eax
-	jz	.L008ecb_ret
+	jz	.L012ecb_ret
 	movl	240(%edx),%ecx
 	testl	%ebx,%ebx
-	jz	.L009ecb_decrypt
+	jz	.L013ecb_decrypt
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	cmpl	$96,%eax
-	jb	.L010ecb_enc_tail
+	jb	.L014ecb_enc_tail
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -331,9 +391,9 @@
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
 	subl	$96,%eax
-	jmp	.L011ecb_enc_loop6_enter
+	jmp	.L015ecb_enc_loop6_enter
 .align	16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
 	movups	%xmm2,(%edi)
 	movdqu	(%esi),%xmm2
 	movups	%xmm3,16(%edi)
@@ -348,12 +408,12 @@
 	leal	96(%edi),%edi
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
 	call	_aesni_encrypt6
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
 	subl	$96,%eax
-	jnc	.L012ecb_enc_loop6
+	jnc	.L016ecb_enc_loop6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
@@ -362,18 +422,18 @@
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
 	addl	$96,%eax
-	jz	.L008ecb_ret
-.L010ecb_enc_tail:
+	jz	.L012ecb_ret
+.L014ecb_enc_tail:
 	movups	(%esi),%xmm2
 	cmpl	$32,%eax
-	jb	.L013ecb_enc_one
+	jb	.L017ecb_enc_one
 	movups	16(%esi),%xmm3
-	je	.L014ecb_enc_two
+	je	.L018ecb_enc_two
 	movups	32(%esi),%xmm4
 	cmpl	$64,%eax
-	jb	.L015ecb_enc_three
+	jb	.L019ecb_enc_three
 	movups	48(%esi),%xmm5
-	je	.L016ecb_enc_four
+	je	.L020ecb_enc_four
 	movups	64(%esi),%xmm6
 	xorps	%xmm7,%xmm7
 	call	_aesni_encrypt6
@@ -382,50 +442,49 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L017enc1_loop_3
+	jnz	.L021enc1_loop_3
 .byte	102,15,56,221,209
 	movups	%xmm2,(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L014ecb_enc_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
+.L018ecb_enc_two:
+	call	_aesni_encrypt2
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
 	call	_aesni_encrypt3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
 	call	_aesni_encrypt4
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	cmpl	$96,%eax
-	jb	.L018ecb_dec_tail
+	jb	.L022ecb_dec_tail
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -434,9 +493,9 @@
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
 	subl	$96,%eax
-	jmp	.L019ecb_dec_loop6_enter
+	jmp	.L023ecb_dec_loop6_enter
 .align	16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
 	movups	%xmm2,(%edi)
 	movdqu	(%esi),%xmm2
 	movups	%xmm3,16(%edi)
@@ -451,12 +510,12 @@
 	leal	96(%edi),%edi
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
 	call	_aesni_decrypt6
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
 	subl	$96,%eax
-	jnc	.L020ecb_dec_loop6
+	jnc	.L024ecb_dec_loop6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
@@ -465,18 +524,18 @@
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
 	addl	$96,%eax
-	jz	.L008ecb_ret
-.L018ecb_dec_tail:
+	jz	.L012ecb_ret
+.L022ecb_dec_tail:
 	movups	(%esi),%xmm2
 	cmpl	$32,%eax
-	jb	.L021ecb_dec_one
+	jb	.L025ecb_dec_one
 	movups	16(%esi),%xmm3
-	je	.L022ecb_dec_two
+	je	.L026ecb_dec_two
 	movups	32(%esi),%xmm4
 	cmpl	$64,%eax
-	jb	.L023ecb_dec_three
+	jb	.L027ecb_dec_three
 	movups	48(%esi),%xmm5
-	je	.L024ecb_dec_four
+	je	.L028ecb_dec_four
 	movups	64(%esi),%xmm6
 	xorps	%xmm7,%xmm7
 	call	_aesni_decrypt6
@@ -485,44 +544,51 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L025dec1_loop_4
+	jnz	.L029dec1_loop_4
 .byte	102,15,56,223,209
 	movups	%xmm2,(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L022ecb_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
+.L026ecb_dec_two:
+	call	_aesni_decrypt2
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
 	call	_aesni_decrypt3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
 	call	_aesni_decrypt4
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -561,13 +627,15 @@
 	movl	%ebp,20(%esp)
 	movl	%ebp,24(%esp)
 	movl	%ebp,28(%esp)
-	shrl	$1,%ecx
+	shll	$4,%ecx
+	movl	$16,%ebx
 	leal	(%edx),%ebp
 	movdqa	(%esp),%xmm5
 	movdqa	%xmm7,%xmm2
-	movl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
 .byte	102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
 	movups	(%ebp),%xmm0
 	movl	%ebx,%ecx
 	movups	(%esi),%xmm6
@@ -574,35 +642,41 @@
 	xorps	%xmm0,%xmm2
 	movups	16(%ebp),%xmm1
 	xorps	%xmm6,%xmm0
-	leal	32(%ebp),%edx
 	xorps	%xmm0,%xmm3
-	movups	(%edx),%xmm0
-.L027ccm64_enc2_loop:
+	movups	32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
 .byte	102,15,56,220,209
-	decl	%ecx
 .byte	102,15,56,220,217
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
-	leal	32(%edx),%edx
 .byte	102,15,56,220,216
-	movups	(%edx),%xmm0
-	jnz	.L027ccm64_enc2_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L031ccm64_enc2_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 	paddq	16(%esp),%xmm7
+	decl	%eax
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
-	decl	%eax
 	leal	16(%esi),%esi
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
 	leal	16(%edi),%edi
-.byte	102,15,56,0,213
-	jnz	.L026ccm64_enc_outer
+	jnz	.L030ccm64_enc_outer
 	movl	48(%esp),%esp
 	movl	40(%esp),%edi
 	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -650,55 +724,58 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L028enc1_loop_5
+	jnz	.L032enc1_loop_5
 .byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
 	movups	(%esi),%xmm6
 	paddq	16(%esp),%xmm7
 	leal	16(%esi),%esi
-	jmp	.L029ccm64_dec_outer
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	.L033ccm64_dec_outer
 .align	16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
-	movl	%ebx,%ecx
 	movups	%xmm6,(%edi)
 	leal	16(%edi),%edi
 .byte	102,15,56,0,213
 	subl	$1,%eax
-	jz	.L030ccm64_dec_break
+	jz	.L034ccm64_dec_break
 	movups	(%ebp),%xmm0
-	shrl	$1,%ecx
+	movl	%ebx,%ecx
 	movups	16(%ebp),%xmm1
 	xorps	%xmm0,%xmm6
-	leal	32(%ebp),%edx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm6,%xmm3
-	movups	(%edx),%xmm0
-.L031ccm64_dec2_loop:
+	movups	32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
 .byte	102,15,56,220,209
-	decl	%ecx
 .byte	102,15,56,220,217
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
-	leal	32(%edx),%edx
 .byte	102,15,56,220,216
-	movups	(%edx),%xmm0
-	jnz	.L031ccm64_dec2_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L035ccm64_dec2_loop
 	movups	(%esi),%xmm6
 	paddq	16(%esp),%xmm7
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	leal	16(%esi),%esi
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
-	jmp	.L029ccm64_dec_outer
+	leal	16(%esi),%esi
+	jmp	.L033ccm64_dec_outer
 .align	16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
@@ -705,16 +782,24 @@
 	xorps	%xmm0,%xmm6
 	leal	32(%edx),%edx
 	xorps	%xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
 .byte	102,15,56,220,217
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L032enc1_loop_6
+	jnz	.L036enc1_loop_6
 .byte	102,15,56,221,217
 	movl	48(%esp),%esp
 	movl	40(%esp),%edi
 	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -740,7 +825,7 @@
 	andl	$-16,%esp
 	movl	%ebp,80(%esp)
 	cmpl	$1,%eax
-	je	.L033ctr32_one_shortcut
+	je	.L037ctr32_one_shortcut
 	movdqu	(%ebx),%xmm7
 	movl	$202182159,(%esp)
 	movl	$134810123,4(%esp)
@@ -756,63 +841,59 @@
 .byte	102,15,58,34,253,3
 	movl	240(%edx),%ecx
 	bswap	%ebx
+	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	pxor	%xmm0,%xmm0
 	movdqa	(%esp),%xmm2
-.byte	102,15,58,34,203,0
+.byte	102,15,58,34,195,0
 	leal	3(%ebx),%ebp
-.byte	102,15,58,34,197,0
+.byte	102,15,58,34,205,0
 	incl	%ebx
-.byte	102,15,58,34,203,1
+.byte	102,15,58,34,195,1
 	incl	%ebp
-.byte	102,15,58,34,197,1
+.byte	102,15,58,34,205,1
 	incl	%ebx
-.byte	102,15,58,34,203,2
+.byte	102,15,58,34,195,2
 	incl	%ebp
-.byte	102,15,58,34,197,2
-	movdqa	%xmm1,48(%esp)
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
 .byte	102,15,56,0,202
-	movdqa	%xmm0,64(%esp)
-.byte	102,15,56,0,194
-	pshufd	$192,%xmm1,%xmm2
-	pshufd	$128,%xmm1,%xmm3
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
 	cmpl	$6,%eax
-	jb	.L034ctr32_tail
+	jb	.L038ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
 	movdqa	%xmm7,32(%esp)
-	shrl	$1,%ecx
 	movl	%edx,%ebp
-	movl	%ecx,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
 	subl	$6,%eax
-	jmp	.L035ctr32_loop6
+	jmp	.L039ctr32_loop6
 .align	16
-.L035ctr32_loop6:
-	pshufd	$64,%xmm1,%xmm4
-	movdqa	32(%esp),%xmm1
-	pshufd	$192,%xmm0,%xmm5
-	por	%xmm1,%xmm2
-	pshufd	$128,%xmm0,%xmm6
-	por	%xmm1,%xmm3
-	pshufd	$64,%xmm0,%xmm7
-	por	%xmm1,%xmm4
-	por	%xmm1,%xmm5
-	por	%xmm1,%xmm6
-	por	%xmm1,%xmm7
-	movups	(%ebp),%xmm0
-	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
-	decl	%ecx
+.L039ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
 	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
 	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
 .byte	102,15,56,220,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,220,249
 	call	.L_aesni_encrypt6_enter
 	movups	(%esi),%xmm1
@@ -823,11 +904,11 @@
 	movups	%xmm2,(%edi)
 	movdqa	16(%esp),%xmm0
 	xorps	%xmm1,%xmm4
-	movdqa	48(%esp),%xmm1
+	movdqa	64(%esp),%xmm1
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	paddd	%xmm0,%xmm1
-	paddd	64(%esp),%xmm0
+	paddd	48(%esp),%xmm0
 	movdqa	(%esp),%xmm2
 	movups	48(%esi),%xmm3
 	movups	64(%esi),%xmm4
@@ -834,40 +915,40 @@
 	xorps	%xmm3,%xmm5
 	movups	80(%esi),%xmm3
 	leal	96(%esi),%esi
-	movdqa	%xmm1,48(%esp)
-.byte	102,15,56,0,202
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
 	xorps	%xmm4,%xmm6
 	movups	%xmm5,48(%edi)
 	xorps	%xmm3,%xmm7
-	movdqa	%xmm0,64(%esp)
-.byte	102,15,56,0,194
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
 	movups	%xmm6,64(%edi)
-	pshufd	$192,%xmm1,%xmm2
+	pshufd	$192,%xmm0,%xmm2
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
-	movl	%ebx,%ecx
-	pshufd	$128,%xmm1,%xmm3
+	pshufd	$128,%xmm0,%xmm3
 	subl	$6,%eax
-	jnc	.L035ctr32_loop6
+	jnc	.L039ctr32_loop6
 	addl	$6,%eax
-	jz	.L036ctr32_ret
+	jz	.L040ctr32_ret
+	movdqu	(%ebp),%xmm7
 	movl	%ebp,%edx
-	leal	1(,%ecx,2),%ecx
-	movdqa	32(%esp),%xmm7
-.L034ctr32_tail:
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+.L038ctr32_tail:
 	por	%xmm7,%xmm2
 	cmpl	$2,%eax
-	jb	.L037ctr32_one
-	pshufd	$64,%xmm1,%xmm4
+	jb	.L041ctr32_one
+	pshufd	$64,%xmm0,%xmm4
 	por	%xmm7,%xmm3
-	je	.L038ctr32_two
-	pshufd	$192,%xmm0,%xmm5
+	je	.L042ctr32_two
+	pshufd	$192,%xmm1,%xmm5
 	por	%xmm7,%xmm4
 	cmpl	$4,%eax
-	jb	.L039ctr32_three
-	pshufd	$128,%xmm0,%xmm6
+	jb	.L043ctr32_three
+	pshufd	$128,%xmm1,%xmm6
 	por	%xmm7,%xmm5
-	je	.L040ctr32_four
+	je	.L044ctr32_four
 	por	%xmm7,%xmm6
 	call	_aesni_encrypt6
 	movups	(%esi),%xmm1
@@ -885,30 +966,30 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
 	movups	(%ebx),%xmm2
 	movl	240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L041enc1_loop_7
+	jnz	.L045enc1_loop_7
 .byte	102,15,56,221,209
 	movups	(%esi),%xmm6
 	xorps	%xmm2,%xmm6
 	movups	%xmm6,(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L038ctr32_two:
-	call	_aesni_encrypt3
+.L042ctr32_two:
+	call	_aesni_encrypt2
 	movups	(%esi),%xmm5
 	movups	16(%esi),%xmm6
 	xorps	%xmm5,%xmm2
@@ -915,9 +996,9 @@
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L039ctr32_three:
+.L043ctr32_three:
 	call	_aesni_encrypt3
 	movups	(%esi),%xmm5
 	movups	16(%esi),%xmm6
@@ -928,9 +1009,9 @@
 	xorps	%xmm7,%xmm4
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L040ctr32_four:
+.L044ctr32_four:
 	call	_aesni_encrypt4
 	movups	(%esi),%xmm6
 	movups	16(%esi),%xmm7
@@ -944,7 +1025,18 @@
 	xorps	%xmm0,%xmm5
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
 	movl	80(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -969,12 +1061,12 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L042enc1_loop_8
+	jnz	.L046enc1_loop_8
 .byte	102,15,56,221,209
 	movl	20(%esp),%esi
 	movl	24(%esp),%edi
@@ -998,12 +1090,14 @@
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	subl	$96,%eax
-	jc	.L043xts_enc_short
-	shrl	$1,%ecx
-	movl	%ecx,%ebx
-	jmp	.L044xts_enc_loop6
+	jc	.L047xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L048xts_enc_loop6
 .align	16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,(%esp)
@@ -1039,6 +1133,7 @@
 	pand	%xmm3,%xmm7
 	movups	(%esi),%xmm2
 	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
 	movdqu	16(%esi),%xmm3
 	xorps	%xmm0,%xmm2
 	movdqu	32(%esi),%xmm4
@@ -1054,19 +1149,17 @@
 	movdqa	%xmm7,80(%esp)
 	pxor	%xmm1,%xmm7
 	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
 	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
 .byte	102,15,56,220,209
-	pxor	32(%esp),%xmm4
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
 .byte	102,15,56,220,217
-	pxor	48(%esp),%xmm5
-	decl	%ecx
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
 .byte	102,15,56,220,225
-	pxor	64(%esp),%xmm6
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,220,249
 	call	.L_aesni_encrypt6_enter
 	movdqa	80(%esp),%xmm1
@@ -1091,19 +1184,18 @@
 	paddq	%xmm1,%xmm1
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
-	movl	%ebx,%ecx
 	pxor	%xmm2,%xmm1
 	subl	$96,%eax
-	jnc	.L044xts_enc_loop6
-	leal	1(,%ecx,2),%ecx
+	jnc	.L048xts_enc_loop6
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movl	%ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
 	addl	$96,%eax
-	jz	.L045xts_enc_done6x
+	jz	.L049xts_enc_done6x
 	movdqa	%xmm1,%xmm5
 	cmpl	$32,%eax
-	jb	.L046xts_enc_one
+	jb	.L050xts_enc_one
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	paddq	%xmm1,%xmm1
@@ -1110,7 +1202,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-	je	.L047xts_enc_two
+	je	.L051xts_enc_two
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm6
@@ -1119,7 +1211,7 @@
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
 	cmpl	$64,%eax
-	jb	.L048xts_enc_three
+	jb	.L052xts_enc_three
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm7
@@ -1129,7 +1221,7 @@
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,(%esp)
 	movdqa	%xmm6,16(%esp)
-	je	.L049xts_enc_four
+	je	.L053xts_enc_four
 	movdqa	%xmm7,32(%esp)
 	pshufd	$19,%xmm0,%xmm7
 	movdqa	%xmm1,48(%esp)
@@ -1161,9 +1253,9 @@
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L046xts_enc_one:
+.L050xts_enc_one:
 	movups	(%esi),%xmm2
 	leal	16(%esi),%esi
 	xorps	%xmm5,%xmm2
@@ -1171,20 +1263,20 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L051enc1_loop_9
+	jnz	.L055enc1_loop_9
 .byte	102,15,56,221,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	movdqa	%xmm5,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L047xts_enc_two:
+.L051xts_enc_two:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1191,8 +1283,7 @@
 	leal	32(%esi),%esi
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
@@ -1199,9 +1290,9 @@
 	movups	%xmm3,16(%edi)
 	leal	32(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L048xts_enc_three:
+.L052xts_enc_three:
 	movaps	%xmm1,%xmm7
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1219,9 +1310,9 @@
 	movups	%xmm4,32(%edi)
 	leal	48(%edi),%edi
 	movdqa	%xmm7,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L049xts_enc_four:
+.L053xts_enc_four:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1243,21 +1334,21 @@
 	movups	%xmm5,48(%edi)
 	leal	64(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
 	movl	112(%esp),%eax
 	andl	$15,%eax
-	jz	.L052xts_enc_ret
+	jz	.L056xts_enc_ret
 	movdqa	%xmm1,%xmm5
 	movl	%eax,112(%esp)
-	jmp	.L053xts_enc_steal
+	jmp	.L057xts_enc_steal
 .align	16
-.L050xts_enc_done:
+.L054xts_enc_done:
 	movl	112(%esp),%eax
 	pxor	%xmm0,%xmm0
 	andl	$15,%eax
-	jz	.L052xts_enc_ret
+	jz	.L056xts_enc_ret
 	pcmpgtd	%xmm1,%xmm0
 	movl	%eax,112(%esp)
 	pshufd	$19,%xmm0,%xmm5
@@ -1264,7 +1355,7 @@
 	paddq	%xmm1,%xmm1
 	pand	96(%esp),%xmm5
 	pxor	%xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
 	movzbl	(%esi),%ecx
 	movzbl	-16(%edi),%edx
 	leal	1(%esi),%esi
@@ -1272,7 +1363,7 @@
 	movb	%dl,(%edi)
 	leal	1(%edi),%edi
 	subl	$1,%eax
-	jnz	.L053xts_enc_steal
+	jnz	.L057xts_enc_steal
 	subl	112(%esp),%edi
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
@@ -1282,16 +1373,30 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L054enc1_loop_10
+	jnz	.L058enc1_loop_10
 .byte	102,15,56,221,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
 	movl	116(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -1316,12 +1421,12 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L055enc1_loop_11
+	jnz	.L059enc1_loop_11
 .byte	102,15,56,221,209
 	movl	20(%esp),%esi
 	movl	24(%esp),%edi
@@ -1350,12 +1455,14 @@
 	pcmpgtd	%xmm1,%xmm0
 	andl	$-16,%eax
 	subl	$96,%eax
-	jc	.L056xts_dec_short
-	shrl	$1,%ecx
-	movl	%ecx,%ebx
-	jmp	.L057xts_dec_loop6
+	jc	.L060xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L061xts_dec_loop6
 .align	16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,(%esp)
@@ -1391,6 +1498,7 @@
 	pand	%xmm3,%xmm7
 	movups	(%esi),%xmm2
 	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
 	movdqu	16(%esi),%xmm3
 	xorps	%xmm0,%xmm2
 	movdqu	32(%esi),%xmm4
@@ -1406,19 +1514,17 @@
 	movdqa	%xmm7,80(%esp)
 	pxor	%xmm1,%xmm7
 	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
 	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
 .byte	102,15,56,222,209
-	pxor	32(%esp),%xmm4
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
 .byte	102,15,56,222,217
-	pxor	48(%esp),%xmm5
-	decl	%ecx
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
 .byte	102,15,56,222,225
-	pxor	64(%esp),%xmm6
 .byte	102,15,56,222,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,222,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,222,249
 	call	.L_aesni_decrypt6_enter
 	movdqa	80(%esp),%xmm1
@@ -1443,19 +1549,18 @@
 	paddq	%xmm1,%xmm1
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
-	movl	%ebx,%ecx
 	pxor	%xmm2,%xmm1
 	subl	$96,%eax
-	jnc	.L057xts_dec_loop6
-	leal	1(,%ecx,2),%ecx
+	jnc	.L061xts_dec_loop6
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movl	%ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
 	addl	$96,%eax
-	jz	.L058xts_dec_done6x
+	jz	.L062xts_dec_done6x
 	movdqa	%xmm1,%xmm5
 	cmpl	$32,%eax
-	jb	.L059xts_dec_one
+	jb	.L063xts_dec_one
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	paddq	%xmm1,%xmm1
@@ -1462,7 +1567,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-	je	.L060xts_dec_two
+	je	.L064xts_dec_two
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm6
@@ -1471,7 +1576,7 @@
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
 	cmpl	$64,%eax
-	jb	.L061xts_dec_three
+	jb	.L065xts_dec_three
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm7
@@ -1481,7 +1586,7 @@
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,(%esp)
 	movdqa	%xmm6,16(%esp)
-	je	.L062xts_dec_four
+	je	.L066xts_dec_four
 	movdqa	%xmm7,32(%esp)
 	pshufd	$19,%xmm0,%xmm7
 	movdqa	%xmm1,48(%esp)
@@ -1513,9 +1618,9 @@
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L059xts_dec_one:
+.L063xts_dec_one:
 	movups	(%esi),%xmm2
 	leal	16(%esi),%esi
 	xorps	%xmm5,%xmm2
@@ -1523,20 +1628,20 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L064dec1_loop_12
+	jnz	.L068dec1_loop_12
 .byte	102,15,56,223,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	movdqa	%xmm5,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L060xts_dec_two:
+.L064xts_dec_two:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1543,7 +1648,7 @@
 	leal	32(%esi),%esi
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
@@ -1550,9 +1655,9 @@
 	movups	%xmm3,16(%edi)
 	leal	32(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L061xts_dec_three:
+.L065xts_dec_three:
 	movaps	%xmm1,%xmm7
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1570,9 +1675,9 @@
 	movups	%xmm4,32(%edi)
 	leal	48(%edi),%edi
 	movdqa	%xmm7,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L062xts_dec_four:
+.L066xts_dec_four:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -1594,20 +1699,20 @@
 	movups	%xmm5,48(%edi)
 	leal	64(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
 	movl	112(%esp),%eax
 	andl	$15,%eax
-	jz	.L065xts_dec_ret
+	jz	.L069xts_dec_ret
 	movl	%eax,112(%esp)
-	jmp	.L066xts_dec_only_one_more
+	jmp	.L070xts_dec_only_one_more
 .align	16
-.L063xts_dec_done:
+.L067xts_dec_done:
 	movl	112(%esp),%eax
 	pxor	%xmm0,%xmm0
 	andl	$15,%eax
-	jz	.L065xts_dec_ret
+	jz	.L069xts_dec_ret
 	pcmpgtd	%xmm1,%xmm0
 	movl	%eax,112(%esp)
 	pshufd	$19,%xmm0,%xmm2
@@ -1617,7 +1722,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
 	pshufd	$19,%xmm0,%xmm5
 	movdqa	%xmm1,%xmm6
 	paddq	%xmm1,%xmm1
@@ -1631,16 +1736,16 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L067dec1_loop_13
+	jnz	.L071dec1_loop_13
 .byte	102,15,56,223,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
 	movzbl	16(%esi),%ecx
 	movzbl	(%edi),%edx
 	leal	1(%esi),%esi
@@ -1648,7 +1753,7 @@
 	movb	%dl,16(%edi)
 	leal	1(%edi),%edi
 	subl	$1,%eax
-	jnz	.L068xts_dec_steal
+	jnz	.L072xts_dec_steal
 	subl	112(%esp),%edi
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
@@ -1658,16 +1763,30 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L069dec1_loop_14
+	jnz	.L073dec1_loop_14
 .byte	102,15,56,223,209
 	xorps	%xmm6,%xmm2
 	movups	%xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
 	movl	116(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -1693,7 +1812,7 @@
 	movl	32(%esp),%edx
 	movl	36(%esp),%ebp
 	testl	%eax,%eax
-	jz	.L070cbc_abort
+	jz	.L074cbc_abort
 	cmpl	$0,40(%esp)
 	xchgl	%esp,%ebx
 	movups	(%ebp),%xmm7
@@ -1701,14 +1820,14 @@
 	movl	%edx,%ebp
 	movl	%ebx,16(%esp)
 	movl	%ecx,%ebx
-	je	.L071cbc_decrypt
+	je	.L075cbc_decrypt
 	movaps	%xmm7,%xmm2
 	cmpl	$16,%eax
-	jb	.L072cbc_enc_tail
+	jb	.L076cbc_enc_tail
 	subl	$16,%eax
-	jmp	.L073cbc_enc_loop
+	jmp	.L077cbc_enc_loop
 .align	16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
 	movups	(%esi),%xmm7
 	leal	16(%esi),%esi
 	movups	(%edx),%xmm0
@@ -1716,12 +1835,12 @@
 	xorps	%xmm0,%xmm7
 	leal	32(%edx),%edx
 	xorps	%xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L074enc1_loop_15
+	jnz	.L078enc1_loop_15
 .byte	102,15,56,221,209
 	movl	%ebx,%ecx
 	movl	%ebp,%edx
@@ -1728,12 +1847,13 @@
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	subl	$16,%eax
-	jnc	.L073cbc_enc_loop
+	jnc	.L077cbc_enc_loop
 	addl	$16,%eax
-	jnz	.L072cbc_enc_tail
+	jnz	.L076cbc_enc_tail
 	movaps	%xmm2,%xmm7
-	jmp	.L075cbc_ret
-.L072cbc_enc_tail:
+	pxor	%xmm2,%xmm2
+	jmp	.L079cbc_ret
+.L076cbc_enc_tail:
 	movl	%eax,%ecx
 .long	2767451785
 	movl	$16,%ecx
@@ -1744,20 +1864,20 @@
 	movl	%ebx,%ecx
 	movl	%edi,%esi
 	movl	%ebp,%edx
-	jmp	.L073cbc_enc_loop
+	jmp	.L077cbc_enc_loop
 .align	16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
 	cmpl	$80,%eax
-	jbe	.L076cbc_dec_tail
+	jbe	.L080cbc_dec_tail
 	movaps	%xmm7,(%esp)
 	subl	$80,%eax
-	jmp	.L077cbc_dec_loop6_enter
+	jmp	.L081cbc_dec_loop6_enter
 .align	16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
 	movaps	%xmm0,(%esp)
 	movups	%xmm7,(%edi)
 	leal	16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -1787,28 +1907,28 @@
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
 	subl	$96,%eax
-	ja	.L078cbc_dec_loop6
+	ja	.L082cbc_dec_loop6
 	movaps	%xmm7,%xmm2
 	movaps	%xmm0,%xmm7
 	addl	$80,%eax
-	jle	.L079cbc_dec_tail_collected
+	jle	.L083cbc_dec_clear_tail_collected
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
 	movups	(%esi),%xmm2
 	movaps	%xmm2,%xmm6
 	cmpl	$16,%eax
-	jbe	.L080cbc_dec_one
+	jbe	.L084cbc_dec_one
 	movups	16(%esi),%xmm3
 	movaps	%xmm3,%xmm5
 	cmpl	$32,%eax
-	jbe	.L081cbc_dec_two
+	jbe	.L085cbc_dec_two
 	movups	32(%esi),%xmm4
 	cmpl	$48,%eax
-	jbe	.L082cbc_dec_three
+	jbe	.L086cbc_dec_three
 	movups	48(%esi),%xmm5
 	cmpl	$64,%eax
-	jbe	.L083cbc_dec_four
+	jbe	.L087cbc_dec_four
 	movups	64(%esi),%xmm6
 	movaps	%xmm7,(%esp)
 	movups	(%esi),%xmm2
@@ -1826,43 +1946,47 @@
 	xorps	%xmm0,%xmm6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
 	leal	64(%edi),%edi
 	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
 	subl	$80,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L084dec1_loop_16
+	jnz	.L089dec1_loop_16
 .byte	102,15,56,223,209
 	xorps	%xmm7,%xmm2
 	movaps	%xmm6,%xmm7
 	subl	$16,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L081cbc_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
+.L085cbc_dec_two:
+	call	_aesni_decrypt2
 	xorps	%xmm7,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
 	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
 	leal	16(%edi),%edi
 	movaps	%xmm5,%xmm7
 	subl	$32,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
 	call	_aesni_decrypt3
 	xorps	%xmm7,%xmm2
 	xorps	%xmm6,%xmm3
@@ -1869,13 +1993,15 @@
 	xorps	%xmm5,%xmm4
 	movups	%xmm2,(%edi)
 	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	leal	32(%edi),%edi
 	movups	32(%esi),%xmm7
 	subl	$48,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
 	call	_aesni_decrypt4
 	movups	16(%esi),%xmm1
 	movups	32(%esi),%xmm0
@@ -1885,28 +2011,44 @@
 	movups	%xmm2,(%edi)
 	xorps	%xmm1,%xmm4
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	xorps	%xmm0,%xmm5
 	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
 	leal	48(%edi),%edi
 	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
 	subl	$64,%eax
-.L079cbc_dec_tail_collected:
+	jmp	.L088cbc_dec_tail_collected
+.align	16
+.L083cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+.L088cbc_dec_tail_collected:
 	andl	$15,%eax
-	jnz	.L085cbc_dec_tail_partial
+	jnz	.L090cbc_dec_tail_partial
 	movups	%xmm2,(%edi)
-	jmp	.L075cbc_ret
+	pxor	%xmm0,%xmm0
+	jmp	.L079cbc_ret
 .align	16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
 	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
 	movl	$16,%ecx
 	movl	%esp,%esi
 	subl	%eax,%ecx
 .long	2767451785
-.L075cbc_ret:
+	movdqa	%xmm2,(%esp)
+.L079cbc_ret:
 	movl	16(%esp),%esp
 	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
 	movups	%xmm7,(%ebp)
-.L070cbc_abort:
+	pxor	%xmm7,%xmm7
+.L074cbc_abort:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -1916,52 +2058,62 @@
 .type	_aesni_set_encrypt_key, at function
 .align	16
 _aesni_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
 	testl	%eax,%eax
-	jz	.L086bad_pointer
+	jz	.L091bad_pointer
 	testl	%edx,%edx
-	jz	.L086bad_pointer
+	jz	.L091bad_pointer
+	call	.L092pic
+.L092pic:
+	popl	%ebx
+	leal	.Lkey_const-.L092pic(%ebx),%ebx
+	leal	OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
 	movups	(%eax),%xmm0
 	xorps	%xmm4,%xmm4
+	movl	4(%ebp),%ebp
 	leal	16(%edx),%edx
+	andl	$268437504,%ebp
 	cmpl	$256,%ecx
-	je	.L08714rounds
+	je	.L09314rounds
 	cmpl	$192,%ecx
-	je	.L08812rounds
+	je	.L09412rounds
 	cmpl	$128,%ecx
-	jne	.L089bad_keybits
+	jne	.L095bad_keybits
 .align	16
-.L09010rounds:
+.L09610rounds:
+	cmpl	$268435456,%ebp
+	je	.L09710rounds_alt
 	movl	$9,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,200,1
-	call	.L091key_128_cold
+	call	.L098key_128_cold
 .byte	102,15,58,223,200,2
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,4
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,8
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,16
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,32
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,64
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,128
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,27
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,54
-	call	.L092key_128
+	call	.L099key_128
 	movups	%xmm0,(%edx)
 	movl	%ecx,80(%edx)
-	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L092key_128:
+.L099key_128:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -1970,38 +2122,91 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L08812rounds:
+.L09710rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L101loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L101loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L100good_key
+.align	16
+.L09412rounds:
 	movq	16(%eax),%xmm2
+	cmpl	$268435456,%ebp
+	je	.L10212rounds_alt
 	movl	$11,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L093key_192a_cold
+	call	.L103key_192a_cold
 .byte	102,15,58,223,202,2
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,4
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,8
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,16
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,32
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,64
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,128
-	call	.L094key_192b
+	call	.L104key_192b
 	movups	%xmm0,(%edx)
 	movl	%ecx,48(%edx)
-	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L095key_192a:
+.L105key_192a:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 .align	16
-.L093key_192a_cold:
+.L103key_192a_cold:
 	movaps	%xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
 	shufps	$16,%xmm0,%xmm4
 	movdqa	%xmm2,%xmm3
 	xorps	%xmm4,%xmm0
@@ -2015,7 +2220,7 @@
 	pxor	%xmm3,%xmm2
 	ret
 .align	16
-.L094key_192b:
+.L104key_192b:
 	movaps	%xmm0,%xmm3
 	shufps	$68,%xmm0,%xmm5
 	movups	%xmm5,(%edx)
@@ -2022,49 +2227,83 @@
 	shufps	$78,%xmm2,%xmm3
 	movups	%xmm3,16(%edx)
 	leal	32(%edx),%edx
-	jmp	.L096key_192b_warm
+	jmp	.L106key_192b_warm
 .align	16
-.L08714rounds:
+.L10212rounds_alt:
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+.L107loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	.L107loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	.L100good_key
+.align	16
+.L09314rounds:
 	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	cmpl	$268435456,%ebp
+	je	.L10814rounds_alt
 	movl	$13,%ecx
-	leal	16(%edx),%edx
 	movups	%xmm0,-32(%edx)
 	movups	%xmm2,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L097key_256a_cold
+	call	.L109key_256a_cold
 .byte	102,15,58,223,200,1
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,2
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,2
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,4
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,4
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,8
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,8
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,16
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,16
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,32
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,32
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,64
-	call	.L099key_256a
+	call	.L111key_256a
 	movups	%xmm0,(%edx)
 	movl	%ecx,16(%edx)
 	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L099key_256a:
+.L111key_256a:
 	movups	%xmm2,(%edx)
 	leal	16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -2073,7 +2312,7 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L098key_256b:
+.L110key_256b:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 	shufps	$16,%xmm2,%xmm4
@@ -2083,13 +2322,70 @@
 	shufps	$170,%xmm1,%xmm1
 	xorps	%xmm1,%xmm2
 	ret
+.align	16
+.L10814rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+.L112loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	.L113done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	.L112loop_key256
+.L113done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+.L100good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
 .align	4
-.L086bad_pointer:
+.L091bad_pointer:
 	movl	$-1,%eax
+	popl	%ebx
+	popl	%ebp
 	ret
 .align	4
-.L089bad_keybits:
+.L095bad_keybits:
+	pxor	%xmm0,%xmm0
 	movl	$-2,%eax
+	popl	%ebx
+	popl	%ebp
 	ret
 .size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
 .globl	aesni_set_encrypt_key
@@ -2115,7 +2411,7 @@
 	movl	12(%esp),%edx
 	shll	$4,%ecx
 	testl	%eax,%eax
-	jnz	.L100dec_key_ret
+	jnz	.L114dec_key_ret
 	leal	16(%edx,%ecx,1),%eax
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
@@ -2123,7 +2419,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2133,18 +2429,27 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	.L101dec_key_inverse
+	ja	.L115dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	xorl	%eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
 	ret
 .size	aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align	64
+.Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
 .byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte	115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"aesni-x86.S"
 .text
@@ -2169,7 +2474,10 @@
 	leal	16(%edx),%edx
 	jnz	.L000enc1_loop_1
 .byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
 	ret
 .size	aesni_encrypt,.-.L_aesni_encrypt_begin
 .globl	aesni_decrypt
@@ -2193,32 +2501,90 @@
 	leal	16(%edx),%edx
 	jnz	.L001dec1_loop_2
 .byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
 	ret
 .size	aesni_decrypt,.-.L_aesni_decrypt_begin
+.type	_aesni_encrypt2, at function
+.align	16
+_aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L002enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L002enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_decrypt2, at function
+.align	16
+_aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L003dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L003dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.size	_aesni_decrypt2,.-_aesni_decrypt2
 .type	_aesni_encrypt3, at function
 .align	16
 _aesni_encrypt3:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-	movups	(%edx),%xmm0
-.L002enc3_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L004enc3_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
-	movups	(%edx),%xmm0
-	jnz	.L002enc3_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L004enc3_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -2231,25 +2597,26 @@
 .align	16
 _aesni_decrypt3:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
-	movups	(%edx),%xmm0
-.L003dec3_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L005dec3_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
-	movups	(%edx),%xmm0
-	jnz	.L003dec3_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L005dec3_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -2263,27 +2630,29 @@
 _aesni_encrypt4:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
-	shrl	$1,%ecx
-	leal	32(%edx),%edx
+	shll	$4,%ecx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
 	pxor	%xmm0,%xmm5
-	movups	(%edx),%xmm0
-.L004enc4_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L006enc4_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
-	movups	(%edx),%xmm0
-	jnz	.L004enc4_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L006enc4_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -2299,27 +2668,29 @@
 _aesni_decrypt4:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
-	shrl	$1,%ecx
-	leal	32(%edx),%edx
+	shll	$4,%ecx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
 	pxor	%xmm0,%xmm4
 	pxor	%xmm0,%xmm5
-	movups	(%edx),%xmm0
-.L005dec4_loop:
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L007dec4_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
-	movups	(%edx),%xmm0
-	jnz	.L005dec4_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L007dec4_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -2334,45 +2705,42 @@
 .align	16
 _aesni_encrypt6:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,220,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
-	decl	%ecx
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,233
 	pxor	%xmm0,%xmm7
-.byte	102,15,56,220,241
-	movups	(%edx),%xmm0
-.byte	102,15,56,220,249
-	jmp	.L_aesni_encrypt6_enter
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L008_aesni_encrypt6_inner
 .align	16
-.L006enc6_loop:
+.L009enc6_loop:
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	decl	%ecx
 .byte	102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-.align	16
 .L_aesni_encrypt6_enter:
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	leal	32(%edx),%edx
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	(%edx),%xmm0
-	jnz	.L006enc6_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L009enc6_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 .byte	102,15,56,220,225
@@ -2391,45 +2759,42 @@
 .align	16
 _aesni_decrypt6:
 	movups	(%edx),%xmm0
-	shrl	$1,%ecx
+	shll	$4,%ecx
 	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
 	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
 .byte	102,15,56,222,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
 .byte	102,15,56,222,217
-	pxor	%xmm0,%xmm5
-	decl	%ecx
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
 .byte	102,15,56,222,225
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,233
 	pxor	%xmm0,%xmm7
-.byte	102,15,56,222,241
-	movups	(%edx),%xmm0
-.byte	102,15,56,222,249
-	jmp	.L_aesni_decrypt6_enter
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L010_aesni_decrypt6_inner
 .align	16
-.L007dec6_loop:
+.L011dec6_loop:
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
-	decl	%ecx
 .byte	102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
-.align	16
 .L_aesni_decrypt6_enter:
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
-	leal	32(%edx),%edx
 .byte	102,15,56,222,224
 .byte	102,15,56,222,232
 .byte	102,15,56,222,240
 .byte	102,15,56,222,248
-	movups	(%edx),%xmm0
-	jnz	.L007dec6_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L011dec6_loop
 .byte	102,15,56,222,209
 .byte	102,15,56,222,217
 .byte	102,15,56,222,225
@@ -2459,14 +2824,14 @@
 	movl	32(%esp),%edx
 	movl	36(%esp),%ebx
 	andl	$-16,%eax
-	jz	.L008ecb_ret
+	jz	.L012ecb_ret
 	movl	240(%edx),%ecx
 	testl	%ebx,%ebx
-	jz	.L009ecb_decrypt
+	jz	.L013ecb_decrypt
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	cmpl	$96,%eax
-	jb	.L010ecb_enc_tail
+	jb	.L014ecb_enc_tail
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -2475,9 +2840,9 @@
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
 	subl	$96,%eax
-	jmp	.L011ecb_enc_loop6_enter
+	jmp	.L015ecb_enc_loop6_enter
 .align	16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
 	movups	%xmm2,(%edi)
 	movdqu	(%esi),%xmm2
 	movups	%xmm3,16(%edi)
@@ -2492,12 +2857,12 @@
 	leal	96(%edi),%edi
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
 	call	_aesni_encrypt6
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
 	subl	$96,%eax
-	jnc	.L012ecb_enc_loop6
+	jnc	.L016ecb_enc_loop6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
@@ -2506,18 +2871,18 @@
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
 	addl	$96,%eax
-	jz	.L008ecb_ret
-.L010ecb_enc_tail:
+	jz	.L012ecb_ret
+.L014ecb_enc_tail:
 	movups	(%esi),%xmm2
 	cmpl	$32,%eax
-	jb	.L013ecb_enc_one
+	jb	.L017ecb_enc_one
 	movups	16(%esi),%xmm3
-	je	.L014ecb_enc_two
+	je	.L018ecb_enc_two
 	movups	32(%esi),%xmm4
 	cmpl	$64,%eax
-	jb	.L015ecb_enc_three
+	jb	.L019ecb_enc_three
 	movups	48(%esi),%xmm5
-	je	.L016ecb_enc_four
+	je	.L020ecb_enc_four
 	movups	64(%esi),%xmm6
 	xorps	%xmm7,%xmm7
 	call	_aesni_encrypt6
@@ -2526,50 +2891,49 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L017enc1_loop_3
+	jnz	.L021enc1_loop_3
 .byte	102,15,56,221,209
 	movups	%xmm2,(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L014ecb_enc_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
+.L018ecb_enc_two:
+	call	_aesni_encrypt2
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
 	call	_aesni_encrypt3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
 	call	_aesni_encrypt4
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	cmpl	$96,%eax
-	jb	.L018ecb_dec_tail
+	jb	.L022ecb_dec_tail
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -2578,9 +2942,9 @@
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
 	subl	$96,%eax
-	jmp	.L019ecb_dec_loop6_enter
+	jmp	.L023ecb_dec_loop6_enter
 .align	16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
 	movups	%xmm2,(%edi)
 	movdqu	(%esi),%xmm2
 	movups	%xmm3,16(%edi)
@@ -2595,12 +2959,12 @@
 	leal	96(%edi),%edi
 	movdqu	80(%esi),%xmm7
 	leal	96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
 	call	_aesni_decrypt6
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
 	subl	$96,%eax
-	jnc	.L020ecb_dec_loop6
+	jnc	.L024ecb_dec_loop6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
@@ -2609,18 +2973,18 @@
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
 	addl	$96,%eax
-	jz	.L008ecb_ret
-.L018ecb_dec_tail:
+	jz	.L012ecb_ret
+.L022ecb_dec_tail:
 	movups	(%esi),%xmm2
 	cmpl	$32,%eax
-	jb	.L021ecb_dec_one
+	jb	.L025ecb_dec_one
 	movups	16(%esi),%xmm3
-	je	.L022ecb_dec_two
+	je	.L026ecb_dec_two
 	movups	32(%esi),%xmm4
 	cmpl	$64,%eax
-	jb	.L023ecb_dec_three
+	jb	.L027ecb_dec_three
 	movups	48(%esi),%xmm5
-	je	.L024ecb_dec_four
+	je	.L028ecb_dec_four
 	movups	64(%esi),%xmm6
 	xorps	%xmm7,%xmm7
 	call	_aesni_decrypt6
@@ -2629,44 +2993,51 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L025dec1_loop_4
+	jnz	.L029dec1_loop_4
 .byte	102,15,56,223,209
 	movups	%xmm2,(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L022ecb_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
+.L026ecb_dec_two:
+	call	_aesni_decrypt2
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
 	call	_aesni_decrypt3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L008ecb_ret
+	jmp	.L012ecb_ret
 .align	16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
 	call	_aesni_decrypt4
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -2705,13 +3076,15 @@
 	movl	%ebp,20(%esp)
 	movl	%ebp,24(%esp)
 	movl	%ebp,28(%esp)
-	shrl	$1,%ecx
+	shll	$4,%ecx
+	movl	$16,%ebx
 	leal	(%edx),%ebp
 	movdqa	(%esp),%xmm5
 	movdqa	%xmm7,%xmm2
-	movl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
 .byte	102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
 	movups	(%ebp),%xmm0
 	movl	%ebx,%ecx
 	movups	(%esi),%xmm6
@@ -2718,35 +3091,41 @@
 	xorps	%xmm0,%xmm2
 	movups	16(%ebp),%xmm1
 	xorps	%xmm6,%xmm0
-	leal	32(%ebp),%edx
 	xorps	%xmm0,%xmm3
-	movups	(%edx),%xmm0
-.L027ccm64_enc2_loop:
+	movups	32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
 .byte	102,15,56,220,209
-	decl	%ecx
 .byte	102,15,56,220,217
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
-	leal	32(%edx),%edx
 .byte	102,15,56,220,216
-	movups	(%edx),%xmm0
-	jnz	.L027ccm64_enc2_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L031ccm64_enc2_loop
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
 	paddq	16(%esp),%xmm7
+	decl	%eax
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
-	decl	%eax
 	leal	16(%esi),%esi
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
 	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
 	leal	16(%edi),%edi
-.byte	102,15,56,0,213
-	jnz	.L026ccm64_enc_outer
+	jnz	.L030ccm64_enc_outer
 	movl	48(%esp),%esp
 	movl	40(%esp),%edi
 	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -2794,55 +3173,58 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L028enc1_loop_5
+	jnz	.L032enc1_loop_5
 .byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
 	movups	(%esi),%xmm6
 	paddq	16(%esp),%xmm7
 	leal	16(%esi),%esi
-	jmp	.L029ccm64_dec_outer
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	.L033ccm64_dec_outer
 .align	16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
 	xorps	%xmm2,%xmm6
 	movdqa	%xmm7,%xmm2
-	movl	%ebx,%ecx
 	movups	%xmm6,(%edi)
 	leal	16(%edi),%edi
 .byte	102,15,56,0,213
 	subl	$1,%eax
-	jz	.L030ccm64_dec_break
+	jz	.L034ccm64_dec_break
 	movups	(%ebp),%xmm0
-	shrl	$1,%ecx
+	movl	%ebx,%ecx
 	movups	16(%ebp),%xmm1
 	xorps	%xmm0,%xmm6
-	leal	32(%ebp),%edx
 	xorps	%xmm0,%xmm2
 	xorps	%xmm6,%xmm3
-	movups	(%edx),%xmm0
-.L031ccm64_dec2_loop:
+	movups	32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
 .byte	102,15,56,220,209
-	decl	%ecx
 .byte	102,15,56,220,217
-	movups	16(%edx),%xmm1
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
 .byte	102,15,56,220,208
-	leal	32(%edx),%edx
 .byte	102,15,56,220,216
-	movups	(%edx),%xmm0
-	jnz	.L031ccm64_dec2_loop
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L035ccm64_dec2_loop
 	movups	(%esi),%xmm6
 	paddq	16(%esp),%xmm7
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	leal	16(%esi),%esi
 .byte	102,15,56,221,208
 .byte	102,15,56,221,216
-	jmp	.L029ccm64_dec_outer
+	leal	16(%esi),%esi
+	jmp	.L033ccm64_dec_outer
 .align	16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
@@ -2849,16 +3231,24 @@
 	xorps	%xmm0,%xmm6
 	leal	32(%edx),%edx
 	xorps	%xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
 .byte	102,15,56,220,217
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L032enc1_loop_6
+	jnz	.L036enc1_loop_6
 .byte	102,15,56,221,217
 	movl	48(%esp),%esp
 	movl	40(%esp),%edi
 	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -2884,7 +3274,7 @@
 	andl	$-16,%esp
 	movl	%ebp,80(%esp)
 	cmpl	$1,%eax
-	je	.L033ctr32_one_shortcut
+	je	.L037ctr32_one_shortcut
 	movdqu	(%ebx),%xmm7
 	movl	$202182159,(%esp)
 	movl	$134810123,4(%esp)
@@ -2900,63 +3290,59 @@
 .byte	102,15,58,34,253,3
 	movl	240(%edx),%ecx
 	bswap	%ebx
+	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	pxor	%xmm0,%xmm0
 	movdqa	(%esp),%xmm2
-.byte	102,15,58,34,203,0
+.byte	102,15,58,34,195,0
 	leal	3(%ebx),%ebp
-.byte	102,15,58,34,197,0
+.byte	102,15,58,34,205,0
 	incl	%ebx
-.byte	102,15,58,34,203,1
+.byte	102,15,58,34,195,1
 	incl	%ebp
-.byte	102,15,58,34,197,1
+.byte	102,15,58,34,205,1
 	incl	%ebx
-.byte	102,15,58,34,203,2
+.byte	102,15,58,34,195,2
 	incl	%ebp
-.byte	102,15,58,34,197,2
-	movdqa	%xmm1,48(%esp)
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
 .byte	102,15,56,0,202
-	movdqa	%xmm0,64(%esp)
-.byte	102,15,56,0,194
-	pshufd	$192,%xmm1,%xmm2
-	pshufd	$128,%xmm1,%xmm3
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
 	cmpl	$6,%eax
-	jb	.L034ctr32_tail
+	jb	.L038ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
 	movdqa	%xmm7,32(%esp)
-	shrl	$1,%ecx
 	movl	%edx,%ebp
-	movl	%ecx,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
 	subl	$6,%eax
-	jmp	.L035ctr32_loop6
+	jmp	.L039ctr32_loop6
 .align	16
-.L035ctr32_loop6:
-	pshufd	$64,%xmm1,%xmm4
-	movdqa	32(%esp),%xmm1
-	pshufd	$192,%xmm0,%xmm5
-	por	%xmm1,%xmm2
-	pshufd	$128,%xmm0,%xmm6
-	por	%xmm1,%xmm3
-	pshufd	$64,%xmm0,%xmm7
-	por	%xmm1,%xmm4
-	por	%xmm1,%xmm5
-	por	%xmm1,%xmm6
-	por	%xmm1,%xmm7
-	movups	(%ebp),%xmm0
-	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
-	decl	%ecx
+.L039ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
 	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
 	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
 .byte	102,15,56,220,209
-	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
 .byte	102,15,56,220,225
-	pxor	%xmm0,%xmm6
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,220,249
 	call	.L_aesni_encrypt6_enter
 	movups	(%esi),%xmm1
@@ -2967,11 +3353,11 @@
 	movups	%xmm2,(%edi)
 	movdqa	16(%esp),%xmm0
 	xorps	%xmm1,%xmm4
-	movdqa	48(%esp),%xmm1
+	movdqa	64(%esp),%xmm1
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
 	paddd	%xmm0,%xmm1
-	paddd	64(%esp),%xmm0
+	paddd	48(%esp),%xmm0
 	movdqa	(%esp),%xmm2
 	movups	48(%esi),%xmm3
 	movups	64(%esi),%xmm4
@@ -2978,40 +3364,40 @@
 	xorps	%xmm3,%xmm5
 	movups	80(%esi),%xmm3
 	leal	96(%esi),%esi
-	movdqa	%xmm1,48(%esp)
-.byte	102,15,56,0,202
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
 	xorps	%xmm4,%xmm6
 	movups	%xmm5,48(%edi)
 	xorps	%xmm3,%xmm7
-	movdqa	%xmm0,64(%esp)
-.byte	102,15,56,0,194
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
 	movups	%xmm6,64(%edi)
-	pshufd	$192,%xmm1,%xmm2
+	pshufd	$192,%xmm0,%xmm2
 	movups	%xmm7,80(%edi)
 	leal	96(%edi),%edi
-	movl	%ebx,%ecx
-	pshufd	$128,%xmm1,%xmm3
+	pshufd	$128,%xmm0,%xmm3
 	subl	$6,%eax
-	jnc	.L035ctr32_loop6
+	jnc	.L039ctr32_loop6
 	addl	$6,%eax
-	jz	.L036ctr32_ret
+	jz	.L040ctr32_ret
+	movdqu	(%ebp),%xmm7
 	movl	%ebp,%edx
-	leal	1(,%ecx,2),%ecx
-	movdqa	32(%esp),%xmm7
-.L034ctr32_tail:
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+.L038ctr32_tail:
 	por	%xmm7,%xmm2
 	cmpl	$2,%eax
-	jb	.L037ctr32_one
-	pshufd	$64,%xmm1,%xmm4
+	jb	.L041ctr32_one
+	pshufd	$64,%xmm0,%xmm4
 	por	%xmm7,%xmm3
-	je	.L038ctr32_two
-	pshufd	$192,%xmm0,%xmm5
+	je	.L042ctr32_two
+	pshufd	$192,%xmm1,%xmm5
 	por	%xmm7,%xmm4
 	cmpl	$4,%eax
-	jb	.L039ctr32_three
-	pshufd	$128,%xmm0,%xmm6
+	jb	.L043ctr32_three
+	pshufd	$128,%xmm1,%xmm6
 	por	%xmm7,%xmm5
-	je	.L040ctr32_four
+	je	.L044ctr32_four
 	por	%xmm7,%xmm6
 	call	_aesni_encrypt6
 	movups	(%esi),%xmm1
@@ -3029,30 +3415,30 @@
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
 	movups	(%ebx),%xmm2
 	movl	240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L041enc1_loop_7
+	jnz	.L045enc1_loop_7
 .byte	102,15,56,221,209
 	movups	(%esi),%xmm6
 	xorps	%xmm2,%xmm6
 	movups	%xmm6,(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L038ctr32_two:
-	call	_aesni_encrypt3
+.L042ctr32_two:
+	call	_aesni_encrypt2
 	movups	(%esi),%xmm5
 	movups	16(%esi),%xmm6
 	xorps	%xmm5,%xmm2
@@ -3059,9 +3445,9 @@
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L039ctr32_three:
+.L043ctr32_three:
 	call	_aesni_encrypt3
 	movups	(%esi),%xmm5
 	movups	16(%esi),%xmm6
@@ -3072,9 +3458,9 @@
 	xorps	%xmm7,%xmm4
 	movups	%xmm3,16(%edi)
 	movups	%xmm4,32(%edi)
-	jmp	.L036ctr32_ret
+	jmp	.L040ctr32_ret
 .align	16
-.L040ctr32_four:
+.L044ctr32_four:
 	call	_aesni_encrypt4
 	movups	(%esi),%xmm6
 	movups	16(%esi),%xmm7
@@ -3088,7 +3474,18 @@
 	xorps	%xmm0,%xmm5
 	movups	%xmm4,32(%edi)
 	movups	%xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
 	movl	80(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -3113,12 +3510,12 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L042enc1_loop_8
+	jnz	.L046enc1_loop_8
 .byte	102,15,56,221,209
 	movl	20(%esp),%esi
 	movl	24(%esp),%edi
@@ -3142,12 +3539,14 @@
 	movl	%edx,%ebp
 	movl	%ecx,%ebx
 	subl	$96,%eax
-	jc	.L043xts_enc_short
-	shrl	$1,%ecx
-	movl	%ecx,%ebx
-	jmp	.L044xts_enc_loop6
+	jc	.L047xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L048xts_enc_loop6
 .align	16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,(%esp)
@@ -3183,6 +3582,7 @@
 	pand	%xmm3,%xmm7
 	movups	(%esi),%xmm2
 	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
 	movdqu	16(%esi),%xmm3
 	xorps	%xmm0,%xmm2
 	movdqu	32(%esi),%xmm4
@@ -3198,19 +3598,17 @@
 	movdqa	%xmm7,80(%esp)
 	pxor	%xmm1,%xmm7
 	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
 	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
 .byte	102,15,56,220,209
-	pxor	32(%esp),%xmm4
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
 .byte	102,15,56,220,217
-	pxor	48(%esp),%xmm5
-	decl	%ecx
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
 .byte	102,15,56,220,225
-	pxor	64(%esp),%xmm6
 .byte	102,15,56,220,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,220,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,220,249
 	call	.L_aesni_encrypt6_enter
 	movdqa	80(%esp),%xmm1
@@ -3235,19 +3633,18 @@
 	paddq	%xmm1,%xmm1
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
-	movl	%ebx,%ecx
 	pxor	%xmm2,%xmm1
 	subl	$96,%eax
-	jnc	.L044xts_enc_loop6
-	leal	1(,%ecx,2),%ecx
+	jnc	.L048xts_enc_loop6
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movl	%ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
 	addl	$96,%eax
-	jz	.L045xts_enc_done6x
+	jz	.L049xts_enc_done6x
 	movdqa	%xmm1,%xmm5
 	cmpl	$32,%eax
-	jb	.L046xts_enc_one
+	jb	.L050xts_enc_one
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	paddq	%xmm1,%xmm1
@@ -3254,7 +3651,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-	je	.L047xts_enc_two
+	je	.L051xts_enc_two
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm6
@@ -3263,7 +3660,7 @@
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
 	cmpl	$64,%eax
-	jb	.L048xts_enc_three
+	jb	.L052xts_enc_three
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm7
@@ -3273,7 +3670,7 @@
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,(%esp)
 	movdqa	%xmm6,16(%esp)
-	je	.L049xts_enc_four
+	je	.L053xts_enc_four
 	movdqa	%xmm7,32(%esp)
 	pshufd	$19,%xmm0,%xmm7
 	movdqa	%xmm1,48(%esp)
@@ -3305,9 +3702,9 @@
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L046xts_enc_one:
+.L050xts_enc_one:
 	movups	(%esi),%xmm2
 	leal	16(%esi),%esi
 	xorps	%xmm5,%xmm2
@@ -3315,20 +3712,20 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L051enc1_loop_9
+	jnz	.L055enc1_loop_9
 .byte	102,15,56,221,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	movdqa	%xmm5,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L047xts_enc_two:
+.L051xts_enc_two:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3335,8 +3732,7 @@
 	leal	32(%esi),%esi
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
-	xorps	%xmm4,%xmm4
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
@@ -3343,9 +3739,9 @@
 	movups	%xmm3,16(%edi)
 	leal	32(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L048xts_enc_three:
+.L052xts_enc_three:
 	movaps	%xmm1,%xmm7
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3363,9 +3759,9 @@
 	movups	%xmm4,32(%edi)
 	leal	48(%edi),%edi
 	movdqa	%xmm7,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L049xts_enc_four:
+.L053xts_enc_four:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3387,21 +3783,21 @@
 	movups	%xmm5,48(%edi)
 	leal	64(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L050xts_enc_done
+	jmp	.L054xts_enc_done
 .align	16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
 	movl	112(%esp),%eax
 	andl	$15,%eax
-	jz	.L052xts_enc_ret
+	jz	.L056xts_enc_ret
 	movdqa	%xmm1,%xmm5
 	movl	%eax,112(%esp)
-	jmp	.L053xts_enc_steal
+	jmp	.L057xts_enc_steal
 .align	16
-.L050xts_enc_done:
+.L054xts_enc_done:
 	movl	112(%esp),%eax
 	pxor	%xmm0,%xmm0
 	andl	$15,%eax
-	jz	.L052xts_enc_ret
+	jz	.L056xts_enc_ret
 	pcmpgtd	%xmm1,%xmm0
 	movl	%eax,112(%esp)
 	pshufd	$19,%xmm0,%xmm5
@@ -3408,7 +3804,7 @@
 	paddq	%xmm1,%xmm1
 	pand	96(%esp),%xmm5
 	pxor	%xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
 	movzbl	(%esi),%ecx
 	movzbl	-16(%edi),%edx
 	leal	1(%esi),%esi
@@ -3416,7 +3812,7 @@
 	movb	%dl,(%edi)
 	leal	1(%edi),%edi
 	subl	$1,%eax
-	jnz	.L053xts_enc_steal
+	jnz	.L057xts_enc_steal
 	subl	112(%esp),%edi
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
@@ -3426,16 +3822,30 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L054enc1_loop_10
+	jnz	.L058enc1_loop_10
 .byte	102,15,56,221,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
 	movl	116(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -3460,12 +3870,12 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L055enc1_loop_11
+	jnz	.L059enc1_loop_11
 .byte	102,15,56,221,209
 	movl	20(%esp),%esi
 	movl	24(%esp),%edi
@@ -3494,12 +3904,14 @@
 	pcmpgtd	%xmm1,%xmm0
 	andl	$-16,%eax
 	subl	$96,%eax
-	jc	.L056xts_dec_short
-	shrl	$1,%ecx
-	movl	%ecx,%ebx
-	jmp	.L057xts_dec_loop6
+	jc	.L060xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L061xts_dec_loop6
 .align	16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,(%esp)
@@ -3535,6 +3947,7 @@
 	pand	%xmm3,%xmm7
 	movups	(%esi),%xmm2
 	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
 	movdqu	16(%esi),%xmm3
 	xorps	%xmm0,%xmm2
 	movdqu	32(%esi),%xmm4
@@ -3550,19 +3963,17 @@
 	movdqa	%xmm7,80(%esp)
 	pxor	%xmm1,%xmm7
 	movups	16(%ebp),%xmm1
-	leal	32(%ebp),%edx
 	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
 .byte	102,15,56,222,209
-	pxor	32(%esp),%xmm4
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
 .byte	102,15,56,222,217
-	pxor	48(%esp),%xmm5
-	decl	%ecx
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
 .byte	102,15,56,222,225
-	pxor	64(%esp),%xmm6
 .byte	102,15,56,222,233
-	pxor	%xmm0,%xmm7
 .byte	102,15,56,222,241
-	movups	(%edx),%xmm0
 .byte	102,15,56,222,249
 	call	.L_aesni_decrypt6_enter
 	movdqa	80(%esp),%xmm1
@@ -3587,19 +3998,18 @@
 	paddq	%xmm1,%xmm1
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
-	movl	%ebx,%ecx
 	pxor	%xmm2,%xmm1
 	subl	$96,%eax
-	jnc	.L057xts_dec_loop6
-	leal	1(,%ecx,2),%ecx
+	jnc	.L061xts_dec_loop6
+	movl	240(%ebp),%ecx
 	movl	%ebp,%edx
 	movl	%ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
 	addl	$96,%eax
-	jz	.L058xts_dec_done6x
+	jz	.L062xts_dec_done6x
 	movdqa	%xmm1,%xmm5
 	cmpl	$32,%eax
-	jb	.L059xts_dec_one
+	jb	.L063xts_dec_one
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	paddq	%xmm1,%xmm1
@@ -3606,7 +4016,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-	je	.L060xts_dec_two
+	je	.L064xts_dec_two
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm6
@@ -3615,7 +4025,7 @@
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
 	cmpl	$64,%eax
-	jb	.L061xts_dec_three
+	jb	.L065xts_dec_three
 	pshufd	$19,%xmm0,%xmm2
 	pxor	%xmm0,%xmm0
 	movdqa	%xmm1,%xmm7
@@ -3625,7 +4035,7 @@
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,(%esp)
 	movdqa	%xmm6,16(%esp)
-	je	.L062xts_dec_four
+	je	.L066xts_dec_four
 	movdqa	%xmm7,32(%esp)
 	pshufd	$19,%xmm0,%xmm7
 	movdqa	%xmm1,48(%esp)
@@ -3657,9 +4067,9 @@
 	movups	%xmm5,48(%edi)
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L059xts_dec_one:
+.L063xts_dec_one:
 	movups	(%esi),%xmm2
 	leal	16(%esi),%esi
 	xorps	%xmm5,%xmm2
@@ -3667,20 +4077,20 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L064dec1_loop_12
+	jnz	.L068dec1_loop_12
 .byte	102,15,56,223,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	movdqa	%xmm5,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L060xts_dec_two:
+.L064xts_dec_two:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3687,7 +4097,7 @@
 	leal	32(%esi),%esi
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	xorps	%xmm5,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
@@ -3694,9 +4104,9 @@
 	movups	%xmm3,16(%edi)
 	leal	32(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L061xts_dec_three:
+.L065xts_dec_three:
 	movaps	%xmm1,%xmm7
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3714,9 +4124,9 @@
 	movups	%xmm4,32(%edi)
 	leal	48(%edi),%edi
 	movdqa	%xmm7,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L062xts_dec_four:
+.L066xts_dec_four:
 	movaps	%xmm1,%xmm6
 	movups	(%esi),%xmm2
 	movups	16(%esi),%xmm3
@@ -3738,20 +4148,20 @@
 	movups	%xmm5,48(%edi)
 	leal	64(%edi),%edi
 	movdqa	%xmm6,%xmm1
-	jmp	.L063xts_dec_done
+	jmp	.L067xts_dec_done
 .align	16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
 	movl	112(%esp),%eax
 	andl	$15,%eax
-	jz	.L065xts_dec_ret
+	jz	.L069xts_dec_ret
 	movl	%eax,112(%esp)
-	jmp	.L066xts_dec_only_one_more
+	jmp	.L070xts_dec_only_one_more
 .align	16
-.L063xts_dec_done:
+.L067xts_dec_done:
 	movl	112(%esp),%eax
 	pxor	%xmm0,%xmm0
 	andl	$15,%eax
-	jz	.L065xts_dec_ret
+	jz	.L069xts_dec_ret
 	pcmpgtd	%xmm1,%xmm0
 	movl	%eax,112(%esp)
 	pshufd	$19,%xmm0,%xmm2
@@ -3761,7 +4171,7 @@
 	pand	%xmm3,%xmm2
 	pcmpgtd	%xmm1,%xmm0
 	pxor	%xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
 	pshufd	$19,%xmm0,%xmm5
 	movdqa	%xmm1,%xmm6
 	paddq	%xmm1,%xmm1
@@ -3775,16 +4185,16 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L067dec1_loop_13
+	jnz	.L071dec1_loop_13
 .byte	102,15,56,223,209
 	xorps	%xmm5,%xmm2
 	movups	%xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
 	movzbl	16(%esi),%ecx
 	movzbl	(%edi),%edx
 	leal	1(%esi),%esi
@@ -3792,7 +4202,7 @@
 	movb	%dl,16(%edi)
 	leal	1(%edi),%edi
 	subl	$1,%eax
-	jnz	.L068xts_dec_steal
+	jnz	.L072xts_dec_steal
 	subl	112(%esp),%edi
 	movl	%ebp,%edx
 	movl	%ebx,%ecx
@@ -3802,16 +4212,30 @@
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L069dec1_loop_14
+	jnz	.L073dec1_loop_14
 .byte	102,15,56,223,209
 	xorps	%xmm6,%xmm2
 	movups	%xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
 	movl	116(%esp),%esp
 	popl	%edi
 	popl	%esi
@@ -3837,7 +4261,7 @@
 	movl	32(%esp),%edx
 	movl	36(%esp),%ebp
 	testl	%eax,%eax
-	jz	.L070cbc_abort
+	jz	.L074cbc_abort
 	cmpl	$0,40(%esp)
 	xchgl	%esp,%ebx
 	movups	(%ebp),%xmm7
@@ -3845,14 +4269,14 @@
 	movl	%edx,%ebp
 	movl	%ebx,16(%esp)
 	movl	%ecx,%ebx
-	je	.L071cbc_decrypt
+	je	.L075cbc_decrypt
 	movaps	%xmm7,%xmm2
 	cmpl	$16,%eax
-	jb	.L072cbc_enc_tail
+	jb	.L076cbc_enc_tail
 	subl	$16,%eax
-	jmp	.L073cbc_enc_loop
+	jmp	.L077cbc_enc_loop
 .align	16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
 	movups	(%esi),%xmm7
 	leal	16(%esi),%esi
 	movups	(%edx),%xmm0
@@ -3860,12 +4284,12 @@
 	xorps	%xmm0,%xmm7
 	leal	32(%edx),%edx
 	xorps	%xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
 .byte	102,15,56,220,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L074enc1_loop_15
+	jnz	.L078enc1_loop_15
 .byte	102,15,56,221,209
 	movl	%ebx,%ecx
 	movl	%ebp,%edx
@@ -3872,12 +4296,13 @@
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
 	subl	$16,%eax
-	jnc	.L073cbc_enc_loop
+	jnc	.L077cbc_enc_loop
 	addl	$16,%eax
-	jnz	.L072cbc_enc_tail
+	jnz	.L076cbc_enc_tail
 	movaps	%xmm2,%xmm7
-	jmp	.L075cbc_ret
-.L072cbc_enc_tail:
+	pxor	%xmm2,%xmm2
+	jmp	.L079cbc_ret
+.L076cbc_enc_tail:
 	movl	%eax,%ecx
 .long	2767451785
 	movl	$16,%ecx
@@ -3888,20 +4313,20 @@
 	movl	%ebx,%ecx
 	movl	%edi,%esi
 	movl	%ebp,%edx
-	jmp	.L073cbc_enc_loop
+	jmp	.L077cbc_enc_loop
 .align	16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
 	cmpl	$80,%eax
-	jbe	.L076cbc_dec_tail
+	jbe	.L080cbc_dec_tail
 	movaps	%xmm7,(%esp)
 	subl	$80,%eax
-	jmp	.L077cbc_dec_loop6_enter
+	jmp	.L081cbc_dec_loop6_enter
 .align	16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
 	movaps	%xmm0,(%esp)
 	movups	%xmm7,(%edi)
 	leal	16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
 	movdqu	(%esi),%xmm2
 	movdqu	16(%esi),%xmm3
 	movdqu	32(%esi),%xmm4
@@ -3931,28 +4356,28 @@
 	movups	%xmm6,64(%edi)
 	leal	80(%edi),%edi
 	subl	$96,%eax
-	ja	.L078cbc_dec_loop6
+	ja	.L082cbc_dec_loop6
 	movaps	%xmm7,%xmm2
 	movaps	%xmm0,%xmm7
 	addl	$80,%eax
-	jle	.L079cbc_dec_tail_collected
+	jle	.L083cbc_dec_clear_tail_collected
 	movups	%xmm2,(%edi)
 	leal	16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
 	movups	(%esi),%xmm2
 	movaps	%xmm2,%xmm6
 	cmpl	$16,%eax
-	jbe	.L080cbc_dec_one
+	jbe	.L084cbc_dec_one
 	movups	16(%esi),%xmm3
 	movaps	%xmm3,%xmm5
 	cmpl	$32,%eax
-	jbe	.L081cbc_dec_two
+	jbe	.L085cbc_dec_two
 	movups	32(%esi),%xmm4
 	cmpl	$48,%eax
-	jbe	.L082cbc_dec_three
+	jbe	.L086cbc_dec_three
 	movups	48(%esi),%xmm5
 	cmpl	$64,%eax
-	jbe	.L083cbc_dec_four
+	jbe	.L087cbc_dec_four
 	movups	64(%esi),%xmm6
 	movaps	%xmm7,(%esp)
 	movups	(%esi),%xmm2
@@ -3970,43 +4395,47 @@
 	xorps	%xmm0,%xmm6
 	movups	%xmm2,(%edi)
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
 	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
 	leal	64(%edi),%edi
 	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
 	subl	$80,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
 	movups	(%edx),%xmm0
 	movups	16(%edx),%xmm1
 	leal	32(%edx),%edx
 	xorps	%xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
 .byte	102,15,56,222,209
 	decl	%ecx
 	movups	(%edx),%xmm1
 	leal	16(%edx),%edx
-	jnz	.L084dec1_loop_16
+	jnz	.L089dec1_loop_16
 .byte	102,15,56,223,209
 	xorps	%xmm7,%xmm2
 	movaps	%xmm6,%xmm7
 	subl	$16,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L081cbc_dec_two:
-	xorps	%xmm4,%xmm4
-	call	_aesni_decrypt3
+.L085cbc_dec_two:
+	call	_aesni_decrypt2
 	xorps	%xmm7,%xmm2
 	xorps	%xmm6,%xmm3
 	movups	%xmm2,(%edi)
 	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
 	leal	16(%edi),%edi
 	movaps	%xmm5,%xmm7
 	subl	$32,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
 	call	_aesni_decrypt3
 	xorps	%xmm7,%xmm2
 	xorps	%xmm6,%xmm3
@@ -4013,13 +4442,15 @@
 	xorps	%xmm5,%xmm4
 	movups	%xmm2,(%edi)
 	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	leal	32(%edi),%edi
 	movups	32(%esi),%xmm7
 	subl	$48,%eax
-	jmp	.L079cbc_dec_tail_collected
+	jmp	.L088cbc_dec_tail_collected
 .align	16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
 	call	_aesni_decrypt4
 	movups	16(%esi),%xmm1
 	movups	32(%esi),%xmm0
@@ -4029,28 +4460,44 @@
 	movups	%xmm2,(%edi)
 	xorps	%xmm1,%xmm4
 	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
 	xorps	%xmm0,%xmm5
 	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
 	leal	48(%edi),%edi
 	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
 	subl	$64,%eax
-.L079cbc_dec_tail_collected:
+	jmp	.L088cbc_dec_tail_collected
+.align	16
+.L083cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+.L088cbc_dec_tail_collected:
 	andl	$15,%eax
-	jnz	.L085cbc_dec_tail_partial
+	jnz	.L090cbc_dec_tail_partial
 	movups	%xmm2,(%edi)
-	jmp	.L075cbc_ret
+	pxor	%xmm0,%xmm0
+	jmp	.L079cbc_ret
 .align	16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
 	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
 	movl	$16,%ecx
 	movl	%esp,%esi
 	subl	%eax,%ecx
 .long	2767451785
-.L075cbc_ret:
+	movdqa	%xmm2,(%esp)
+.L079cbc_ret:
 	movl	16(%esp),%esp
 	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
 	movups	%xmm7,(%ebp)
-.L070cbc_abort:
+	pxor	%xmm7,%xmm7
+.L074cbc_abort:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -4060,52 +4507,62 @@
 .type	_aesni_set_encrypt_key, at function
 .align	16
 _aesni_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
 	testl	%eax,%eax
-	jz	.L086bad_pointer
+	jz	.L091bad_pointer
 	testl	%edx,%edx
-	jz	.L086bad_pointer
+	jz	.L091bad_pointer
+	call	.L092pic
+.L092pic:
+	popl	%ebx
+	leal	.Lkey_const-.L092pic(%ebx),%ebx
+	leal	OPENSSL_ia32cap_P,%ebp
 	movups	(%eax),%xmm0
 	xorps	%xmm4,%xmm4
+	movl	4(%ebp),%ebp
 	leal	16(%edx),%edx
+	andl	$268437504,%ebp
 	cmpl	$256,%ecx
-	je	.L08714rounds
+	je	.L09314rounds
 	cmpl	$192,%ecx
-	je	.L08812rounds
+	je	.L09412rounds
 	cmpl	$128,%ecx
-	jne	.L089bad_keybits
+	jne	.L095bad_keybits
 .align	16
-.L09010rounds:
+.L09610rounds:
+	cmpl	$268435456,%ebp
+	je	.L09710rounds_alt
 	movl	$9,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,200,1
-	call	.L091key_128_cold
+	call	.L098key_128_cold
 .byte	102,15,58,223,200,2
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,4
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,8
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,16
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,32
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,64
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,128
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,27
-	call	.L092key_128
+	call	.L099key_128
 .byte	102,15,58,223,200,54
-	call	.L092key_128
+	call	.L099key_128
 	movups	%xmm0,(%edx)
 	movl	%ecx,80(%edx)
-	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L092key_128:
+.L099key_128:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -4114,38 +4571,91 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L08812rounds:
+.L09710rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L101loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L101loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L100good_key
+.align	16
+.L09412rounds:
 	movq	16(%eax),%xmm2
+	cmpl	$268435456,%ebp
+	je	.L10212rounds_alt
 	movl	$11,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L093key_192a_cold
+	call	.L103key_192a_cold
 .byte	102,15,58,223,202,2
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,4
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,8
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,16
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,32
-	call	.L094key_192b
+	call	.L104key_192b
 .byte	102,15,58,223,202,64
-	call	.L095key_192a
+	call	.L105key_192a
 .byte	102,15,58,223,202,128
-	call	.L094key_192b
+	call	.L104key_192b
 	movups	%xmm0,(%edx)
 	movl	%ecx,48(%edx)
-	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L095key_192a:
+.L105key_192a:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 .align	16
-.L093key_192a_cold:
+.L103key_192a_cold:
 	movaps	%xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
 	shufps	$16,%xmm0,%xmm4
 	movdqa	%xmm2,%xmm3
 	xorps	%xmm4,%xmm0
@@ -4159,7 +4669,7 @@
 	pxor	%xmm3,%xmm2
 	ret
 .align	16
-.L094key_192b:
+.L104key_192b:
 	movaps	%xmm0,%xmm3
 	shufps	$68,%xmm0,%xmm5
 	movups	%xmm5,(%edx)
@@ -4166,49 +4676,83 @@
 	shufps	$78,%xmm2,%xmm3
 	movups	%xmm3,16(%edx)
 	leal	32(%edx),%edx
-	jmp	.L096key_192b_warm
+	jmp	.L106key_192b_warm
 .align	16
-.L08714rounds:
+.L10212rounds_alt:
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+.L107loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	.L107loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	.L100good_key
+.align	16
+.L09314rounds:
 	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	cmpl	$268435456,%ebp
+	je	.L10814rounds_alt
 	movl	$13,%ecx
-	leal	16(%edx),%edx
 	movups	%xmm0,-32(%edx)
 	movups	%xmm2,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L097key_256a_cold
+	call	.L109key_256a_cold
 .byte	102,15,58,223,200,1
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,2
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,2
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,4
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,4
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,8
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,8
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,16
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,16
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,32
-	call	.L099key_256a
+	call	.L111key_256a
 .byte	102,15,58,223,200,32
-	call	.L098key_256b
+	call	.L110key_256b
 .byte	102,15,58,223,202,64
-	call	.L099key_256a
+	call	.L111key_256a
 	movups	%xmm0,(%edx)
 	movl	%ecx,16(%edx)
 	xorl	%eax,%eax
-	ret
+	jmp	.L100good_key
 .align	16
-.L099key_256a:
+.L111key_256a:
 	movups	%xmm2,(%edx)
 	leal	16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -4217,7 +4761,7 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L098key_256b:
+.L110key_256b:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 	shufps	$16,%xmm2,%xmm4
@@ -4227,13 +4771,70 @@
 	shufps	$170,%xmm1,%xmm1
 	xorps	%xmm1,%xmm2
 	ret
+.align	16
+.L10814rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+.L112loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	.L113done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	.L112loop_key256
+.L113done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+.L100good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
 .align	4
-.L086bad_pointer:
+.L091bad_pointer:
 	movl	$-1,%eax
+	popl	%ebx
+	popl	%ebp
 	ret
 .align	4
-.L089bad_keybits:
+.L095bad_keybits:
+	pxor	%xmm0,%xmm0
 	movl	$-2,%eax
+	popl	%ebx
+	popl	%ebp
 	ret
 .size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
 .globl	aesni_set_encrypt_key
@@ -4259,7 +4860,7 @@
 	movl	12(%esp),%edx
 	shll	$4,%ecx
 	testl	%eax,%eax
-	jnz	.L100dec_key_ret
+	jnz	.L114dec_key_ret
 	leal	16(%edx,%ecx,1),%eax
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
@@ -4267,7 +4868,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -4277,16 +4878,25 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	.L101dec_key_inverse
+	ja	.L115dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
 	xorl	%eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
 	ret
 .size	aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align	64
+.Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
 .byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte	115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/bf-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/bf-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/bf-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/bf-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from bf-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/bf-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from bf-586.pl. */
 #ifdef PIC
 .file	"bf-586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/bf-686.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/bf-686.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/bf-686.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/bf-686.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from bf-686.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/bf-686.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from bf-686.pl. */
 #ifdef PIC
 .file	"bf-686.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/bn-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/bn-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/bn-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/bn-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from bn-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/bn-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from bn-586.pl. */
 #ifdef PIC
 .file	"bn-586.S"
 .text
@@ -12,8 +12,7 @@
 	call	.L000PIC_me_up
 .L000PIC_me_up:
 	popl	%eax
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
 	jnc	.L001maw_non_sse2
 	movl	4(%esp),%eax
@@ -296,8 +295,7 @@
 	call	.L010PIC_me_up
 .L010PIC_me_up:
 	popl	%eax
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
 	jnc	.L011mw_non_sse2
 	movl	4(%esp),%eax
@@ -479,8 +477,7 @@
 	call	.L017PIC_me_up
 .L017PIC_me_up:
 	popl	%eax
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L017PIC_me_up](%eax),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
 	jnc	.L018sqr_non_sse2
 	movl	4(%esp),%eax
@@ -1534,7 +1531,7 @@
 	popl	%ebp
 	ret
 .size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"bn-586.S"
 .text
@@ -3056,5 +3053,5 @@
 	popl	%ebp
 	ret
 .size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/cmll-x86.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/cmll-x86.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/cmll-x86.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/cmll-x86.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from cmll-x86.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/cmll-x86.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from cmll-x86.pl. */
 #ifdef PIC
 .file	"cmll-x86.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/co-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/co-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/co-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/co-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from co-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/co-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from co-586.pl. */
 #ifdef PIC
 .file	"co-586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/crypt586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/crypt586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/crypt586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/crypt586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from crypt586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/crypt586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from crypt586.pl. */
 #ifdef PIC
 .file	"crypt586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/des-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/des-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/des-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/des-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from des-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/des-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from des-586.pl. */
 #ifdef PIC
 .file	"des-586.S"
 .text
@@ -1004,7 +1004,7 @@
 	call	.L000pic_point
 .L000pic_point:
 	popl	%ebp
-	leal	DES_SPtrans-.L000pic_point(%ebp),%ebp
+	leal	.Ldes_sptrans-.L000pic_point(%ebp),%ebp
 	movl	24(%esp),%ecx
 	cmpl	$0,%ebx
 	je	.L001decrypt
@@ -1081,7 +1081,7 @@
 	call	.L003pic_point
 .L003pic_point:
 	popl	%ebp
-	leal	DES_SPtrans-.L003pic_point(%ebp),%ebp
+	leal	.Ldes_sptrans-.L003pic_point(%ebp),%ebp
 	movl	24(%esp),%ecx
 	cmpl	$0,%ebx
 	je	.L004decrypt
@@ -1711,6 +1711,7 @@
 .size	DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
 .align	64
 DES_SPtrans:
+.Ldes_sptrans:
 .long	34080768,524288,33554434,34080770
 .long	33554432,526338,524290,33554434
 .long	526338,34080768,34078720,2050
@@ -2842,7 +2843,7 @@
 	call	.L000pic_point
 .L000pic_point:
 	popl	%ebp
-	leal	DES_SPtrans-.L000pic_point(%ebp),%ebp
+	leal	.Ldes_sptrans-.L000pic_point(%ebp),%ebp
 	movl	24(%esp),%ecx
 	cmpl	$0,%ebx
 	je	.L001decrypt
@@ -2919,7 +2920,7 @@
 	call	.L003pic_point
 .L003pic_point:
 	popl	%ebp
-	leal	DES_SPtrans-.L003pic_point(%ebp),%ebp
+	leal	.Ldes_sptrans-.L003pic_point(%ebp),%ebp
 	movl	24(%esp),%ecx
 	cmpl	$0,%ebx
 	je	.L004decrypt
@@ -3549,6 +3550,7 @@
 .size	DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
 .align	64
 DES_SPtrans:
+.Ldes_sptrans:
 .long	34080768,524288,33554434,34080770
 .long	33554432,526338,524290,33554434
 .long	526338,34080768,34078720,2050

Modified: trunk/secure/lib/libcrypto/i386/ghash-x86.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/ghash-x86.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/ghash-x86.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/ghash-x86.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from ghash-x86.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/ghash-x86.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from ghash-x86.pl. */
 #ifdef PIC
 .file	"ghash-x86.S"
 .text
@@ -949,27 +949,34 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
 	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
 	ret
 .size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
 .globl	gcm_gmult_clmul
@@ -987,11 +994,10 @@
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
 .byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
 .byte	102,15,58,68,220,0
@@ -1002,25 +1008,26 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
 	ret
@@ -1052,127 +1059,115 @@
 	movdqu	16(%esi),%xmm6
 .byte	102,15,56,0,221
 .byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
 	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
 	movdqa	%xmm6,%xmm7
-	pshufd	$78,%xmm6,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm6,%xmm3
-	pxor	%xmm2,%xmm4
+	leal	32(%esi),%esi
 .byte	102,15,58,68,242,0
 .byte	102,15,58,68,250,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm7
-	pxor	%xmm4,%xmm6
+.byte	102,15,58,68,221,0
 	movups	16(%edx),%xmm2
-	leal	32(%esi),%esi
+	nop
 	subl	$32,%ebx
 	jbe	.L014even_tail
+	jmp	.L015mod_loop
+.align	32
 .L015mod_loop:
+	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
+	pxor	%xmm0,%xmm4
+	nop
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqu	(%esi),%xmm3
+.byte	102,15,58,68,229,16
 	movups	(%edx),%xmm2
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
 	movdqu	16(%esi),%xmm6
-.byte	102,15,56,0,221
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
 .byte	102,15,56,0,245
-	movdqa	%xmm6,%xmm5
+	pxor	%xmm7,%xmm1
 	movdqa	%xmm6,%xmm7
-	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 .byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pshufd	$78,%xmm5,%xmm3
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
 	pxor	%xmm4,%xmm1
-	pxor	%xmm5,%xmm3
-	pshufd	$78,%xmm2,%xmm5
-	pxor	%xmm2,%xmm5
 .byte	102,15,58,68,250,17
-	movdqa	%xmm0,%xmm4
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .byte	102,15,58,68,221,0
-	movups	16(%edx),%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm3
-	movdqa	%xmm3,%xmm5
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm5
-	pxor	%xmm3,%xmm7
-	pxor	%xmm5,%xmm6
-	movdqa	(%ecx),%xmm5
 	leal	32(%esi),%esi
 	subl	$32,%ebx
 	ja	.L015mod_loop
 .L014even_tail:
+	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
+	pxor	%xmm0,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 	testl	%ebx,%ebx
 	jnz	.L016done
 	movups	(%edx),%xmm2
@@ -1195,25 +1190,26 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .L016done:
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
@@ -1228,12 +1224,6 @@
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
 .align	64
-.Lrem_4bit:
-.long	0,0,0,471859200,0,943718400,0,610271232
-.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
-.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
-.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
-.align	64
 .Lrem_8bit:
 .value	0,450,900,582,1800,1738,1164,1358
 .value	3600,4050,3476,3158,2328,2266,2716,2910
@@ -1267,6 +1257,12 @@
 .value	42960,42514,42068,42390,41176,41242,41820,41630
 .value	46560,46114,46692,47014,45800,45866,45420,45230
 .value	48112,47666,47220,47542,48376,48442,49020,48830
+.align	64
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
 .byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
 .byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
 .byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
@@ -2219,27 +2215,34 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
 	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
 	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
 	ret
 .size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
 .globl	gcm_gmult_clmul
@@ -2257,11 +2260,10 @@
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
 .byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
 .byte	102,15,58,68,220,0
@@ -2272,25 +2274,26 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
 	ret
@@ -2322,127 +2325,115 @@
 	movdqu	16(%esi),%xmm6
 .byte	102,15,56,0,221
 .byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
 	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
 	movdqa	%xmm6,%xmm7
-	pshufd	$78,%xmm6,%xmm3
-	pshufd	$78,%xmm2,%xmm4
 	pxor	%xmm6,%xmm3
-	pxor	%xmm2,%xmm4
+	leal	32(%esi),%esi
 .byte	102,15,58,68,242,0
 .byte	102,15,58,68,250,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm7
-	pxor	%xmm4,%xmm6
+.byte	102,15,58,68,221,0
 	movups	16(%edx),%xmm2
-	leal	32(%esi),%esi
+	nop
 	subl	$32,%ebx
 	jbe	.L014even_tail
+	jmp	.L015mod_loop
+.align	32
 .L015mod_loop:
+	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
+	pxor	%xmm0,%xmm4
+	nop
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqu	(%esi),%xmm3
+.byte	102,15,58,68,229,16
 	movups	(%edx),%xmm2
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
 	movdqu	16(%esi),%xmm6
-.byte	102,15,56,0,221
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
 .byte	102,15,56,0,245
-	movdqa	%xmm6,%xmm5
+	pxor	%xmm7,%xmm1
 	movdqa	%xmm6,%xmm7
-	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 .byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
-	pshufd	$78,%xmm5,%xmm3
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
 	pxor	%xmm4,%xmm1
-	pxor	%xmm5,%xmm3
-	pshufd	$78,%xmm2,%xmm5
-	pxor	%xmm2,%xmm5
 .byte	102,15,58,68,250,17
-	movdqa	%xmm0,%xmm4
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .byte	102,15,58,68,221,0
-	movups	16(%edx),%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm3
-	movdqa	%xmm3,%xmm5
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm5
-	pxor	%xmm3,%xmm7
-	pxor	%xmm5,%xmm6
-	movdqa	(%ecx),%xmm5
 	leal	32(%esi),%esi
 	subl	$32,%ebx
 	ja	.L015mod_loop
 .L014even_tail:
+	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
+	pxor	%xmm0,%xmm4
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	pxor	%xmm6,%xmm0
-	pxor	%xmm7,%xmm1
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 	testl	%ebx,%ebx
 	jnz	.L016done
 	movups	(%edx),%xmm2
@@ -2465,25 +2456,26 @@
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
-	psllq	$5,%xmm0
-	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
 	movdqa	%xmm0,%xmm4
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm4
-	pxor	%xmm3,%xmm0
+	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
-	movdqa	%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
-	pxor	%xmm1,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
 .L016done:
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
@@ -2498,12 +2490,6 @@
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
 .align	64
-.Lrem_4bit:
-.long	0,0,0,471859200,0,943718400,0,610271232
-.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
-.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
-.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
-.align	64
 .Lrem_8bit:
 .value	0,450,900,582,1800,1738,1164,1358
 .value	3600,4050,3476,3158,2328,2266,2716,2910
@@ -2537,6 +2523,12 @@
 .value	42960,42514,42068,42390,41176,41242,41820,41630
 .value	46560,46114,46692,47014,45800,45866,45420,45230
 .value	48112,47666,47220,47542,48376,48442,49020,48830
+.align	64
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
 .byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
 .byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
 .byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62

Modified: trunk/secure/lib/libcrypto/i386/md5-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/md5-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/md5-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/md5-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from md5-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/md5-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from md5-586.pl. */
 #ifdef PIC
 .file	"md5-586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/rc4-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/rc4-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/rc4-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/rc4-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from rc4-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/rc4-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rc4-586.pl. */
 #ifdef PIC
 .file	"rc4-586.S"
 .text
@@ -33,14 +33,13 @@
 	movl	(%edi,%eax,4),%ecx
 	andl	$-4,%edx
 	jz	.L002loop1
+	movl	%ebp,32(%esp)
 	testl	$-8,%edx
-	movl	%ebp,32(%esp)
 	jz	.L003go4loop4
 	call	.L004PIC_me_up
 .L004PIC_me_up:
 	popl	%ebp
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L004PIC_me_up](%ebp),%ebp
-	movl	OPENSSL_ia32cap_P at GOT(%ebp),%ebp
+	leal	OPENSSL_ia32cap_P-.L004PIC_me_up(%ebp),%ebp
 	btl	$26,(%ebp)
 	jnc	.L003go4loop4
 	movl	32(%esp),%ebp
@@ -286,8 +285,7 @@
 	call	.L010PIC_me_up
 .L010PIC_me_up:
 	popl	%edx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%edx),%edx
-	movl	OPENSSL_ia32cap_P at GOT(%edx),%edx
+	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%edx),%edx
 	leal	8(%edi),%edi
 	leal	(%esi,%ebp,1),%esi
 	negl	%ebp
@@ -362,8 +360,7 @@
 	call	.L020PIC_me_up
 .L020PIC_me_up:
 	popl	%edx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L020PIC_me_up](%edx),%edx
-	movl	OPENSSL_ia32cap_P at GOT(%edx),%edx
+	leal	OPENSSL_ia32cap_P-.L020PIC_me_up(%edx),%edx
 	movl	(%edx),%edx
 	btl	$20,%edx
 	jc	.L0211xchar
@@ -385,7 +382,7 @@
 .byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64
 .size	RC4_options,.-.L_RC4_options_begin
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"rc4-586.S"
 .text
@@ -418,8 +415,8 @@
 	movl	(%edi,%eax,4),%ecx
 	andl	$-4,%edx
 	jz	.L002loop1
+	movl	%ebp,32(%esp)
 	testl	$-8,%edx
-	movl	%ebp,32(%esp)
 	jz	.L003go4loop4
 	leal	OPENSSL_ia32cap_P,%ebp
 	btl	$26,(%ebp)
@@ -758,5 +755,5 @@
 .byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64
 .size	RC4_options,.-.L_RC4_options_begin
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/rc5-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/rc5-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/rc5-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/rc5-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from rc5-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/rc5-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rc5-586.pl. */
 #ifdef PIC
 .file	"rc5-586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/rmd-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/rmd-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/rmd-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/rmd-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from rmd-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/rmd-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rmd-586.pl. */
 #ifdef PIC
 .file	"rmd-586.S"
 .text

Modified: trunk/secure/lib/libcrypto/i386/sha1-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/sha1-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/sha1-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/sha1-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from sha1-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/sha1-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha1-586.pl. */
 #ifdef PIC
 .file	"sha1-586.S"
 .text
@@ -16,15 +16,17 @@
 	call	.L000pic_point
 .L000pic_point:
 	popl	%ebp
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000pic_point](%ebp),%esi
-	movl	OPENSSL_ia32cap_P at GOT(%esi),%esi
+	leal	OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi
 	leal	.LK_XX_XX-.L000pic_point(%ebp),%ebp
 	movl	(%esi),%eax
 	movl	4(%esi),%edx
 	testl	$512,%edx
 	jz	.L001x86
+	movl	8(%esi),%ecx
 	testl	$16777216,%eax
 	jz	.L001x86
+	testl	$536870912,%ecx
+	jnz	.Lshaext_shortcut
 	andl	$268435456,%edx
 	andl	$1073741824,%eax
 	orl	%edx,%eax
@@ -1398,9 +1400,9 @@
 	popl	%ebp
 	ret
 .size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
-.type	_sha1_block_data_order_ssse3, at function
+.type	_sha1_block_data_order_shaext, at function
 .align	16
-_sha1_block_data_order_ssse3:
+_sha1_block_data_order_shaext:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -1409,6 +1411,176 @@
 .L003pic_point:
 	popl	%ebp
 	leal	.LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lshaext_shortcut:
+	movl	20(%esp),%edi
+	movl	%esp,%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	subl	$32,%esp
+	movdqu	(%edi),%xmm0
+	movd	16(%edi),%xmm1
+	andl	$-32,%esp
+	movdqa	80(%ebp),%xmm3
+	movdqu	(%esi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%esi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%esi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%esi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+.byte	102,15,56,0,251
+	jmp	.L004loop_shaext
+.align	16
+.L004loop_shaext:
+	decl	%ecx
+	leal	64(%esi),%eax
+	movdqa	%xmm1,(%esp)
+	paddd	%xmm4,%xmm1
+	cmovnel	%eax,%esi
+	movdqa	%xmm0,16(%esp)
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%esi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%esi),%xmm5
+.byte	102,15,56,0,227
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%esi),%xmm6
+.byte	102,15,56,0,235
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%esi),%xmm7
+.byte	102,15,56,0,243
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+	movdqa	(%esp),%xmm2
+.byte	102,15,56,0,251
+.byte	15,56,200,202
+	paddd	16(%esp),%xmm0
+	jnz	.L004loop_shaext
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%edi)
+	movd	%xmm1,16(%edi)
+	movl	%ebx,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext
+.type	_sha1_block_data_order_ssse3, at function
+.align	16
+_sha1_block_data_order_ssse3:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L005pic_point(%ebp),%ebp
 .Lssse3_shortcut:
 	movdqa	(%ebp),%xmm7
 	movdqa	16(%ebp),%xmm0
@@ -1456,936 +1628,917 @@
 	movdqa	%xmm1,16(%esp)
 	psubd	%xmm7,%xmm1
 	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
 	psubd	%xmm7,%xmm2
-	movdqa	%xmm1,%xmm4
-	jmp	.L004loop
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	.L006loop
 .align	16
-.L004loop:
+.L006loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
 	addl	(%esp),%edi
-	xorl	%edx,%ecx
-.byte	102,15,58,15,224,8
-	movdqa	%xmm3,%xmm6
-	movl	%eax,%ebp
-	roll	$5,%eax
+	xorl	%ecx,%ebx
 	paddd	%xmm3,%xmm7
 	movdqa	%xmm0,64(%esp)
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	roll	$5,%eax
+	addl	%esi,%edi
 	psrldq	$4,%xmm6
-	xorl	%edx,%esi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
 	addl	%eax,%edi
-	pxor	%xmm0,%xmm4
-	rorl	$2,%ebx
-	addl	%esi,%edi
+	rorl	$7,%eax
 	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
 	addl	4(%esp),%edx
-	xorl	%ecx,%ebx
-	movl	%edi,%esi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	pxor	%xmm6,%xmm4
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	movdqa	%xmm7,48(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movdqa	%xmm4,%xmm0
+	rorl	$7,%edi
 	movdqa	%xmm4,%xmm6
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	8(%esp),%ecx
-	xorl	%ebx,%eax
+	xorl	%ebx,%esi
 	pslldq	$12,%xmm0
 	paddd	%xmm4,%xmm4
 	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
 	roll	$5,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	psrld	$31,%xmm6
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
 	movdqa	%xmm0,%xmm7
-	rorl	$7,%edi
 	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
 	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
 	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
 	addl	12(%esp),%ebx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
 	roll	$5,%ecx
-	pslld	$2,%xmm7
 	pxor	%xmm0,%xmm4
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	movdqa	96(%esp),%xmm0
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	pxor	%xmm7,%xmm4
-	movdqa	%xmm2,%xmm5
-	rorl	$7,%edx
-	addl	%ebp,%ebx
-	addl	16(%esp),%eax
+	pshufd	$238,%xmm1,%xmm5
 	xorl	%edi,%edx
-.byte	102,15,58,15,233,8
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
 	movdqa	%xmm4,%xmm7
-	movl	%ebx,%ebp
-	roll	$5,%ebx
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
 	paddd	%xmm4,%xmm0
 	movdqa	%xmm1,80(%esp)
-	andl	%edx,%esi
-	xorl	%edi,%edx
+	roll	$5,%ebx
+	addl	%esi,%eax
 	psrldq	$4,%xmm7
-	xorl	%edi,%esi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
 	addl	%ebx,%eax
-	pxor	%xmm1,%xmm5
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	rorl	$7,%ebx
 	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
 	addl	20(%esp),%edi
-	xorl	%edx,%ecx
-	movl	%eax,%esi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pxor	%xmm7,%xmm5
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	movdqa	%xmm0,(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movdqa	%xmm5,%xmm1
+	rorl	$7,%eax
 	movdqa	%xmm5,%xmm7
-	rorl	$7,%ebx
-	addl	%ebp,%edi
-	addl	24(%esp),%edx
-	xorl	%ecx,%ebx
+	xorl	%ecx,%esi
 	pslldq	$12,%xmm1
 	paddd	%xmm5,%xmm5
 	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	psrld	$31,%xmm7
-	xorl	%ecx,%esi
-	addl	%edi,%edx
 	movdqa	%xmm1,%xmm0
-	rorl	$7,%eax
 	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
 	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
 	addl	28(%esp),%ecx
-	xorl	%ebx,%eax
-	movl	%edx,%esi
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
 	roll	$5,%edx
-	pslld	$2,%xmm0
 	pxor	%xmm1,%xmm5
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	movdqa	112(%esp),%xmm1
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	pxor	%xmm0,%xmm5
-	movdqa	%xmm3,%xmm6
-	rorl	$7,%edi
-	addl	%ebp,%ecx
-	addl	32(%esp),%ebx
+	pshufd	$238,%xmm2,%xmm6
 	xorl	%eax,%edi
-.byte	102,15,58,15,242,8
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
 	movdqa	%xmm5,%xmm0
-	movl	%ecx,%ebp
-	roll	$5,%ecx
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
 	paddd	%xmm5,%xmm1
 	movdqa	%xmm2,96(%esp)
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
 	psrldq	$4,%xmm0
-	xorl	%eax,%esi
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
 	addl	%ecx,%ebx
-	pxor	%xmm2,%xmm6
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	rorl	$7,%ecx
 	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
 	addl	36(%esp),%eax
-	xorl	%edi,%edx
-	movl	%ebx,%esi
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	pxor	%xmm0,%xmm6
-	andl	%edx,%ebp
-	xorl	%edi,%edx
 	movdqa	%xmm1,16(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movdqa	%xmm6,%xmm2
+	rorl	$7,%ebx
 	movdqa	%xmm6,%xmm0
-	rorl	$7,%ecx
-	addl	%ebp,%eax
-	addl	40(%esp),%edi
-	xorl	%edx,%ecx
+	xorl	%edx,%esi
 	pslldq	$12,%xmm2
 	paddd	%xmm6,%xmm6
 	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	psrld	$31,%xmm0
-	xorl	%edx,%esi
-	addl	%eax,%edi
 	movdqa	%xmm2,%xmm1
-	rorl	$7,%ebx
 	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
 	por	%xmm0,%xmm6
-	addl	44(%esp),%edx
-	xorl	%ecx,%ebx
+	xorl	%ecx,%ebp
 	movdqa	64(%esp),%xmm0
 	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	pslld	$2,%xmm1
 	pxor	%xmm2,%xmm6
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	movdqa	112(%esp),%xmm2
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	pxor	%xmm1,%xmm6
-	movdqa	%xmm4,%xmm7
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	48(%esp),%ecx
+	pshufd	$238,%xmm3,%xmm7
 	xorl	%ebx,%eax
-.byte	102,15,58,15,251,8
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
 	movdqa	%xmm6,%xmm1
-	movl	%edx,%ebp
-	roll	$5,%edx
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
 	paddd	%xmm6,%xmm2
 	movdqa	%xmm3,64(%esp)
-	andl	%eax,%esi
-	xorl	%ebx,%eax
+	roll	$5,%edx
+	addl	%esi,%ecx
 	psrldq	$4,%xmm1
-	xorl	%ebx,%esi
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
 	addl	%edx,%ecx
-	pxor	%xmm3,%xmm7
-	rorl	$7,%edi
-	addl	%esi,%ecx
+	rorl	$7,%edx
 	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
 	addl	52(%esp),%ebx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
 	roll	$5,%ecx
-	pxor	%xmm1,%xmm7
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	movdqa	%xmm2,32(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movdqa	%xmm7,%xmm3
+	rorl	$7,%ecx
 	movdqa	%xmm7,%xmm1
-	rorl	$7,%edx
-	addl	%ebp,%ebx
-	addl	56(%esp),%eax
-	xorl	%edi,%edx
+	xorl	%edi,%esi
 	pslldq	$12,%xmm3
 	paddd	%xmm7,%xmm7
 	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	andl	%edx,%esi
-	xorl	%edi,%edx
-	psrld	$31,%xmm1
-	xorl	%edi,%esi
-	addl	%ebx,%eax
 	movdqa	%xmm3,%xmm2
-	rorl	$7,%ecx
 	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
 	por	%xmm1,%xmm7
-	addl	60(%esp),%edi
-	xorl	%edx,%ecx
+	xorl	%edx,%ebp
 	movdqa	80(%esp),%xmm1
 	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pslld	$2,%xmm2
 	pxor	%xmm3,%xmm7
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	movdqa	112(%esp),%xmm3
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	pxor	%xmm2,%xmm7
-	rorl	$7,%ebx
-	addl	%ebp,%edi
-	movdqa	%xmm7,%xmm2
-	addl	(%esp),%edx
+	rorl	$7,%eax
 	pxor	%xmm4,%xmm0
-.byte	102,15,58,15,214,8
-	xorl	%ecx,%ebx
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
 	movl	%edi,%ebp
-	roll	$5,%edi
+	addl	(%esp),%edx
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm4,80(%esp)
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
+	roll	$5,%edi
 	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
 	paddd	%xmm7,%xmm3
-	xorl	%ecx,%esi
-	addl	%edi,%edx
+	andl	%eax,%ebp
 	pxor	%xmm2,%xmm0
-	rorl	$7,%eax
-	addl	%esi,%edx
-	addl	4(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
 	movdqa	%xmm0,%xmm2
 	movdqa	%xmm3,48(%esp)
 	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
 	roll	$5,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	pslld	$2,%xmm0
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	psrld	$30,%xmm2
-	rorl	$7,%edi
-	addl	%ebp,%ecx
-	addl	8(%esp),%ebx
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
 	roll	$5,%ecx
 	por	%xmm2,%xmm0
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
 	movdqa	96(%esp),%xmm2
-	xorl	%eax,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	12(%esp),%eax
-	movdqa	%xmm0,%xmm3
-	xorl	%edi,%edx
+	xorl	%edi,%ebp
 	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
 	roll	$5,%ebx
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	16(%esp),%edi
 	pxor	%xmm5,%xmm1
-.byte	102,15,58,15,223,8
-	xorl	%edx,%esi
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,96(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
 	paddd	%xmm0,%xmm4
-	rorl	$7,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	pxor	%xmm3,%xmm1
 	addl	20(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
 	movdqa	%xmm1,%xmm3
 	movdqa	%xmm4,(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	pslld	$2,%xmm1
 	addl	24(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	psrld	$30,%xmm3
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	por	%xmm3,%xmm1
 	addl	28(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movdqa	64(%esp),%xmm3
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	movdqa	%xmm1,%xmm4
-	addl	%ebp,%ebx
 	addl	32(%esp),%eax
 	pxor	%xmm6,%xmm2
-.byte	102,15,58,15,224,8
-	xorl	%edi,%esi
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
 	pxor	%xmm3,%xmm2
 	movdqa	%xmm6,64(%esp)
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%ebp
 	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
 	paddd	%xmm1,%xmm5
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	addl	%ebx,%eax
 	pxor	%xmm4,%xmm2
 	addl	36(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
 	movdqa	%xmm2,%xmm4
 	movdqa	%xmm5,16(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	pslld	$2,%xmm2
 	addl	40(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	psrld	$30,%xmm4
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	por	%xmm4,%xmm2
 	addl	44(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movdqa	80(%esp),%xmm4
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	movdqa	%xmm2,%xmm5
-	addl	%ebp,%ecx
 	addl	48(%esp),%ebx
 	pxor	%xmm7,%xmm3
-.byte	102,15,58,15,233,8
-	xorl	%eax,%esi
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
 	pxor	%xmm4,%xmm3
 	movdqa	%xmm7,80(%esp)
-	xorl	%edi,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
 	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
 	paddd	%xmm2,%xmm6
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	addl	%ecx,%ebx
 	pxor	%xmm5,%xmm3
 	addl	52(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
 	movdqa	%xmm3,%xmm5
 	movdqa	%xmm6,32(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	pslld	$2,%xmm3
 	addl	56(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	psrld	$30,%xmm5
 	movl	%eax,%ebp
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	por	%xmm5,%xmm3
 	addl	60(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movdqa	96(%esp),%xmm5
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
 	addl	%edi,%edx
-	rorl	$7,%eax
-	movdqa	%xmm3,%xmm6
-	addl	%ebp,%edx
 	addl	(%esp),%ecx
 	pxor	%xmm0,%xmm4
-.byte	102,15,58,15,242,8
-	xorl	%ebx,%esi
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
 	pxor	%xmm5,%xmm4
 	movdqa	%xmm0,96(%esp)
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
 	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
 	paddd	%xmm3,%xmm7
-	rorl	$7,%edi
-	addl	%esi,%ecx
+	addl	%edx,%ecx
 	pxor	%xmm6,%xmm4
 	addl	4(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
 	movdqa	%xmm4,%xmm6
 	movdqa	%xmm7,48(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	pslld	$2,%xmm4
 	addl	8(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	psrld	$30,%xmm6
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	por	%xmm6,%xmm4
 	addl	12(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movdqa	64(%esp),%xmm6
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	movdqa	%xmm4,%xmm7
-	addl	%ebp,%edi
 	addl	16(%esp),%edx
 	pxor	%xmm1,%xmm5
-.byte	102,15,58,15,251,8
-	xorl	%ecx,%esi
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
 	pxor	%xmm6,%xmm5
 	movdqa	%xmm1,64(%esp)
-	xorl	%ebx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
 	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
 	paddd	%xmm4,%xmm0
-	rorl	$7,%eax
-	addl	%esi,%edx
+	addl	%edi,%edx
 	pxor	%xmm7,%xmm5
 	addl	20(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
 	movdqa	%xmm5,%xmm7
 	movdqa	%xmm0,(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	pslld	$2,%xmm5
 	addl	24(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	psrld	$30,%xmm7
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	por	%xmm7,%xmm5
 	addl	28(%esp),%eax
-	xorl	%edi,%ebp
 	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	movdqa	%xmm5,%xmm0
-	addl	%ebp,%eax
-	movl	%ecx,%ebp
+	addl	32(%esp),%edi
 	pxor	%xmm2,%xmm6
-.byte	102,15,58,15,196,8
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	32(%esp),%edi
-	andl	%edx,%ebp
+	rorl	$7,%ebx
 	pxor	%xmm7,%xmm6
 	movdqa	%xmm2,80(%esp)
-	andl	%ecx,%esi
-	rorl	$7,%ebx
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
 	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
 	paddd	%xmm5,%xmm1
-	addl	%ebp,%edi
-	movl	%eax,%ebp
+	xorl	%ebx,%ebp
 	pxor	%xmm0,%xmm6
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm1,16(%esp)
-	movl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	36(%esp),%edx
-	andl	%ecx,%esi
-	pslld	$2,%xmm6
-	andl	%ebx,%ebp
 	rorl	$7,%eax
-	psrld	$30,%xmm0
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	roll	$5,%edi
+	pslld	$2,%xmm6
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	por	%xmm0,%xmm6
-	movl	%eax,%ebp
-	xorl	%ebx,%eax
-	movdqa	96(%esp),%xmm0
 	addl	40(%esp),%ecx
-	andl	%ebx,%ebp
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	rorl	$7,%edi
-	addl	%ebp,%ecx
-	movdqa	%xmm6,%xmm1
+	por	%xmm0,%xmm6
 	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
 	roll	$5,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%esi
-	xorl	%eax,%edi
+	pshufd	$238,%xmm5,%xmm1
 	addl	44(%esp),%ebx
-	andl	%eax,%esi
 	andl	%edi,%ebp
+	xorl	%eax,%edi
 	rorl	$7,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	roll	$5,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%ebp
+	addl	48(%esp),%eax
 	pxor	%xmm3,%xmm7
-.byte	102,15,58,15,205,8
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
 	xorl	%edi,%edx
-	addl	48(%esp),%eax
-	andl	%edi,%ebp
+	rorl	$7,%ecx
 	pxor	%xmm0,%xmm7
 	movdqa	%xmm3,96(%esp)
-	andl	%edx,%esi
-	rorl	$7,%ecx
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
 	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
 	paddd	%xmm6,%xmm2
-	addl	%ebp,%eax
-	movl	%ebx,%ebp
+	xorl	%ecx,%ebp
 	pxor	%xmm1,%xmm7
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
 	movdqa	%xmm7,%xmm1
 	movdqa	%xmm2,32(%esp)
-	movl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	52(%esp),%edi
-	andl	%edx,%esi
-	pslld	$2,%xmm7
-	andl	%ecx,%ebp
 	rorl	$7,%ebx
-	psrld	$30,%xmm1
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	roll	$5,%eax
+	pslld	$2,%xmm7
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	por	%xmm1,%xmm7
-	movl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	movdqa	64(%esp),%xmm1
 	addl	56(%esp),%edx
-	andl	%ecx,%ebp
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%ebp,%edx
-	movdqa	%xmm7,%xmm2
+	por	%xmm1,%xmm7
 	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
 	roll	$5,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
+	pshufd	$238,%xmm6,%xmm2
 	addl	60(%esp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	rorl	$7,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	roll	$5,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%ebp
+	addl	(%esp),%ebx
 	pxor	%xmm4,%xmm0
-.byte	102,15,58,15,214,8
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
 	xorl	%eax,%edi
-	addl	(%esp),%ebx
-	andl	%eax,%ebp
+	rorl	$7,%edx
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm4,64(%esp)
-	andl	%edi,%esi
-	rorl	$7,%edx
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
 	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
 	paddd	%xmm7,%xmm3
-	addl	%ebp,%ebx
-	movl	%ecx,%ebp
+	xorl	%edx,%ebp
 	pxor	%xmm2,%xmm0
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
 	movdqa	%xmm0,%xmm2
 	movdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
 	xorl	%edi,%edx
-	addl	4(%esp),%eax
-	andl	%edi,%esi
-	pslld	$2,%xmm0
-	andl	%edx,%ebp
 	rorl	$7,%ecx
-	psrld	$30,%xmm2
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
+	pslld	$2,%xmm0
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	por	%xmm2,%xmm0
-	movl	%ecx,%ebp
-	xorl	%edx,%ecx
-	movdqa	80(%esp),%xmm2
 	addl	8(%esp),%edi
-	andl	%edx,%ebp
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	addl	%ebp,%edi
-	movdqa	%xmm0,%xmm3
+	por	%xmm2,%xmm0
 	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
 	roll	$5,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
+	pshufd	$238,%xmm7,%xmm3
 	addl	12(%esp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	roll	$5,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%ebp
+	addl	16(%esp),%ecx
 	pxor	%xmm5,%xmm1
-.byte	102,15,58,15,223,8
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	16(%esp),%ecx
-	andl	%ebx,%ebp
+	rorl	$7,%edi
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,80(%esp)
-	andl	%eax,%esi
-	rorl	$7,%edi
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
 	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
 	paddd	%xmm0,%xmm4
-	addl	%ebp,%ecx
-	movl	%edx,%ebp
+	xorl	%edi,%ebp
 	pxor	%xmm3,%xmm1
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
 	movdqa	%xmm1,%xmm3
 	movdqa	%xmm4,(%esp)
-	movl	%edi,%esi
 	xorl	%eax,%edi
-	addl	20(%esp),%ebx
-	andl	%eax,%esi
-	pslld	$2,%xmm1
-	andl	%edi,%ebp
 	rorl	$7,%edx
-	psrld	$30,%xmm3
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	roll	$5,%ecx
+	pslld	$2,%xmm1
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	por	%xmm3,%xmm1
-	movl	%edx,%ebp
-	xorl	%edi,%edx
-	movdqa	96(%esp),%xmm3
 	addl	24(%esp),%eax
-	andl	%edi,%ebp
 	andl	%edx,%esi
+	xorl	%edi,%edx
 	rorl	$7,%ecx
-	addl	%ebp,%eax
-	movdqa	%xmm1,%xmm4
+	por	%xmm3,%xmm1
 	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
 	roll	$5,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
+	pshufd	$238,%xmm0,%xmm4
 	addl	28(%esp),%edi
-	andl	%edx,%esi
 	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	roll	$5,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%ebp
+	addl	32(%esp),%edx
 	pxor	%xmm6,%xmm2
-.byte	102,15,58,15,224,8
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	32(%esp),%edx
-	andl	%ecx,%ebp
+	rorl	$7,%eax
 	pxor	%xmm3,%xmm2
 	movdqa	%xmm6,96(%esp)
-	andl	%ebx,%esi
-	rorl	$7,%eax
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
 	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
 	paddd	%xmm1,%xmm5
-	addl	%ebp,%edx
-	movl	%edi,%ebp
+	xorl	%eax,%ebp
 	pxor	%xmm4,%xmm2
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
 	movdqa	%xmm2,%xmm4
 	movdqa	%xmm5,16(%esp)
-	movl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	36(%esp),%ecx
-	andl	%ebx,%esi
-	pslld	$2,%xmm2
-	andl	%eax,%ebp
 	rorl	$7,%edi
-	psrld	$30,%xmm4
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	roll	$5,%edx
+	pslld	$2,%xmm2
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	por	%xmm4,%xmm2
-	movl	%edi,%ebp
-	xorl	%eax,%edi
-	movdqa	64(%esp),%xmm4
 	addl	40(%esp),%ebx
-	andl	%eax,%ebp
 	andl	%edi,%esi
+	xorl	%eax,%edi
 	rorl	$7,%edx
-	addl	%ebp,%ebx
-	movdqa	%xmm2,%xmm5
+	por	%xmm4,%xmm2
 	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
 	roll	$5,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%edi,%edx
+	pshufd	$238,%xmm1,%xmm5
 	addl	44(%esp),%eax
-	andl	%edi,%esi
 	andl	%edx,%ebp
+	xorl	%edi,%edx
 	rorl	$7,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	addl	48(%esp),%edi
 	pxor	%xmm7,%xmm3
-.byte	102,15,58,15,233,8
-	xorl	%edx,%esi
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
 	pxor	%xmm4,%xmm3
 	movdqa	%xmm7,64(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
 	paddd	%xmm2,%xmm6
-	rorl	$7,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	pxor	%xmm5,%xmm3
 	addl	52(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
 	movdqa	%xmm3,%xmm5
 	movdqa	%xmm6,32(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	pslld	$2,%xmm3
 	addl	56(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	psrld	$30,%xmm5
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	por	%xmm5,%xmm3
 	addl	60(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	(%esp),%eax
-	paddd	%xmm3,%xmm7
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
-	movdqa	%xmm7,48(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	4(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	8(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	12(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	movl	196(%esp),%ebp
 	cmpl	200(%esp),%ebp
-	je	.L005done
+	je	.L007done
 	movdqa	160(%esp),%xmm7
 	movdqa	176(%esp),%xmm6
 	movdqu	(%ebp),%xmm0
@@ -2397,113 +2550,112 @@
 	movl	%ebp,196(%esp)
 	movdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
-.byte	102,15,56,0,206
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	paddd	%xmm7,%xmm0
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	movdqa	%xmm0,(%esp)
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
-	psubd	%xmm7,%xmm0
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
-.byte	102,15,56,0,214
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
-	paddd	%xmm7,%xmm1
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
-	movdqa	%xmm1,16(%esp)
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
-	psubd	%xmm7,%xmm1
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
-.byte	102,15,56,0,222
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	paddd	%xmm7,%xmm2
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
-	movdqa	%xmm2,32(%esp)
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
-	psubd	%xmm7,%xmm2
+	xorl	%eax,%ebp
 	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	addl	4(%ebp),%esi
@@ -2513,109 +2665,112 @@
 	movl	%esi,4(%ebp)
 	addl	16(%ebp),%edi
 	movl	%ecx,8(%ebp)
-	movl	%esi,%ebx
+	movl	%ecx,%ebx
 	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
 	movl	%edi,16(%ebp)
-	movdqa	%xmm1,%xmm4
-	jmp	.L004loop
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L006loop
 .align	16
-.L005done:
+.L007done:
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	movl	204(%esp),%esp
@@ -2641,10 +2796,10 @@
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	call	.L006pic_point
-.L006pic_point:
+	call	.L008pic_point
+.L008pic_point:
 	popl	%ebp
-	leal	.LK_XX_XX-.L006pic_point(%ebp),%ebp
+	leal	.LK_XX_XX-.L008pic_point(%ebp),%ebp
 .Lavx_shortcut:
 	vzeroall
 	vmovdqa	(%ebp),%xmm7
@@ -2689,893 +2844,874 @@
 	vpaddd	%xmm7,%xmm1,%xmm5
 	vpaddd	%xmm7,%xmm2,%xmm6
 	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
 	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
 	vmovdqa	%xmm6,32(%esp)
-	jmp	.L007loop
+	andl	%ebp,%esi
+	jmp	.L009loop
 .align	16
-.L007loop:
-	addl	(%esp),%edi
-	xorl	%edx,%ecx
+.L009loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
+	addl	(%esp),%edi
 	vpaddd	%xmm3,%xmm7,%xmm7
 	vmovdqa	%xmm0,64(%esp)
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
 	vpsrldq	$4,%xmm3,%xmm6
-	xorl	%edx,%esi
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	vpxor	%xmm0,%xmm4,%xmm4
-	shrdl	$2,%ebx,%ebx
-	addl	%esi,%edi
 	vpxor	%xmm2,%xmm6,%xmm6
-	addl	4(%esp),%edx
-	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm7,48(%esp)
 	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	vpxor	%xmm6,%xmm4,%xmm4
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	vpsrld	$31,%xmm4,%xmm6
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	8(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
 	vpslldq	$12,%xmm4,%xmm0
 	vpaddd	%xmm4,%xmm4,%xmm4
 	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
 	vpsrld	$30,%xmm0,%xmm7
 	vpor	%xmm6,%xmm4,%xmm4
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpslld	$2,%xmm0,%xmm0
-	addl	12(%esp),%ebx
-	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
 	vpxor	%xmm7,%xmm4,%xmm4
 	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	vpxor	%xmm0,%xmm4,%xmm4
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	vmovdqa	96(%esp),%xmm0
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
-	addl	16(%esp),%eax
 	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
+	addl	16(%esp),%eax
 	vpaddd	%xmm4,%xmm0,%xmm0
 	vmovdqa	%xmm1,80(%esp)
-	andl	%edx,%esi
-	xorl	%edi,%edx
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
 	vpsrldq	$4,%xmm4,%xmm7
-	xorl	%edi,%esi
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpxor	%xmm1,%xmm5,%xmm5
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpxor	%xmm3,%xmm7,%xmm7
-	addl	20(%esp),%edi
-	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
 	vmovdqa	%xmm0,(%esp)
 	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm7,%xmm5,%xmm5
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	xorl	%edx,%ebp
-	addl	%eax,%edi
+	addl	%ebp,%edi
+	andl	%ebx,%esi
 	vpsrld	$31,%xmm5,%xmm7
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
-	addl	24(%esp),%edx
 	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
 	vpslldq	$12,%xmm5,%xmm1
 	vpaddd	%xmm5,%xmm5,%xmm5
 	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
 	vpsrld	$30,%xmm1,%xmm0
 	vpor	%xmm7,%xmm5,%xmm5
-	xorl	%ecx,%esi
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpslld	$2,%xmm1,%xmm1
-	addl	28(%esp),%ecx
-	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
 	vpxor	%xmm0,%xmm5,%xmm5
 	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	vpxor	%xmm1,%xmm5,%xmm5
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	vmovdqa	112(%esp),%xmm1
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
-	addl	32(%esp),%ebx
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
+	addl	32(%esp),%ebx
 	vpaddd	%xmm5,%xmm1,%xmm1
 	vmovdqa	%xmm2,96(%esp)
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
 	vpsrldq	$4,%xmm5,%xmm0
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	vpxor	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpxor	%xmm4,%xmm0,%xmm0
-	addl	36(%esp),%eax
-	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
 	vmovdqa	%xmm1,16(%esp)
 	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpxor	%xmm0,%xmm6,%xmm6
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
-	addl	%ebx,%eax
+	addl	%ebp,%eax
+	andl	%ecx,%esi
 	vpsrld	$31,%xmm6,%xmm0
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
-	addl	40(%esp),%edi
 	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
 	vpslldq	$12,%xmm6,%xmm2
 	vpaddd	%xmm6,%xmm6,%xmm6
 	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
 	vpsrld	$30,%xmm2,%xmm1
 	vpor	%xmm0,%xmm6,%xmm6
-	xorl	%edx,%esi
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	vpslld	$2,%xmm2,%xmm2
 	vmovdqa	64(%esp),%xmm0
-	addl	44(%esp),%edx
-	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
 	vpxor	%xmm1,%xmm6,%xmm6
 	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	vpxor	%xmm2,%xmm6,%xmm6
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	vmovdqa	112(%esp),%xmm2
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	48(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
+	addl	48(%esp),%ecx
 	vpaddd	%xmm6,%xmm2,%xmm2
 	vmovdqa	%xmm3,64(%esp)
-	andl	%eax,%esi
-	xorl	%ebx,%eax
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
 	vpsrldq	$4,%xmm6,%xmm1
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	vpxor	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpxor	%xmm5,%xmm1,%xmm1
-	addl	52(%esp),%ebx
-	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
 	vmovdqa	%xmm2,32(%esp)
 	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	vpxor	%xmm1,%xmm7,%xmm7
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	vpsrld	$31,%xmm7,%xmm1
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
-	addl	56(%esp),%eax
 	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
 	vpslldq	$12,%xmm7,%xmm3
 	vpaddd	%xmm7,%xmm7,%xmm7
 	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%esi
-	xorl	%edi,%edx
 	vpsrld	$30,%xmm3,%xmm2
 	vpor	%xmm1,%xmm7,%xmm7
-	xorl	%edi,%esi
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpslld	$2,%xmm3,%xmm3
 	vmovdqa	80(%esp),%xmm1
-	addl	60(%esp),%edi
-	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
 	vpxor	%xmm2,%xmm7,%xmm7
 	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	vpxor	%xmm3,%xmm7,%xmm7
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	vmovdqa	112(%esp),%xmm3
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpalignr	$8,%xmm6,%xmm7,%xmm2
 	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
 	addl	(%esp),%edx
-	xorl	%ecx,%ebx
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
 	vpxor	%xmm1,%xmm0,%xmm0
 	vmovdqa	%xmm4,80(%esp)
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
 	vmovdqa	%xmm3,%xmm4
 	vpaddd	%xmm7,%xmm3,%xmm3
-	xorl	%ecx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	andl	%eax,%ebp
 	vpxor	%xmm2,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
-	addl	4(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
 	vpsrld	$30,%xmm0,%xmm2
 	vmovdqa	%xmm3,48(%esp)
 	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	vpslld	$2,%xmm0,%xmm0
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
 	addl	%ebp,%ecx
-	addl	8(%esp),%ebx
+	andl	%edi,%esi
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	vpor	%xmm2,%xmm0,%xmm0
-	andl	%edi,%esi
-	xorl	%eax,%edi
 	vmovdqa	96(%esp),%xmm2
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	12(%esp),%eax
-	xorl	%edi,%edx
+	xorl	%edi,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpalignr	$8,%xmm7,%xmm0,%xmm3
 	vpxor	%xmm5,%xmm1,%xmm1
 	addl	16(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
 	vpxor	%xmm2,%xmm1,%xmm1
 	vmovdqa	%xmm5,96(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm4,%xmm5
 	vpaddd	%xmm0,%xmm4,%xmm4
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	vpxor	%xmm3,%xmm1,%xmm1
 	addl	20(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
 	vpsrld	$30,%xmm1,%xmm3
 	vmovdqa	%xmm4,(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpslld	$2,%xmm1,%xmm1
 	addl	24(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpor	%xmm3,%xmm1,%xmm1
 	addl	28(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	vmovdqa	64(%esp),%xmm3
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	vpxor	%xmm6,%xmm2,%xmm2
 	addl	32(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
 	vpxor	%xmm3,%xmm2,%xmm2
 	vmovdqa	%xmm6,64(%esp)
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%ebp
 	vmovdqa	128(%esp),%xmm6
 	vpaddd	%xmm1,%xmm5,%xmm5
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
+	addl	%ebx,%eax
 	vpxor	%xmm4,%xmm2,%xmm2
 	addl	36(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
 	vpsrld	$30,%xmm2,%xmm4
 	vmovdqa	%xmm5,16(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpslld	$2,%xmm2,%xmm2
 	addl	40(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpor	%xmm4,%xmm2,%xmm2
 	addl	44(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	vmovdqa	80(%esp),%xmm4
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
 	vpxor	%xmm4,%xmm3,%xmm3
 	vmovdqa	%xmm7,80(%esp)
-	xorl	%edi,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
 	vmovdqa	%xmm6,%xmm7
 	vpaddd	%xmm2,%xmm6,%xmm6
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
+	addl	%ecx,%ebx
 	vpxor	%xmm5,%xmm3,%xmm3
 	addl	52(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
 	vpsrld	$30,%xmm3,%xmm5
 	vmovdqa	%xmm6,32(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	vpor	%xmm5,%xmm3,%xmm3
 	addl	60(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	vmovdqa	96(%esp),%xmm5
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	vpxor	%xmm0,%xmm4,%xmm4
 	addl	(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
 	vpxor	%xmm5,%xmm4,%xmm4
 	vmovdqa	%xmm0,96(%esp)
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
 	vmovdqa	%xmm7,%xmm0
 	vpaddd	%xmm3,%xmm7,%xmm7
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
+	addl	%edx,%ecx
 	vpxor	%xmm6,%xmm4,%xmm4
 	addl	4(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
 	vpsrld	$30,%xmm4,%xmm6
 	vmovdqa	%xmm7,48(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	vpslld	$2,%xmm4,%xmm4
 	addl	8(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpor	%xmm6,%xmm4,%xmm4
 	addl	12(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	vmovdqa	64(%esp),%xmm6
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	vpxor	%xmm1,%xmm5,%xmm5
 	addl	16(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
 	vpxor	%xmm6,%xmm5,%xmm5
 	vmovdqa	%xmm1,64(%esp)
-	xorl	%ebx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
 	vmovdqa	%xmm0,%xmm1
 	vpaddd	%xmm4,%xmm0,%xmm0
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
+	addl	%edi,%edx
 	vpxor	%xmm7,%xmm5,%xmm5
 	addl	20(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
 	vpsrld	$30,%xmm5,%xmm7
 	vmovdqa	%xmm0,(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	vpslld	$2,%xmm5,%xmm5
 	addl	24(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpor	%xmm7,%xmm5,%xmm5
 	addl	28(%esp),%eax
-	xorl	%edi,%ebp
 	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpalignr	$8,%xmm4,%xmm5,%xmm0
 	vpxor	%xmm2,%xmm6,%xmm6
-	movl	%ecx,%ebp
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	32(%esp),%edi
-	andl	%edx,%ebp
+	shrdl	$7,%ebx,%ebx
 	vpxor	%xmm7,%xmm6,%xmm6
 	vmovdqa	%xmm2,80(%esp)
-	andl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
 	vmovdqa	%xmm1,%xmm2
 	vpaddd	%xmm5,%xmm1,%xmm1
-	addl	%ebp,%edi
-	movl	%eax,%ebp
-	vpxor	%xmm0,%xmm6,%xmm6
 	shldl	$5,%eax,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	36(%esp),%edx
 	vpsrld	$30,%xmm6,%xmm0
 	vmovdqa	%xmm1,16(%esp)
-	movl	%ebx,%esi
+	andl	%ebx,%ebp
 	xorl	%ecx,%ebx
-	addl	36(%esp),%edx
-	andl	%ecx,%esi
-	vpslld	$2,%xmm6,%xmm6
-	andl	%ebx,%ebp
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
 	shldl	$5,%edi,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
 	vpor	%xmm0,%xmm6,%xmm6
-	movl	%eax,%ebp
 	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
 	vmovdqa	96(%esp),%xmm0
-	addl	40(%esp),%ecx
-	andl	%ebx,%ebp
-	andl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	movl	%edx,%ebp
+	xorl	%eax,%esi
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%esi
-	xorl	%eax,%edi
 	addl	44(%esp),%ebx
-	andl	%eax,%esi
 	andl	%edi,%ebp
+	xorl	%eax,%edi
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	shldl	$5,%ecx,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
 	vpalignr	$8,%xmm5,%xmm6,%xmm1
 	vpxor	%xmm3,%xmm7,%xmm7
-	movl	%edx,%ebp
+	addl	48(%esp),%eax
+	andl	%edx,%esi
 	xorl	%edi,%edx
-	addl	48(%esp),%eax
-	andl	%edi,%ebp
+	shrdl	$7,%ecx,%ecx
 	vpxor	%xmm0,%xmm7,%xmm7
 	vmovdqa	%xmm3,96(%esp)
-	andl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
 	vmovdqa	144(%esp),%xmm3
 	vpaddd	%xmm6,%xmm2,%xmm2
-	addl	%ebp,%eax
-	movl	%ebx,%ebp
-	vpxor	%xmm1,%xmm7,%xmm7
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	52(%esp),%edi
 	vpsrld	$30,%xmm7,%xmm1
 	vmovdqa	%xmm2,32(%esp)
-	movl	%ecx,%esi
+	andl	%ecx,%ebp
 	xorl	%edx,%ecx
-	addl	52(%esp),%edi
-	andl	%edx,%esi
-	vpslld	$2,%xmm7,%xmm7
-	andl	%ecx,%ebp
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
 	shldl	$5,%eax,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
 	vpor	%xmm1,%xmm7,%xmm7
-	movl	%ebx,%ebp
 	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
 	vmovdqa	64(%esp),%xmm1
-	addl	56(%esp),%edx
-	andl	%ecx,%ebp
-	andl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	movl	%edi,%ebp
+	xorl	%ebx,%esi
 	shldl	$5,%edi,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%esp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
 	vpalignr	$8,%xmm6,%xmm7,%xmm2
 	vpxor	%xmm4,%xmm0,%xmm0
-	movl	%edi,%ebp
+	addl	(%esp),%ebx
+	andl	%edi,%esi
 	xorl	%eax,%edi
-	addl	(%esp),%ebx
-	andl	%eax,%ebp
+	shrdl	$7,%edx,%edx
 	vpxor	%xmm1,%xmm0,%xmm0
 	vmovdqa	%xmm4,64(%esp)
-	andl	%edi,%esi
-	shrdl	$7,%edx,%edx
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
 	vmovdqa	%xmm3,%xmm4
 	vpaddd	%xmm7,%xmm3,%xmm3
-	addl	%ebp,%ebx
-	movl	%ecx,%ebp
-	vpxor	%xmm2,%xmm0,%xmm0
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	4(%esp),%eax
 	vpsrld	$30,%xmm0,%xmm2
 	vmovdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
+	andl	%edx,%ebp
 	xorl	%edi,%edx
-	addl	4(%esp),%eax
-	andl	%edi,%esi
-	vpslld	$2,%xmm0,%xmm0
-	andl	%edx,%ebp
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
 	vpor	%xmm2,%xmm0,%xmm0
-	movl	%ecx,%ebp
 	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
 	vmovdqa	80(%esp),%xmm2
-	addl	8(%esp),%edi
-	andl	%edx,%ebp
-	andl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	movl	%eax,%ebp
+	xorl	%ecx,%esi
 	shldl	$5,%eax,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%esp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	shldl	$5,%edi,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
 	vpalignr	$8,%xmm7,%xmm0,%xmm3
 	vpxor	%xmm5,%xmm1,%xmm1
-	movl	%eax,%ebp
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	16(%esp),%ecx
-	andl	%ebx,%ebp
+	shrdl	$7,%edi,%edi
 	vpxor	%xmm2,%xmm1,%xmm1
 	vmovdqa	%xmm5,80(%esp)
-	andl	%eax,%esi
-	shrdl	$7,%edi,%edi
+	movl	%edx,%ebp
+	xorl	%eax,%esi
 	vmovdqa	%xmm4,%xmm5
 	vpaddd	%xmm0,%xmm4,%xmm4
-	addl	%ebp,%ecx
-	movl	%edx,%ebp
-	vpxor	%xmm3,%xmm1,%xmm1
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	20(%esp),%ebx
 	vpsrld	$30,%xmm1,%xmm3
 	vmovdqa	%xmm4,(%esp)
-	movl	%edi,%esi
+	andl	%edi,%ebp
 	xorl	%eax,%edi
-	addl	20(%esp),%ebx
-	andl	%eax,%esi
-	vpslld	$2,%xmm1,%xmm1
-	andl	%edi,%ebp
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
 	shldl	$5,%ecx,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
 	vpor	%xmm3,%xmm1,%xmm1
-	movl	%edx,%ebp
 	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
 	vmovdqa	96(%esp),%xmm3
-	addl	24(%esp),%eax
-	andl	%edi,%ebp
-	andl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	movl	%ebx,%ebp
+	xorl	%edx,%esi
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	28(%esp),%edi
-	andl	%edx,%esi
 	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	shldl	$5,%eax,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	vpxor	%xmm6,%xmm2,%xmm2
-	movl	%ebx,%ebp
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	32(%esp),%edx
-	andl	%ecx,%ebp
+	shrdl	$7,%eax,%eax
 	vpxor	%xmm3,%xmm2,%xmm2
 	vmovdqa	%xmm6,96(%esp)
-	andl	%ebx,%esi
-	shrdl	$7,%eax,%eax
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
 	vmovdqa	%xmm5,%xmm6
 	vpaddd	%xmm1,%xmm5,%xmm5
-	addl	%ebp,%edx
-	movl	%edi,%ebp
-	vpxor	%xmm4,%xmm2,%xmm2
 	shldl	$5,%edi,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	36(%esp),%ecx
 	vpsrld	$30,%xmm2,%xmm4
 	vmovdqa	%xmm5,16(%esp)
-	movl	%eax,%esi
+	andl	%eax,%ebp
 	xorl	%ebx,%eax
-	addl	36(%esp),%ecx
-	andl	%ebx,%esi
-	vpslld	$2,%xmm2,%xmm2
-	andl	%eax,%ebp
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
 	vpor	%xmm4,%xmm2,%xmm2
-	movl	%edi,%ebp
 	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
 	vmovdqa	64(%esp),%xmm4
-	addl	40(%esp),%ebx
-	andl	%eax,%ebp
-	andl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	movl	%ecx,%ebp
+	xorl	%edi,%esi
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%edi,%edx
 	addl	44(%esp),%eax
-	andl	%edi,%esi
 	andl	%edx,%ebp
+	xorl	%edi,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
 	vpxor	%xmm4,%xmm3,%xmm3
 	vmovdqa	%xmm7,64(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm6,%xmm7
 	vpaddd	%xmm2,%xmm6,%xmm6
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	vpxor	%xmm5,%xmm3,%xmm3
 	addl	52(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
 	vpsrld	$30,%xmm3,%xmm5
 	vmovdqa	%xmm6,32(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpor	%xmm5,%xmm3,%xmm3
 	addl	60(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	(%esp),%eax
 	vpaddd	%xmm3,%xmm7,%xmm7
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
 	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	4(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	8(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	12(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	movl	196(%esp),%ebp
 	cmpl	200(%esp),%ebp
-	je	.L008done
+	je	.L010done
 	vmovdqa	160(%esp),%xmm7
 	vmovdqa	176(%esp),%xmm6
 	vmovdqu	(%ebp),%xmm0
@@ -3587,110 +3723,109 @@
 	movl	%ebp,196(%esp)
 	vmovdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	vpshufb	%xmm6,%xmm1,%xmm1
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
 	vpaddd	%xmm7,%xmm0,%xmm4
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vmovdqa	%xmm4,(%esp)
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	vpshufb	%xmm6,%xmm2,%xmm2
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
 	vpaddd	%xmm7,%xmm1,%xmm5
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vmovdqa	%xmm5,16(%esp)
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	vpshufb	%xmm6,%xmm3,%xmm3
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
 	vpaddd	%xmm7,%xmm2,%xmm6
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vmovdqa	%xmm6,32(%esp)
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	addl	4(%ebp),%esi
@@ -3699,109 +3834,112 @@
 	addl	12(%ebp),%edx
 	movl	%esi,4(%ebp)
 	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
 	movl	%ecx,8(%ebp)
-	movl	%esi,%ebx
+	xorl	%edx,%ebx
 	movl	%edx,12(%ebp)
 	movl	%edi,16(%ebp)
-	jmp	.L007loop
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L009loop
 .align	16
-.L008done:
+.L010done:
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vzeroall
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
@@ -3828,11 +3966,12 @@
 .long	2400959708,2400959708,2400959708,2400959708
 .long	3395469782,3395469782,3395469782,3395469782
 .long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
 .byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
 .byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"sha1-586.S"
 .text
@@ -3854,8 +3993,11 @@
 	movl	4(%esi),%edx
 	testl	$512,%edx
 	jz	.L001x86
+	movl	8(%esi),%ecx
 	testl	$16777216,%eax
 	jz	.L001x86
+	testl	$536870912,%ecx
+	jnz	.Lshaext_shortcut
 	andl	$268435456,%edx
 	andl	$1073741824,%eax
 	orl	%edx,%eax
@@ -5229,9 +5371,9 @@
 	popl	%ebp
 	ret
 .size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
-.type	_sha1_block_data_order_ssse3, at function
+.type	_sha1_block_data_order_shaext, at function
 .align	16
-_sha1_block_data_order_ssse3:
+_sha1_block_data_order_shaext:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -5240,6 +5382,176 @@
 .L003pic_point:
 	popl	%ebp
 	leal	.LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lshaext_shortcut:
+	movl	20(%esp),%edi
+	movl	%esp,%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	subl	$32,%esp
+	movdqu	(%edi),%xmm0
+	movd	16(%edi),%xmm1
+	andl	$-32,%esp
+	movdqa	80(%ebp),%xmm3
+	movdqu	(%esi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%esi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%esi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%esi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+.byte	102,15,56,0,251
+	jmp	.L004loop_shaext
+.align	16
+.L004loop_shaext:
+	decl	%ecx
+	leal	64(%esi),%eax
+	movdqa	%xmm1,(%esp)
+	paddd	%xmm4,%xmm1
+	cmovnel	%eax,%esi
+	movdqa	%xmm0,16(%esp)
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%esi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%esi),%xmm5
+.byte	102,15,56,0,227
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%esi),%xmm6
+.byte	102,15,56,0,235
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%esi),%xmm7
+.byte	102,15,56,0,243
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+	movdqa	(%esp),%xmm2
+.byte	102,15,56,0,251
+.byte	15,56,200,202
+	paddd	16(%esp),%xmm0
+	jnz	.L004loop_shaext
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%edi)
+	movd	%xmm1,16(%edi)
+	movl	%ebx,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext
+.type	_sha1_block_data_order_ssse3, at function
+.align	16
+_sha1_block_data_order_ssse3:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L005pic_point(%ebp),%ebp
 .Lssse3_shortcut:
 	movdqa	(%ebp),%xmm7
 	movdqa	16(%ebp),%xmm0
@@ -5287,936 +5599,917 @@
 	movdqa	%xmm1,16(%esp)
 	psubd	%xmm7,%xmm1
 	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
 	psubd	%xmm7,%xmm2
-	movdqa	%xmm1,%xmm4
-	jmp	.L004loop
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	.L006loop
 .align	16
-.L004loop:
+.L006loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
 	addl	(%esp),%edi
-	xorl	%edx,%ecx
-.byte	102,15,58,15,224,8
-	movdqa	%xmm3,%xmm6
-	movl	%eax,%ebp
-	roll	$5,%eax
+	xorl	%ecx,%ebx
 	paddd	%xmm3,%xmm7
 	movdqa	%xmm0,64(%esp)
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	roll	$5,%eax
+	addl	%esi,%edi
 	psrldq	$4,%xmm6
-	xorl	%edx,%esi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
 	addl	%eax,%edi
-	pxor	%xmm0,%xmm4
-	rorl	$2,%ebx
-	addl	%esi,%edi
+	rorl	$7,%eax
 	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
 	addl	4(%esp),%edx
-	xorl	%ecx,%ebx
-	movl	%edi,%esi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	pxor	%xmm6,%xmm4
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	movdqa	%xmm7,48(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movdqa	%xmm4,%xmm0
+	rorl	$7,%edi
 	movdqa	%xmm4,%xmm6
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	8(%esp),%ecx
-	xorl	%ebx,%eax
+	xorl	%ebx,%esi
 	pslldq	$12,%xmm0
 	paddd	%xmm4,%xmm4
 	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
 	roll	$5,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	psrld	$31,%xmm6
-	xorl	%ebx,%esi
-	addl	%edx,%ecx
 	movdqa	%xmm0,%xmm7
-	rorl	$7,%edi
 	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
 	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
 	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
 	addl	12(%esp),%ebx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
 	roll	$5,%ecx
-	pslld	$2,%xmm7
 	pxor	%xmm0,%xmm4
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	movdqa	96(%esp),%xmm0
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	pxor	%xmm7,%xmm4
-	movdqa	%xmm2,%xmm5
-	rorl	$7,%edx
-	addl	%ebp,%ebx
-	addl	16(%esp),%eax
+	pshufd	$238,%xmm1,%xmm5
 	xorl	%edi,%edx
-.byte	102,15,58,15,233,8
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
 	movdqa	%xmm4,%xmm7
-	movl	%ebx,%ebp
-	roll	$5,%ebx
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
 	paddd	%xmm4,%xmm0
 	movdqa	%xmm1,80(%esp)
-	andl	%edx,%esi
-	xorl	%edi,%edx
+	roll	$5,%ebx
+	addl	%esi,%eax
 	psrldq	$4,%xmm7
-	xorl	%edi,%esi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
 	addl	%ebx,%eax
-	pxor	%xmm1,%xmm5
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	rorl	$7,%ebx
 	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
 	addl	20(%esp),%edi
-	xorl	%edx,%ecx
-	movl	%eax,%esi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pxor	%xmm7,%xmm5
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	movdqa	%xmm0,(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movdqa	%xmm5,%xmm1
+	rorl	$7,%eax
 	movdqa	%xmm5,%xmm7
-	rorl	$7,%ebx
-	addl	%ebp,%edi
-	addl	24(%esp),%edx
-	xorl	%ecx,%ebx
+	xorl	%ecx,%esi
 	pslldq	$12,%xmm1
 	paddd	%xmm5,%xmm5
 	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	psrld	$31,%xmm7
-	xorl	%ecx,%esi
-	addl	%edi,%edx
 	movdqa	%xmm1,%xmm0
-	rorl	$7,%eax
 	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
 	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
 	addl	28(%esp),%ecx
-	xorl	%ebx,%eax
-	movl	%edx,%esi
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
 	roll	$5,%edx
-	pslld	$2,%xmm0
 	pxor	%xmm1,%xmm5
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	movdqa	112(%esp),%xmm1
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	pxor	%xmm0,%xmm5
-	movdqa	%xmm3,%xmm6
-	rorl	$7,%edi
-	addl	%ebp,%ecx
-	addl	32(%esp),%ebx
+	pshufd	$238,%xmm2,%xmm6
 	xorl	%eax,%edi
-.byte	102,15,58,15,242,8
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
 	movdqa	%xmm5,%xmm0
-	movl	%ecx,%ebp
-	roll	$5,%ecx
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
 	paddd	%xmm5,%xmm1
 	movdqa	%xmm2,96(%esp)
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
 	psrldq	$4,%xmm0
-	xorl	%eax,%esi
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
 	addl	%ecx,%ebx
-	pxor	%xmm2,%xmm6
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	rorl	$7,%ecx
 	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
 	addl	36(%esp),%eax
-	xorl	%edi,%edx
-	movl	%ebx,%esi
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	pxor	%xmm0,%xmm6
-	andl	%edx,%ebp
-	xorl	%edi,%edx
 	movdqa	%xmm1,16(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movdqa	%xmm6,%xmm2
+	rorl	$7,%ebx
 	movdqa	%xmm6,%xmm0
-	rorl	$7,%ecx
-	addl	%ebp,%eax
-	addl	40(%esp),%edi
-	xorl	%edx,%ecx
+	xorl	%edx,%esi
 	pslldq	$12,%xmm2
 	paddd	%xmm6,%xmm6
 	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	psrld	$31,%xmm0
-	xorl	%edx,%esi
-	addl	%eax,%edi
 	movdqa	%xmm2,%xmm1
-	rorl	$7,%ebx
 	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
 	por	%xmm0,%xmm6
-	addl	44(%esp),%edx
-	xorl	%ecx,%ebx
+	xorl	%ecx,%ebp
 	movdqa	64(%esp),%xmm0
 	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
 	roll	$5,%edi
-	pslld	$2,%xmm1
 	pxor	%xmm2,%xmm6
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	movdqa	112(%esp),%xmm2
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	pxor	%xmm1,%xmm6
-	movdqa	%xmm4,%xmm7
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	48(%esp),%ecx
+	pshufd	$238,%xmm3,%xmm7
 	xorl	%ebx,%eax
-.byte	102,15,58,15,251,8
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
 	movdqa	%xmm6,%xmm1
-	movl	%edx,%ebp
-	roll	$5,%edx
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
 	paddd	%xmm6,%xmm2
 	movdqa	%xmm3,64(%esp)
-	andl	%eax,%esi
-	xorl	%ebx,%eax
+	roll	$5,%edx
+	addl	%esi,%ecx
 	psrldq	$4,%xmm1
-	xorl	%ebx,%esi
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
 	addl	%edx,%ecx
-	pxor	%xmm3,%xmm7
-	rorl	$7,%edi
-	addl	%esi,%ecx
+	rorl	$7,%edx
 	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
 	addl	52(%esp),%ebx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
 	roll	$5,%ecx
-	pxor	%xmm1,%xmm7
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	movdqa	%xmm2,32(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movdqa	%xmm7,%xmm3
+	rorl	$7,%ecx
 	movdqa	%xmm7,%xmm1
-	rorl	$7,%edx
-	addl	%ebp,%ebx
-	addl	56(%esp),%eax
-	xorl	%edi,%edx
+	xorl	%edi,%esi
 	pslldq	$12,%xmm3
 	paddd	%xmm7,%xmm7
 	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
 	roll	$5,%ebx
-	andl	%edx,%esi
-	xorl	%edi,%edx
-	psrld	$31,%xmm1
-	xorl	%edi,%esi
-	addl	%ebx,%eax
 	movdqa	%xmm3,%xmm2
-	rorl	$7,%ecx
 	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
 	por	%xmm1,%xmm7
-	addl	60(%esp),%edi
-	xorl	%edx,%ecx
+	xorl	%edx,%ebp
 	movdqa	80(%esp),%xmm1
 	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
 	roll	$5,%eax
-	pslld	$2,%xmm2
 	pxor	%xmm3,%xmm7
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	movdqa	112(%esp),%xmm3
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	pxor	%xmm2,%xmm7
-	rorl	$7,%ebx
-	addl	%ebp,%edi
-	movdqa	%xmm7,%xmm2
-	addl	(%esp),%edx
+	rorl	$7,%eax
 	pxor	%xmm4,%xmm0
-.byte	102,15,58,15,214,8
-	xorl	%ecx,%ebx
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
 	movl	%edi,%ebp
-	roll	$5,%edi
+	addl	(%esp),%edx
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm4,80(%esp)
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
+	roll	$5,%edi
 	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
 	paddd	%xmm7,%xmm3
-	xorl	%ecx,%esi
-	addl	%edi,%edx
+	andl	%eax,%ebp
 	pxor	%xmm2,%xmm0
-	rorl	$7,%eax
-	addl	%esi,%edx
-	addl	4(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
 	movdqa	%xmm0,%xmm2
 	movdqa	%xmm3,48(%esp)
 	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
 	roll	$5,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	pslld	$2,%xmm0
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	psrld	$30,%xmm2
-	rorl	$7,%edi
-	addl	%ebp,%ecx
-	addl	8(%esp),%ebx
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
 	roll	$5,%ecx
 	por	%xmm2,%xmm0
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
 	movdqa	96(%esp),%xmm2
-	xorl	%eax,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	12(%esp),%eax
-	movdqa	%xmm0,%xmm3
-	xorl	%edi,%edx
+	xorl	%edi,%ebp
 	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
 	roll	$5,%ebx
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	16(%esp),%edi
 	pxor	%xmm5,%xmm1
-.byte	102,15,58,15,223,8
-	xorl	%edx,%esi
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,96(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
 	paddd	%xmm0,%xmm4
-	rorl	$7,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	pxor	%xmm3,%xmm1
 	addl	20(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
 	movdqa	%xmm1,%xmm3
 	movdqa	%xmm4,(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	pslld	$2,%xmm1
 	addl	24(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	psrld	$30,%xmm3
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	por	%xmm3,%xmm1
 	addl	28(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movdqa	64(%esp),%xmm3
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	movdqa	%xmm1,%xmm4
-	addl	%ebp,%ebx
 	addl	32(%esp),%eax
 	pxor	%xmm6,%xmm2
-.byte	102,15,58,15,224,8
-	xorl	%edi,%esi
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
 	pxor	%xmm3,%xmm2
 	movdqa	%xmm6,64(%esp)
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%ebp
 	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
 	paddd	%xmm1,%xmm5
-	rorl	$7,%ecx
-	addl	%esi,%eax
+	addl	%ebx,%eax
 	pxor	%xmm4,%xmm2
 	addl	36(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
 	movdqa	%xmm2,%xmm4
 	movdqa	%xmm5,16(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	pslld	$2,%xmm2
 	addl	40(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	psrld	$30,%xmm4
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	por	%xmm4,%xmm2
 	addl	44(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movdqa	80(%esp),%xmm4
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	movdqa	%xmm2,%xmm5
-	addl	%ebp,%ecx
 	addl	48(%esp),%ebx
 	pxor	%xmm7,%xmm3
-.byte	102,15,58,15,233,8
-	xorl	%eax,%esi
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
 	pxor	%xmm4,%xmm3
 	movdqa	%xmm7,80(%esp)
-	xorl	%edi,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
 	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
 	paddd	%xmm2,%xmm6
-	rorl	$7,%edx
-	addl	%esi,%ebx
+	addl	%ecx,%ebx
 	pxor	%xmm5,%xmm3
 	addl	52(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
 	movdqa	%xmm3,%xmm5
 	movdqa	%xmm6,32(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	pslld	$2,%xmm3
 	addl	56(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	psrld	$30,%xmm5
 	movl	%eax,%ebp
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	por	%xmm5,%xmm3
 	addl	60(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movdqa	96(%esp),%xmm5
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
 	addl	%edi,%edx
-	rorl	$7,%eax
-	movdqa	%xmm3,%xmm6
-	addl	%ebp,%edx
 	addl	(%esp),%ecx
 	pxor	%xmm0,%xmm4
-.byte	102,15,58,15,242,8
-	xorl	%ebx,%esi
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
 	pxor	%xmm5,%xmm4
 	movdqa	%xmm0,96(%esp)
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
 	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
 	paddd	%xmm3,%xmm7
-	rorl	$7,%edi
-	addl	%esi,%ecx
+	addl	%edx,%ecx
 	pxor	%xmm6,%xmm4
 	addl	4(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
 	movdqa	%xmm4,%xmm6
 	movdqa	%xmm7,48(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	pslld	$2,%xmm4
 	addl	8(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	psrld	$30,%xmm6
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	por	%xmm6,%xmm4
 	addl	12(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movdqa	64(%esp),%xmm6
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	movdqa	%xmm4,%xmm7
-	addl	%ebp,%edi
 	addl	16(%esp),%edx
 	pxor	%xmm1,%xmm5
-.byte	102,15,58,15,251,8
-	xorl	%ecx,%esi
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
 	pxor	%xmm6,%xmm5
 	movdqa	%xmm1,64(%esp)
-	xorl	%ebx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
 	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
 	paddd	%xmm4,%xmm0
-	rorl	$7,%eax
-	addl	%esi,%edx
+	addl	%edi,%edx
 	pxor	%xmm7,%xmm5
 	addl	20(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
 	movdqa	%xmm5,%xmm7
 	movdqa	%xmm0,(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	pslld	$2,%xmm5
 	addl	24(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	psrld	$30,%xmm7
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	por	%xmm7,%xmm5
 	addl	28(%esp),%eax
-	xorl	%edi,%ebp
 	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	movdqa	%xmm5,%xmm0
-	addl	%ebp,%eax
-	movl	%ecx,%ebp
+	addl	32(%esp),%edi
 	pxor	%xmm2,%xmm6
-.byte	102,15,58,15,196,8
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	32(%esp),%edi
-	andl	%edx,%ebp
+	rorl	$7,%ebx
 	pxor	%xmm7,%xmm6
 	movdqa	%xmm2,80(%esp)
-	andl	%ecx,%esi
-	rorl	$7,%ebx
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
 	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
 	paddd	%xmm5,%xmm1
-	addl	%ebp,%edi
-	movl	%eax,%ebp
+	xorl	%ebx,%ebp
 	pxor	%xmm0,%xmm6
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
 	movdqa	%xmm6,%xmm0
 	movdqa	%xmm1,16(%esp)
-	movl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	36(%esp),%edx
-	andl	%ecx,%esi
-	pslld	$2,%xmm6
-	andl	%ebx,%ebp
 	rorl	$7,%eax
-	psrld	$30,%xmm0
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	roll	$5,%edi
+	pslld	$2,%xmm6
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	por	%xmm0,%xmm6
-	movl	%eax,%ebp
-	xorl	%ebx,%eax
-	movdqa	96(%esp),%xmm0
 	addl	40(%esp),%ecx
-	andl	%ebx,%ebp
 	andl	%eax,%esi
+	xorl	%ebx,%eax
 	rorl	$7,%edi
-	addl	%ebp,%ecx
-	movdqa	%xmm6,%xmm1
+	por	%xmm0,%xmm6
 	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
 	roll	$5,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%esi
-	xorl	%eax,%edi
+	pshufd	$238,%xmm5,%xmm1
 	addl	44(%esp),%ebx
-	andl	%eax,%esi
 	andl	%edi,%ebp
+	xorl	%eax,%edi
 	rorl	$7,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	roll	$5,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%ebp
+	addl	48(%esp),%eax
 	pxor	%xmm3,%xmm7
-.byte	102,15,58,15,205,8
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
 	xorl	%edi,%edx
-	addl	48(%esp),%eax
-	andl	%edi,%ebp
+	rorl	$7,%ecx
 	pxor	%xmm0,%xmm7
 	movdqa	%xmm3,96(%esp)
-	andl	%edx,%esi
-	rorl	$7,%ecx
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
 	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
 	paddd	%xmm6,%xmm2
-	addl	%ebp,%eax
-	movl	%ebx,%ebp
+	xorl	%ecx,%ebp
 	pxor	%xmm1,%xmm7
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
 	movdqa	%xmm7,%xmm1
 	movdqa	%xmm2,32(%esp)
-	movl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	52(%esp),%edi
-	andl	%edx,%esi
-	pslld	$2,%xmm7
-	andl	%ecx,%ebp
 	rorl	$7,%ebx
-	psrld	$30,%xmm1
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	roll	$5,%eax
+	pslld	$2,%xmm7
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	por	%xmm1,%xmm7
-	movl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	movdqa	64(%esp),%xmm1
 	addl	56(%esp),%edx
-	andl	%ecx,%ebp
 	andl	%ebx,%esi
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%ebp,%edx
-	movdqa	%xmm7,%xmm2
+	por	%xmm1,%xmm7
 	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
 	roll	$5,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
+	pshufd	$238,%xmm6,%xmm2
 	addl	60(%esp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	rorl	$7,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	roll	$5,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%ebp
+	addl	(%esp),%ebx
 	pxor	%xmm4,%xmm0
-.byte	102,15,58,15,214,8
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
 	xorl	%eax,%edi
-	addl	(%esp),%ebx
-	andl	%eax,%ebp
+	rorl	$7,%edx
 	pxor	%xmm1,%xmm0
 	movdqa	%xmm4,64(%esp)
-	andl	%edi,%esi
-	rorl	$7,%edx
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
 	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
 	paddd	%xmm7,%xmm3
-	addl	%ebp,%ebx
-	movl	%ecx,%ebp
+	xorl	%edx,%ebp
 	pxor	%xmm2,%xmm0
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
 	movdqa	%xmm0,%xmm2
 	movdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
 	xorl	%edi,%edx
-	addl	4(%esp),%eax
-	andl	%edi,%esi
-	pslld	$2,%xmm0
-	andl	%edx,%ebp
 	rorl	$7,%ecx
-	psrld	$30,%xmm2
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
+	pslld	$2,%xmm0
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	por	%xmm2,%xmm0
-	movl	%ecx,%ebp
-	xorl	%edx,%ecx
-	movdqa	80(%esp),%xmm2
 	addl	8(%esp),%edi
-	andl	%edx,%ebp
 	andl	%ecx,%esi
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	addl	%ebp,%edi
-	movdqa	%xmm0,%xmm3
+	por	%xmm2,%xmm0
 	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
 	roll	$5,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
+	pshufd	$238,%xmm7,%xmm3
 	addl	12(%esp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	rorl	$7,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	roll	$5,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%ebp
+	addl	16(%esp),%ecx
 	pxor	%xmm5,%xmm1
-.byte	102,15,58,15,223,8
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	16(%esp),%ecx
-	andl	%ebx,%ebp
+	rorl	$7,%edi
 	pxor	%xmm2,%xmm1
 	movdqa	%xmm5,80(%esp)
-	andl	%eax,%esi
-	rorl	$7,%edi
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
 	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
 	paddd	%xmm0,%xmm4
-	addl	%ebp,%ecx
-	movl	%edx,%ebp
+	xorl	%edi,%ebp
 	pxor	%xmm3,%xmm1
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
 	movdqa	%xmm1,%xmm3
 	movdqa	%xmm4,(%esp)
-	movl	%edi,%esi
 	xorl	%eax,%edi
-	addl	20(%esp),%ebx
-	andl	%eax,%esi
-	pslld	$2,%xmm1
-	andl	%edi,%ebp
 	rorl	$7,%edx
-	psrld	$30,%xmm3
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	roll	$5,%ecx
+	pslld	$2,%xmm1
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	por	%xmm3,%xmm1
-	movl	%edx,%ebp
-	xorl	%edi,%edx
-	movdqa	96(%esp),%xmm3
 	addl	24(%esp),%eax
-	andl	%edi,%ebp
 	andl	%edx,%esi
+	xorl	%edi,%edx
 	rorl	$7,%ecx
-	addl	%ebp,%eax
-	movdqa	%xmm1,%xmm4
+	por	%xmm3,%xmm1
 	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
 	roll	$5,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
+	pshufd	$238,%xmm0,%xmm4
 	addl	28(%esp),%edi
-	andl	%edx,%esi
 	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	rorl	$7,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	roll	$5,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%ebp
+	addl	32(%esp),%edx
 	pxor	%xmm6,%xmm2
-.byte	102,15,58,15,224,8
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	32(%esp),%edx
-	andl	%ecx,%ebp
+	rorl	$7,%eax
 	pxor	%xmm3,%xmm2
 	movdqa	%xmm6,96(%esp)
-	andl	%ebx,%esi
-	rorl	$7,%eax
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
 	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
 	paddd	%xmm1,%xmm5
-	addl	%ebp,%edx
-	movl	%edi,%ebp
+	xorl	%eax,%ebp
 	pxor	%xmm4,%xmm2
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
 	movdqa	%xmm2,%xmm4
 	movdqa	%xmm5,16(%esp)
-	movl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	36(%esp),%ecx
-	andl	%ebx,%esi
-	pslld	$2,%xmm2
-	andl	%eax,%ebp
 	rorl	$7,%edi
-	psrld	$30,%xmm4
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	roll	$5,%edx
+	pslld	$2,%xmm2
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	por	%xmm4,%xmm2
-	movl	%edi,%ebp
-	xorl	%eax,%edi
-	movdqa	64(%esp),%xmm4
 	addl	40(%esp),%ebx
-	andl	%eax,%ebp
 	andl	%edi,%esi
+	xorl	%eax,%edi
 	rorl	$7,%edx
-	addl	%ebp,%ebx
-	movdqa	%xmm2,%xmm5
+	por	%xmm4,%xmm2
 	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
 	roll	$5,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%edi,%edx
+	pshufd	$238,%xmm1,%xmm5
 	addl	44(%esp),%eax
-	andl	%edi,%esi
 	andl	%edx,%ebp
+	xorl	%edi,%edx
 	rorl	$7,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	roll	$5,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	addl	48(%esp),%edi
 	pxor	%xmm7,%xmm3
-.byte	102,15,58,15,233,8
-	xorl	%edx,%esi
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
 	pxor	%xmm4,%xmm3
 	movdqa	%xmm7,64(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
 	paddd	%xmm2,%xmm6
-	rorl	$7,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	pxor	%xmm5,%xmm3
 	addl	52(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
 	movdqa	%xmm3,%xmm5
 	movdqa	%xmm6,32(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	pslld	$2,%xmm3
 	addl	56(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	psrld	$30,%xmm5
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	por	%xmm5,%xmm3
 	addl	60(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	(%esp),%eax
-	paddd	%xmm3,%xmm7
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
-	movdqa	%xmm7,48(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	4(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	8(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	12(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	movl	196(%esp),%ebp
 	cmpl	200(%esp),%ebp
-	je	.L005done
+	je	.L007done
 	movdqa	160(%esp),%xmm7
 	movdqa	176(%esp),%xmm6
 	movdqu	(%ebp),%xmm0
@@ -6228,113 +6521,112 @@
 	movl	%ebp,196(%esp)
 	movdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
-.byte	102,15,56,0,206
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	paddd	%xmm7,%xmm0
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
-	movdqa	%xmm0,(%esp)
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
-	psubd	%xmm7,%xmm0
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
-.byte	102,15,56,0,214
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
-	paddd	%xmm7,%xmm1
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
-	movdqa	%xmm1,16(%esp)
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
-	psubd	%xmm7,%xmm1
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
-.byte	102,15,56,0,222
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	paddd	%xmm7,%xmm2
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
-	movdqa	%xmm2,32(%esp)
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
-	psubd	%xmm7,%xmm2
+	xorl	%eax,%ebp
 	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	addl	4(%ebp),%esi
@@ -6344,109 +6636,112 @@
 	movl	%esi,4(%ebp)
 	addl	16(%ebp),%edi
 	movl	%ecx,8(%ebp)
-	movl	%esi,%ebx
+	movl	%ecx,%ebx
 	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
 	movl	%edi,16(%ebp)
-	movdqa	%xmm1,%xmm4
-	jmp	.L004loop
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L006loop
 .align	16
-.L005done:
+.L007done:
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	roll	$5,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	roll	$5,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	roll	$5,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%esi,%ecx
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	roll	$5,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	roll	$5,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	roll	$5,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
 	addl	%eax,%edi
-	rorl	$7,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	roll	$5,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
 	addl	%edi,%edx
-	rorl	$7,%eax
-	addl	%esi,%edx
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	roll	$5,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
 	addl	%edx,%ecx
-	rorl	$7,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	roll	$5,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
 	addl	%ecx,%ebx
-	rorl	$7,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	roll	$5,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	rorl	$7,%ecx
 	addl	%ebx,%eax
-	rorl	$7,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	movl	204(%esp),%esp
@@ -6472,10 +6767,10 @@
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	call	.L006pic_point
-.L006pic_point:
+	call	.L008pic_point
+.L008pic_point:
 	popl	%ebp
-	leal	.LK_XX_XX-.L006pic_point(%ebp),%ebp
+	leal	.LK_XX_XX-.L008pic_point(%ebp),%ebp
 .Lavx_shortcut:
 	vzeroall
 	vmovdqa	(%ebp),%xmm7
@@ -6520,893 +6815,874 @@
 	vpaddd	%xmm7,%xmm1,%xmm5
 	vpaddd	%xmm7,%xmm2,%xmm6
 	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
 	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
 	vmovdqa	%xmm6,32(%esp)
-	jmp	.L007loop
+	andl	%ebp,%esi
+	jmp	.L009loop
 .align	16
-.L007loop:
-	addl	(%esp),%edi
-	xorl	%edx,%ecx
+.L009loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
+	addl	(%esp),%edi
 	vpaddd	%xmm3,%xmm7,%xmm7
 	vmovdqa	%xmm0,64(%esp)
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
 	vpsrldq	$4,%xmm3,%xmm6
-	xorl	%edx,%esi
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	vpxor	%xmm0,%xmm4,%xmm4
-	shrdl	$2,%ebx,%ebx
-	addl	%esi,%edi
 	vpxor	%xmm2,%xmm6,%xmm6
-	addl	4(%esp),%edx
-	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm7,48(%esp)
 	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	vpxor	%xmm6,%xmm4,%xmm4
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	vpsrld	$31,%xmm4,%xmm6
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	8(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
 	vpslldq	$12,%xmm4,%xmm0
 	vpaddd	%xmm4,%xmm4,%xmm4
 	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
 	vpsrld	$30,%xmm0,%xmm7
 	vpor	%xmm6,%xmm4,%xmm4
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpslld	$2,%xmm0,%xmm0
-	addl	12(%esp),%ebx
-	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
 	vpxor	%xmm7,%xmm4,%xmm4
 	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
 	vpxor	%xmm0,%xmm4,%xmm4
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	vmovdqa	96(%esp),%xmm0
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
-	addl	16(%esp),%eax
 	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
+	addl	16(%esp),%eax
 	vpaddd	%xmm4,%xmm0,%xmm0
 	vmovdqa	%xmm1,80(%esp)
-	andl	%edx,%esi
-	xorl	%edi,%edx
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
 	vpsrldq	$4,%xmm4,%xmm7
-	xorl	%edi,%esi
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	vpxor	%xmm1,%xmm5,%xmm5
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpxor	%xmm3,%xmm7,%xmm7
-	addl	20(%esp),%edi
-	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
 	vmovdqa	%xmm0,(%esp)
 	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	vpxor	%xmm7,%xmm5,%xmm5
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	xorl	%edx,%ebp
-	addl	%eax,%edi
+	addl	%ebp,%edi
+	andl	%ebx,%esi
 	vpsrld	$31,%xmm5,%xmm7
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
-	addl	24(%esp),%edx
 	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
 	vpslldq	$12,%xmm5,%xmm1
 	vpaddd	%xmm5,%xmm5,%xmm5
 	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
 	vpsrld	$30,%xmm1,%xmm0
 	vpor	%xmm7,%xmm5,%xmm5
-	xorl	%ecx,%esi
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpslld	$2,%xmm1,%xmm1
-	addl	28(%esp),%ecx
-	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
 	vpxor	%xmm0,%xmm5,%xmm5
 	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	vpxor	%xmm1,%xmm5,%xmm5
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
+	addl	%ebp,%ecx
+	andl	%edi,%esi
 	vmovdqa	112(%esp),%xmm1
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
-	addl	32(%esp),%ebx
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
+	addl	32(%esp),%ebx
 	vpaddd	%xmm5,%xmm1,%xmm1
 	vmovdqa	%xmm2,96(%esp)
-	andl	%edi,%esi
-	xorl	%eax,%edi
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
 	vpsrldq	$4,%xmm5,%xmm0
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	vpxor	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpxor	%xmm4,%xmm0,%xmm0
-	addl	36(%esp),%eax
-	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
 	vmovdqa	%xmm1,16(%esp)
 	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	vpxor	%xmm0,%xmm6,%xmm6
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
-	addl	%ebx,%eax
+	addl	%ebp,%eax
+	andl	%ecx,%esi
 	vpsrld	$31,%xmm6,%xmm0
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
-	addl	40(%esp),%edi
 	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
 	vpslldq	$12,%xmm6,%xmm2
 	vpaddd	%xmm6,%xmm6,%xmm6
 	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
 	vpsrld	$30,%xmm2,%xmm1
 	vpor	%xmm0,%xmm6,%xmm6
-	xorl	%edx,%esi
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	vpslld	$2,%xmm2,%xmm2
 	vmovdqa	64(%esp),%xmm0
-	addl	44(%esp),%edx
-	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
 	vpxor	%xmm1,%xmm6,%xmm6
 	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
 	shldl	$5,%edi,%edi
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
 	vpxor	%xmm2,%xmm6,%xmm6
-	xorl	%ecx,%ebp
-	addl	%edi,%edx
+	addl	%ebp,%edx
+	andl	%eax,%esi
 	vmovdqa	112(%esp),%xmm2
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	48(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
+	addl	48(%esp),%ecx
 	vpaddd	%xmm6,%xmm2,%xmm2
 	vmovdqa	%xmm3,64(%esp)
-	andl	%eax,%esi
-	xorl	%ebx,%eax
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
 	vpsrldq	$4,%xmm6,%xmm1
-	xorl	%ebx,%esi
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	vpxor	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpxor	%xmm5,%xmm1,%xmm1
-	addl	52(%esp),%ebx
-	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
 	vmovdqa	%xmm2,32(%esp)
 	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	vpxor	%xmm1,%xmm7,%xmm7
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	xorl	%eax,%ebp
-	addl	%ecx,%ebx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
 	vpsrld	$31,%xmm7,%xmm1
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
-	addl	56(%esp),%eax
 	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
 	vpslldq	$12,%xmm7,%xmm3
 	vpaddd	%xmm7,%xmm7,%xmm7
 	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%esi
-	xorl	%edi,%edx
 	vpsrld	$30,%xmm3,%xmm2
 	vpor	%xmm1,%xmm7,%xmm7
-	xorl	%edi,%esi
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpslld	$2,%xmm3,%xmm3
 	vmovdqa	80(%esp),%xmm1
-	addl	60(%esp),%edi
-	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
 	vpxor	%xmm2,%xmm7,%xmm7
 	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
 	shldl	$5,%eax,%eax
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
 	vpxor	%xmm3,%xmm7,%xmm7
-	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	vmovdqa	112(%esp),%xmm3
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpalignr	$8,%xmm6,%xmm7,%xmm2
 	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
 	addl	(%esp),%edx
-	xorl	%ecx,%ebx
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
 	vpxor	%xmm1,%xmm0,%xmm0
 	vmovdqa	%xmm4,80(%esp)
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
 	vmovdqa	%xmm3,%xmm4
 	vpaddd	%xmm7,%xmm3,%xmm3
-	xorl	%ecx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	andl	%eax,%ebp
 	vpxor	%xmm2,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
-	addl	4(%esp),%ecx
 	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
 	vpsrld	$30,%xmm0,%xmm2
 	vmovdqa	%xmm3,48(%esp)
 	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
 	shldl	$5,%edx,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
 	vpslld	$2,%xmm0,%xmm0
-	xorl	%ebx,%ebp
-	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
 	addl	%ebp,%ecx
-	addl	8(%esp),%ebx
+	andl	%edi,%esi
 	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
 	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
 	shldl	$5,%ecx,%ecx
-	vpor	%xmm2,%xmm0,%xmm0
-	andl	%edi,%esi
-	xorl	%eax,%edi
 	vmovdqa	96(%esp),%xmm2
-	xorl	%eax,%esi
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	12(%esp),%eax
-	xorl	%edi,%edx
+	xorl	%edi,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	xorl	%edi,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpalignr	$8,%xmm7,%xmm0,%xmm3
 	vpxor	%xmm5,%xmm1,%xmm1
 	addl	16(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
 	vpxor	%xmm2,%xmm1,%xmm1
 	vmovdqa	%xmm5,96(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm4,%xmm5
 	vpaddd	%xmm0,%xmm4,%xmm4
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	vpxor	%xmm3,%xmm1,%xmm1
 	addl	20(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
 	vpsrld	$30,%xmm1,%xmm3
 	vmovdqa	%xmm4,(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpslld	$2,%xmm1,%xmm1
 	addl	24(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpor	%xmm3,%xmm1,%xmm1
 	addl	28(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	vmovdqa	64(%esp),%xmm3
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	vpxor	%xmm6,%xmm2,%xmm2
 	addl	32(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
 	vpxor	%xmm3,%xmm2,%xmm2
 	vmovdqa	%xmm6,64(%esp)
-	xorl	%edx,%esi
-	addl	%ebx,%eax
+	addl	%esi,%eax
+	xorl	%edx,%ebp
 	vmovdqa	128(%esp),%xmm6
 	vpaddd	%xmm1,%xmm5,%xmm5
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
+	addl	%ebx,%eax
 	vpxor	%xmm4,%xmm2,%xmm2
 	addl	36(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
 	vpsrld	$30,%xmm2,%xmm4
 	vmovdqa	%xmm5,16(%esp)
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpslld	$2,%xmm2,%xmm2
 	addl	40(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vpor	%xmm4,%xmm2,%xmm2
 	addl	44(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	vmovdqa	80(%esp),%xmm4
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
 	vpxor	%xmm4,%xmm3,%xmm3
 	vmovdqa	%xmm7,80(%esp)
-	xorl	%edi,%esi
-	addl	%ecx,%ebx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
 	vmovdqa	%xmm6,%xmm7
 	vpaddd	%xmm2,%xmm6,%xmm6
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
+	addl	%ecx,%ebx
 	vpxor	%xmm5,%xmm3,%xmm3
 	addl	52(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
 	vpsrld	$30,%xmm3,%xmm5
 	vmovdqa	%xmm6,32(%esp)
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	vpor	%xmm5,%xmm3,%xmm3
 	addl	60(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	vmovdqa	96(%esp),%xmm5
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpalignr	$8,%xmm2,%xmm3,%xmm6
 	vpxor	%xmm0,%xmm4,%xmm4
 	addl	(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
 	vpxor	%xmm5,%xmm4,%xmm4
 	vmovdqa	%xmm0,96(%esp)
-	xorl	%eax,%esi
-	addl	%edx,%ecx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
 	vmovdqa	%xmm7,%xmm0
 	vpaddd	%xmm3,%xmm7,%xmm7
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
+	addl	%edx,%ecx
 	vpxor	%xmm6,%xmm4,%xmm4
 	addl	4(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
 	vpsrld	$30,%xmm4,%xmm6
 	vmovdqa	%xmm7,48(%esp)
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	vpslld	$2,%xmm4,%xmm4
 	addl	8(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	vpor	%xmm6,%xmm4,%xmm4
 	addl	12(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	vmovdqa	64(%esp),%xmm6
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	vpalignr	$8,%xmm3,%xmm4,%xmm7
 	vpxor	%xmm1,%xmm5,%xmm5
 	addl	16(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
 	vpxor	%xmm6,%xmm5,%xmm5
 	vmovdqa	%xmm1,64(%esp)
-	xorl	%ebx,%esi
-	addl	%edi,%edx
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
 	vmovdqa	%xmm0,%xmm1
 	vpaddd	%xmm4,%xmm0,%xmm0
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
+	addl	%edi,%edx
 	vpxor	%xmm7,%xmm5,%xmm5
 	addl	20(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
 	vpsrld	$30,%xmm5,%xmm7
 	vmovdqa	%xmm0,(%esp)
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	vpslld	$2,%xmm5,%xmm5
 	addl	24(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vpor	%xmm7,%xmm5,%xmm5
 	addl	28(%esp),%eax
-	xorl	%edi,%ebp
 	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vpalignr	$8,%xmm4,%xmm5,%xmm0
 	vpxor	%xmm2,%xmm6,%xmm6
-	movl	%ecx,%ebp
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
 	xorl	%edx,%ecx
-	addl	32(%esp),%edi
-	andl	%edx,%ebp
+	shrdl	$7,%ebx,%ebx
 	vpxor	%xmm7,%xmm6,%xmm6
 	vmovdqa	%xmm2,80(%esp)
-	andl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
 	vmovdqa	%xmm1,%xmm2
 	vpaddd	%xmm5,%xmm1,%xmm1
-	addl	%ebp,%edi
-	movl	%eax,%ebp
-	vpxor	%xmm0,%xmm6,%xmm6
 	shldl	$5,%eax,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	36(%esp),%edx
 	vpsrld	$30,%xmm6,%xmm0
 	vmovdqa	%xmm1,16(%esp)
-	movl	%ebx,%esi
+	andl	%ebx,%ebp
 	xorl	%ecx,%ebx
-	addl	36(%esp),%edx
-	andl	%ecx,%esi
-	vpslld	$2,%xmm6,%xmm6
-	andl	%ebx,%ebp
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
 	shldl	$5,%edi,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
 	vpor	%xmm0,%xmm6,%xmm6
-	movl	%eax,%ebp
 	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
 	vmovdqa	96(%esp),%xmm0
-	addl	40(%esp),%ecx
-	andl	%ebx,%ebp
-	andl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	movl	%edx,%ebp
+	xorl	%eax,%esi
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
-	movl	%edi,%esi
-	xorl	%eax,%edi
 	addl	44(%esp),%ebx
-	andl	%eax,%esi
 	andl	%edi,%ebp
+	xorl	%eax,%edi
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	xorl	%edi,%ebp
 	shldl	$5,%ecx,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
 	vpalignr	$8,%xmm5,%xmm6,%xmm1
 	vpxor	%xmm3,%xmm7,%xmm7
-	movl	%edx,%ebp
+	addl	48(%esp),%eax
+	andl	%edx,%esi
 	xorl	%edi,%edx
-	addl	48(%esp),%eax
-	andl	%edi,%ebp
+	shrdl	$7,%ecx,%ecx
 	vpxor	%xmm0,%xmm7,%xmm7
 	vmovdqa	%xmm3,96(%esp)
-	andl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
 	vmovdqa	144(%esp),%xmm3
 	vpaddd	%xmm6,%xmm2,%xmm2
-	addl	%ebp,%eax
-	movl	%ebx,%ebp
-	vpxor	%xmm1,%xmm7,%xmm7
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	52(%esp),%edi
 	vpsrld	$30,%xmm7,%xmm1
 	vmovdqa	%xmm2,32(%esp)
-	movl	%ecx,%esi
+	andl	%ecx,%ebp
 	xorl	%edx,%ecx
-	addl	52(%esp),%edi
-	andl	%edx,%esi
-	vpslld	$2,%xmm7,%xmm7
-	andl	%ecx,%ebp
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
 	shldl	$5,%eax,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
 	vpor	%xmm1,%xmm7,%xmm7
-	movl	%ebx,%ebp
 	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
 	vmovdqa	64(%esp),%xmm1
-	addl	56(%esp),%edx
-	andl	%ecx,%ebp
-	andl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	movl	%edi,%ebp
+	xorl	%ebx,%esi
 	shldl	$5,%edi,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
-	movl	%eax,%esi
-	xorl	%ebx,%eax
 	addl	60(%esp),%ecx
-	andl	%ebx,%esi
 	andl	%eax,%ebp
+	xorl	%ebx,%eax
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
 	vpalignr	$8,%xmm6,%xmm7,%xmm2
 	vpxor	%xmm4,%xmm0,%xmm0
-	movl	%edi,%ebp
+	addl	(%esp),%ebx
+	andl	%edi,%esi
 	xorl	%eax,%edi
-	addl	(%esp),%ebx
-	andl	%eax,%ebp
+	shrdl	$7,%edx,%edx
 	vpxor	%xmm1,%xmm0,%xmm0
 	vmovdqa	%xmm4,64(%esp)
-	andl	%edi,%esi
-	shrdl	$7,%edx,%edx
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
 	vmovdqa	%xmm3,%xmm4
 	vpaddd	%xmm7,%xmm3,%xmm3
-	addl	%ebp,%ebx
-	movl	%ecx,%ebp
-	vpxor	%xmm2,%xmm0,%xmm0
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	4(%esp),%eax
 	vpsrld	$30,%xmm0,%xmm2
 	vmovdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
+	andl	%edx,%ebp
 	xorl	%edi,%edx
-	addl	4(%esp),%eax
-	andl	%edi,%esi
-	vpslld	$2,%xmm0,%xmm0
-	andl	%edx,%ebp
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
 	vpor	%xmm2,%xmm0,%xmm0
-	movl	%ecx,%ebp
 	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
 	vmovdqa	80(%esp),%xmm2
-	addl	8(%esp),%edi
-	andl	%edx,%ebp
-	andl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	movl	%eax,%ebp
+	xorl	%ecx,%esi
 	shldl	$5,%eax,%eax
 	addl	%esi,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
-	movl	%ebx,%esi
-	xorl	%ecx,%ebx
 	addl	12(%esp),%edx
-	andl	%ecx,%esi
 	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
 	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	movl	%edi,%esi
+	xorl	%ebx,%ebp
 	shldl	$5,%edi,%edi
 	addl	%ebp,%edx
-	xorl	%ecx,%ebx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
 	addl	%edi,%edx
 	vpalignr	$8,%xmm7,%xmm0,%xmm3
 	vpxor	%xmm5,%xmm1,%xmm1
-	movl	%eax,%ebp
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
 	xorl	%ebx,%eax
-	addl	16(%esp),%ecx
-	andl	%ebx,%ebp
+	shrdl	$7,%edi,%edi
 	vpxor	%xmm2,%xmm1,%xmm1
 	vmovdqa	%xmm5,80(%esp)
-	andl	%eax,%esi
-	shrdl	$7,%edi,%edi
+	movl	%edx,%ebp
+	xorl	%eax,%esi
 	vmovdqa	%xmm4,%xmm5
 	vpaddd	%xmm0,%xmm4,%xmm4
-	addl	%ebp,%ecx
-	movl	%edx,%ebp
-	vpxor	%xmm3,%xmm1,%xmm1
 	shldl	$5,%edx,%edx
 	addl	%esi,%ecx
-	xorl	%ebx,%eax
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	20(%esp),%ebx
 	vpsrld	$30,%xmm1,%xmm3
 	vmovdqa	%xmm4,(%esp)
-	movl	%edi,%esi
+	andl	%edi,%ebp
 	xorl	%eax,%edi
-	addl	20(%esp),%ebx
-	andl	%eax,%esi
-	vpslld	$2,%xmm1,%xmm1
-	andl	%edi,%ebp
 	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
 	shldl	$5,%ecx,%ecx
 	addl	%ebp,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%esi
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
 	vpor	%xmm3,%xmm1,%xmm1
-	movl	%edx,%ebp
 	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
 	vmovdqa	96(%esp),%xmm3
-	addl	24(%esp),%eax
-	andl	%edi,%ebp
-	andl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	movl	%ebx,%ebp
+	xorl	%edx,%esi
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
-	xorl	%edi,%edx
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
 	addl	%ebx,%eax
-	movl	%ecx,%esi
-	xorl	%edx,%ecx
 	addl	28(%esp),%edi
-	andl	%edx,%esi
 	andl	%ecx,%ebp
+	xorl	%edx,%ecx
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	movl	%eax,%esi
+	xorl	%ecx,%ebp
 	shldl	$5,%eax,%eax
 	addl	%ebp,%edi
-	xorl	%edx,%ecx
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
 	addl	%eax,%edi
 	vpalignr	$8,%xmm0,%xmm1,%xmm4
 	vpxor	%xmm6,%xmm2,%xmm2
-	movl	%ebx,%ebp
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
 	xorl	%ecx,%ebx
-	addl	32(%esp),%edx
-	andl	%ecx,%ebp
+	shrdl	$7,%eax,%eax
 	vpxor	%xmm3,%xmm2,%xmm2
 	vmovdqa	%xmm6,96(%esp)
-	andl	%ebx,%esi
-	shrdl	$7,%eax,%eax
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
 	vmovdqa	%xmm5,%xmm6
 	vpaddd	%xmm1,%xmm5,%xmm5
-	addl	%ebp,%edx
-	movl	%edi,%ebp
-	vpxor	%xmm4,%xmm2,%xmm2
 	shldl	$5,%edi,%edi
 	addl	%esi,%edx
-	xorl	%ecx,%ebx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
 	addl	%edi,%edx
+	addl	36(%esp),%ecx
 	vpsrld	$30,%xmm2,%xmm4
 	vmovdqa	%xmm5,16(%esp)
-	movl	%eax,%esi
+	andl	%eax,%ebp
 	xorl	%ebx,%eax
-	addl	36(%esp),%ecx
-	andl	%ebx,%esi
-	vpslld	$2,%xmm2,%xmm2
-	andl	%eax,%ebp
 	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
 	addl	%ebp,%ecx
-	xorl	%ebx,%eax
+	xorl	%edi,%esi
+	xorl	%eax,%edi
 	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
 	vpor	%xmm4,%xmm2,%xmm2
-	movl	%edi,%ebp
 	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
 	vmovdqa	64(%esp),%xmm4
-	addl	40(%esp),%ebx
-	andl	%eax,%ebp
-	andl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	movl	%ecx,%ebp
+	xorl	%edi,%esi
 	shldl	$5,%ecx,%ecx
 	addl	%esi,%ebx
-	xorl	%eax,%edi
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
 	addl	%ecx,%ebx
-	movl	%edx,%esi
-	xorl	%edi,%edx
 	addl	44(%esp),%eax
-	andl	%edi,%esi
 	andl	%edx,%ebp
+	xorl	%edi,%edx
 	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	movl	%ebx,%esi
+	xorl	%edx,%ebp
 	shldl	$5,%ebx,%ebx
 	addl	%ebp,%eax
-	xorl	%edi,%edx
+	xorl	%edx,%esi
 	addl	%ebx,%eax
 	vpalignr	$8,%xmm1,%xmm2,%xmm5
 	vpxor	%xmm7,%xmm3,%xmm3
 	addl	48(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
 	vpxor	%xmm4,%xmm3,%xmm3
 	vmovdqa	%xmm7,64(%esp)
-	xorl	%ecx,%esi
-	addl	%eax,%edi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
 	vmovdqa	%xmm6,%xmm7
 	vpaddd	%xmm2,%xmm6,%xmm6
 	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
+	addl	%eax,%edi
 	vpxor	%xmm5,%xmm3,%xmm3
 	addl	52(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
 	vpsrld	$30,%xmm3,%xmm5
 	vmovdqa	%xmm6,32(%esp)
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	vpslld	$2,%xmm3,%xmm3
 	addl	56(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vpor	%xmm5,%xmm3,%xmm3
 	addl	60(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	(%esp),%eax
 	vpaddd	%xmm3,%xmm7,%xmm7
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
 	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	4(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	8(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	12(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	movl	196(%esp),%ebp
 	cmpl	200(%esp),%ebp
-	je	.L008done
+	je	.L010done
 	vmovdqa	160(%esp),%xmm7
 	vmovdqa	176(%esp),%xmm6
 	vmovdqu	(%ebp),%xmm0
@@ -7418,110 +7694,109 @@
 	movl	%ebp,196(%esp)
 	vmovdqa	%xmm7,96(%esp)
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	vpshufb	%xmm6,%xmm1,%xmm1
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
 	vpaddd	%xmm7,%xmm0,%xmm4
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	vmovdqa	%xmm4,(%esp)
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	vpshufb	%xmm6,%xmm2,%xmm2
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
 	vpaddd	%xmm7,%xmm1,%xmm5
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	vmovdqa	%xmm5,16(%esp)
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	vpshufb	%xmm6,%xmm3,%xmm3
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
 	vpaddd	%xmm7,%xmm2,%xmm6
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	vmovdqa	%xmm6,32(%esp)
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
 	addl	4(%ebp),%esi
@@ -7530,109 +7805,112 @@
 	addl	12(%ebp),%edx
 	movl	%esi,4(%ebp)
 	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
 	movl	%ecx,8(%ebp)
-	movl	%esi,%ebx
+	xorl	%edx,%ebx
 	movl	%edx,12(%ebp)
 	movl	%edi,16(%ebp)
-	jmp	.L007loop
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L009loop
 .align	16
-.L008done:
+.L010done:
 	addl	16(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	20(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	addl	24(%esp),%edi
-	xorl	%edx,%esi
+	xorl	%ecx,%esi
 	movl	%eax,%ebp
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%esi
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%esi,%edi
 	addl	28(%esp),%edx
-	xorl	%ecx,%ebp
+	xorl	%ebx,%ebp
 	movl	%edi,%esi
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%ebp
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
 	addl	32(%esp),%ecx
-	xorl	%ebx,%esi
+	xorl	%eax,%esi
 	movl	%edx,%ebp
 	shldl	$5,%edx,%edx
-	xorl	%eax,%esi
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%esi,%ecx
 	addl	36(%esp),%ebx
-	xorl	%eax,%ebp
+	xorl	%edi,%ebp
 	movl	%ecx,%esi
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%ebp
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%ebp,%ebx
 	addl	40(%esp),%eax
-	xorl	%edi,%esi
+	xorl	%edx,%esi
 	movl	%ebx,%ebp
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%esi
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%esi,%eax
 	addl	44(%esp),%edi
-	xorl	%edx,%ebp
+	xorl	%ecx,%ebp
 	movl	%eax,%esi
 	shldl	$5,%eax,%eax
-	xorl	%ecx,%ebp
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
 	addl	%eax,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%ebp,%edi
 	addl	48(%esp),%edx
-	xorl	%ecx,%esi
+	xorl	%ebx,%esi
 	movl	%edi,%ebp
 	shldl	$5,%edi,%edi
-	xorl	%ebx,%esi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
 	addl	%edi,%edx
-	shrdl	$7,%eax,%eax
-	addl	%esi,%edx
 	addl	52(%esp),%ecx
-	xorl	%ebx,%ebp
+	xorl	%eax,%ebp
 	movl	%edx,%esi
 	shldl	$5,%edx,%edx
-	xorl	%eax,%ebp
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
 	addl	%edx,%ecx
-	shrdl	$7,%edi,%edi
-	addl	%ebp,%ecx
 	addl	56(%esp),%ebx
-	xorl	%eax,%esi
+	xorl	%edi,%esi
 	movl	%ecx,%ebp
 	shldl	$5,%ecx,%ecx
-	xorl	%edi,%esi
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
 	addl	%ecx,%ebx
-	shrdl	$7,%edx,%edx
-	addl	%esi,%ebx
 	addl	60(%esp),%eax
-	xorl	%edi,%ebp
+	xorl	%edx,%ebp
 	movl	%ebx,%esi
 	shldl	$5,%ebx,%ebx
-	xorl	%edx,%ebp
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebp,%eax
 	vzeroall
 	movl	192(%esp),%ebp
 	addl	(%ebp),%eax
@@ -7659,9 +7937,10 @@
 .long	2400959708,2400959708,2400959708,2400959708
 .long	3395469782,3395469782,3395469782,3395469782
 .long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
 .byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
 .byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/sha256-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/sha256-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/sha256-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/sha256-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from sha256-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/sha256-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha256-586.pl. */
 #ifdef PIC
 .file	"sha256-586.S"
 .text
@@ -29,237 +29,6762 @@
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
+	leal	OPENSSL_ia32cap_P-.L001K256(%ebp),%edx
+	movl	(%edx),%ecx
+	movl	4(%edx),%ebx
+	testl	$1048576,%ecx
+	jnz	.L002loop
+	movl	8(%edx),%edx
+	testl	$16777216,%ecx
+	jz	.L003no_xmm
+	andl	$1073741824,%ecx
+	andl	$268435968,%ebx
+	testl	$536870912,%edx
+	jnz	.L004shaext
+	orl	%ebx,%ecx
+	andl	$1342177280,%ecx
+	cmpl	$1342177280,%ecx
+	je	.L005AVX
+	testl	$512,%ebx
+	jnz	.L006SSSE3
+.L003no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	.L007unrolled
+	jmp	.L002loop
 .align	16
 .L002loop:
 	movl	(%edi),%eax
 	movl	4(%edi),%ebx
 	movl	8(%edi),%ecx
+	bswap	%eax
 	movl	12(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	16(%edi),%eax
 	movl	20(%edi),%ebx
 	movl	24(%edi),%ecx
+	bswap	%eax
 	movl	28(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	32(%edi),%eax
 	movl	36(%edi),%ebx
 	movl	40(%edi),%ecx
+	bswap	%eax
 	movl	44(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	48(%edi),%eax
 	movl	52(%edi),%ebx
 	movl	56(%edi),%ecx
+	bswap	%eax
 	movl	60(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	addl	$64,%edi
-	subl	$32,%esp
-	movl	%edi,100(%esp)
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
 	movl	12(%esi),%edi
-	movl	%ebx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%edi,12(%esp)
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
 	movl	16(%esi),%edx
 	movl	20(%esi),%ebx
 	movl	24(%esi),%ecx
 	movl	28(%esi),%edi
-	movl	%ebx,20(%esp)
-	movl	%ecx,24(%esp)
-	movl	%edi,28(%esp)
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
 .align	16
-.L00300_15:
-	movl	92(%esp),%ebx
+.L00800_15:
 	movl	%edx,%ecx
+	movl	24(%esp),%esi
 	rorl	$14,%ecx
-	movl	20(%esp),%esi
+	movl	28(%esp),%edi
 	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
 	rorl	$5,%ecx
-	xorl	%edx,%ecx
-	rorl	$6,%ecx
-	movl	24(%esp),%edi
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
 	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00800_15
+	movl	156(%esp),%ecx
+	jmp	.L00916_63
+.align	16
+.L00916_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
 	xorl	%edi,%esi
-	movl	%edx,16(%esp)
-	movl	%eax,%ecx
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
 	andl	%edx,%esi
-	movl	12(%esp),%edx
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
 	xorl	%edi,%esi
-	movl	%eax,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
 	addl	%esi,%ebx
 	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00916_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	16
+.L007unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	.L010grand_loop
+.align	16
+.L010grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
 	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
 	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
 	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
 	rorl	$2,%ecx
-	addl	%ebx,%edx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
 	movl	8(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
 	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	subl	$4,%esp
-	orl	%esi,%eax
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
 	andl	%esi,%ecx
-	andl	%edi,%eax
-	movl	(%ebp),%esi
-	orl	%ecx,%eax
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	addl	%esi,%edx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
 	addl	%esi,%eax
-	cmpl	$3248222580,%esi
-	jne	.L00300_15
-	movl	152(%esp),%ebx
-.align	16
-.L00416_63:
-	movl	%ebx,%esi
-	movl	100(%esp),%ecx
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
 	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
 	movl	%ecx,%edi
+	rorl	$2,%ecx
 	xorl	%ebx,%esi
+	shrl	$3,%ebx
 	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
 	shrl	$3,%ebx
-	rorl	$2,%edi
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
 	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
 	xorl	%ecx,%edi
-	rorl	$17,%edi
-	shrl	$10,%ecx
-	addl	156(%esp),%ebx
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
 	xorl	%ecx,%edi
-	addl	120(%esp),%ebx
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
 	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
 	addl	%edi,%ebx
-	rorl	$14,%ecx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
 	movl	20(%esp),%esi
-	xorl	%edx,%ecx
-	rorl	$5,%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
 	movl	%ebx,92(%esp)
-	xorl	%edx,%ecx
-	rorl	$6,%ecx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
 	movl	24(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
 	xorl	%edi,%esi
-	movl	%edx,16(%esp)
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	andl	%edx,%esi
-	movl	12(%esp),%edx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
 	xorl	%edi,%esi
-	movl	%eax,%edi
-	addl	%esi,%ebx
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
 	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
 	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
 	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
 	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
 	rorl	$2,%ecx
-	addl	%ebx,%edx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
 	movl	8(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
 	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	subl	$4,%esp
-	orl	%esi,%eax
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
 	andl	%esi,%ecx
-	andl	%edi,%eax
-	movl	(%ebp),%esi
-	orl	%ecx,%eax
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	movl	152(%esp),%ebx
-	addl	%esi,%edx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
 	addl	%esi,%eax
-	cmpl	$3329325298,%esi
-	jne	.L00416_63
-	movl	352(%esp),%esi
-	movl	4(%esp),%ebx
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
 	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
 	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
 	addl	(%esi),%eax
-	addl	4(%esi),%ebx
-	addl	8(%esi),%ecx
-	addl	12(%esi),%edi
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
 	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	movl	%ecx,8(%esi)
-	movl	%edi,12(%esi)
-	movl	20(%esp),%eax
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	movl	28(%esp),%ecx
-	movl	356(%esp),%edi
 	addl	16(%esi),%edx
-	addl	20(%esi),%eax
+	addl	20(%esi),%edi
 	addl	24(%esi),%ebx
 	addl	28(%esi),%ecx
 	movl	%edx,16(%esi)
-	movl	%eax,20(%esi)
+	movl	%edi,20(%esi)
 	movl	%ebx,24(%esi)
 	movl	%ecx,28(%esi)
-	addl	$352,%esp
-	subl	$256,%ebp
-	cmpl	8(%esp),%edi
-	jb	.L002loop
-	movl	12(%esp),%esp
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	.L010grand_loop
+	movl	108(%esp),%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	64
-.L001K256:
-.long	1116352408,1899447441,3049323471,3921009573
-.long	961987163,1508970993,2453635748,2870763221
-.long	3624381080,310598401,607225278,1426881987
-.long	1925078388,2162078206,2614888103,3248222580
-.long	3835390401,4022224774,264347078,604807628
-.long	770255983,1249150122,1555081692,1996064986
-.long	2554220882,2821834349,2952996808,3210313671
-.long	3336571891,3584528711,113926993,338241895
-.long	666307205,773529912,1294757372,1396182291
-.long	1695183700,1986661051,2177026350,2456956037
-.long	2730485921,2820302411,3259730800,3345764771
-.long	3516065817,3600352804,4094571909,275423344
-.long	430227734,506948616,659060556,883997877
-.long	958139571,1322822218,1537002063,1747873779
-.long	1955562222,2024104815,2227730452,2361852424
-.long	2428436474,2756734187,3204031479,3329325298
+.align	32
+.L004shaext:
+	subl	$32,%esp
+	movdqu	(%esi),%xmm1
+	leal	128(%ebp),%ebp
+	movdqu	16(%esi),%xmm2
+	movdqa	128(%ebp),%xmm7
+	pshufd	$27,%xmm1,%xmm0
+	pshufd	$177,%xmm1,%xmm1
+	pshufd	$27,%xmm2,%xmm2
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.L011loop_shaext
+.align	16
+.L011loop_shaext:
+	movdqu	(%edi),%xmm3
+	movdqu	16(%edi),%xmm4
+	movdqu	32(%edi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%edi),%xmm6
+	movdqa	%xmm2,16(%esp)
+	movdqa	-128(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,(%esp)
+.byte	15,56,203,202
+	movdqa	-112(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	leal	64(%edi),%edi
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	-96(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	-80(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	-64(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	-48(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	-32(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	-16(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	16(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	32(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	48(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+	movdqa	96(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	128(%ebp),%xmm7
+.byte	15,56,203,202
+	movdqa	112(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	cmpl	%edi,%eax
+	nop
+.byte	15,56,203,202
+	paddd	16(%esp),%xmm2
+	paddd	(%esp),%xmm1
+	jnz	.L011loop_shaext
+	pshufd	$177,%xmm2,%xmm2
+	pshufd	$27,%xmm1,%xmm7
+	pshufd	$177,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+	movl	44(%esp),%esp
+	movdqu	%xmm1,(%esi)
+	movdqu	%xmm2,16(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L006SSSE3:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	.L012grand_ssse3
+.align	16
+.L012grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	.L013ssse3_00_47
+.align	16
+.L013ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L013ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L012grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L005AVX:
+	andl	$264,%edx
+	cmpl	$264,%edx
+	je	.L014AVX_BMI
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L015grand_avx
+.align	32
+.L015grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L016avx_00_47
+.align	16
+.L016avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L016avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L015grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L014AVX_BMI:
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L017grand_avx_bmi
+.align	32
+.L017grand_avx_bmi:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L018avx_bmi_00_47
+.align	16
+.L018avx_bmi_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	32(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	36(%esp),%edx
+	vpaddd	%xmm4,%xmm0,%xmm0
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm0,%xmm7
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	40(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	44(%esp),%edx
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	48(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	52(%esp),%edx
+	vpaddd	%xmm4,%xmm1,%xmm1
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm1,%xmm7
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	56(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	60(%esp),%edx
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	64(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	68(%esp),%edx
+	vpaddd	%xmm4,%xmm2,%xmm2
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm2,%xmm7
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	72(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	76(%esp),%edx
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	80(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	84(%esp),%edx
+	vpaddd	%xmm4,%xmm3,%xmm3
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm3,%xmm7
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	88(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	92(%esp),%edx
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L018avx_bmi_00_47
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L017grand_avx_bmi
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
 .size	sha256_block_data_order,.-.L_sha256_block_data_order_begin
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte	62,0
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"sha256-586.S"
 .text
@@ -288,235 +6813,6760 @@
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
+	leal	OPENSSL_ia32cap_P,%edx
+	movl	(%edx),%ecx
+	movl	4(%edx),%ebx
+	testl	$1048576,%ecx
+	jnz	.L002loop
+	movl	8(%edx),%edx
+	testl	$16777216,%ecx
+	jz	.L003no_xmm
+	andl	$1073741824,%ecx
+	andl	$268435968,%ebx
+	testl	$536870912,%edx
+	jnz	.L004shaext
+	orl	%ebx,%ecx
+	andl	$1342177280,%ecx
+	cmpl	$1342177280,%ecx
+	je	.L005AVX
+	testl	$512,%ebx
+	jnz	.L006SSSE3
+.L003no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	.L007unrolled
+	jmp	.L002loop
 .align	16
 .L002loop:
 	movl	(%edi),%eax
 	movl	4(%edi),%ebx
 	movl	8(%edi),%ecx
+	bswap	%eax
 	movl	12(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	16(%edi),%eax
 	movl	20(%edi),%ebx
 	movl	24(%edi),%ecx
+	bswap	%eax
 	movl	28(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	32(%edi),%eax
 	movl	36(%edi),%ebx
 	movl	40(%edi),%ecx
+	bswap	%eax
 	movl	44(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	movl	48(%edi),%eax
 	movl	52(%edi),%ebx
 	movl	56(%edi),%ecx
+	bswap	%eax
 	movl	60(%edi),%edx
-	bswap	%eax
 	bswap	%ebx
+	pushl	%eax
 	bswap	%ecx
+	pushl	%ebx
 	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	addl	$64,%edi
-	subl	$32,%esp
-	movl	%edi,100(%esp)
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
 	movl	12(%esi),%edi
-	movl	%ebx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%edi,12(%esp)
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
 	movl	16(%esi),%edx
 	movl	20(%esi),%ebx
 	movl	24(%esi),%ecx
 	movl	28(%esi),%edi
-	movl	%ebx,20(%esp)
-	movl	%ecx,24(%esp)
-	movl	%edi,28(%esp)
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
 .align	16
-.L00300_15:
-	movl	92(%esp),%ebx
+.L00800_15:
 	movl	%edx,%ecx
+	movl	24(%esp),%esi
 	rorl	$14,%ecx
-	movl	20(%esp),%esi
+	movl	28(%esp),%edi
 	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
 	rorl	$5,%ecx
-	xorl	%edx,%ecx
-	rorl	$6,%ecx
-	movl	24(%esp),%edi
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
 	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00800_15
+	movl	156(%esp),%ecx
+	jmp	.L00916_63
+.align	16
+.L00916_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
 	xorl	%edi,%esi
-	movl	%edx,16(%esp)
-	movl	%eax,%ecx
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
 	andl	%edx,%esi
-	movl	12(%esp),%edx
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
 	xorl	%edi,%esi
-	movl	%eax,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
 	addl	%esi,%ebx
 	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00916_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	16
+.L007unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	.L010grand_loop
+.align	16
+.L010grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
 	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
 	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
 	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
 	rorl	$2,%ecx
-	addl	%ebx,%edx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
 	movl	8(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
 	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	subl	$4,%esp
-	orl	%esi,%eax
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
 	andl	%esi,%ecx
-	andl	%edi,%eax
-	movl	(%ebp),%esi
-	orl	%ecx,%eax
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	addl	%esi,%edx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
 	addl	%esi,%eax
-	cmpl	$3248222580,%esi
-	jne	.L00300_15
-	movl	152(%esp),%ebx
-.align	16
-.L00416_63:
-	movl	%ebx,%esi
-	movl	100(%esp),%ecx
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
 	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
 	movl	%ecx,%edi
+	rorl	$2,%ecx
 	xorl	%ebx,%esi
+	shrl	$3,%ebx
 	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
 	shrl	$3,%ebx
-	rorl	$2,%edi
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
 	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
 	xorl	%ecx,%edi
-	rorl	$17,%edi
-	shrl	$10,%ecx
-	addl	156(%esp),%ebx
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
 	xorl	%ecx,%edi
-	addl	120(%esp),%ebx
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
 	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
 	addl	%edi,%ebx
-	rorl	$14,%ecx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
 	movl	20(%esp),%esi
-	xorl	%edx,%ecx
-	rorl	$5,%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
 	movl	%ebx,92(%esp)
-	xorl	%edx,%ecx
-	rorl	$6,%ecx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
 	movl	24(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
 	xorl	%edi,%esi
-	movl	%edx,16(%esp)
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	andl	%edx,%esi
-	movl	12(%esp),%edx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
 	xorl	%edi,%esi
-	movl	%eax,%edi
-	addl	%esi,%ebx
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
 	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
 	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
 	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
 	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
 	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
 	rorl	$2,%ecx
-	addl	%ebx,%edx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
 	movl	8(%esp),%edi
-	addl	%ecx,%ebx
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
 	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
 	movl	%eax,%ecx
-	subl	$4,%esp
-	orl	%esi,%eax
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
 	andl	%esi,%ecx
-	andl	%edi,%eax
-	movl	(%ebp),%esi
-	orl	%ecx,%eax
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	movl	152(%esp),%ebx
-	addl	%esi,%edx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
 	addl	%esi,%eax
-	cmpl	$3329325298,%esi
-	jne	.L00416_63
-	movl	352(%esp),%esi
-	movl	4(%esp),%ebx
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
 	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
 	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
 	addl	(%esi),%eax
-	addl	4(%esi),%ebx
-	addl	8(%esi),%ecx
-	addl	12(%esi),%edi
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
 	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	movl	%ecx,8(%esi)
-	movl	%edi,12(%esi)
-	movl	20(%esp),%eax
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	movl	28(%esp),%ecx
-	movl	356(%esp),%edi
 	addl	16(%esi),%edx
-	addl	20(%esi),%eax
+	addl	20(%esi),%edi
 	addl	24(%esi),%ebx
 	addl	28(%esi),%ecx
 	movl	%edx,16(%esi)
-	movl	%eax,20(%esi)
+	movl	%edi,20(%esi)
 	movl	%ebx,24(%esi)
 	movl	%ecx,28(%esi)
-	addl	$352,%esp
-	subl	$256,%ebp
-	cmpl	8(%esp),%edi
-	jb	.L002loop
-	movl	12(%esp),%esp
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	.L010grand_loop
+	movl	108(%esp),%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	64
-.L001K256:
-.long	1116352408,1899447441,3049323471,3921009573
-.long	961987163,1508970993,2453635748,2870763221
-.long	3624381080,310598401,607225278,1426881987
-.long	1925078388,2162078206,2614888103,3248222580
-.long	3835390401,4022224774,264347078,604807628
-.long	770255983,1249150122,1555081692,1996064986
-.long	2554220882,2821834349,2952996808,3210313671
-.long	3336571891,3584528711,113926993,338241895
-.long	666307205,773529912,1294757372,1396182291
-.long	1695183700,1986661051,2177026350,2456956037
-.long	2730485921,2820302411,3259730800,3345764771
-.long	3516065817,3600352804,4094571909,275423344
-.long	430227734,506948616,659060556,883997877
-.long	958139571,1322822218,1537002063,1747873779
-.long	1955562222,2024104815,2227730452,2361852424
-.long	2428436474,2756734187,3204031479,3329325298
+.align	32
+.L004shaext:
+	subl	$32,%esp
+	movdqu	(%esi),%xmm1
+	leal	128(%ebp),%ebp
+	movdqu	16(%esi),%xmm2
+	movdqa	128(%ebp),%xmm7
+	pshufd	$27,%xmm1,%xmm0
+	pshufd	$177,%xmm1,%xmm1
+	pshufd	$27,%xmm2,%xmm2
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.L011loop_shaext
+.align	16
+.L011loop_shaext:
+	movdqu	(%edi),%xmm3
+	movdqu	16(%edi),%xmm4
+	movdqu	32(%edi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%edi),%xmm6
+	movdqa	%xmm2,16(%esp)
+	movdqa	-128(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,(%esp)
+.byte	15,56,203,202
+	movdqa	-112(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	leal	64(%edi),%edi
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	-96(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	-80(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	-64(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	-48(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	-32(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	-16(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	16(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	32(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	48(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64(%ebp),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80(%ebp),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+	movdqa	96(%ebp),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	128(%ebp),%xmm7
+.byte	15,56,203,202
+	movdqa	112(%ebp),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$14,%xmm0,%xmm0
+	cmpl	%edi,%eax
+	nop
+.byte	15,56,203,202
+	paddd	16(%esp),%xmm2
+	paddd	(%esp),%xmm1
+	jnz	.L011loop_shaext
+	pshufd	$177,%xmm2,%xmm2
+	pshufd	$27,%xmm1,%xmm7
+	pshufd	$177,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+	movl	44(%esp),%esp
+	movdqu	%xmm1,(%esi)
+	movdqu	%xmm2,16(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L006SSSE3:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	.L012grand_ssse3
+.align	16
+.L012grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	.L013ssse3_00_47
+.align	16
+.L013ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L013ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L012grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L005AVX:
+	andl	$264,%edx
+	cmpl	$264,%edx
+	je	.L014AVX_BMI
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L015grand_avx
+.align	32
+.L015grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L016avx_00_47
+.align	16
+.L016avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L016avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L015grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L014AVX_BMI:
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L017grand_avx_bmi
+.align	32
+.L017grand_avx_bmi:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L018avx_bmi_00_47
+.align	16
+.L018avx_bmi_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	32(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	36(%esp),%edx
+	vpaddd	%xmm4,%xmm0,%xmm0
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm0,%xmm7
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	40(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	44(%esp),%edx
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	48(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	52(%esp),%edx
+	vpaddd	%xmm4,%xmm1,%xmm1
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm1,%xmm7
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	56(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	60(%esp),%edx
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	64(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	68(%esp),%edx
+	vpaddd	%xmm4,%xmm2,%xmm2
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm2,%xmm7
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	72(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	76(%esp),%edx
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrld	$3,%xmm4,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpslld	$14,%xmm4,%xmm5
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpxor	%xmm6,%xmm7,%xmm4
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	80(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	vpsrld	$11,%xmm6,%xmm6
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpslld	$11,%xmm5,%xmm5
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpsrlq	$17,%xmm7,%xmm5
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	84(%esp),%edx
+	vpaddd	%xmm4,%xmm3,%xmm3
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	vpxor	%xmm5,%xmm6,%xmm6
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpsrlq	$19,%xmm7,%xmm7
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	vpshufd	$132,%xmm6,%xmm7
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	vpsrldq	$8,%xmm7,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	vpshufd	$80,%xmm3,%xmm7
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	88(%esp),%edx
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	vpsrlq	$19,%xmm7,%xmm7
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	vpshufd	$232,%xmm6,%xmm7
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$8,%xmm7,%xmm7
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	92(%esp),%edx
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L018avx_bmi_00_47
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,16(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	24(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	20(%esp),%edx
+	movl	%eax,(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	4(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	andl	%eax,%ebx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	12(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,12(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	20(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	16(%esp),%edx
+	movl	%ebx,28(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	andl	%ebx,%eax
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	8(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,8(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	16(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	12(%esp),%edx
+	movl	%eax,24(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	28(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	andl	%eax,%ebx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	4(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,4(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	12(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	8(%esp),%edx
+	movl	%ebx,20(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	24(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	andl	%ebx,%eax
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	8(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	4(%esp),%edx
+	movl	%eax,16(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	andl	%eax,%ebx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	28(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,28(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	4(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	(%esp),%edx
+	movl	%ebx,12(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	16(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	andl	%ebx,%eax
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	24(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,24(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	28(%esp),%edx
+	movl	%eax,8(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%eax,%edi
+	rorxl	$13,%eax,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%eax,%ecx
+	xorl	%edi,%esi
+	movl	12(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	andl	%eax,%ebx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	addl	%edx,%ecx
+	addl	20(%esp),%edx
+	leal	(%ebx,%ecx,1),%ebx
+	rorxl	$6,%edx,%ecx
+	rorxl	$11,%edx,%esi
+	movl	%edx,20(%esp)
+	rorxl	$25,%edx,%edi
+	xorl	%esi,%ecx
+	andnl	28(%esp),%edx,%esi
+	xorl	%edi,%ecx
+	andl	24(%esp),%edx
+	movl	%ebx,4(%esp)
+	orl	%esi,%edx
+	rorxl	$2,%ebx,%edi
+	rorxl	$13,%ebx,%esi
+	leal	(%edx,%ecx,1),%edx
+	rorxl	$22,%ebx,%ecx
+	xorl	%edi,%esi
+	movl	8(%esp),%edi
+	xorl	%esi,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	andl	%ebx,%eax
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	addl	%edx,%ecx
+	addl	16(%esp),%edx
+	leal	(%eax,%ecx,1),%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L017grand_avx_bmi
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
 .size	sha256_block_data_order,.-.L_sha256_block_data_order_begin
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte	62,0
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/sha512-586.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/sha512-586.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/sha512-586.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/sha512-586.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from sha512-586.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/sha512-586.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha512-586.pl. */
 #ifdef PIC
 .file	"sha512-586.S"
 .text
@@ -29,251 +29,2244 @@
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx
-	movl	OPENSSL_ia32cap_P at GOT(%edx),%edx
-	btl	$26,(%edx)
-	jnc	.L002loop_x86
+	leal	OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+	movl	(%edx),%ecx
+	testl	$67108864,%ecx
+	jz	.L002loop_x86
+	movl	4(%edx),%edx
 	movq	(%esi),%mm0
+	andl	$16777216,%ecx
 	movq	8(%esi),%mm1
+	andl	$512,%edx
 	movq	16(%esi),%mm2
+	orl	%edx,%ecx
 	movq	24(%esi),%mm3
 	movq	32(%esi),%mm4
 	movq	40(%esi),%mm5
 	movq	48(%esi),%mm6
 	movq	56(%esi),%mm7
+	cmpl	$16777728,%ecx
+	je	.L003SSSE3
 	subl	$80,%esp
+	jmp	.L004loop_sse2
 .align	16
-.L003loop_sse2:
+.L004loop_sse2:
 	movq	%mm1,8(%esp)
 	movq	%mm2,16(%esp)
 	movq	%mm3,24(%esp)
 	movq	%mm5,40(%esp)
 	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
 	movq	%mm7,56(%esp)
-	movl	(%edi),%ecx
-	movl	4(%edi),%edx
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
 	addl	$8,%edi
-	bswap	%ecx
-	bswap	%edx
-	movl	%ecx,76(%esp)
-	movl	%edx,72(%esp)
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	.L00500_14_sse2
 .align	16
-.L00400_14_sse2:
+.L00500_14_sse2:
+	movd	%eax,%mm1
 	movl	(%edi),%eax
+	movd	%ebx,%mm7
 	movl	4(%edi),%ebx
 	addl	$8,%edi
 	bswap	%eax
 	bswap	%ebx
-	movl	%eax,68(%esp)
-	movl	%ebx,64(%esp)
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	decl	%edx
+	jnz	.L00500_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
+	psllq	$4,%mm4
 	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	.L00616_79_sse2
+.align	16
+.L00616_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
 	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
 	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
 	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	.L00616_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L004loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L003SSSE3:
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	32
+.L007loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	.L00800_47_ssse3
+.align	32
+.L00800_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
 	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
 	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-	cmpb	$53,%dl
-	jne	.L00400_14_sse2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
-	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
 	pxor	%mm6,%mm5
-	movq	8(%esp),%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	.L00800_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
 	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
-	movq	%mm0,(%esp)
 	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	movq	88(%esp),%mm6
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	movq	192(%esp),%mm2
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-.align	16
-.L00516_79_sse2:
-	movq	%mm2,%mm1
-	psrlq	$1,%mm2
-	movq	%mm6,%mm7
-	psrlq	$6,%mm6
-	movq	%mm2,%mm3
-	psrlq	$6,%mm2
-	movq	%mm6,%mm5
-	psrlq	$13,%mm6
-	pxor	%mm2,%mm3
-	psrlq	$1,%mm2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
 	pxor	%mm6,%mm5
-	psrlq	$42,%mm6
-	pxor	%mm2,%mm3
-	movq	200(%esp),%mm2
-	psllq	$56,%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
-	psllq	$3,%mm7
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
-	paddq	128(%esp),%mm2
-	psllq	$7,%mm1
-	pxor	%mm7,%mm5
-	psllq	$42,%mm7
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
 	pxor	%mm1,%mm3
-	pxor	%mm7,%mm5
-	paddq	%mm5,%mm3
-	paddq	%mm2,%mm3
-	movq	%mm3,72(%esp)
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
-	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
 	pxor	%mm6,%mm5
-	movq	8(%esp),%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
-	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
-	movq	%mm0,(%esp)
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
 	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	movq	88(%esp),%mm6
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	movq	192(%esp),%mm2
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-	cmpb	$23,%dl
-	jne	.L00516_79_sse2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
 	movq	8(%esp),%mm1
-	movq	16(%esp),%mm2
+	paddq	%mm3,%mm0
 	movq	24(%esp),%mm3
-	movq	40(%esp),%mm5
-	movq	48(%esp),%mm6
 	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
 	paddq	(%esi),%mm0
 	paddq	8(%esi),%mm1
 	paddq	16(%esi),%mm2
@@ -290,12 +2283,10 @@
 	movq	%mm5,40(%esi)
 	movq	%mm6,48(%esi)
 	movq	%mm7,56(%esi)
-	addl	$640,%esp
-	subl	$640,%ebp
-	cmpl	88(%esp),%edi
-	jb	.L003loop_sse2
+	cmpl	%eax,%edi
+	jb	.L007loop_ssse3
+	movl	76(%edx),%esp
 	emms
-	movl	92(%esp),%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -406,7 +2397,7 @@
 	movl	$16,%ecx
 .long	2784229001
 .align	16
-.L00600_15_x86:
+.L00900_15_x86:
 	movl	40(%esp),%ecx
 	movl	44(%esp),%edx
 	movl	%ecx,%esi
@@ -513,9 +2504,9 @@
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$148,%dl
-	jne	.L00600_15_x86
+	jne	.L00900_15_x86
 .align	16
-.L00716_79_x86:
+.L01016_79_x86:
 	movl	312(%esp),%ecx
 	movl	316(%esp),%edx
 	movl	%ecx,%esi
@@ -688,7 +2679,7 @@
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$23,%dl
-	jne	.L00716_79_x86
+	jne	.L01016_79_x86
 	movl	840(%esp),%esi
 	movl	844(%esp),%edi
 	movl	(%esi),%eax
@@ -831,6 +2822,8 @@
 .long	4234509866,1501505948
 .long	987167468,1607167915
 .long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
 .size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
@@ -837,7 +2830,7 @@
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"sha512-586.S"
 .text
@@ -867,249 +2860,2243 @@
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
 	leal	OPENSSL_ia32cap_P,%edx
-	btl	$26,(%edx)
-	jnc	.L002loop_x86
+	movl	(%edx),%ecx
+	testl	$67108864,%ecx
+	jz	.L002loop_x86
+	movl	4(%edx),%edx
 	movq	(%esi),%mm0
+	andl	$16777216,%ecx
 	movq	8(%esi),%mm1
+	andl	$512,%edx
 	movq	16(%esi),%mm2
+	orl	%edx,%ecx
 	movq	24(%esi),%mm3
 	movq	32(%esi),%mm4
 	movq	40(%esi),%mm5
 	movq	48(%esi),%mm6
 	movq	56(%esi),%mm7
+	cmpl	$16777728,%ecx
+	je	.L003SSSE3
 	subl	$80,%esp
+	jmp	.L004loop_sse2
 .align	16
-.L003loop_sse2:
+.L004loop_sse2:
 	movq	%mm1,8(%esp)
 	movq	%mm2,16(%esp)
 	movq	%mm3,24(%esp)
 	movq	%mm5,40(%esp)
 	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
 	movq	%mm7,56(%esp)
-	movl	(%edi),%ecx
-	movl	4(%edi),%edx
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
 	addl	$8,%edi
-	bswap	%ecx
-	bswap	%edx
-	movl	%ecx,76(%esp)
-	movl	%edx,72(%esp)
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	.L00500_14_sse2
 .align	16
-.L00400_14_sse2:
+.L00500_14_sse2:
+	movd	%eax,%mm1
 	movl	(%edi),%eax
+	movd	%ebx,%mm7
 	movl	4(%edi),%ebx
 	addl	$8,%edi
 	bswap	%eax
 	bswap	%ebx
-	movl	%eax,68(%esp)
-	movl	%ebx,64(%esp)
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	decl	%edx
+	jnz	.L00500_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
+	psllq	$4,%mm4
 	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	.L00616_79_sse2
+.align	16
+.L00616_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
 	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
 	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
 	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	.L00616_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L004loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L003SSSE3:
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	32
+.L007loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	.L00800_47_ssse3
+.align	32
+.L00800_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
 	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
 	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-	cmpb	$53,%dl
-	jne	.L00400_14_sse2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
-	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
 	pxor	%mm6,%mm5
-	movq	8(%esp),%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	.L00800_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
 	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
-	movq	%mm0,(%esp)
 	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	movq	88(%esp),%mm6
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	movq	192(%esp),%mm2
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-.align	16
-.L00516_79_sse2:
-	movq	%mm2,%mm1
-	psrlq	$1,%mm2
-	movq	%mm6,%mm7
-	psrlq	$6,%mm6
-	movq	%mm2,%mm3
-	psrlq	$6,%mm2
-	movq	%mm6,%mm5
-	psrlq	$13,%mm6
-	pxor	%mm2,%mm3
-	psrlq	$1,%mm2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
 	pxor	%mm6,%mm5
-	psrlq	$42,%mm6
-	pxor	%mm2,%mm3
-	movq	200(%esp),%mm2
-	psllq	$56,%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
-	psllq	$3,%mm7
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
-	paddq	128(%esp),%mm2
-	psllq	$7,%mm1
-	pxor	%mm7,%mm5
-	psllq	$42,%mm7
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
 	pxor	%mm1,%mm3
-	pxor	%mm7,%mm5
-	paddq	%mm5,%mm3
-	paddq	%mm2,%mm3
-	movq	%mm3,72(%esp)
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
 	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
 	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
 	movq	%mm4,%mm1
-	movq	%mm4,%mm2
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
 	psrlq	$14,%mm1
 	movq	%mm4,32(%esp)
-	psllq	$23,%mm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
 	movq	%mm1,%mm3
 	psrlq	$4,%mm1
-	pxor	%mm2,%mm3
-	psllq	$23,%mm2
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
 	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
 	psrlq	$23,%mm1
-	pxor	%mm2,%mm3
-	psllq	$4,%mm2
+	paddq	56(%esp),%mm7
 	pxor	%mm1,%mm3
-	paddq	(%ebp),%mm7
-	pxor	%mm2,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
 	pxor	%mm6,%mm5
-	movq	8(%esp),%mm1
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
 	pand	%mm4,%mm5
-	movq	16(%esp),%mm2
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
 	pxor	%mm6,%mm5
-	movq	24(%esp),%mm4
-	paddq	%mm5,%mm3
-	movq	%mm0,(%esp)
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
 	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
 	movq	%mm0,%mm6
-	paddq	72(%esp),%mm3
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
 	psrlq	$28,%mm5
 	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
 	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
 	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
 	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
 	psllq	$5,%mm6
 	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
 	psrlq	$5,%mm5
 	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
 	psllq	$6,%mm6
 	pxor	%mm5,%mm7
-	subl	$8,%esp
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
 	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
 	movq	%mm0,%mm5
-	por	%mm2,%mm0
-	movq	88(%esp),%mm6
-	pand	%mm2,%mm5
-	pand	%mm1,%mm0
-	movq	192(%esp),%mm2
-	por	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
 	paddq	%mm5,%mm7
-	movq	%mm3,%mm0
-	movb	(%ebp),%dl
-	paddq	%mm7,%mm0
-	addl	$8,%ebp
-	cmpb	$23,%dl
-	jne	.L00516_79_sse2
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
 	movq	8(%esp),%mm1
-	movq	16(%esp),%mm2
+	paddq	%mm3,%mm0
 	movq	24(%esp),%mm3
-	movq	40(%esp),%mm5
-	movq	48(%esp),%mm6
 	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
 	paddq	(%esi),%mm0
 	paddq	8(%esi),%mm1
 	paddq	16(%esi),%mm2
@@ -1126,12 +5113,10 @@
 	movq	%mm5,40(%esi)
 	movq	%mm6,48(%esi)
 	movq	%mm7,56(%esi)
-	addl	$640,%esp
-	subl	$640,%ebp
-	cmpl	88(%esp),%edi
-	jb	.L003loop_sse2
+	cmpl	%eax,%edi
+	jb	.L007loop_ssse3
+	movl	76(%edx),%esp
 	emms
-	movl	92(%esp),%esp
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -1242,7 +5227,7 @@
 	movl	$16,%ecx
 .long	2784229001
 .align	16
-.L00600_15_x86:
+.L00900_15_x86:
 	movl	40(%esp),%ecx
 	movl	44(%esp),%edx
 	movl	%ecx,%esi
@@ -1349,9 +5334,9 @@
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$148,%dl
-	jne	.L00600_15_x86
+	jne	.L00900_15_x86
 .align	16
-.L00716_79_x86:
+.L01016_79_x86:
 	movl	312(%esp),%ecx
 	movl	316(%esp),%edx
 	movl	%ecx,%esi
@@ -1524,7 +5509,7 @@
 	subl	$8,%esp
 	leal	8(%ebp),%ebp
 	cmpb	$23,%dl
-	jne	.L00716_79_x86
+	jne	.L01016_79_x86
 	movl	840(%esp),%esi
 	movl	844(%esp),%edi
 	movl	(%esi),%eax
@@ -1667,6 +5652,8 @@
 .long	4234509866,1501505948
 .long	987167468,1607167915
 .long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
 .size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
@@ -1673,5 +5660,5 @@
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/vpaes-x86.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/vpaes-x86.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/vpaes-x86.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/vpaes-x86.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from vpaes-x86.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/vpaes-x86.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from vpaes-x86.pl. */
 #ifdef PIC
 .file	"vpaes-x86.S"
 .text
@@ -77,33 +77,33 @@
 	movdqa	%xmm6,%xmm1
 	movdqa	(%ebp),%xmm2
 	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
 	movdqu	(%edx),%xmm5
-	psrld	$4,%xmm1
-	pand	%xmm6,%xmm0
 .byte	102,15,56,0,208
 	movdqa	16(%ebp),%xmm0
-.byte	102,15,56,0,193
 	pxor	%xmm5,%xmm2
-	pxor	%xmm2,%xmm0
+	psrld	$4,%xmm1
 	addl	$16,%edx
+.byte	102,15,56,0,193
 	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
 	jmp	.L000enc_entry
 .align	16
 .L001enc_loop:
 	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
 .byte	102,15,56,0,226
+.byte	102,15,56,0,195
 	pxor	%xmm5,%xmm4
-	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,195
+	movdqa	64(%ebp),%xmm5
 	pxor	%xmm4,%xmm0
-	movdqa	64(%ebp),%xmm5
+	movdqa	-64(%ebx,%ecx,1),%xmm1
 .byte	102,15,56,0,234
-	movdqa	-64(%ebx,%ecx,1),%xmm1
 	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
 .byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-	movdqa	(%ebx,%ecx,1),%xmm4
-	movdqa	%xmm0,%xmm3
 .byte	102,15,56,0,193
 	addl	$16,%edx
 	pxor	%xmm2,%xmm0
@@ -112,28 +112,28 @@
 	pxor	%xmm0,%xmm3
 .byte	102,15,56,0,193
 	andl	$48,%ecx
+	subl	$1,%eax
 	pxor	%xmm3,%xmm0
-	subl	$1,%eax
 .L000enc_entry:
 	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm6,%xmm0
-	movdqa	-32(%ebp),%xmm5
 .byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
 	pxor	%xmm5,%xmm3
-	movdqa	%xmm7,%xmm4
 .byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
 	pxor	%xmm5,%xmm4
-	movdqa	%xmm7,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,220
 	movdqu	(%edx),%xmm5
-.byte	102,15,56,0,220
 	pxor	%xmm1,%xmm3
 	jnz	.L001enc_loop
 	movdqa	96(%ebp),%xmm4
@@ -149,8 +149,8 @@
 .type	_vpaes_decrypt_core, at function
 .align	16
 _vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
 	movl	240(%edx),%eax
-	leal	608(%ebp),%ebx
 	movdqa	%xmm6,%xmm1
 	movdqa	-64(%ebx),%xmm2
 	pandn	%xmm0,%xmm1
@@ -173,56 +173,56 @@
 .align	16
 .L003dec_loop:
 	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	-16(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	addl	$16,%edx
-.byte	102,15,56,0,197
 	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	16(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	subl	$1,%eax
-.byte	102,15,56,0,197
 	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	48(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,197
 	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	80(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
+	addl	$16,%edx
 .byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
 .L002dec_entry:
 	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
 	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
 	psrld	$4,%xmm1
-	pand	%xmm6,%xmm0
-	movdqa	-32(%ebp),%xmm2
 .byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
 	pxor	%xmm2,%xmm3
-	movdqa	%xmm7,%xmm4
 .byte	102,15,56,0,224
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm7,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
 	pxor	%xmm1,%xmm3
-	movdqu	(%edx),%xmm0
 	jnz	.L003dec_loop
 	movdqa	96(%ebx),%xmm4
 .byte	102,15,56,0,226
@@ -331,12 +331,12 @@
 .type	_vpaes_schedule_192_smear, at function
 .align	16
 _vpaes_schedule_192_smear:
-	pshufd	$128,%xmm6,%xmm0
-	pxor	%xmm0,%xmm6
+	pshufd	$128,%xmm6,%xmm1
 	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
 	pxor	%xmm0,%xmm6
 	movdqa	%xmm6,%xmm0
-	pxor	%xmm1,%xmm1
 	movhlps	%xmm1,%xmm6
 	ret
 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
@@ -739,33 +739,33 @@
 	movdqa	%xmm6,%xmm1
 	movdqa	(%ebp),%xmm2
 	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
 	movdqu	(%edx),%xmm5
-	psrld	$4,%xmm1
-	pand	%xmm6,%xmm0
 .byte	102,15,56,0,208
 	movdqa	16(%ebp),%xmm0
-.byte	102,15,56,0,193
 	pxor	%xmm5,%xmm2
-	pxor	%xmm2,%xmm0
+	psrld	$4,%xmm1
 	addl	$16,%edx
+.byte	102,15,56,0,193
 	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
 	jmp	.L000enc_entry
 .align	16
 .L001enc_loop:
 	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
 .byte	102,15,56,0,226
+.byte	102,15,56,0,195
 	pxor	%xmm5,%xmm4
-	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,195
+	movdqa	64(%ebp),%xmm5
 	pxor	%xmm4,%xmm0
-	movdqa	64(%ebp),%xmm5
+	movdqa	-64(%ebx,%ecx,1),%xmm1
 .byte	102,15,56,0,234
-	movdqa	-64(%ebx,%ecx,1),%xmm1
 	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
 .byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
 	pxor	%xmm5,%xmm2
-	movdqa	(%ebx,%ecx,1),%xmm4
-	movdqa	%xmm0,%xmm3
 .byte	102,15,56,0,193
 	addl	$16,%edx
 	pxor	%xmm2,%xmm0
@@ -774,28 +774,28 @@
 	pxor	%xmm0,%xmm3
 .byte	102,15,56,0,193
 	andl	$48,%ecx
+	subl	$1,%eax
 	pxor	%xmm3,%xmm0
-	subl	$1,%eax
 .L000enc_entry:
 	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
 	pandn	%xmm0,%xmm1
 	psrld	$4,%xmm1
 	pand	%xmm6,%xmm0
-	movdqa	-32(%ebp),%xmm5
 .byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
 	pxor	%xmm5,%xmm3
-	movdqa	%xmm7,%xmm4
 .byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
 	pxor	%xmm5,%xmm4
-	movdqa	%xmm7,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,220
 	movdqu	(%edx),%xmm5
-.byte	102,15,56,0,220
 	pxor	%xmm1,%xmm3
 	jnz	.L001enc_loop
 	movdqa	96(%ebp),%xmm4
@@ -811,8 +811,8 @@
 .type	_vpaes_decrypt_core, at function
 .align	16
 _vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
 	movl	240(%edx),%eax
-	leal	608(%ebp),%ebx
 	movdqa	%xmm6,%xmm1
 	movdqa	-64(%ebx),%xmm2
 	pandn	%xmm0,%xmm1
@@ -835,56 +835,56 @@
 .align	16
 .L003dec_loop:
 	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	-16(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	addl	$16,%edx
-.byte	102,15,56,0,197
 	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	16(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-	subl	$1,%eax
-.byte	102,15,56,0,197
 	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	48(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,197
 	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
 .byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	80(%ebx),%xmm0
-.byte	102,15,56,0,195
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
 	pxor	%xmm4,%xmm0
+	addl	$16,%edx
 .byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
 .L002dec_entry:
 	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
 	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
 	psrld	$4,%xmm1
-	pand	%xmm6,%xmm0
-	movdqa	-32(%ebp),%xmm2
 .byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm1,%xmm0
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
 	pxor	%xmm2,%xmm3
-	movdqa	%xmm7,%xmm4
 .byte	102,15,56,0,224
 	pxor	%xmm2,%xmm4
 	movdqa	%xmm7,%xmm2
 .byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
 	pxor	%xmm0,%xmm2
-	movdqa	%xmm7,%xmm3
 .byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
 	pxor	%xmm1,%xmm3
-	movdqu	(%edx),%xmm0
 	jnz	.L003dec_loop
 	movdqa	96(%ebx),%xmm4
 .byte	102,15,56,0,226
@@ -993,12 +993,12 @@
 .type	_vpaes_schedule_192_smear, at function
 .align	16
 _vpaes_schedule_192_smear:
-	pshufd	$128,%xmm6,%xmm0
-	pxor	%xmm0,%xmm6
+	pshufd	$128,%xmm6,%xmm1
 	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
 	pxor	%xmm0,%xmm6
 	movdqa	%xmm6,%xmm0
-	pxor	%xmm1,%xmm1
 	movhlps	%xmm1,%xmm6
 	ret
 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear

Modified: trunk/secure/lib/libcrypto/i386/wp-mmx.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/wp-mmx.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/wp-mmx.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/wp-mmx.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from wp-mmx.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/wp-mmx.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from wp-mmx.pl. */
 #ifdef PIC
 .file	"wp-mmx.S"
 .text
@@ -70,228 +70,230 @@
 	movq	4096(%ebp,%esi,8),%mm0
 	movl	(%esp),%eax
 	movl	4(%esp),%ebx
-	movb	%al,%cl
-	movb	%ah,%dl
+	movzbl	%al,%ecx
+	movzbl	%ah,%edx
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm0
 	movq	7(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	8(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	movq	6(%ebp,%esi,8),%mm2
 	movq	5(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	movq	4(%ebp,%esi,8),%mm4
 	movq	3(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	12(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	movq	2(%ebp,%esi,8),%mm6
 	movq	1(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm1
 	pxor	7(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	16(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm3
 	pxor	5(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm5
 	pxor	3(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	20(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm7
 	pxor	1(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm2
 	pxor	7(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	24(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm4
 	pxor	5(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm6
 	pxor	3(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	28(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm0
 	pxor	1(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm3
 	pxor	7(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	32(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm5
 	pxor	5(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm7
 	pxor	3(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	36(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm1
 	pxor	1(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm4
 	pxor	7(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	40(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm6
 	pxor	5(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm0
 	pxor	3(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	44(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm2
 	pxor	1(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm5
 	pxor	7(%ebp,%edi,8),%mm6
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	48(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm7
 	pxor	5(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm1
 	pxor	3(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	52(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm3
 	pxor	1(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm6
 	pxor	7(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	56(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm0
 	pxor	5(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm2
 	pxor	3(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	60(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm4
 	pxor	1(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm7
 	pxor	7(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm1
 	pxor	5(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm3
 	pxor	3(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	68(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm5
 	pxor	1(%ebp,%edi,8),%mm6
 	movq	%mm0,(%esp)
@@ -302,226 +304,226 @@
 	movq	%mm5,40(%esp)
 	movq	%mm6,48(%esp)
 	movq	%mm7,56(%esp)
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm0
 	pxor	7(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	72(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm2
 	pxor	5(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm4
 	pxor	3(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	76(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm6
 	pxor	1(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm1
 	pxor	7(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	80(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm3
 	pxor	5(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm5
 	pxor	3(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	84(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm7
 	pxor	1(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm2
 	pxor	7(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	88(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm4
 	pxor	5(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm6
 	pxor	3(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	92(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm0
 	pxor	1(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm3
 	pxor	7(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	96(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm5
 	pxor	5(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm7
 	pxor	3(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	100(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm1
 	pxor	1(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm4
 	pxor	7(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	104(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm6
 	pxor	5(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm0
 	pxor	3(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	108(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm2
 	pxor	1(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm5
 	pxor	7(%ebp,%edi,8),%mm6
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	112(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm7
 	pxor	5(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm1
 	pxor	3(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	116(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm3
 	pxor	1(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm6
 	pxor	7(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	120(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm0
 	pxor	5(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm2
 	pxor	3(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	124(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm4
 	pxor	1(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm7
 	pxor	7(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm1
 	pxor	5(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm3
 	pxor	3(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm5
 	pxor	1(%ebp,%edi,8),%mm6
 	leal	128(%esp),%ebx
@@ -1176,228 +1178,230 @@
 	movq	4096(%ebp,%esi,8),%mm0
 	movl	(%esp),%eax
 	movl	4(%esp),%ebx
-	movb	%al,%cl
-	movb	%ah,%dl
+	movzbl	%al,%ecx
+	movzbl	%ah,%edx
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm0
 	movq	7(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	8(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	movq	6(%ebp,%esi,8),%mm2
 	movq	5(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	movq	4(%ebp,%esi,8),%mm4
 	movq	3(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	12(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	movq	2(%ebp,%esi,8),%mm6
 	movq	1(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm1
 	pxor	7(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	16(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm3
 	pxor	5(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm5
 	pxor	3(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	20(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm7
 	pxor	1(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm2
 	pxor	7(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	24(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm4
 	pxor	5(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm6
 	pxor	3(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	28(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm0
 	pxor	1(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm3
 	pxor	7(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	32(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm5
 	pxor	5(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm7
 	pxor	3(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	36(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm1
 	pxor	1(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm4
 	pxor	7(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	40(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm6
 	pxor	5(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm0
 	pxor	3(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	44(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm2
 	pxor	1(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm5
 	pxor	7(%ebp,%edi,8),%mm6
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	48(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm7
 	pxor	5(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm1
 	pxor	3(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	52(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm3
 	pxor	1(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm6
 	pxor	7(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	56(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm0
 	pxor	5(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm2
 	pxor	3(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	60(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm4
 	pxor	1(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm7
 	pxor	7(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	64(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm1
 	pxor	5(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm3
 	pxor	3(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	68(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm5
 	pxor	1(%ebp,%edi,8),%mm6
 	movq	%mm0,(%esp)
@@ -1408,226 +1412,226 @@
 	movq	%mm5,40(%esp)
 	movq	%mm6,48(%esp)
 	movq	%mm7,56(%esp)
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm0
 	pxor	7(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	72(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm2
 	pxor	5(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm4
 	pxor	3(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	76(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm6
 	pxor	1(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm1
 	pxor	7(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	80(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm3
 	pxor	5(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm5
 	pxor	3(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	84(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm7
 	pxor	1(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm2
 	pxor	7(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	88(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm4
 	pxor	5(%ebp,%edi,8),%mm5
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm6
 	pxor	3(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	92(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm0
 	pxor	1(%ebp,%edi,8),%mm1
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm3
 	pxor	7(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	96(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm5
 	pxor	5(%ebp,%edi,8),%mm6
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm7
 	pxor	3(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	100(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm1
 	pxor	1(%ebp,%edi,8),%mm2
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm4
 	pxor	7(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	104(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm6
 	pxor	5(%ebp,%edi,8),%mm7
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm0
 	pxor	3(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	108(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm2
 	pxor	1(%ebp,%edi,8),%mm3
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm5
 	pxor	7(%ebp,%edi,8),%mm6
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	112(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm7
 	pxor	5(%ebp,%edi,8),%mm0
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm1
 	pxor	3(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	116(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm3
 	pxor	1(%ebp,%edi,8),%mm4
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm6
 	pxor	7(%ebp,%edi,8),%mm7
-	movb	%al,%cl
-	movb	%ah,%dl
 	movl	120(%esp),%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm0
 	pxor	5(%ebp,%edi,8),%mm1
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm2
 	pxor	3(%ebp,%edi,8),%mm3
-	movb	%bl,%cl
-	movb	%bh,%dl
 	movl	124(%esp),%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm4
 	pxor	1(%ebp,%edi,8),%mm5
-	movb	%al,%cl
-	movb	%ah,%dl
+	shrl	$16,%eax
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%eax
+	movzbl	%ah,%edx
 	pxor	(%ebp,%esi,8),%mm7
 	pxor	7(%ebp,%edi,8),%mm0
-	movb	%al,%cl
-	movb	%ah,%dl
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%bh,%edx
 	pxor	6(%ebp,%esi,8),%mm1
 	pxor	5(%ebp,%edi,8),%mm2
-	movb	%bl,%cl
-	movb	%bh,%dl
+	shrl	$16,%ebx
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%bl,%ecx
 	leal	(%edx,%edx,1),%edi
-	shrl	$16,%ebx
+	movzbl	%bh,%edx
 	pxor	4(%ebp,%esi,8),%mm3
 	pxor	3(%ebp,%edi,8),%mm4
-	movb	%bl,%cl
-	movb	%bh,%dl
 	leal	(%ecx,%ecx,1),%esi
+	movzbl	%al,%ecx
 	leal	(%edx,%edx,1),%edi
+	movzbl	%ah,%edx
 	pxor	2(%ebp,%esi,8),%mm5
 	pxor	1(%ebp,%edi,8),%mm6
 	leal	128(%esp),%ebx

Modified: trunk/secure/lib/libcrypto/i386/x86-gf2m.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/x86-gf2m.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/x86-gf2m.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/x86-gf2m.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86-gf2m.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/x86-gf2m.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from x86-gf2m.pl. */
 #ifdef PIC
 .file	"x86-gf2m.S"
 .text
@@ -247,8 +247,7 @@
 	call	.L000PIC_me_up
 .L000PIC_me_up:
 	popl	%edx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
-	movl	OPENSSL_ia32cap_P at GOT(%edx),%edx
+	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%edx),%edx
 	movl	(%edx),%eax
 	movl	4(%edx),%edx
 	testl	$8388608,%eax
@@ -348,7 +347,7 @@
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"x86-gf2m.S"
 .text
@@ -692,5 +691,5 @@
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/x86-mont.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/x86-mont.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/x86-mont.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/x86-mont.S 299966 2016-05-16 19:30:27Z jkim $
-# Do not modify. This file is auto-generated from x86-mont.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/x86-mont.S 337982 2018-08-17 18:32:53Z jkim $ */
+/* Do not modify. This file is auto-generated from x86-mont.pl. */
 #ifdef PIC
 .file	"x86-mont.S"
 .text
@@ -19,48 +19,54 @@
 	jl	.L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
-	movl	%esp,%ebp
 	addl	$2,%edi
 	negl	%edi
-	leal	-32(%esp,%edi,4),%esp
+	leal	-32(%esp,%edi,4),%ebp
 	negl	%edi
-	movl	%esp,%eax
+	movl	%ebp,%eax
 	subl	%edx,%eax
 	andl	$2047,%eax
-	subl	%eax,%esp
-	xorl	%esp,%edx
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
 	andl	$2048,%edx
 	xorl	$2048,%edx
-	subl	%edx,%esp
-	andl	$-64,%esp
-	movl	%ebp,%eax
-	subl	%esp,%eax
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
 	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
 .L001page_walk:
-	movl	(%esp,%eax,1),%edx
-	subl	$4096,%eax
-.byte	46
-	jnc	.L001page_walk
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
+	movl	12(%esi),%ebp
 	movl	16(%esi),%esi
 	movl	(%esi),%esi
 	movl	%eax,4(%esp)
 	movl	%ebx,8(%esp)
 	movl	%ecx,12(%esp)
-	movl	%edx,16(%esp)
+	movl	%ebp,16(%esp)
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
-	movl	%ebp,24(%esp)
-	call	.L002PIC_me_up
-.L002PIC_me_up:
+	movl	%edx,24(%esp)
+	call	.L003PIC_me_up
+.L003PIC_me_up:
 	popl	%eax
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L002PIC_me_up](%eax),%eax
-	movl	OPENSSL_ia32cap_P at GOT(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
-	jnc	.L003non_sse2
+	jnc	.L004non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -84,7 +90,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	16
-.L0041st:
+.L0051st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -99,7 +105,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	.L0041st
+	jl	.L0051st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -113,7 +119,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-.L005outer:
+.L006outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -135,7 +141,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-.L006inner:
+.L007inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -152,7 +158,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	.L006inner
+	jnz	.L007inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -170,11 +176,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	.L005outer
+	jle	.L006outer
 	emms
-	jmp	.L007common_tail
+	jmp	.L008common_tail
 .align	16
-.L003non_sse2:
+.L004non_sse2:
 	movl	8(%esp),%esi
 	leal	1(%ebx),%ebp
 	movl	12(%esp),%edi
@@ -185,12 +191,12 @@
 	leal	4(%edi,%ebx,4),%eax
 	orl	%edx,%ebp
 	movl	(%edi),%edi
-	jz	.L008bn_sqr_mont
+	jz	.L009bn_sqr_mont
 	movl	%eax,28(%esp)
 	movl	(%esi),%eax
 	xorl	%edx,%edx
 .align	16
-.L009mull:
+.L010mull:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	%eax,%ebp
@@ -199,7 +205,7 @@
 	movl	(%esi,%ecx,4),%eax
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L009mull
+	jl	.L010mull
 	movl	%edx,%ebp
 	mull	%edi
 	movl	20(%esp),%edi
@@ -217,9 +223,9 @@
 	movl	4(%esi),%eax
 	adcl	$0,%edx
 	incl	%ecx
-	jmp	.L0102ndmadd
+	jmp	.L0112ndmadd
 .align	16
-.L0111stmadd:
+.L0121stmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -230,7 +236,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0111stmadd
+	jl	.L0121stmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%eax
@@ -253,7 +259,7 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 .align	16
-.L0102ndmadd:
+.L0112ndmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -264,7 +270,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0102ndmadd
+	jl	.L0112ndmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -280,7 +286,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	28(%esp),%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L007common_tail
+	je	.L008common_tail
 	movl	(%ecx),%edi
 	movl	8(%esp),%esi
 	movl	%ecx,12(%esp)
@@ -287,9 +293,9 @@
 	xorl	%ecx,%ecx
 	xorl	%edx,%edx
 	movl	(%esi),%eax
-	jmp	.L0111stmadd
+	jmp	.L0121stmadd
 .align	16
-.L008bn_sqr_mont:
+.L009bn_sqr_mont:
 	movl	%ebx,(%esp)
 	movl	%ecx,12(%esp)
 	movl	%edi,%eax
@@ -300,7 +306,7 @@
 	andl	$1,%ebx
 	incl	%ecx
 .align	16
-.L012sqr:
+.L013sqr:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -312,7 +318,7 @@
 	cmpl	(%esp),%ecx
 	movl	%eax,%ebx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L012sqr
+	jl	.L013sqr
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -336,7 +342,7 @@
 	movl	4(%esi),%eax
 	movl	$1,%ecx
 .align	16
-.L0133rdmadd:
+.L0143rdmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -355,7 +361,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0133rdmadd
+	jl	.L0143rdmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -371,7 +377,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	%ebx,%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L007common_tail
+	je	.L008common_tail
 	movl	4(%esi,%ecx,4),%edi
 	leal	1(%ecx),%ecx
 	movl	%edi,%eax
@@ -383,12 +389,12 @@
 	xorl	%ebp,%ebp
 	cmpl	%ebx,%ecx
 	leal	1(%ecx),%ecx
-	je	.L014sqrlast
+	je	.L015sqrlast
 	movl	%edx,%ebx
 	shrl	$1,%edx
 	andl	$1,%ebx
 .align	16
-.L015sqradd:
+.L016sqradd:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -404,13 +410,13 @@
 	cmpl	(%esp),%ecx
 	movl	%ebp,28(%esp,%ecx,4)
 	movl	%eax,%ebx
-	jle	.L015sqradd
+	jle	.L016sqradd
 	movl	%edx,%ebp
 	addl	%edx,%edx
 	shrl	$31,%ebp
 	addl	%ebx,%edx
 	adcl	$0,%ebp
-.L014sqrlast:
+.L015sqrlast:
 	movl	20(%esp),%edi
 	movl	16(%esp),%esi
 	imull	32(%esp),%edi
@@ -425,9 +431,9 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 	movl	4(%esi),%eax
-	jmp	.L0133rdmadd
+	jmp	.L0143rdmadd
 .align	16
-.L007common_tail:
+.L008common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -435,26 +441,28 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L016sub:
+.L017sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L016sub
+	jge	.L017sub
 	sbbl	$0,%eax
-	andl	%eax,%esi
-	notl	%eax
-	movl	%edi,%ebp
-	andl	%eax,%ebp
-	orl	%ebp,%esi
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L018copy
 .align	16
-.L017copy:
-	movl	(%esi,%ebx,4),%eax
-	movl	%eax,(%edi,%ebx,4)
+.L018copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	.L017copy
+	jge	.L018copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 .L000just_leave:
@@ -469,7 +477,7 @@
 .byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
 .byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
 .byte	111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #else
 .file	"x86-mont.S"
 .text
@@ -488,44 +496,51 @@
 	jl	.L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
-	movl	%esp,%ebp
 	addl	$2,%edi
 	negl	%edi
-	leal	-32(%esp,%edi,4),%esp
+	leal	-32(%esp,%edi,4),%ebp
 	negl	%edi
-	movl	%esp,%eax
+	movl	%ebp,%eax
 	subl	%edx,%eax
 	andl	$2047,%eax
-	subl	%eax,%esp
-	xorl	%esp,%edx
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
 	andl	$2048,%edx
 	xorl	$2048,%edx
-	subl	%edx,%esp
-	andl	$-64,%esp
-	movl	%ebp,%eax
-	subl	%esp,%eax
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
 	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
 .L001page_walk:
-	movl	(%esp,%eax,1),%edx
-	subl	$4096,%eax
-.byte	46
-	jnc	.L001page_walk
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
+	movl	12(%esi),%ebp
 	movl	16(%esi),%esi
 	movl	(%esi),%esi
 	movl	%eax,4(%esp)
 	movl	%ebx,8(%esp)
 	movl	%ecx,12(%esp)
-	movl	%edx,16(%esp)
+	movl	%ebp,16(%esp)
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
-	movl	%ebp,24(%esp)
+	movl	%edx,24(%esp)
 	leal	OPENSSL_ia32cap_P,%eax
 	btl	$26,(%eax)
-	jnc	.L002non_sse2
+	jnc	.L003non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -549,7 +564,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	16
-.L0031st:
+.L0041st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -564,7 +579,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	.L0031st
+	jl	.L0041st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -578,7 +593,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-.L004outer:
+.L005outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -600,7 +615,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-.L005inner:
+.L006inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -617,7 +632,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	.L005inner
+	jnz	.L006inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -635,11 +650,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	.L004outer
+	jle	.L005outer
 	emms
-	jmp	.L006common_tail
+	jmp	.L007common_tail
 .align	16
-.L002non_sse2:
+.L003non_sse2:
 	movl	8(%esp),%esi
 	leal	1(%ebx),%ebp
 	movl	12(%esp),%edi
@@ -650,12 +665,12 @@
 	leal	4(%edi,%ebx,4),%eax
 	orl	%edx,%ebp
 	movl	(%edi),%edi
-	jz	.L007bn_sqr_mont
+	jz	.L008bn_sqr_mont
 	movl	%eax,28(%esp)
 	movl	(%esi),%eax
 	xorl	%edx,%edx
 .align	16
-.L008mull:
+.L009mull:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	%eax,%ebp
@@ -664,7 +679,7 @@
 	movl	(%esi,%ecx,4),%eax
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L008mull
+	jl	.L009mull
 	movl	%edx,%ebp
 	mull	%edi
 	movl	20(%esp),%edi
@@ -682,9 +697,9 @@
 	movl	4(%esi),%eax
 	adcl	$0,%edx
 	incl	%ecx
-	jmp	.L0092ndmadd
+	jmp	.L0102ndmadd
 .align	16
-.L0101stmadd:
+.L0111stmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -695,7 +710,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0101stmadd
+	jl	.L0111stmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%eax
@@ -718,7 +733,7 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 .align	16
-.L0092ndmadd:
+.L0102ndmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -729,7 +744,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0092ndmadd
+	jl	.L0102ndmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -745,7 +760,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	28(%esp),%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L006common_tail
+	je	.L007common_tail
 	movl	(%ecx),%edi
 	movl	8(%esp),%esi
 	movl	%ecx,12(%esp)
@@ -752,9 +767,9 @@
 	xorl	%ecx,%ecx
 	xorl	%edx,%edx
 	movl	(%esi),%eax
-	jmp	.L0101stmadd
+	jmp	.L0111stmadd
 .align	16
-.L007bn_sqr_mont:
+.L008bn_sqr_mont:
 	movl	%ebx,(%esp)
 	movl	%ecx,12(%esp)
 	movl	%edi,%eax
@@ -765,7 +780,7 @@
 	andl	$1,%ebx
 	incl	%ecx
 .align	16
-.L011sqr:
+.L012sqr:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -777,7 +792,7 @@
 	cmpl	(%esp),%ecx
 	movl	%eax,%ebx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L011sqr
+	jl	.L012sqr
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -801,7 +816,7 @@
 	movl	4(%esi),%eax
 	movl	$1,%ecx
 .align	16
-.L0123rdmadd:
+.L0133rdmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -820,7 +835,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0123rdmadd
+	jl	.L0133rdmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -836,7 +851,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	%ebx,%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L006common_tail
+	je	.L007common_tail
 	movl	4(%esi,%ecx,4),%edi
 	leal	1(%ecx),%ecx
 	movl	%edi,%eax
@@ -848,12 +863,12 @@
 	xorl	%ebp,%ebp
 	cmpl	%ebx,%ecx
 	leal	1(%ecx),%ecx
-	je	.L013sqrlast
+	je	.L014sqrlast
 	movl	%edx,%ebx
 	shrl	$1,%edx
 	andl	$1,%ebx
 .align	16
-.L014sqradd:
+.L015sqradd:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -869,13 +884,13 @@
 	cmpl	(%esp),%ecx
 	movl	%ebp,28(%esp,%ecx,4)
 	movl	%eax,%ebx
-	jle	.L014sqradd
+	jle	.L015sqradd
 	movl	%edx,%ebp
 	addl	%edx,%edx
 	shrl	$31,%ebp
 	addl	%ebx,%edx
 	adcl	$0,%ebp
-.L013sqrlast:
+.L014sqrlast:
 	movl	20(%esp),%edi
 	movl	16(%esp),%esi
 	imull	32(%esp),%edi
@@ -890,9 +905,9 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 	movl	4(%esi),%eax
-	jmp	.L0123rdmadd
+	jmp	.L0133rdmadd
 .align	16
-.L006common_tail:
+.L007common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -900,26 +915,28 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L015sub:
+.L016sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L015sub
+	jge	.L016sub
 	sbbl	$0,%eax
-	andl	%eax,%esi
-	notl	%eax
-	movl	%edi,%ebp
-	andl	%eax,%ebp
-	orl	%ebp,%esi
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L017copy
 .align	16
-.L016copy:
-	movl	(%esi,%ebx,4),%eax
-	movl	%eax,(%edi,%ebx,4)
+.L017copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	.L016copy
+	jge	.L017copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 .L000just_leave:
@@ -934,5 +951,5 @@
 .byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
 .byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
 .byte	111,114,103,62,0
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 #endif

Modified: trunk/secure/lib/libcrypto/i386/x86cpuid.S
===================================================================
--- trunk/secure/lib/libcrypto/i386/x86cpuid.S	2019-01-20 05:38:15 UTC (rev 12153)
+++ trunk/secure/lib/libcrypto/i386/x86cpuid.S	2019-01-20 05:38:27 UTC (rev 12154)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
-# $FreeBSD: stable/10/secure/lib/libcrypto/i386/x86cpuid.S 299983 2016-05-16 22:42:09Z jkim $
-# Do not modify. This file is auto-generated from x86cpuid.pl.
+/* $FreeBSD: stable/11/secure/lib/libcrypto/i386/x86cpuid.S 325335 2017-11-02 18:22:53Z jkim $ */
+/* Do not modify. This file is auto-generated from x86cpuid.pl. */
 #ifdef PIC
 .file	"x86cpuid.S"
 .text
@@ -24,6 +24,8 @@
 	popl	%eax
 	xorl	%eax,%ecx
 	xorl	%eax,%eax
+	movl	20(%esp),%esi
+	movl	%eax,8(%esi)
 	btl	$21,%ecx
 	jnc	.L000nocpuid
 	.byte	0x0f,0xa2
@@ -77,14 +79,14 @@
 	jmp	.L002generic
 .L001intel:
 	cmpl	$4,%edi
-	movl	$-1,%edi
+	movl	$-1,%esi
 	jb	.L003nocacheinfo
 	movl	$4,%eax
 	movl	$0,%ecx
 	.byte	0x0f,0xa2
-	movl	%eax,%edi
-	shrl	$14,%edi
-	andl	$4095,%edi
+	movl	%eax,%esi
+	shrl	$14,%esi
+	andl	$4095,%esi
 .L003nocacheinfo:
 	movl	$1,%eax
 	xorl	%ecx,%ecx
@@ -101,7 +103,7 @@
 	btl	$28,%edx
 	jnc	.L002generic
 	andl	$4026531839,%edx
-	cmpl	$0,%edi
+	cmpl	$0,%esi
 	je	.L002generic
 	orl	$268435456,%edx
 	shrl	$16,%ebx
@@ -113,21 +115,30 @@
 	andl	$4294965247,%ecx
 	movl	%edx,%esi
 	orl	%ecx,%ebp
-	btl	$27,%ecx
-	jnc	.L005clear_avx
+	cmpl	$7,%edi
+	movl	20(%esp),%edi
+	jb	.L005no_extended_info
+	movl	$7,%eax
 	xorl	%ecx,%ecx
+	.byte	0x0f,0xa2
+	movl	%ebx,8(%edi)
+.L005no_extended_info:
+	btl	$27,%ebp
+	jnc	.L006clear_avx
+	xorl	%ecx,%ecx
 .byte	15,1,208
 	andl	$6,%eax
 	cmpl	$6,%eax
-	je	.L006done
+	je	.L007done
 	cmpl	$2,%eax
-	je	.L005clear_avx
-.L007clear_xmm:
+	je	.L006clear_avx
+.L008clear_xmm:
 	andl	$4261412861,%ebp
 	andl	$4278190079,%esi
-.L005clear_avx:
+.L006clear_avx:
 	andl	$4026525695,%ebp
-.L006done:
+	andl	$4294967263,8(%edi)
+.L007done:
 	movl	%esi,%eax
 	movl	%ebp,%edx
 .L000nocpuid:
@@ -144,15 +155,14 @@
 .L_OPENSSL_rdtsc_begin:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
-	call	.L008PIC_me_up
-.L008PIC_me_up:
+	call	.L009PIC_me_up
+.L009PIC_me_up:
 	popl	%ecx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L008PIC_me_up](%ecx),%ecx
-	movl	OPENSSL_ia32cap_P at GOT(%ecx),%ecx
+	leal	OPENSSL_ia32cap_P-.L009PIC_me_up(%ecx),%ecx
 	btl	$4,(%ecx)
-	jnc	.L009notsc
+	jnc	.L010notsc
 	.byte	0x0f,0x31
-.L009notsc:
+.L010notsc:
 	ret
 .size	OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin
 .globl	OPENSSL_instrument_halt
@@ -160,20 +170,19 @@
 .align	16
 OPENSSL_instrument_halt:
 .L_OPENSSL_instrument_halt_begin:
-	call	.L010PIC_me_up
-.L010PIC_me_up:
+	call	.L011PIC_me_up
+.L011PIC_me_up:
 	popl	%ecx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%ecx),%ecx
-	movl	OPENSSL_ia32cap_P at GOT(%ecx),%ecx
+	leal	OPENSSL_ia32cap_P-.L011PIC_me_up(%ecx),%ecx
 	btl	$4,(%ecx)
-	jnc	.L011nohalt
+	jnc	.L012nohalt
 .long	2421723150
 	andl	$3,%eax
-	jnz	.L011nohalt
+	jnz	.L012nohalt
 	pushfl
 	popl	%eax
 	btl	$9,%eax
-	jnc	.L011nohalt
+	jnc	.L012nohalt
 	.byte	0x0f,0x31
 	pushl	%edx
 	pushl	%eax
@@ -183,7 +192,7 @@
 	sbbl	4(%esp),%edx
 	addl	$8,%esp
 	ret
-.L011nohalt:
+.L012nohalt:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	ret
@@ -196,21 +205,21 @@
 	pushfl
 	popl	%eax
 	btl	$9,%eax
-	jnc	.L012nospin
+	jnc	.L013nospin
 	movl	4(%esp),%eax
 	movl	8(%esp),%ecx
 .long	2430111262
 	xorl	%eax,%eax
 	movl	(%ecx),%edx
-	jmp	.L013spin
+	jmp	.L014spin
 .align	16
-.L013spin:
+.L014spin:
 	incl	%eax
 	cmpl	(%ecx),%edx
-	je	.L013spin
+	je	.L014spin
 .long	529567888
 	ret
-.L012nospin:
+.L013nospin:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	ret
@@ -222,17 +231,16 @@
 .L_OPENSSL_wipe_cpu_begin:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
-	call	.L014PIC_me_up
-.L014PIC_me_up:
+	call	.L015PIC_me_up
+.L015PIC_me_up:
 	popl	%ecx
-	leal	_GLOBAL_OFFSET_TABLE_+[.-.L014PIC_me_up](%ecx),%ecx
-	movl	OPENSSL_ia32cap_P at GOT(%ecx),%ecx
+	leal	OPENSSL_ia32cap_P-.L015PIC_me_up(%ecx),%ecx
 	movl	(%ecx),%ecx
 	btl	$1,(%ecx)
-	jnc	.L015no_x87
+	jnc	.L016no_x87
 	andl	$83886080,%ecx
 	cmpl	$83886080,%ecx
-	jne	.L016no_sse2
+	jne	.L017no_sse2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -241,9 +249,9 @@
 	pxor	%xmm5,%xmm5
 	pxor	%xmm6,%xmm6
 	pxor	%xmm7,%xmm7
-.L016no_sse2:
+.L017no_sse2:
 .long	4007259865,4007259865,4007259865,4007259865,2430851995
-.L015no_x87:
+.L016no_x87:
 	leal	4(%esp),%eax
 	ret
 .size	OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin
@@ -257,11 +265,11 @@
 	pushl	%ebx
 	nop
 	movl	(%edx),%eax
-.L017spin:
+.L018spin:
 	leal	(%eax,%ecx,1),%ebx
 	nop
 .long	447811568
-	jne	.L017spin
+	jne	.L018spin
 	movl	%ebx,%eax
 	popl	%ebx
 	ret
@@ -302,32 +310,32 @@
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 	cmpl	$7,%ecx
-	jae	.L018lot
+	jae	.L019lot
 	cmpl	$0,%ecx
-	je	.L019ret
-.L020little:
+	je	.L020ret
+.L021little:
 	movb	%al,(%edx)
 	subl	$1,%ecx
 	leal	1(%edx),%edx
-	jnz	.L020little
-.L019ret:
+	jnz	.L021little
+.L020ret:
 	ret
 .align	16
-.L018lot:
+.L019lot:
 	testl	$3,%edx
-	jz	.L021aligned
+	jz	.L022aligned
 	movb	%al,(%edx)
 	leal	-1(%ecx),%ecx
 	leal	1(%edx),%edx
-	jmp	.L018lot
-.L021aligned:
+	jmp	.L019lot
+.L022aligned:
 	movl	%eax,(%edx)
 	leal	-4(%ecx),%ecx
 	testl	$-4,%ecx
 	leal	4(%edx),%edx
-	jnz	.L021aligned
+	jnz	.L022aligned
 	cmpl	$0,%ecx
-	jne	.L020little
+	jne	.L021little
 	ret
 .size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
 .globl	OPENSSL_ia32_rdrand
@@ -336,18 +344,33 @@
 OPENSSL_ia32_rdrand:
 .L_OPENSSL_ia32_rdrand_begin:
 	movl	$8,%ecx
-.L022loop:
+.L023loop:
 .byte	15,199,240
-	jc	.L023break
-	loop	.L022loop
-.L023break:
+	jc	.L024break
+	loop	.L023loop
+.L024break:
 	cmpl	$0,%eax
 	cmovel	%ecx,%eax
 	ret
 .size	OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin
+.globl	OPENSSL_ia32_rdseed
+.type	OPENSSL_ia32_rdseed, at function
+.align	16
+OPENSSL_ia32_rdseed:
+.L_OPENSSL_ia32_rdseed_begin:
+	movl	$8,%ecx
+.L025loop:
+.byte	15,199,248
+	jc	.L026break
+	loop	.L025loop
+.L026break:
+	cmpl	$0,%eax
+	cmovel	%ecx,%eax
+	ret
+.size	OPENSSL_ia32_rdseed,.-.L_OPENSSL_ia32_rdseed_begin
 .hidden	OPENSSL_cpuid_setup
 .hidden	OPENSSL_ia32cap_P
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 .section	.init
 	call	OPENSSL_cpuid_setup
 #else
@@ -373,6 +396,8 @@
 	popl	%eax
 	xorl	%eax,%ecx
 	xorl	%eax,%eax
+	movl	20(%esp),%esi
+	movl	%eax,8(%esi)
 	btl	$21,%ecx
 	jnc	.L000nocpuid
 	.byte	0x0f,0xa2
@@ -426,14 +451,14 @@
 	jmp	.L002generic
 .L001intel:
 	cmpl	$4,%edi
-	movl	$-1,%edi
+	movl	$-1,%esi
 	jb	.L003nocacheinfo
 	movl	$4,%eax
 	movl	$0,%ecx
 	.byte	0x0f,0xa2
-	movl	%eax,%edi
-	shrl	$14,%edi
-	andl	$4095,%edi
+	movl	%eax,%esi
+	shrl	$14,%esi
+	andl	$4095,%esi
 .L003nocacheinfo:
 	movl	$1,%eax
 	xorl	%ecx,%ecx
@@ -450,7 +475,7 @@
 	btl	$28,%edx
 	jnc	.L002generic
 	andl	$4026531839,%edx
-	cmpl	$0,%edi
+	cmpl	$0,%esi
 	je	.L002generic
 	orl	$268435456,%edx
 	shrl	$16,%ebx
@@ -462,21 +487,30 @@
 	andl	$4294965247,%ecx
 	movl	%edx,%esi
 	orl	%ecx,%ebp
-	btl	$27,%ecx
-	jnc	.L005clear_avx
+	cmpl	$7,%edi
+	movl	20(%esp),%edi
+	jb	.L005no_extended_info
+	movl	$7,%eax
 	xorl	%ecx,%ecx
+	.byte	0x0f,0xa2
+	movl	%ebx,8(%edi)
+.L005no_extended_info:
+	btl	$27,%ebp
+	jnc	.L006clear_avx
+	xorl	%ecx,%ecx
 .byte	15,1,208
 	andl	$6,%eax
 	cmpl	$6,%eax
-	je	.L006done
+	je	.L007done
 	cmpl	$2,%eax
-	je	.L005clear_avx
-.L007clear_xmm:
+	je	.L006clear_avx
+.L008clear_xmm:
 	andl	$4261412861,%ebp
 	andl	$4278190079,%esi
-.L005clear_avx:
+.L006clear_avx:
 	andl	$4026525695,%ebp
-.L006done:
+	andl	$4294967263,8(%edi)
+.L007done:
 	movl	%esi,%eax
 	movl	%ebp,%edx
 .L000nocpuid:
@@ -495,9 +529,9 @@
 	xorl	%edx,%edx
 	leal	OPENSSL_ia32cap_P,%ecx
 	btl	$4,(%ecx)
-	jnc	.L008notsc
+	jnc	.L009notsc
 	.byte	0x0f,0x31
-.L008notsc:
+.L009notsc:
 	ret
 .size	OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin
 .globl	OPENSSL_instrument_halt
@@ -507,14 +541,14 @@
 .L_OPENSSL_instrument_halt_begin:
 	leal	OPENSSL_ia32cap_P,%ecx
 	btl	$4,(%ecx)
-	jnc	.L009nohalt
+	jnc	.L010nohalt
 .long	2421723150
 	andl	$3,%eax
-	jnz	.L009nohalt
+	jnz	.L010nohalt
 	pushfl
 	popl	%eax
 	btl	$9,%eax
-	jnc	.L009nohalt
+	jnc	.L010nohalt
 	.byte	0x0f,0x31
 	pushl	%edx
 	pushl	%eax
@@ -524,7 +558,7 @@
 	sbbl	4(%esp),%edx
 	addl	$8,%esp
 	ret
-.L009nohalt:
+.L010nohalt:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	ret
@@ -537,21 +571,21 @@
 	pushfl
 	popl	%eax
 	btl	$9,%eax
-	jnc	.L010nospin
+	jnc	.L011nospin
 	movl	4(%esp),%eax
 	movl	8(%esp),%ecx
 .long	2430111262
 	xorl	%eax,%eax
 	movl	(%ecx),%edx
-	jmp	.L011spin
+	jmp	.L012spin
 .align	16
-.L011spin:
+.L012spin:
 	incl	%eax
 	cmpl	(%ecx),%edx
-	je	.L011spin
+	je	.L012spin
 .long	529567888
 	ret
-.L010nospin:
+.L011nospin:
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	ret
@@ -566,10 +600,10 @@
 	leal	OPENSSL_ia32cap_P,%ecx
 	movl	(%ecx),%ecx
 	btl	$1,(%ecx)
-	jnc	.L012no_x87
+	jnc	.L013no_x87
 	andl	$83886080,%ecx
 	cmpl	$83886080,%ecx
-	jne	.L013no_sse2
+	jne	.L014no_sse2
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -578,9 +612,9 @@
 	pxor	%xmm5,%xmm5
 	pxor	%xmm6,%xmm6
 	pxor	%xmm7,%xmm7
-.L013no_sse2:
+.L014no_sse2:
 .long	4007259865,4007259865,4007259865,4007259865,2430851995
-.L012no_x87:
+.L013no_x87:
 	leal	4(%esp),%eax
 	ret
 .size	OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin
@@ -594,11 +628,11 @@
 	pushl	%ebx
 	nop
 	movl	(%edx),%eax
-.L014spin:
+.L015spin:
 	leal	(%eax,%ecx,1),%ebx
 	nop
 .long	447811568
-	jne	.L014spin
+	jne	.L015spin
 	movl	%ebx,%eax
 	popl	%ebx
 	ret
@@ -639,32 +673,32 @@
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 	cmpl	$7,%ecx
-	jae	.L015lot
+	jae	.L016lot
 	cmpl	$0,%ecx
-	je	.L016ret
-.L017little:
+	je	.L017ret
+.L018little:
 	movb	%al,(%edx)
 	subl	$1,%ecx
 	leal	1(%edx),%edx
-	jnz	.L017little
-.L016ret:
+	jnz	.L018little
+.L017ret:
 	ret
 .align	16
-.L015lot:
+.L016lot:
 	testl	$3,%edx
-	jz	.L018aligned
+	jz	.L019aligned
 	movb	%al,(%edx)
 	leal	-1(%ecx),%ecx
 	leal	1(%edx),%edx
-	jmp	.L015lot
-.L018aligned:
+	jmp	.L016lot
+.L019aligned:
 	movl	%eax,(%edx)
 	leal	-4(%ecx),%ecx
 	testl	$-4,%ecx
 	leal	4(%edx),%edx
-	jnz	.L018aligned
+	jnz	.L019aligned
 	cmpl	$0,%ecx
-	jne	.L017little
+	jne	.L018little
 	ret
 .size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
 .globl	OPENSSL_ia32_rdrand
@@ -673,18 +707,33 @@
 OPENSSL_ia32_rdrand:
 .L_OPENSSL_ia32_rdrand_begin:
 	movl	$8,%ecx
-.L019loop:
+.L020loop:
 .byte	15,199,240
-	jc	.L020break
-	loop	.L019loop
-.L020break:
+	jc	.L021break
+	loop	.L020loop
+.L021break:
 	cmpl	$0,%eax
 	cmovel	%ecx,%eax
 	ret
 .size	OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin
+.globl	OPENSSL_ia32_rdseed
+.type	OPENSSL_ia32_rdseed, at function
+.align	16
+OPENSSL_ia32_rdseed:
+.L_OPENSSL_ia32_rdseed_begin:
+	movl	$8,%ecx
+.L022loop:
+.byte	15,199,248
+	jc	.L023break
+	loop	.L022loop
+.L023break:
+	cmpl	$0,%eax
+	cmovel	%ecx,%eax
+	ret
+.size	OPENSSL_ia32_rdseed,.-.L_OPENSSL_ia32_rdseed_begin
 .hidden	OPENSSL_cpuid_setup
 .hidden	OPENSSL_ia32cap_P
-.comm	OPENSSL_ia32cap_P,8,4
+.comm	OPENSSL_ia32cap_P,16,4
 .section	.init
 	call	OPENSSL_cpuid_setup
 #endif



More information about the Midnightbsd-cvs mailing list