[Midnightbsd-cvs] src [12156] trunk/secure/lib/libcrypto: add missing files

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sun Jan 20 00:40:36 EST 2019


Revision: 12156
          http://svnweb.midnightbsd.org/src/?rev=12156
Author:   laffer1
Date:     2019-01-20 00:40:35 -0500 (Sun, 20 Jan 2019)
Log Message:
-----------
add missing files

Added Paths:
-----------
    trunk/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
    trunk/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
    trunk/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
    trunk/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
    trunk/secure/lib/libcrypto/amd64/rsaz-avx2.S
    trunk/secure/lib/libcrypto/amd64/rsaz-x86_64.S
    trunk/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
    trunk/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
    trunk/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libaep/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libatalla/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libcapi/
    trunk/secure/lib/libcrypto/engines/libcapi/Makefile
    trunk/secure/lib/libcrypto/engines/libcapi/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libchil/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libcswift/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libgost/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libnuron/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libsureware/Makefile.depend
    trunk/secure/lib/libcrypto/engines/libubsec/Makefile.depend
    trunk/secure/lib/libcrypto/man/ASN1_TIME_set.3
    trunk/secure/lib/libcrypto/man/EC_GFp_simple_method.3
    trunk/secure/lib/libcrypto/man/EC_GROUP_copy.3
    trunk/secure/lib/libcrypto/man/EC_GROUP_new.3
    trunk/secure/lib/libcrypto/man/EC_KEY_new.3
    trunk/secure/lib/libcrypto/man/EC_POINT_add.3
    trunk/secure/lib/libcrypto/man/EC_POINT_new.3
    trunk/secure/lib/libcrypto/man/EVP_PKEY_meth_new.3
    trunk/secure/lib/libcrypto/man/OPENSSL_instrument_bus.3
    trunk/secure/lib/libcrypto/man/X509_check_host.3
    trunk/secure/lib/libcrypto/man/X509_check_private_key.3
    trunk/secure/lib/libcrypto/man/X509_cmp_time.3
    trunk/secure/lib/libcrypto/man/d2i_ECPKParameters.3
    trunk/secure/lib/libcrypto/man/ec.3

Added: trunk/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,756 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */
+.text	
+
+.type	_aesni_ctr32_ghash_6x, at function
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%ebp
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt, at function
+.align	32
+aesni_gcm_decrypt:
+	xorq	%r10,%r10
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	240-128(%rcx),%ebp
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+	leaq	-192(%rdi,%rdx,1),%r15
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x, at function
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%rbp),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt, at function
+.align	32
+aesni_gcm_encrypt:
+	xorq	%r10,%r10
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%ebp
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+	leaq	-192(%rsi,%rdx,1),%r15
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64


Property changes on: trunk/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,1438 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-mb-x86_64.pl. */
+.text	
+
+
+
+.globl	aesni_multi_cbc_encrypt
+.type	aesni_multi_cbc_encrypt, at function
+.align	32
+aesni_multi_cbc_encrypt:
+	cmpl	$2,%edx
+	jb	.Lenc_non_avx
+	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
+	testl	$268435456,%ecx
+	jnz	_avx_cbc_enc_shortcut
+	jmp	.Lenc_non_avx
+.align	16
+.Lenc_non_avx:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+
+
+
+
+	subq	$48,%rsp
+	andq	$-64,%rsp
+	movq	%rax,16(%rsp)
+
+.Lenc4x_body:
+	movdqu	(%rsi),%xmm12
+	leaq	120(%rsi),%rsi
+	leaq	80(%rdi),%rdi
+
+.Lenc4x_loop_grande:
+	movl	%edx,24(%rsp)
+	xorl	%edx,%edx
+	movl	-64(%rdi),%ecx
+	movq	-80(%rdi),%r8
+	cmpl	%edx,%ecx
+	movq	-72(%rdi),%r12
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	-56(%rdi),%xmm2
+	movl	%ecx,32(%rsp)
+	cmovleq	%rsp,%r8
+	movl	-24(%rdi),%ecx
+	movq	-40(%rdi),%r9
+	cmpl	%edx,%ecx
+	movq	-32(%rdi),%r13
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	-16(%rdi),%xmm3
+	movl	%ecx,36(%rsp)
+	cmovleq	%rsp,%r9
+	movl	16(%rdi),%ecx
+	movq	0(%rdi),%r10
+	cmpl	%edx,%ecx
+	movq	8(%rdi),%r14
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	24(%rdi),%xmm4
+	movl	%ecx,40(%rsp)
+	cmovleq	%rsp,%r10
+	movl	56(%rdi),%ecx
+	movq	40(%rdi),%r11
+	cmpl	%edx,%ecx
+	movq	48(%rdi),%r15
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	64(%rdi),%xmm5
+	movl	%ecx,44(%rsp)
+	cmovleq	%rsp,%r11
+	testl	%edx,%edx
+	jz	.Lenc4x_done
+
+	movups	16-120(%rsi),%xmm1
+	pxor	%xmm12,%xmm2
+	movups	32-120(%rsi),%xmm0
+	pxor	%xmm12,%xmm3
+	movl	240-120(%rsi),%eax
+	pxor	%xmm12,%xmm4
+	movdqu	(%r8),%xmm6
+	pxor	%xmm12,%xmm5
+	movdqu	(%r9),%xmm7
+	pxor	%xmm6,%xmm2
+	movdqu	(%r10),%xmm8
+	pxor	%xmm7,%xmm3
+	movdqu	(%r11),%xmm9
+	pxor	%xmm8,%xmm4
+	pxor	%xmm9,%xmm5
+	movdqa	32(%rsp),%xmm10
+	xorq	%rbx,%rbx
+	jmp	.Loop_enc4x
+
+.align	32
+.Loop_enc4x:
+	addq	$16,%rbx
+	leaq	16(%rsp),%rbp
+	movl	$1,%ecx
+	subq	%rbx,%rbp
+
+.byte	102,15,56,220,209
+	prefetcht0	31(%r8,%rbx,1)
+	prefetcht0	31(%r9,%rbx,1)
+.byte	102,15,56,220,217
+	prefetcht0	31(%r10,%rbx,1)
+	prefetcht0	31(%r10,%rbx,1)
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	48-120(%rsi),%xmm1
+	cmpl	32(%rsp),%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	cmovgeq	%rbp,%r8
+	cmovgq	%rbp,%r12
+.byte	102,15,56,220,232
+	movups	-56(%rsi),%xmm0
+	cmpl	36(%rsp),%ecx
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	cmovgeq	%rbp,%r9
+	cmovgq	%rbp,%r13
+.byte	102,15,56,220,233
+	movups	-40(%rsi),%xmm1
+	cmpl	40(%rsp),%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	cmovgeq	%rbp,%r10
+	cmovgq	%rbp,%r14
+.byte	102,15,56,220,232
+	movups	-24(%rsi),%xmm0
+	cmpl	44(%rsp),%ecx
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	cmovgeq	%rbp,%r11
+	cmovgq	%rbp,%r15
+.byte	102,15,56,220,233
+	movups	-8(%rsi),%xmm1
+	movdqa	%xmm10,%xmm11
+.byte	102,15,56,220,208
+	prefetcht0	15(%r12,%rbx,1)
+	prefetcht0	15(%r13,%rbx,1)
+.byte	102,15,56,220,216
+	prefetcht0	15(%r14,%rbx,1)
+	prefetcht0	15(%r15,%rbx,1)
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	128-120(%rsi),%xmm0
+	pxor	%xmm12,%xmm12
+
+.byte	102,15,56,220,209
+	pcmpgtd	%xmm12,%xmm11
+	movdqu	-120(%rsi),%xmm12
+.byte	102,15,56,220,217
+	paddd	%xmm11,%xmm10
+	movdqa	%xmm10,32(%rsp)
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	144-120(%rsi),%xmm1
+
+	cmpl	$11,%eax
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	160-120(%rsi),%xmm0
+
+	jb	.Lenc4x_tail
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	176-120(%rsi),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	192-120(%rsi),%xmm0
+
+	je	.Lenc4x_tail
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	208-120(%rsi),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	224-120(%rsi),%xmm0
+	jmp	.Lenc4x_tail
+
+.align	32
+.Lenc4x_tail:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movdqu	(%r8,%rbx,1),%xmm6
+	movdqu	16-120(%rsi),%xmm1
+
+.byte	102,15,56,221,208
+	movdqu	(%r9,%rbx,1),%xmm7
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,221,216
+	movdqu	(%r10,%rbx,1),%xmm8
+	pxor	%xmm12,%xmm7
+.byte	102,15,56,221,224
+	movdqu	(%r11,%rbx,1),%xmm9
+	pxor	%xmm12,%xmm8
+.byte	102,15,56,221,232
+	movdqu	32-120(%rsi),%xmm0
+	pxor	%xmm12,%xmm9
+
+	movups	%xmm2,-16(%r12,%rbx,1)
+	pxor	%xmm6,%xmm2
+	movups	%xmm3,-16(%r13,%rbx,1)
+	pxor	%xmm7,%xmm3
+	movups	%xmm4,-16(%r14,%rbx,1)
+	pxor	%xmm8,%xmm4
+	movups	%xmm5,-16(%r15,%rbx,1)
+	pxor	%xmm9,%xmm5
+
+	decl	%edx
+	jnz	.Loop_enc4x
+
+	movq	16(%rsp),%rax
+	movl	24(%rsp),%edx
+
+
+
+
+
+
+
+
+
+
+	leaq	160(%rdi),%rdi
+	decl	%edx
+	jnz	.Lenc4x_loop_grande
+
+.Lenc4x_done:
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lenc4x_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl	aesni_multi_cbc_decrypt
+.type	aesni_multi_cbc_decrypt, at function
+.align	32
+aesni_multi_cbc_decrypt:
+	cmpl	$2,%edx
+	jb	.Ldec_non_avx
+	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
+	testl	$268435456,%ecx
+	jnz	_avx_cbc_dec_shortcut
+	jmp	.Ldec_non_avx
+.align	16
+.Ldec_non_avx:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+
+
+
+
+	subq	$48,%rsp
+	andq	$-64,%rsp
+	movq	%rax,16(%rsp)
+
+.Ldec4x_body:
+	movdqu	(%rsi),%xmm12
+	leaq	120(%rsi),%rsi
+	leaq	80(%rdi),%rdi
+
+.Ldec4x_loop_grande:
+	movl	%edx,24(%rsp)
+	xorl	%edx,%edx
+	movl	-64(%rdi),%ecx
+	movq	-80(%rdi),%r8
+	cmpl	%edx,%ecx
+	movq	-72(%rdi),%r12
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	-56(%rdi),%xmm6
+	movl	%ecx,32(%rsp)
+	cmovleq	%rsp,%r8
+	movl	-24(%rdi),%ecx
+	movq	-40(%rdi),%r9
+	cmpl	%edx,%ecx
+	movq	-32(%rdi),%r13
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	-16(%rdi),%xmm7
+	movl	%ecx,36(%rsp)
+	cmovleq	%rsp,%r9
+	movl	16(%rdi),%ecx
+	movq	0(%rdi),%r10
+	cmpl	%edx,%ecx
+	movq	8(%rdi),%r14
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	24(%rdi),%xmm8
+	movl	%ecx,40(%rsp)
+	cmovleq	%rsp,%r10
+	movl	56(%rdi),%ecx
+	movq	40(%rdi),%r11
+	cmpl	%edx,%ecx
+	movq	48(%rdi),%r15
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movdqu	64(%rdi),%xmm9
+	movl	%ecx,44(%rsp)
+	cmovleq	%rsp,%r11
+	testl	%edx,%edx
+	jz	.Ldec4x_done
+
+	movups	16-120(%rsi),%xmm1
+	movups	32-120(%rsi),%xmm0
+	movl	240-120(%rsi),%eax
+	movdqu	(%r8),%xmm2
+	movdqu	(%r9),%xmm3
+	pxor	%xmm12,%xmm2
+	movdqu	(%r10),%xmm4
+	pxor	%xmm12,%xmm3
+	movdqu	(%r11),%xmm5
+	pxor	%xmm12,%xmm4
+	pxor	%xmm12,%xmm5
+	movdqa	32(%rsp),%xmm10
+	xorq	%rbx,%rbx
+	jmp	.Loop_dec4x
+
+.align	32
+.Loop_dec4x:
+	addq	$16,%rbx
+	leaq	16(%rsp),%rbp
+	movl	$1,%ecx
+	subq	%rbx,%rbp
+
+.byte	102,15,56,222,209
+	prefetcht0	31(%r8,%rbx,1)
+	prefetcht0	31(%r9,%rbx,1)
+.byte	102,15,56,222,217
+	prefetcht0	31(%r10,%rbx,1)
+	prefetcht0	31(%r11,%rbx,1)
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	48-120(%rsi),%xmm1
+	cmpl	32(%rsp),%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	cmovgeq	%rbp,%r8
+	cmovgq	%rbp,%r12
+.byte	102,15,56,222,232
+	movups	-56(%rsi),%xmm0
+	cmpl	36(%rsp),%ecx
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	cmovgeq	%rbp,%r9
+	cmovgq	%rbp,%r13
+.byte	102,15,56,222,233
+	movups	-40(%rsi),%xmm1
+	cmpl	40(%rsp),%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	cmovgeq	%rbp,%r10
+	cmovgq	%rbp,%r14
+.byte	102,15,56,222,232
+	movups	-24(%rsi),%xmm0
+	cmpl	44(%rsp),%ecx
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	cmovgeq	%rbp,%r11
+	cmovgq	%rbp,%r15
+.byte	102,15,56,222,233
+	movups	-8(%rsi),%xmm1
+	movdqa	%xmm10,%xmm11
+.byte	102,15,56,222,208
+	prefetcht0	15(%r12,%rbx,1)
+	prefetcht0	15(%r13,%rbx,1)
+.byte	102,15,56,222,216
+	prefetcht0	15(%r14,%rbx,1)
+	prefetcht0	15(%r15,%rbx,1)
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	128-120(%rsi),%xmm0
+	pxor	%xmm12,%xmm12
+
+.byte	102,15,56,222,209
+	pcmpgtd	%xmm12,%xmm11
+	movdqu	-120(%rsi),%xmm12
+.byte	102,15,56,222,217
+	paddd	%xmm11,%xmm10
+	movdqa	%xmm10,32(%rsp)
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	144-120(%rsi),%xmm1
+
+	cmpl	$11,%eax
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	160-120(%rsi),%xmm0
+
+	jb	.Ldec4x_tail
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	176-120(%rsi),%xmm1
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	192-120(%rsi),%xmm0
+
+	je	.Ldec4x_tail
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	208-120(%rsi),%xmm1
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	224-120(%rsi),%xmm0
+	jmp	.Ldec4x_tail
+
+.align	32
+.Ldec4x_tail:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,222,233
+	movdqu	16-120(%rsi),%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm0,%xmm9
+	movdqu	32-120(%rsi),%xmm0
+
+.byte	102,15,56,223,214
+.byte	102,15,56,223,223
+	movdqu	-16(%r8,%rbx,1),%xmm6
+	movdqu	-16(%r9,%rbx,1),%xmm7
+.byte	102,65,15,56,223,224
+.byte	102,65,15,56,223,233
+	movdqu	-16(%r10,%rbx,1),%xmm8
+	movdqu	-16(%r11,%rbx,1),%xmm9
+
+	movups	%xmm2,-16(%r12,%rbx,1)
+	movdqu	(%r8,%rbx,1),%xmm2
+	movups	%xmm3,-16(%r13,%rbx,1)
+	movdqu	(%r9,%rbx,1),%xmm3
+	pxor	%xmm12,%xmm2
+	movups	%xmm4,-16(%r14,%rbx,1)
+	movdqu	(%r10,%rbx,1),%xmm4
+	pxor	%xmm12,%xmm3
+	movups	%xmm5,-16(%r15,%rbx,1)
+	movdqu	(%r11,%rbx,1),%xmm5
+	pxor	%xmm12,%xmm4
+	pxor	%xmm12,%xmm5
+
+	decl	%edx
+	jnz	.Loop_dec4x
+
+	movq	16(%rsp),%rax
+	movl	24(%rsp),%edx
+
+	leaq	160(%rdi),%rdi
+	decl	%edx
+	jnz	.Ldec4x_loop_grande
+
+.Ldec4x_done:
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Ldec4x_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.type	aesni_multi_cbc_encrypt_avx, at function
+.align	32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+
+
+
+
+
+
+	subq	$192,%rsp
+	andq	$-128,%rsp
+	movq	%rax,16(%rsp)
+
+.Lenc8x_body:
+	vzeroupper
+	vmovdqu	(%rsi),%xmm15
+	leaq	120(%rsi),%rsi
+	leaq	160(%rdi),%rdi
+	shrl	$1,%edx
+
+.Lenc8x_loop_grande:
+
+	xorl	%edx,%edx
+	movl	-144(%rdi),%ecx
+	movq	-160(%rdi),%r8
+	cmpl	%edx,%ecx
+	movq	-152(%rdi),%rbx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-136(%rdi),%xmm2
+	movl	%ecx,32(%rsp)
+	cmovleq	%rsp,%r8
+	subq	%r8,%rbx
+	movq	%rbx,64(%rsp)
+	movl	-104(%rdi),%ecx
+	movq	-120(%rdi),%r9
+	cmpl	%edx,%ecx
+	movq	-112(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-96(%rdi),%xmm3
+	movl	%ecx,36(%rsp)
+	cmovleq	%rsp,%r9
+	subq	%r9,%rbp
+	movq	%rbp,72(%rsp)
+	movl	-64(%rdi),%ecx
+	movq	-80(%rdi),%r10
+	cmpl	%edx,%ecx
+	movq	-72(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-56(%rdi),%xmm4
+	movl	%ecx,40(%rsp)
+	cmovleq	%rsp,%r10
+	subq	%r10,%rbp
+	movq	%rbp,80(%rsp)
+	movl	-24(%rdi),%ecx
+	movq	-40(%rdi),%r11
+	cmpl	%edx,%ecx
+	movq	-32(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-16(%rdi),%xmm5
+	movl	%ecx,44(%rsp)
+	cmovleq	%rsp,%r11
+	subq	%r11,%rbp
+	movq	%rbp,88(%rsp)
+	movl	16(%rdi),%ecx
+	movq	0(%rdi),%r12
+	cmpl	%edx,%ecx
+	movq	8(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	24(%rdi),%xmm6
+	movl	%ecx,48(%rsp)
+	cmovleq	%rsp,%r12
+	subq	%r12,%rbp
+	movq	%rbp,96(%rsp)
+	movl	56(%rdi),%ecx
+	movq	40(%rdi),%r13
+	cmpl	%edx,%ecx
+	movq	48(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	64(%rdi),%xmm7
+	movl	%ecx,52(%rsp)
+	cmovleq	%rsp,%r13
+	subq	%r13,%rbp
+	movq	%rbp,104(%rsp)
+	movl	96(%rdi),%ecx
+	movq	80(%rdi),%r14
+	cmpl	%edx,%ecx
+	movq	88(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	104(%rdi),%xmm8
+	movl	%ecx,56(%rsp)
+	cmovleq	%rsp,%r14
+	subq	%r14,%rbp
+	movq	%rbp,112(%rsp)
+	movl	136(%rdi),%ecx
+	movq	120(%rdi),%r15
+	cmpl	%edx,%ecx
+	movq	128(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	144(%rdi),%xmm9
+	movl	%ecx,60(%rsp)
+	cmovleq	%rsp,%r15
+	subq	%r15,%rbp
+	movq	%rbp,120(%rsp)
+	testl	%edx,%edx
+	jz	.Lenc8x_done
+
+	vmovups	16-120(%rsi),%xmm1
+	vmovups	32-120(%rsi),%xmm0
+	movl	240-120(%rsi),%eax
+
+	vpxor	(%r8),%xmm15,%xmm10
+	leaq	128(%rsp),%rbp
+	vpxor	(%r9),%xmm15,%xmm11
+	vpxor	(%r10),%xmm15,%xmm12
+	vpxor	(%r11),%xmm15,%xmm13
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpxor	(%r12),%xmm15,%xmm10
+	vpxor	%xmm11,%xmm3,%xmm3
+	vpxor	(%r13),%xmm15,%xmm11
+	vpxor	%xmm12,%xmm4,%xmm4
+	vpxor	(%r14),%xmm15,%xmm12
+	vpxor	%xmm13,%xmm5,%xmm5
+	vpxor	(%r15),%xmm15,%xmm13
+	vpxor	%xmm10,%xmm6,%xmm6
+	movl	$1,%ecx
+	vpxor	%xmm11,%xmm7,%xmm7
+	vpxor	%xmm12,%xmm8,%xmm8
+	vpxor	%xmm13,%xmm9,%xmm9
+	jmp	.Loop_enc8x
+
+.align	32
+.Loop_enc8x:
+	vaesenc	%xmm1,%xmm2,%xmm2
+	cmpl	32+0(%rsp),%ecx
+	vaesenc	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r8)
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm5,%xmm5
+	leaq	(%r8,%rbx,1),%rbx
+	cmovgeq	%rsp,%r8
+	vaesenc	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm1,%xmm7,%xmm7
+	subq	%r8,%rbx
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vpxor	16(%r8),%xmm15,%xmm10
+	movq	%rbx,64+0(%rsp)
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	-72(%rsi),%xmm1
+	leaq	16(%r8,%rbx,1),%r8
+	vmovdqu	%xmm10,0(%rbp)
+	vaesenc	%xmm0,%xmm2,%xmm2
+	cmpl	32+4(%rsp),%ecx
+	movq	64+8(%rsp),%rbx
+	vaesenc	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r9)
+	vaesenc	%xmm0,%xmm4,%xmm4
+	vaesenc	%xmm0,%xmm5,%xmm5
+	leaq	(%r9,%rbx,1),%rbx
+	cmovgeq	%rsp,%r9
+	vaesenc	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm0,%xmm7,%xmm7
+	subq	%r9,%rbx
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vpxor	16(%r9),%xmm15,%xmm11
+	movq	%rbx,64+8(%rsp)
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	-56(%rsi),%xmm0
+	leaq	16(%r9,%rbx,1),%r9
+	vmovdqu	%xmm11,16(%rbp)
+	vaesenc	%xmm1,%xmm2,%xmm2
+	cmpl	32+8(%rsp),%ecx
+	movq	64+16(%rsp),%rbx
+	vaesenc	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r10)
+	vaesenc	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r8)
+	vaesenc	%xmm1,%xmm5,%xmm5
+	leaq	(%r10,%rbx,1),%rbx
+	cmovgeq	%rsp,%r10
+	vaesenc	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm1,%xmm7,%xmm7
+	subq	%r10,%rbx
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vpxor	16(%r10),%xmm15,%xmm12
+	movq	%rbx,64+16(%rsp)
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	-40(%rsi),%xmm1
+	leaq	16(%r10,%rbx,1),%r10
+	vmovdqu	%xmm12,32(%rbp)
+	vaesenc	%xmm0,%xmm2,%xmm2
+	cmpl	32+12(%rsp),%ecx
+	movq	64+24(%rsp),%rbx
+	vaesenc	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r11)
+	vaesenc	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r9)
+	vaesenc	%xmm0,%xmm5,%xmm5
+	leaq	(%r11,%rbx,1),%rbx
+	cmovgeq	%rsp,%r11
+	vaesenc	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm0,%xmm7,%xmm7
+	subq	%r11,%rbx
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vpxor	16(%r11),%xmm15,%xmm13
+	movq	%rbx,64+24(%rsp)
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	-24(%rsi),%xmm0
+	leaq	16(%r11,%rbx,1),%r11
+	vmovdqu	%xmm13,48(%rbp)
+	vaesenc	%xmm1,%xmm2,%xmm2
+	cmpl	32+16(%rsp),%ecx
+	movq	64+32(%rsp),%rbx
+	vaesenc	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r12)
+	vaesenc	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r10)
+	vaesenc	%xmm1,%xmm5,%xmm5
+	leaq	(%r12,%rbx,1),%rbx
+	cmovgeq	%rsp,%r12
+	vaesenc	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm1,%xmm7,%xmm7
+	subq	%r12,%rbx
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vpxor	16(%r12),%xmm15,%xmm10
+	movq	%rbx,64+32(%rsp)
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	-8(%rsi),%xmm1
+	leaq	16(%r12,%rbx,1),%r12
+	vaesenc	%xmm0,%xmm2,%xmm2
+	cmpl	32+20(%rsp),%ecx
+	movq	64+40(%rsp),%rbx
+	vaesenc	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r13)
+	vaesenc	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r11)
+	vaesenc	%xmm0,%xmm5,%xmm5
+	leaq	(%rbx,%r13,1),%rbx
+	cmovgeq	%rsp,%r13
+	vaesenc	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm0,%xmm7,%xmm7
+	subq	%r13,%rbx
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vpxor	16(%r13),%xmm15,%xmm11
+	movq	%rbx,64+40(%rsp)
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	8(%rsi),%xmm0
+	leaq	16(%r13,%rbx,1),%r13
+	vaesenc	%xmm1,%xmm2,%xmm2
+	cmpl	32+24(%rsp),%ecx
+	movq	64+48(%rsp),%rbx
+	vaesenc	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r14)
+	vaesenc	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r12)
+	vaesenc	%xmm1,%xmm5,%xmm5
+	leaq	(%r14,%rbx,1),%rbx
+	cmovgeq	%rsp,%r14
+	vaesenc	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm1,%xmm7,%xmm7
+	subq	%r14,%rbx
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vpxor	16(%r14),%xmm15,%xmm12
+	movq	%rbx,64+48(%rsp)
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	24(%rsi),%xmm1
+	leaq	16(%r14,%rbx,1),%r14
+	vaesenc	%xmm0,%xmm2,%xmm2
+	cmpl	32+28(%rsp),%ecx
+	movq	64+56(%rsp),%rbx
+	vaesenc	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r15)
+	vaesenc	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r13)
+	vaesenc	%xmm0,%xmm5,%xmm5
+	leaq	(%r15,%rbx,1),%rbx
+	cmovgeq	%rsp,%r15
+	vaesenc	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesenc	%xmm0,%xmm7,%xmm7
+	subq	%r15,%rbx
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vpxor	16(%r15),%xmm15,%xmm13
+	movq	%rbx,64+56(%rsp)
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	40(%rsi),%xmm0
+	leaq	16(%r15,%rbx,1),%r15
+	vmovdqu	32(%rsp),%xmm14
+	prefetcht0	15(%r14)
+	prefetcht0	15(%r15)
+	cmpl	$11,%eax
+	jb	.Lenc8x_tail
+
+	vaesenc	%xmm1,%xmm2,%xmm2
+	vaesenc	%xmm1,%xmm3,%xmm3
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm5,%xmm5
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	176-120(%rsi),%xmm1
+
+	vaesenc	%xmm0,%xmm2,%xmm2
+	vaesenc	%xmm0,%xmm3,%xmm3
+	vaesenc	%xmm0,%xmm4,%xmm4
+	vaesenc	%xmm0,%xmm5,%xmm5
+	vaesenc	%xmm0,%xmm6,%xmm6
+	vaesenc	%xmm0,%xmm7,%xmm7
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	192-120(%rsi),%xmm0
+	je	.Lenc8x_tail
+
+	vaesenc	%xmm1,%xmm2,%xmm2
+	vaesenc	%xmm1,%xmm3,%xmm3
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm5,%xmm5
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	208-120(%rsi),%xmm1
+
+	vaesenc	%xmm0,%xmm2,%xmm2
+	vaesenc	%xmm0,%xmm3,%xmm3
+	vaesenc	%xmm0,%xmm4,%xmm4
+	vaesenc	%xmm0,%xmm5,%xmm5
+	vaesenc	%xmm0,%xmm6,%xmm6
+	vaesenc	%xmm0,%xmm7,%xmm7
+	vaesenc	%xmm0,%xmm8,%xmm8
+	vaesenc	%xmm0,%xmm9,%xmm9
+	vmovups	224-120(%rsi),%xmm0
+
+.Lenc8x_tail:
+	vaesenc	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm15,%xmm15,%xmm15
+	vaesenc	%xmm1,%xmm3,%xmm3
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vpcmpgtd	%xmm15,%xmm14,%xmm15
+	vaesenc	%xmm1,%xmm5,%xmm5
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vpaddd	%xmm14,%xmm15,%xmm15
+	vmovdqu	48(%rsp),%xmm14
+	vaesenc	%xmm1,%xmm7,%xmm7
+	movq	64(%rsp),%rbx
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	16-120(%rsi),%xmm1
+
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vmovdqa	%xmm15,32(%rsp)
+	vpxor	%xmm15,%xmm15,%xmm15
+	vaesenclast	%xmm0,%xmm3,%xmm3
+	vaesenclast	%xmm0,%xmm4,%xmm4
+	vpcmpgtd	%xmm15,%xmm14,%xmm15
+	vaesenclast	%xmm0,%xmm5,%xmm5
+	vaesenclast	%xmm0,%xmm6,%xmm6
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vmovdqu	-120(%rsi),%xmm15
+	vaesenclast	%xmm0,%xmm7,%xmm7
+	vaesenclast	%xmm0,%xmm8,%xmm8
+	vmovdqa	%xmm14,48(%rsp)
+	vaesenclast	%xmm0,%xmm9,%xmm9
+	vmovups	32-120(%rsi),%xmm0
+
+	vmovups	%xmm2,-16(%r8)
+	subq	%rbx,%r8
+	vpxor	0(%rbp),%xmm2,%xmm2
+	vmovups	%xmm3,-16(%r9)
+	subq	72(%rsp),%r9
+	vpxor	16(%rbp),%xmm3,%xmm3
+	vmovups	%xmm4,-16(%r10)
+	subq	80(%rsp),%r10
+	vpxor	32(%rbp),%xmm4,%xmm4
+	vmovups	%xmm5,-16(%r11)
+	subq	88(%rsp),%r11
+	vpxor	48(%rbp),%xmm5,%xmm5
+	vmovups	%xmm6,-16(%r12)
+	subq	96(%rsp),%r12
+	vpxor	%xmm10,%xmm6,%xmm6
+	vmovups	%xmm7,-16(%r13)
+	subq	104(%rsp),%r13
+	vpxor	%xmm11,%xmm7,%xmm7
+	vmovups	%xmm8,-16(%r14)
+	subq	112(%rsp),%r14
+	vpxor	%xmm12,%xmm8,%xmm8
+	vmovups	%xmm9,-16(%r15)
+	subq	120(%rsp),%r15
+	vpxor	%xmm13,%xmm9,%xmm9
+
+	decl	%edx
+	jnz	.Loop_enc8x
+
+	movq	16(%rsp),%rax
+
+
+
+
+
+.Lenc8x_done:
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lenc8x_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type	aesni_multi_cbc_decrypt_avx, at function
+.align	32
+aesni_multi_cbc_decrypt_avx:
+_avx_cbc_dec_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+
+
+
+
+
+
+
+	subq	$256,%rsp
+	andq	$-256,%rsp
+	subq	$192,%rsp
+	movq	%rax,16(%rsp)
+
+.Ldec8x_body:
+	vzeroupper
+	vmovdqu	(%rsi),%xmm15
+	leaq	120(%rsi),%rsi
+	leaq	160(%rdi),%rdi
+	shrl	$1,%edx
+
+.Ldec8x_loop_grande:
+
+	xorl	%edx,%edx
+	movl	-144(%rdi),%ecx
+	movq	-160(%rdi),%r8
+	cmpl	%edx,%ecx
+	movq	-152(%rdi),%rbx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-136(%rdi),%xmm2
+	movl	%ecx,32(%rsp)
+	cmovleq	%rsp,%r8
+	subq	%r8,%rbx
+	movq	%rbx,64(%rsp)
+	vmovdqu	%xmm2,192(%rsp)
+	movl	-104(%rdi),%ecx
+	movq	-120(%rdi),%r9
+	cmpl	%edx,%ecx
+	movq	-112(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-96(%rdi),%xmm3
+	movl	%ecx,36(%rsp)
+	cmovleq	%rsp,%r9
+	subq	%r9,%rbp
+	movq	%rbp,72(%rsp)
+	vmovdqu	%xmm3,208(%rsp)
+	movl	-64(%rdi),%ecx
+	movq	-80(%rdi),%r10
+	cmpl	%edx,%ecx
+	movq	-72(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-56(%rdi),%xmm4
+	movl	%ecx,40(%rsp)
+	cmovleq	%rsp,%r10
+	subq	%r10,%rbp
+	movq	%rbp,80(%rsp)
+	vmovdqu	%xmm4,224(%rsp)
+	movl	-24(%rdi),%ecx
+	movq	-40(%rdi),%r11
+	cmpl	%edx,%ecx
+	movq	-32(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	-16(%rdi),%xmm5
+	movl	%ecx,44(%rsp)
+	cmovleq	%rsp,%r11
+	subq	%r11,%rbp
+	movq	%rbp,88(%rsp)
+	vmovdqu	%xmm5,240(%rsp)
+	movl	16(%rdi),%ecx
+	movq	0(%rdi),%r12
+	cmpl	%edx,%ecx
+	movq	8(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	24(%rdi),%xmm6
+	movl	%ecx,48(%rsp)
+	cmovleq	%rsp,%r12
+	subq	%r12,%rbp
+	movq	%rbp,96(%rsp)
+	vmovdqu	%xmm6,256(%rsp)
+	movl	56(%rdi),%ecx
+	movq	40(%rdi),%r13
+	cmpl	%edx,%ecx
+	movq	48(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	64(%rdi),%xmm7
+	movl	%ecx,52(%rsp)
+	cmovleq	%rsp,%r13
+	subq	%r13,%rbp
+	movq	%rbp,104(%rsp)
+	vmovdqu	%xmm7,272(%rsp)
+	movl	96(%rdi),%ecx
+	movq	80(%rdi),%r14
+	cmpl	%edx,%ecx
+	movq	88(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	104(%rdi),%xmm8
+	movl	%ecx,56(%rsp)
+	cmovleq	%rsp,%r14
+	subq	%r14,%rbp
+	movq	%rbp,112(%rsp)
+	vmovdqu	%xmm8,288(%rsp)
+	movl	136(%rdi),%ecx
+	movq	120(%rdi),%r15
+	cmpl	%edx,%ecx
+	movq	128(%rdi),%rbp
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	vmovdqu	144(%rdi),%xmm9
+	movl	%ecx,60(%rsp)
+	cmovleq	%rsp,%r15
+	subq	%r15,%rbp
+	movq	%rbp,120(%rsp)
+	vmovdqu	%xmm9,304(%rsp)
+	testl	%edx,%edx
+	jz	.Ldec8x_done
+
+	vmovups	16-120(%rsi),%xmm1
+	vmovups	32-120(%rsi),%xmm0
+	movl	240-120(%rsi),%eax
+	leaq	192+128(%rsp),%rbp
+
+	vmovdqu	(%r8),%xmm2
+	vmovdqu	(%r9),%xmm3
+	vmovdqu	(%r10),%xmm4
+	vmovdqu	(%r11),%xmm5
+	vmovdqu	(%r12),%xmm6
+	vmovdqu	(%r13),%xmm7
+	vmovdqu	(%r14),%xmm8
+	vmovdqu	(%r15),%xmm9
+	vmovdqu	%xmm2,0(%rbp)
+	vpxor	%xmm15,%xmm2,%xmm2
+	vmovdqu	%xmm3,16(%rbp)
+	vpxor	%xmm15,%xmm3,%xmm3
+	vmovdqu	%xmm4,32(%rbp)
+	vpxor	%xmm15,%xmm4,%xmm4
+	vmovdqu	%xmm5,48(%rbp)
+	vpxor	%xmm15,%xmm5,%xmm5
+	vmovdqu	%xmm6,64(%rbp)
+	vpxor	%xmm15,%xmm6,%xmm6
+	vmovdqu	%xmm7,80(%rbp)
+	vpxor	%xmm15,%xmm7,%xmm7
+	vmovdqu	%xmm8,96(%rbp)
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	%xmm9,112(%rbp)
+	vpxor	%xmm15,%xmm9,%xmm9
+	xorq	$0x80,%rbp
+	movl	$1,%ecx
+	jmp	.Loop_dec8x
+
+.align	32
+.Loop_dec8x:
+	vaesdec	%xmm1,%xmm2,%xmm2
+	cmpl	32+0(%rsp),%ecx
+	vaesdec	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r8)
+	vaesdec	%xmm1,%xmm4,%xmm4
+	vaesdec	%xmm1,%xmm5,%xmm5
+	leaq	(%r8,%rbx,1),%rbx
+	cmovgeq	%rsp,%r8
+	vaesdec	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm1,%xmm7,%xmm7
+	subq	%r8,%rbx
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vmovdqu	16(%r8),%xmm10
+	movq	%rbx,64+0(%rsp)
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	-72(%rsi),%xmm1
+	leaq	16(%r8,%rbx,1),%r8
+	vmovdqu	%xmm10,128(%rsp)
+	vaesdec	%xmm0,%xmm2,%xmm2
+	cmpl	32+4(%rsp),%ecx
+	movq	64+8(%rsp),%rbx
+	vaesdec	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r9)
+	vaesdec	%xmm0,%xmm4,%xmm4
+	vaesdec	%xmm0,%xmm5,%xmm5
+	leaq	(%r9,%rbx,1),%rbx
+	cmovgeq	%rsp,%r9
+	vaesdec	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm0,%xmm7,%xmm7
+	subq	%r9,%rbx
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vmovdqu	16(%r9),%xmm11
+	movq	%rbx,64+8(%rsp)
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	-56(%rsi),%xmm0
+	leaq	16(%r9,%rbx,1),%r9
+	vmovdqu	%xmm11,144(%rsp)
+	vaesdec	%xmm1,%xmm2,%xmm2
+	cmpl	32+8(%rsp),%ecx
+	movq	64+16(%rsp),%rbx
+	vaesdec	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r10)
+	vaesdec	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r8)
+	vaesdec	%xmm1,%xmm5,%xmm5
+	leaq	(%r10,%rbx,1),%rbx
+	cmovgeq	%rsp,%r10
+	vaesdec	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm1,%xmm7,%xmm7
+	subq	%r10,%rbx
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vmovdqu	16(%r10),%xmm12
+	movq	%rbx,64+16(%rsp)
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	-40(%rsi),%xmm1
+	leaq	16(%r10,%rbx,1),%r10
+	vmovdqu	%xmm12,160(%rsp)
+	vaesdec	%xmm0,%xmm2,%xmm2
+	cmpl	32+12(%rsp),%ecx
+	movq	64+24(%rsp),%rbx
+	vaesdec	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r11)
+	vaesdec	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r9)
+	vaesdec	%xmm0,%xmm5,%xmm5
+	leaq	(%r11,%rbx,1),%rbx
+	cmovgeq	%rsp,%r11
+	vaesdec	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm0,%xmm7,%xmm7
+	subq	%r11,%rbx
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vmovdqu	16(%r11),%xmm13
+	movq	%rbx,64+24(%rsp)
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	-24(%rsi),%xmm0
+	leaq	16(%r11,%rbx,1),%r11
+	vmovdqu	%xmm13,176(%rsp)
+	vaesdec	%xmm1,%xmm2,%xmm2
+	cmpl	32+16(%rsp),%ecx
+	movq	64+32(%rsp),%rbx
+	vaesdec	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r12)
+	vaesdec	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r10)
+	vaesdec	%xmm1,%xmm5,%xmm5
+	leaq	(%r12,%rbx,1),%rbx
+	cmovgeq	%rsp,%r12
+	vaesdec	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm1,%xmm7,%xmm7
+	subq	%r12,%rbx
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vmovdqu	16(%r12),%xmm10
+	movq	%rbx,64+32(%rsp)
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	-8(%rsi),%xmm1
+	leaq	16(%r12,%rbx,1),%r12
+	vaesdec	%xmm0,%xmm2,%xmm2
+	cmpl	32+20(%rsp),%ecx
+	movq	64+40(%rsp),%rbx
+	vaesdec	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r13)
+	vaesdec	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r11)
+	vaesdec	%xmm0,%xmm5,%xmm5
+	leaq	(%rbx,%r13,1),%rbx
+	cmovgeq	%rsp,%r13
+	vaesdec	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm0,%xmm7,%xmm7
+	subq	%r13,%rbx
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vmovdqu	16(%r13),%xmm11
+	movq	%rbx,64+40(%rsp)
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	8(%rsi),%xmm0
+	leaq	16(%r13,%rbx,1),%r13
+	vaesdec	%xmm1,%xmm2,%xmm2
+	cmpl	32+24(%rsp),%ecx
+	movq	64+48(%rsp),%rbx
+	vaesdec	%xmm1,%xmm3,%xmm3
+	prefetcht0	31(%r14)
+	vaesdec	%xmm1,%xmm4,%xmm4
+	prefetcht0	15(%r12)
+	vaesdec	%xmm1,%xmm5,%xmm5
+	leaq	(%r14,%rbx,1),%rbx
+	cmovgeq	%rsp,%r14
+	vaesdec	%xmm1,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm1,%xmm7,%xmm7
+	subq	%r14,%rbx
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vmovdqu	16(%r14),%xmm12
+	movq	%rbx,64+48(%rsp)
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	24(%rsi),%xmm1
+	leaq	16(%r14,%rbx,1),%r14
+	vaesdec	%xmm0,%xmm2,%xmm2
+	cmpl	32+28(%rsp),%ecx
+	movq	64+56(%rsp),%rbx
+	vaesdec	%xmm0,%xmm3,%xmm3
+	prefetcht0	31(%r15)
+	vaesdec	%xmm0,%xmm4,%xmm4
+	prefetcht0	15(%r13)
+	vaesdec	%xmm0,%xmm5,%xmm5
+	leaq	(%r15,%rbx,1),%rbx
+	cmovgeq	%rsp,%r15
+	vaesdec	%xmm0,%xmm6,%xmm6
+	cmovgq	%rsp,%rbx
+	vaesdec	%xmm0,%xmm7,%xmm7
+	subq	%r15,%rbx
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vmovdqu	16(%r15),%xmm13
+	movq	%rbx,64+56(%rsp)
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	40(%rsi),%xmm0
+	leaq	16(%r15,%rbx,1),%r15
+	vmovdqu	32(%rsp),%xmm14
+	prefetcht0	15(%r14)
+	prefetcht0	15(%r15)
+	cmpl	$11,%eax
+	jb	.Ldec8x_tail
+
+	vaesdec	%xmm1,%xmm2,%xmm2
+	vaesdec	%xmm1,%xmm3,%xmm3
+	vaesdec	%xmm1,%xmm4,%xmm4
+	vaesdec	%xmm1,%xmm5,%xmm5
+	vaesdec	%xmm1,%xmm6,%xmm6
+	vaesdec	%xmm1,%xmm7,%xmm7
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	176-120(%rsi),%xmm1
+
+	vaesdec	%xmm0,%xmm2,%xmm2
+	vaesdec	%xmm0,%xmm3,%xmm3
+	vaesdec	%xmm0,%xmm4,%xmm4
+	vaesdec	%xmm0,%xmm5,%xmm5
+	vaesdec	%xmm0,%xmm6,%xmm6
+	vaesdec	%xmm0,%xmm7,%xmm7
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	192-120(%rsi),%xmm0
+	je	.Ldec8x_tail
+
+	vaesdec	%xmm1,%xmm2,%xmm2
+	vaesdec	%xmm1,%xmm3,%xmm3
+	vaesdec	%xmm1,%xmm4,%xmm4
+	vaesdec	%xmm1,%xmm5,%xmm5
+	vaesdec	%xmm1,%xmm6,%xmm6
+	vaesdec	%xmm1,%xmm7,%xmm7
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	208-120(%rsi),%xmm1
+
+	vaesdec	%xmm0,%xmm2,%xmm2
+	vaesdec	%xmm0,%xmm3,%xmm3
+	vaesdec	%xmm0,%xmm4,%xmm4
+	vaesdec	%xmm0,%xmm5,%xmm5
+	vaesdec	%xmm0,%xmm6,%xmm6
+	vaesdec	%xmm0,%xmm7,%xmm7
+	vaesdec	%xmm0,%xmm8,%xmm8
+	vaesdec	%xmm0,%xmm9,%xmm9
+	vmovups	224-120(%rsi),%xmm0
+
+.Ldec8x_tail:
+	vaesdec	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm15,%xmm15,%xmm15
+	vaesdec	%xmm1,%xmm3,%xmm3
+	vaesdec	%xmm1,%xmm4,%xmm4
+	vpcmpgtd	%xmm15,%xmm14,%xmm15
+	vaesdec	%xmm1,%xmm5,%xmm5
+	vaesdec	%xmm1,%xmm6,%xmm6
+	vpaddd	%xmm14,%xmm15,%xmm15
+	vmovdqu	48(%rsp),%xmm14
+	vaesdec	%xmm1,%xmm7,%xmm7
+	movq	64(%rsp),%rbx
+	vaesdec	%xmm1,%xmm8,%xmm8
+	vaesdec	%xmm1,%xmm9,%xmm9
+	vmovups	16-120(%rsi),%xmm1
+
+	vaesdeclast	%xmm0,%xmm2,%xmm2
+	vmovdqa	%xmm15,32(%rsp)
+	vpxor	%xmm15,%xmm15,%xmm15
+	vaesdeclast	%xmm0,%xmm3,%xmm3
+	vpxor	0(%rbp),%xmm2,%xmm2
+	vaesdeclast	%xmm0,%xmm4,%xmm4
+	vpxor	16(%rbp),%xmm3,%xmm3
+	vpcmpgtd	%xmm15,%xmm14,%xmm15
+	vaesdeclast	%xmm0,%xmm5,%xmm5
+	vpxor	32(%rbp),%xmm4,%xmm4
+	vaesdeclast	%xmm0,%xmm6,%xmm6
+	vpxor	48(%rbp),%xmm5,%xmm5
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vmovdqu	-120(%rsi),%xmm15
+	vaesdeclast	%xmm0,%xmm7,%xmm7
+	vpxor	64(%rbp),%xmm6,%xmm6
+	vaesdeclast	%xmm0,%xmm8,%xmm8
+	vpxor	80(%rbp),%xmm7,%xmm7
+	vmovdqa	%xmm14,48(%rsp)
+	vaesdeclast	%xmm0,%xmm9,%xmm9
+	vpxor	96(%rbp),%xmm8,%xmm8
+	vmovups	32-120(%rsi),%xmm0
+
+	vmovups	%xmm2,-16(%r8)
+	subq	%rbx,%r8
+	vmovdqu	128+0(%rsp),%xmm2
+	vpxor	112(%rbp),%xmm9,%xmm9
+	vmovups	%xmm3,-16(%r9)
+	subq	72(%rsp),%r9
+	vmovdqu	%xmm2,0(%rbp)
+	vpxor	%xmm15,%xmm2,%xmm2
+	vmovdqu	128+16(%rsp),%xmm3
+	vmovups	%xmm4,-16(%r10)
+	subq	80(%rsp),%r10
+	vmovdqu	%xmm3,16(%rbp)
+	vpxor	%xmm15,%xmm3,%xmm3
+	vmovdqu	128+32(%rsp),%xmm4
+	vmovups	%xmm5,-16(%r11)
+	subq	88(%rsp),%r11
+	vmovdqu	%xmm4,32(%rbp)
+	vpxor	%xmm15,%xmm4,%xmm4
+	vmovdqu	128+48(%rsp),%xmm5
+	vmovups	%xmm6,-16(%r12)
+	subq	96(%rsp),%r12
+	vmovdqu	%xmm5,48(%rbp)
+	vpxor	%xmm15,%xmm5,%xmm5
+	vmovdqu	%xmm10,64(%rbp)
+	vpxor	%xmm10,%xmm15,%xmm6
+	vmovups	%xmm7,-16(%r13)
+	subq	104(%rsp),%r13
+	vmovdqu	%xmm11,80(%rbp)
+	vpxor	%xmm11,%xmm15,%xmm7
+	vmovups	%xmm8,-16(%r14)
+	subq	112(%rsp),%r14
+	vmovdqu	%xmm12,96(%rbp)
+	vpxor	%xmm12,%xmm15,%xmm8
+	vmovups	%xmm9,-16(%r15)
+	subq	120(%rsp),%r15
+	vmovdqu	%xmm13,112(%rbp)
+	vpxor	%xmm13,%xmm15,%xmm9
+
+	xorq	$128,%rbp
+	decl	%edx
+	jnz	.Loop_dec8x
+
+	movq	16(%rsp),%rax
+
+
+
+
+
+.Ldec8x_done:
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Ldec8x_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx


Property changes on: trunk/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,4358 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S 325335 2017-11-02 18:22:53Z jkim $ */
+/* Do not modify. This file is auto-generated from aesni-sha256-x86_64.pl. */
+.text	
+
+
+.globl	aesni_cbc_sha256_enc
+.type	aesni_cbc_sha256_enc, at function
+.align	16
+aesni_cbc_sha256_enc:
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	$1,%eax
+	cmpq	$0,%rdi
+	je	.Lprobe
+	movl	0(%r11),%eax
+	movq	4(%r11),%r10
+	btq	$61,%r10
+	jc	aesni_cbc_sha256_enc_shaext
+	movq	%r10,%r11
+	shrq	$32,%r11
+
+	testl	$2048,%r10d
+	jnz	aesni_cbc_sha256_enc_xop
+	andl	$296,%r11d
+	cmpl	$296,%r11d
+	je	aesni_cbc_sha256_enc_avx2
+	andl	$268435456,%r10d
+	jnz	aesni_cbc_sha256_enc_avx
+	ud2
+	xorl	%eax,%eax
+	cmpq	$0,%rdi
+	je	.Lprobe
+	ud2
+.Lprobe:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc
+
+.align	64
+.type	K256, at object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
+.long	0,0,0,0,   0,0,0,0
+.byte	65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.type	aesni_cbc_sha256_enc_xop, at function
+.align	64
+aesni_cbc_sha256_enc_xop:
+.Lxop_shortcut:
+	movq	8(%rsp),%r10
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	subq	$128,%rsp
+	andq	$-64,%rsp
+
+	shlq	$6,%rdx
+	subq	%rdi,%rsi
+	subq	%rdi,%r10
+	addq	%rdi,%rdx
+
+
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+
+	movq	%r8,64+32(%rsp)
+	movq	%r9,64+40(%rsp)
+	movq	%r10,64+48(%rsp)
+	movq	%r11,64+56(%rsp)
+.Lprologue_xop:
+	vzeroall
+
+	movq	%rdi,%r12
+	leaq	128(%rcx),%rdi
+	leaq	K256+544(%rip),%r13
+	movl	240-128(%rdi),%r14d
+	movq	%r9,%r15
+	movq	%r10,%rsi
+	vmovdqu	(%r8),%xmm8
+	subq	$9,%r14
+
+	movl	0(%r15),%eax
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	movl	12(%r15),%edx
+	movl	16(%r15),%r8d
+	movl	20(%r15),%r9d
+	movl	24(%r15),%r10d
+	movl	28(%r15),%r11d
+
+	vmovdqa	0(%r13,%r14,8),%xmm14
+	vmovdqa	16(%r13,%r14,8),%xmm13
+	vmovdqa	32(%r13,%r14,8),%xmm12
+	vmovdqu	0-128(%rdi),%xmm10
+	jmp	.Lloop_xop
+.align	16
+.Lloop_xop:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi,%r12,1),%xmm0
+	vmovdqu	16(%rsi,%r12,1),%xmm1
+	vmovdqu	32(%rsi,%r12,1),%xmm2
+	vmovdqu	48(%rsi,%r12,1),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%esi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%esi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lxop_00_47
+
+.align	16
+.Lxop_00_47:
+	subq	$-32*4,%rbp
+	vmovdqu	(%r12),%xmm9
+	movq	%r12,64+0(%rsp)
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+.byte	143,232,120,194,236,14
+	rorl	$9,%r14d
+	xorl	%r10d,%r12d
+	vpsrld	$3,%xmm4,%xmm4
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	andl	%r8d,%r12d
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+.byte	143,232,120,194,245,11
+	rorl	$11,%r14d
+	xorl	%r10d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+.byte	143,232,120,194,251,13
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	vpsrld	$10,%xmm3,%xmm6
+	rorl	$2,%r14d
+	addl	%esi,%r11d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+.byte	143,232,120,194,239,2
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%edx,%r13d
+	vpsrldq	$8,%xmm7,%xmm7
+	addl	4(%rsp),%r10d
+	movl	%r11d,%esi
+	rorl	$11,%r14d
+	xorl	%r9d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%eax,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+.byte	143,232,120,194,248,13
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	vpsrld	$10,%xmm0,%xmm6
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+.byte	143,232,120,194,239,2
+	rorl	$2,%r14d
+	addl	%r15d,%r10d
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm5,%xmm7,%xmm7
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r12d
+	vpslldq	$8,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	rorl	$11,%r14d
+	xorl	%r8d,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	rorl	$2,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%esi
+	rorl	$11,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	rorl	$2,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+.byte	143,232,120,194,236,14
+	rorl	$9,%r14d
+	xorl	%ecx,%r12d
+	vpsrld	$3,%xmm4,%xmm4
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	andl	%eax,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+.byte	143,232,120,194,245,11
+	rorl	$11,%r14d
+	xorl	%ecx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+.byte	143,232,120,194,248,13
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	vpsrld	$10,%xmm0,%xmm6
+	rorl	$2,%r14d
+	addl	%esi,%edx
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+.byte	143,232,120,194,239,2
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	vpsrldq	$8,%xmm7,%xmm7
+	addl	20(%rsp),%ecx
+	movl	%edx,%esi
+	rorl	$11,%r14d
+	xorl	%ebx,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%r8d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+.byte	143,232,120,194,249,13
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	vpsrld	$10,%xmm1,%xmm6
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+.byte	143,232,120,194,239,2
+	rorl	$2,%r14d
+	addl	%r15d,%ecx
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm5,%xmm7,%xmm7
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r12d
+	vpslldq	$8,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	rorl	$11,%r14d
+	xorl	%eax,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	rorl	$2,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%esi
+	rorl	$11,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	rorl	$2,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+.byte	143,232,120,194,236,14
+	rorl	$9,%r14d
+	xorl	%r10d,%r12d
+	vpsrld	$3,%xmm4,%xmm4
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	andl	%r8d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+.byte	143,232,120,194,245,11
+	rorl	$11,%r14d
+	xorl	%r10d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+.byte	143,232,120,194,249,13
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	vpsrld	$10,%xmm1,%xmm6
+	rorl	$2,%r14d
+	addl	%esi,%r11d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+.byte	143,232,120,194,239,2
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%edx,%r13d
+	vpsrldq	$8,%xmm7,%xmm7
+	addl	36(%rsp),%r10d
+	movl	%r11d,%esi
+	rorl	$11,%r14d
+	xorl	%r9d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%eax,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+.byte	143,232,120,194,250,13
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	vpsrld	$10,%xmm2,%xmm6
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+.byte	143,232,120,194,239,2
+	rorl	$2,%r14d
+	addl	%r15d,%r10d
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm5,%xmm7,%xmm7
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r12d
+	vpslldq	$8,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	rorl	$11,%r14d
+	xorl	%r8d,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	rorl	$2,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%esi
+	rorl	$11,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	rorl	$2,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+.byte	143,232,120,194,236,14
+	rorl	$9,%r14d
+	xorl	%ecx,%r12d
+	vpsrld	$3,%xmm4,%xmm4
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	andl	%eax,%r12d
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+.byte	143,232,120,194,245,11
+	rorl	$11,%r14d
+	xorl	%ecx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+.byte	143,232,120,194,250,13
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	vpsrld	$10,%xmm2,%xmm6
+	rorl	$2,%r14d
+	addl	%esi,%edx
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+.byte	143,232,120,194,239,2
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	vpsrldq	$8,%xmm7,%xmm7
+	addl	52(%rsp),%ecx
+	movl	%edx,%esi
+	rorl	$11,%r14d
+	xorl	%ebx,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%r8d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+.byte	143,232,120,194,251,13
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	vpsrld	$10,%xmm3,%xmm6
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+.byte	143,232,120,194,239,2
+	rorl	$2,%r14d
+	addl	%r15d,%ecx
+	vpxor	%xmm6,%xmm7,%xmm7
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm5,%xmm7,%xmm7
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r12d
+	vpslldq	$8,%xmm7,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	rorl	$11,%r14d
+	xorl	%eax,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	rorl	$2,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%esi
+	rorl	$11,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	rorl	$2,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	movq	64+0(%rsp),%r12
+	vpand	%xmm14,%xmm11,%xmm11
+	movq	64+8(%rsp),%r15
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r15,%r12,1)
+	leaq	16(%r12),%r12
+	cmpb	$0,131(%rbp)
+	jne	.Lxop_00_47
+	vmovdqu	(%r12),%xmm9
+	movq	%r12,64+0(%rsp)
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	rorl	$11,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	rorl	$2,%r14d
+	addl	%esi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%esi
+	rorl	$11,%r14d
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	rorl	$2,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	rorl	$11,%r14d
+	xorl	%r8d,%r12d
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	rorl	$2,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%esi
+	rorl	$11,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	rorl	$2,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	rorl	$11,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	rorl	$2,%r14d
+	addl	%esi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%esi
+	rorl	$11,%r14d
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	rorl	$2,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	rorl	$11,%r14d
+	xorl	%eax,%r12d
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	rorl	$2,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%esi
+	rorl	$11,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	rorl	$2,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	rorl	$11,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	rorl	$2,%r14d
+	addl	%esi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%esi
+	rorl	$11,%r14d
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	rorl	$2,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	rorl	$11,%r14d
+	xorl	%r8d,%r12d
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	rorl	$2,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%esi
+	rorl	$11,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	rorl	$2,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	rorl	$11,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	rorl	$2,%r14d
+	addl	%esi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%esi
+	rorl	$11,%r14d
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	rorl	$2,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	rorl	$11,%r14d
+	xorl	%eax,%r12d
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	rorl	$2,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%esi
+	rorl	$11,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	rorl	$6,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	rorl	$2,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%r12
+	movq	64+8(%rsp),%r13
+	movq	64+40(%rsp),%r15
+	movq	64+48(%rsp),%rsi
+
+	vpand	%xmm14,%xmm11,%xmm11
+	movl	%r14d,%eax
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12,%r13,1)
+	leaq	16(%r12),%r12
+
+	addl	0(%r15),%eax
+	addl	4(%r15),%ebx
+	addl	8(%r15),%ecx
+	addl	12(%r15),%edx
+	addl	16(%r15),%r8d
+	addl	20(%r15),%r9d
+	addl	24(%r15),%r10d
+	addl	28(%r15),%r11d
+
+	cmpq	64+16(%rsp),%r12
+
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	movl	%r8d,16(%r15)
+	movl	%r9d,20(%r15)
+	movl	%r10d,24(%r15)
+	movl	%r11d,28(%r15)
+
+	jb	.Lloop_xop
+
+	movq	64+32(%rsp),%r8
+	movq	64+56(%rsp),%rsi
+	vmovdqu	%xmm8,(%r8)
+	vzeroall
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_xop:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
+.type	aesni_cbc_sha256_enc_avx, at function
+.align	64
+aesni_cbc_sha256_enc_avx:
+.Lavx_shortcut:
+	movq	8(%rsp),%r10
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	subq	$128,%rsp
+	andq	$-64,%rsp
+
+	shlq	$6,%rdx
+	subq	%rdi,%rsi
+	subq	%rdi,%r10
+	addq	%rdi,%rdx
+
+
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+
+	movq	%r8,64+32(%rsp)
+	movq	%r9,64+40(%rsp)
+	movq	%r10,64+48(%rsp)
+	movq	%r11,64+56(%rsp)
+.Lprologue_avx:
+	vzeroall
+
+	movq	%rdi,%r12
+	leaq	128(%rcx),%rdi
+	leaq	K256+544(%rip),%r13
+	movl	240-128(%rdi),%r14d
+	movq	%r9,%r15
+	movq	%r10,%rsi
+	vmovdqu	(%r8),%xmm8
+	subq	$9,%r14
+
+	movl	0(%r15),%eax
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	movl	12(%r15),%edx
+	movl	16(%r15),%r8d
+	movl	20(%r15),%r9d
+	movl	24(%r15),%r10d
+	movl	28(%r15),%r11d
+
+	vmovdqa	0(%r13,%r14,8),%xmm14
+	vmovdqa	16(%r13,%r14,8),%xmm13
+	vmovdqa	32(%r13,%r14,8),%xmm12
+	vmovdqu	0-128(%rdi),%xmm10
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi,%r12,1),%xmm0
+	vmovdqu	16(%rsi,%r12,1),%xmm1
+	vmovdqu	32(%rsi,%r12,1),%xmm2
+	vmovdqu	48(%rsi,%r12,1),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%esi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%esi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	subq	$-32*4,%rbp
+	vmovdqu	(%r12),%xmm9
+	movq	%r12,64+0(%rsp)
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%r8d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	vpshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%edx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	vpslld	$11,%xmm5,%xmm5
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%esi
+	shrdl	$11,%r14d,%r14d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	shrdl	$9,%r14d,%r14d
+	vpshufd	$132,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpsrldq	$8,%xmm6,%xmm6
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	movl	%r10d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%r12d
+	vpshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r9d
+	vpsrld	$10,%xmm7,%xmm6
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpsrlq	$17,%xmm7,%xmm7
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpsrlq	$2,%xmm7,%xmm7
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%ebx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r12d
+	vpshufd	$232,%xmm6,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpslldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%esi
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%eax,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	vpshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r11d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	vpslld	$11,%xmm5,%xmm5
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%esi
+	shrdl	$11,%r14d,%r14d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	shrdl	$9,%r14d,%r14d
+	vpshufd	$132,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpsrldq	$8,%xmm6,%xmm6
+	andl	%r10d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpaddd	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%r12d
+	vpshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ebx
+	vpsrld	$10,%xmm7,%xmm6
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpsrlq	$17,%xmm7,%xmm7
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpsrlq	$2,%xmm7,%xmm7
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r9d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r12d
+	vpshufd	$232,%xmm6,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpslldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%esi
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%r8d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	vpshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%edx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	vpslld	$11,%xmm5,%xmm5
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%esi
+	shrdl	$11,%r14d,%r14d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	shrdl	$9,%r14d,%r14d
+	vpshufd	$132,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpsrldq	$8,%xmm6,%xmm6
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	movl	%r10d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%r12d
+	vpshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r9d
+	vpsrld	$10,%xmm7,%xmm6
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpsrlq	$17,%xmm7,%xmm7
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpsrlq	$2,%xmm7,%xmm7
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%ebx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r12d
+	vpshufd	$232,%xmm6,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpslldq	$8,%xmm6,%xmm6
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%esi
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%eax,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	vpshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r11d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	vpslld	$11,%xmm5,%xmm5
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%esi
+	shrdl	$11,%r14d,%r14d
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	shrdl	$9,%r14d,%r14d
+	vpshufd	$132,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpsrldq	$8,%xmm6,%xmm6
+	andl	%r10d,%r12d
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpaddd	%xmm6,%xmm3,%xmm3
+	movl	%ecx,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%r12d
+	vpshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ebx
+	vpsrld	$10,%xmm7,%xmm6
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpsrlq	$17,%xmm7,%xmm7
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpsrlq	$2,%xmm7,%xmm7
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r9d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r12d
+	vpshufd	$232,%xmm6,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpslldq	$8,%xmm6,%xmm6
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%esi
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	movq	64+0(%rsp),%r12
+	vpand	%xmm14,%xmm11,%xmm11
+	movq	64+8(%rsp),%r15
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r15,%r12,1)
+	leaq	16(%r12),%r12
+	cmpb	$0,131(%rbp)
+	jne	.Lavx_00_47
+	vmovdqu	(%r12),%xmm9
+	movq	%r12,64+0(%rsp)
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%r12d
+	xorl	%r11d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%r12d
+	xorl	%edx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	xorl	%r8d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%r12d
+	xorl	%ebx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r11d
+	andl	%r15d,%esi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%esi
+	addl	%r11d,%edx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	xorl	%edx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r12d
+	xorl	%eax,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r10d
+	andl	%esi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	addl	%r10d,%ecx
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	xorl	%ecx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%r12d
+	xorl	%r11d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r9d
+	andl	%r15d,%esi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%esi
+	addl	%r9d,%ebx
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	xorl	%ebx,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r12d
+	xorl	%r10d,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%r8d
+	andl	%esi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	addl	%r8d,%eax
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	xorl	%eax,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%r12d
+	xorl	%r9d,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%edx
+	andl	%r15d,%esi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%esi
+	addl	%edx,%r11d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	xorl	%r11d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r12d
+	xorl	%r8d,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ecx
+	andl	%esi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	addl	%ecx,%r10d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	xorl	%r10d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%r12d
+	xorl	%edx,%r15d
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%ebx
+	andl	%r15d,%esi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%r9d
+	shrdl	$2,%r14d,%r14d
+	addl	%esi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	xorl	%r9d,%r13d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%esi
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r12d
+	xorl	%ecx,%esi
+	shrdl	$6,%r13d,%r13d
+	addl	%r12d,%eax
+	andl	%esi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	addl	%eax,%r8d
+	shrdl	$2,%r14d,%r14d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%r12
+	movq	64+8(%rsp),%r13
+	movq	64+40(%rsp),%r15
+	movq	64+48(%rsp),%rsi
+
+	vpand	%xmm14,%xmm11,%xmm11
+	movl	%r14d,%eax
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12,%r13,1)
+	leaq	16(%r12),%r12
+
+	addl	0(%r15),%eax
+	addl	4(%r15),%ebx
+	addl	8(%r15),%ecx
+	addl	12(%r15),%edx
+	addl	16(%r15),%r8d
+	addl	20(%r15),%r9d
+	addl	24(%r15),%r10d
+	addl	28(%r15),%r11d
+
+	cmpq	64+16(%rsp),%r12
+
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	movl	%r8d,16(%r15)
+	movl	%r9d,20(%r15)
+	movl	%r10d,24(%r15)
+	movl	%r11d,28(%r15)
+	jb	.Lloop_avx
+
+	movq	64+32(%rsp),%r8
+	movq	64+56(%rsp),%rsi
+	vmovdqu	%xmm8,(%r8)
+	vzeroall
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
+.type	aesni_cbc_sha256_enc_avx2, at function
+.align	64
+aesni_cbc_sha256_enc_avx2:
+.Lavx2_shortcut:
+	movq	8(%rsp),%r10
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	subq	$576,%rsp
+	andq	$-1024,%rsp
+	addq	$448,%rsp
+
+	shlq	$6,%rdx
+	subq	%rdi,%rsi
+	subq	%rdi,%r10
+	addq	%rdi,%rdx
+
+
+
+	movq	%rdx,64+16(%rsp)
+
+	movq	%r8,64+32(%rsp)
+	movq	%r9,64+40(%rsp)
+	movq	%r10,64+48(%rsp)
+	movq	%r11,64+56(%rsp)
+.Lprologue_avx2:
+	vzeroall
+
+	movq	%rdi,%r13
+	vpinsrq	$1,%rsi,%xmm15,%xmm15
+	leaq	128(%rcx),%rdi
+	leaq	K256+544(%rip),%r12
+	movl	240-128(%rdi),%r14d
+	movq	%r9,%r15
+	movq	%r10,%rsi
+	vmovdqu	(%r8),%xmm8
+	leaq	-9(%r14),%r14
+
+	vmovdqa	0(%r12,%r14,8),%xmm14
+	vmovdqa	16(%r12,%r14,8),%xmm13
+	vmovdqa	32(%r12,%r14,8),%xmm12
+
+	subq	$-64,%r13
+	movl	0(%r15),%eax
+	leaq	(%rsi,%r13,1),%r12
+	movl	4(%r15),%ebx
+	cmpq	%rdx,%r13
+	movl	8(%r15),%ecx
+	cmoveq	%rsp,%r12
+	movl	12(%r15),%edx
+	movl	16(%r15),%r8d
+	movl	20(%r15),%r9d
+	movl	24(%r15),%r10d
+	movl	28(%r15),%r11d
+	vmovdqu	0-128(%rdi),%xmm10
+	jmp	.Loop_avx2
+.align	16
+.Loop_avx2:
+	vmovdqa	K256+512(%rip),%ymm7
+	vmovdqu	-64+0(%rsi,%r13,1),%xmm0
+	vmovdqu	-64+16(%rsi,%r13,1),%xmm1
+	vmovdqu	-64+32(%rsi,%r13,1),%xmm2
+	vmovdqu	-64+48(%rsi,%r13,1),%xmm3
+
+	vinserti128	$1,(%r12),%ymm0,%ymm0
+	vinserti128	$1,16(%r12),%ymm1,%ymm1
+	vpshufb	%ymm7,%ymm0,%ymm0
+	vinserti128	$1,32(%r12),%ymm2,%ymm2
+	vpshufb	%ymm7,%ymm1,%ymm1
+	vinserti128	$1,48(%r12),%ymm3,%ymm3
+
+	leaq	K256(%rip),%rbp
+	vpshufb	%ymm7,%ymm2,%ymm2
+	leaq	-64(%r13),%r13
+	vpaddd	0(%rbp),%ymm0,%ymm4
+	vpshufb	%ymm7,%ymm3,%ymm3
+	vpaddd	32(%rbp),%ymm1,%ymm5
+	vpaddd	64(%rbp),%ymm2,%ymm6
+	vpaddd	96(%rbp),%ymm3,%ymm7
+	vmovdqa	%ymm4,0(%rsp)
+	xorl	%r14d,%r14d
+	vmovdqa	%ymm5,32(%rsp)
+	leaq	-64(%rsp),%rsp
+	movl	%ebx,%esi
+	vmovdqa	%ymm6,0(%rsp)
+	xorl	%ecx,%esi
+	vmovdqa	%ymm7,32(%rsp)
+	movl	%r9d,%r12d
+	subq	$-32*4,%rbp
+	jmp	.Lavx2_00_47
+
+.align	16
+.Lavx2_00_47:
+	vmovdqu	(%r13),%xmm9
+	vpinsrq	$0,%r13,%xmm15,%xmm15
+	leaq	-64(%rsp),%rsp
+	vpalignr	$4,%ymm0,%ymm1,%ymm4
+	addl	0+128(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	vpalignr	$4,%ymm2,%ymm3,%ymm7
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	vpaddd	%ymm7,%ymm0,%ymm0
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%esi
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	vpshufd	$250,%ymm3,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	4+128(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%esi,%r15d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	vpaddd	%ymm4,%ymm0,%ymm0
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	8+128(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	vpshufd	$132,%ymm6,%ymm6
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	vpsrldq	$8,%ymm6,%ymm6
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	vpaddd	%ymm6,%ymm0,%ymm0
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	vpshufd	$80,%ymm0,%ymm7
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	vpsrlq	$17,%ymm7,%ymm7
+	addl	12+128(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	vpsrlq	$2,%ymm7,%ymm7
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	vpxor	%ymm7,%ymm6,%ymm6
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	vpshufd	$232,%ymm6,%ymm6
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	vpslldq	$8,%ymm6,%ymm6
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	vpaddd	%ymm6,%ymm0,%ymm0
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	vpaddd	0(%rbp),%ymm0,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	vmovdqa	%ymm6,0(%rsp)
+	vpalignr	$4,%ymm1,%ymm2,%ymm4
+	addl	32+128(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	vpalignr	$4,%ymm3,%ymm0,%ymm7
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	vpaddd	%ymm7,%ymm1,%ymm1
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	vpshufd	$250,%ymm0,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	36+128(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	vpaddd	%ymm4,%ymm1,%ymm1
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	40+128(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	vpshufd	$132,%ymm6,%ymm6
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	vpsrldq	$8,%ymm6,%ymm6
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	vpaddd	%ymm6,%ymm1,%ymm1
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	vpshufd	$80,%ymm1,%ymm7
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	vpsrlq	$17,%ymm7,%ymm7
+	addl	44+128(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	vpsrlq	$2,%ymm7,%ymm7
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	vpxor	%ymm7,%ymm6,%ymm6
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	vpshufd	$232,%ymm6,%ymm6
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	vpslldq	$8,%ymm6,%ymm6
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	vpaddd	%ymm6,%ymm1,%ymm1
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	vpaddd	32(%rbp),%ymm1,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vmovdqa	%ymm6,32(%rsp)
+	leaq	-64(%rsp),%rsp
+	vpalignr	$4,%ymm2,%ymm3,%ymm4
+	addl	0+128(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	vpalignr	$4,%ymm0,%ymm1,%ymm7
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	vpaddd	%ymm7,%ymm2,%ymm2
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	vpshufd	$250,%ymm1,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	4+128(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	vpaddd	%ymm4,%ymm2,%ymm2
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	8+128(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	vpshufd	$132,%ymm6,%ymm6
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	vpsrldq	$8,%ymm6,%ymm6
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	vpaddd	%ymm6,%ymm2,%ymm2
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	vpshufd	$80,%ymm2,%ymm7
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	vpsrlq	$17,%ymm7,%ymm7
+	addl	12+128(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	vpsrlq	$2,%ymm7,%ymm7
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	vpxor	%ymm7,%ymm6,%ymm6
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	vpshufd	$232,%ymm6,%ymm6
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	vpslldq	$8,%ymm6,%ymm6
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	vpaddd	%ymm6,%ymm2,%ymm2
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	vpaddd	64(%rbp),%ymm2,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	vmovdqa	%ymm6,0(%rsp)
+	vpalignr	$4,%ymm3,%ymm0,%ymm4
+	addl	32+128(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	vpalignr	$4,%ymm1,%ymm2,%ymm7
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	vpsrld	$7,%ymm4,%ymm6
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	vpaddd	%ymm7,%ymm3,%ymm3
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	vpsrld	$3,%ymm4,%ymm7
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	vpslld	$14,%ymm4,%ymm5
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	vpxor	%ymm6,%ymm7,%ymm4
+	andl	%r15d,%esi
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	vpshufd	$250,%ymm2,%ymm7
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	vpsrld	$11,%ymm6,%ymm6
+	addl	36+128(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	vpslld	$11,%ymm5,%ymm5
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	vpxor	%ymm6,%ymm4,%ymm4
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	vpsrlq	$17,%ymm7,%ymm7
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	vpaddd	%ymm4,%ymm3,%ymm3
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	vpxor	%ymm7,%ymm6,%ymm6
+	addl	40+128(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	vpsrlq	$2,%ymm7,%ymm7
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	vpxor	%ymm7,%ymm6,%ymm6
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	vpshufd	$132,%ymm6,%ymm6
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	vpsrldq	$8,%ymm6,%ymm6
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	vpaddd	%ymm6,%ymm3,%ymm3
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	vpshufd	$80,%ymm3,%ymm7
+	andl	%r15d,%esi
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	vpsrld	$10,%ymm7,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	vpsrlq	$17,%ymm7,%ymm7
+	addl	44+128(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	vpsrlq	$2,%ymm7,%ymm7
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	vpxor	%ymm7,%ymm6,%ymm6
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	vpshufd	$232,%ymm6,%ymm6
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	vpslldq	$8,%ymm6,%ymm6
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	vpaddd	%ymm6,%ymm3,%ymm3
+	andl	%esi,%r15d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	vpaddd	96(%rbp),%ymm3,%ymm6
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vmovdqa	%ymm6,32(%rsp)
+	vmovq	%xmm15,%r13
+	vpextrq	$1,%xmm15,%r15
+	vpand	%xmm14,%xmm11,%xmm11
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r15,%r13,1)
+	leaq	16(%r13),%r13
+	leaq	128(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jne	.Lavx2_00_47
+	vmovdqu	(%r13),%xmm9
+	vpinsrq	$0,%r13,%xmm15,%xmm15
+	addl	0+64(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%esi
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4+64(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%esi,%r15d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8+64(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12+64(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32+64(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	addl	36+64(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40+64(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44+64(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	addl	0(%rsp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4(%rsp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8(%rsp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12(%rsp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32(%rsp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%esi
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	addl	36(%rsp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40(%rsp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%esi
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44(%rsp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%esi,%r15d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vpextrq	$1,%xmm15,%r12
+	vmovq	%xmm15,%r13
+	movq	552(%rsp),%r15
+	addl	%r14d,%eax
+	leaq	448(%rsp),%rbp
+
+	vpand	%xmm14,%xmm11,%xmm11
+	vpor	%xmm11,%xmm8,%xmm8
+	vmovdqu	%xmm8,(%r12,%r13,1)
+	leaq	16(%r13),%r13
+
+	addl	0(%r15),%eax
+	addl	4(%r15),%ebx
+	addl	8(%r15),%ecx
+	addl	12(%r15),%edx
+	addl	16(%r15),%r8d
+	addl	20(%r15),%r9d
+	addl	24(%r15),%r10d
+	addl	28(%r15),%r11d
+
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	movl	%r8d,16(%r15)
+	movl	%r9d,20(%r15)
+	movl	%r10d,24(%r15)
+	movl	%r11d,28(%r15)
+
+	cmpq	80(%rbp),%r13
+	je	.Ldone_avx2
+
+	xorl	%r14d,%r14d
+	movl	%ebx,%esi
+	movl	%r9d,%r12d
+	xorl	%ecx,%esi
+	jmp	.Lower_avx2
+.align	16
+.Lower_avx2:
+	vmovdqu	(%r13),%xmm9
+	vpinsrq	$0,%r13,%xmm15,%xmm15
+	addl	0+16(%rbp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%esi
+	vpxor	%xmm10,%xmm9,%xmm9
+	vmovdqu	16-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4+16(%rbp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%esi,%r15d
+	vpxor	%xmm8,%xmm9,%xmm9
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8+16(%rbp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	32-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12+16(%rbp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	48-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32+16(%rbp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	addl	36+16(%rbp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	80-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40+16(%rbp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	96-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44+16(%rbp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	112-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	leaq	-64(%rbp),%rbp
+	addl	0+16(%rbp),%r11d
+	andl	%r8d,%r12d
+	rorxl	$25,%r8d,%r13d
+	rorxl	$11,%r8d,%r15d
+	leal	(%rax,%r14,1),%eax
+	leal	(%r11,%r12,1),%r11d
+	andnl	%r10d,%r8d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r8d,%r14d
+	leal	(%r11,%r12,1),%r11d
+	xorl	%r14d,%r13d
+	movl	%eax,%r15d
+	rorxl	$22,%eax,%r12d
+	leal	(%r11,%r13,1),%r11d
+	xorl	%ebx,%r15d
+	rorxl	$13,%eax,%r14d
+	rorxl	$2,%eax,%r13d
+	leal	(%rdx,%r11,1),%edx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	128-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ebx,%esi
+	xorl	%r13d,%r14d
+	leal	(%r11,%rsi,1),%r11d
+	movl	%r8d,%r12d
+	addl	4+16(%rbp),%r10d
+	andl	%edx,%r12d
+	rorxl	$25,%edx,%r13d
+	rorxl	$11,%edx,%esi
+	leal	(%r11,%r14,1),%r11d
+	leal	(%r10,%r12,1),%r10d
+	andnl	%r9d,%edx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%edx,%r14d
+	leal	(%r10,%r12,1),%r10d
+	xorl	%r14d,%r13d
+	movl	%r11d,%esi
+	rorxl	$22,%r11d,%r12d
+	leal	(%r10,%r13,1),%r10d
+	xorl	%eax,%esi
+	rorxl	$13,%r11d,%r14d
+	rorxl	$2,%r11d,%r13d
+	leal	(%rcx,%r10,1),%ecx
+	andl	%esi,%r15d
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	144-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%eax,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r10,%r15,1),%r10d
+	movl	%edx,%r12d
+	addl	8+16(%rbp),%r9d
+	andl	%ecx,%r12d
+	rorxl	$25,%ecx,%r13d
+	rorxl	$11,%ecx,%r15d
+	leal	(%r10,%r14,1),%r10d
+	leal	(%r9,%r12,1),%r9d
+	andnl	%r8d,%ecx,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%ecx,%r14d
+	leal	(%r9,%r12,1),%r9d
+	xorl	%r14d,%r13d
+	movl	%r10d,%r15d
+	rorxl	$22,%r10d,%r12d
+	leal	(%r9,%r13,1),%r9d
+	xorl	%r11d,%r15d
+	rorxl	$13,%r10d,%r14d
+	rorxl	$2,%r10d,%r13d
+	leal	(%rbx,%r9,1),%ebx
+	andl	%r15d,%esi
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r11d,%esi
+	xorl	%r13d,%r14d
+	leal	(%r9,%rsi,1),%r9d
+	movl	%ecx,%r12d
+	addl	12+16(%rbp),%r8d
+	andl	%ebx,%r12d
+	rorxl	$25,%ebx,%r13d
+	rorxl	$11,%ebx,%esi
+	leal	(%r9,%r14,1),%r9d
+	leal	(%r8,%r12,1),%r8d
+	andnl	%edx,%ebx,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%ebx,%r14d
+	leal	(%r8,%r12,1),%r8d
+	xorl	%r14d,%r13d
+	movl	%r9d,%esi
+	rorxl	$22,%r9d,%r12d
+	leal	(%r8,%r13,1),%r8d
+	xorl	%r10d,%esi
+	rorxl	$13,%r9d,%r14d
+	rorxl	$2,%r9d,%r13d
+	leal	(%rax,%r8,1),%eax
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	176-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r10d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%r8,%r15,1),%r8d
+	movl	%ebx,%r12d
+	addl	32+16(%rbp),%edx
+	andl	%eax,%r12d
+	rorxl	$25,%eax,%r13d
+	rorxl	$11,%eax,%r15d
+	leal	(%r8,%r14,1),%r8d
+	leal	(%rdx,%r12,1),%edx
+	andnl	%ecx,%eax,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%eax,%r14d
+	leal	(%rdx,%r12,1),%edx
+	xorl	%r14d,%r13d
+	movl	%r8d,%r15d
+	rorxl	$22,%r8d,%r12d
+	leal	(%rdx,%r13,1),%edx
+	xorl	%r9d,%r15d
+	rorxl	$13,%r8d,%r14d
+	rorxl	$2,%r8d,%r13d
+	leal	(%r11,%rdx,1),%r11d
+	andl	%r15d,%esi
+	vpand	%xmm12,%xmm11,%xmm8
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	192-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r9d,%esi
+	xorl	%r13d,%r14d
+	leal	(%rdx,%rsi,1),%edx
+	movl	%eax,%r12d
+	addl	36+16(%rbp),%ecx
+	andl	%r11d,%r12d
+	rorxl	$25,%r11d,%r13d
+	rorxl	$11,%r11d,%esi
+	leal	(%rdx,%r14,1),%edx
+	leal	(%rcx,%r12,1),%ecx
+	andnl	%ebx,%r11d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r11d,%r14d
+	leal	(%rcx,%r12,1),%ecx
+	xorl	%r14d,%r13d
+	movl	%edx,%esi
+	rorxl	$22,%edx,%r12d
+	leal	(%rcx,%r13,1),%ecx
+	xorl	%r8d,%esi
+	rorxl	$13,%edx,%r14d
+	rorxl	$2,%edx,%r13d
+	leal	(%r10,%rcx,1),%r10d
+	andl	%esi,%r15d
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	208-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%r8d,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rcx,%r15,1),%ecx
+	movl	%r11d,%r12d
+	addl	40+16(%rbp),%ebx
+	andl	%r10d,%r12d
+	rorxl	$25,%r10d,%r13d
+	rorxl	$11,%r10d,%r15d
+	leal	(%rcx,%r14,1),%ecx
+	leal	(%rbx,%r12,1),%ebx
+	andnl	%eax,%r10d,%r12d
+	xorl	%r15d,%r13d
+	rorxl	$6,%r10d,%r14d
+	leal	(%rbx,%r12,1),%ebx
+	xorl	%r14d,%r13d
+	movl	%ecx,%r15d
+	rorxl	$22,%ecx,%r12d
+	leal	(%rbx,%r13,1),%ebx
+	xorl	%edx,%r15d
+	rorxl	$13,%ecx,%r14d
+	rorxl	$2,%ecx,%r13d
+	leal	(%r9,%rbx,1),%r9d
+	andl	%r15d,%esi
+	vpand	%xmm13,%xmm11,%xmm11
+	vaesenc	%xmm10,%xmm9,%xmm9
+	vmovdqu	224-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%edx,%esi
+	xorl	%r13d,%r14d
+	leal	(%rbx,%rsi,1),%ebx
+	movl	%r10d,%r12d
+	addl	44+16(%rbp),%eax
+	andl	%r9d,%r12d
+	rorxl	$25,%r9d,%r13d
+	rorxl	$11,%r9d,%esi
+	leal	(%rbx,%r14,1),%ebx
+	leal	(%rax,%r12,1),%eax
+	andnl	%r11d,%r9d,%r12d
+	xorl	%esi,%r13d
+	rorxl	$6,%r9d,%r14d
+	leal	(%rax,%r12,1),%eax
+	xorl	%r14d,%r13d
+	movl	%ebx,%esi
+	rorxl	$22,%ebx,%r12d
+	leal	(%rax,%r13,1),%eax
+	xorl	%ecx,%esi
+	rorxl	$13,%ebx,%r14d
+	rorxl	$2,%ebx,%r13d
+	leal	(%r8,%rax,1),%r8d
+	andl	%esi,%r15d
+	vpor	%xmm11,%xmm8,%xmm8
+	vaesenclast	%xmm10,%xmm9,%xmm11
+	vmovdqu	0-128(%rdi),%xmm10
+	xorl	%r12d,%r14d
+	xorl	%ecx,%r15d
+	xorl	%r13d,%r14d
+	leal	(%rax,%r15,1),%eax
+	movl	%r9d,%r12d
+	vmovq	%xmm15,%r13
+	vpextrq	$1,%xmm15,%r15
+	vpand	%xmm14,%xmm11,%xmm11
+	vpor	%xmm11,%xmm8,%xmm8
+	leaq	-64(%rbp),%rbp
+	vmovdqu	%xmm8,(%r15,%r13,1)
+	leaq	16(%r13),%r13
+	cmpq	%rsp,%rbp
+	jae	.Lower_avx2
+
+	movq	552(%rsp),%r15
+	leaq	64(%r13),%r13
+	movq	560(%rsp),%rsi
+	addl	%r14d,%eax
+	leaq	448(%rsp),%rsp
+
+	addl	0(%r15),%eax
+	addl	4(%r15),%ebx
+	addl	8(%r15),%ecx
+	addl	12(%r15),%edx
+	addl	16(%r15),%r8d
+	addl	20(%r15),%r9d
+	addl	24(%r15),%r10d
+	leaq	(%rsi,%r13,1),%r12
+	addl	28(%r15),%r11d
+
+	cmpq	64+16(%rsp),%r13
+
+	movl	%eax,0(%r15)
+	cmoveq	%rsp,%r12
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	movl	%r8d,16(%r15)
+	movl	%r9d,20(%r15)
+	movl	%r10d,24(%r15)
+	movl	%r11d,28(%r15)
+
+	jbe	.Loop_avx2
+	leaq	(%rsp),%rbp
+
+.Ldone_avx2:
+	leaq	(%rbp),%rsp
+	movq	64+32(%rsp),%r8
+	movq	64+56(%rsp),%rsi
+	vmovdqu	%xmm8,(%r8)
+	vzeroall
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
+.type	aesni_cbc_sha256_enc_shaext, at function
+.align	32
+aesni_cbc_sha256_enc_shaext:
+	movq	8(%rsp),%r10
+	leaq	K256+128(%rip),%rax
+	movdqu	(%r9),%xmm1
+	movdqu	16(%r9),%xmm2
+	movdqa	512-128(%rax),%xmm3
+
+	movl	240(%rcx),%r11d
+	subq	%rdi,%rsi
+	movups	(%rcx),%xmm15
+	movups	(%r8),%xmm6
+	movups	16(%rcx),%xmm4
+	leaq	112(%rcx),%rcx
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%r10),%xmm10
+	movdqu	16(%r10),%xmm11
+	movdqu	32(%r10),%xmm12
+.byte	102,68,15,56,0,211
+	movdqu	48(%r10),%xmm13
+
+	movdqa	0-128(%rax),%xmm0
+	paddd	%xmm10,%xmm0
+.byte	102,68,15,56,0,219
+	movdqa	%xmm2,%xmm9
+	movdqa	%xmm1,%xmm8
+	movups	0(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	xorps	%xmm14,%xmm6
+	movups	-80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movups	-64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,202
+
+	movdqa	32-128(%rax),%xmm0
+	paddd	%xmm11,%xmm0
+.byte	102,68,15,56,0,227
+	leaq	64(%r10),%r10
+	movups	-48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movups	-32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,202
+
+	movdqa	64-128(%rax),%xmm0
+	paddd	%xmm12,%xmm0
+.byte	102,68,15,56,0,235
+.byte	69,15,56,204,211
+	movups	-16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm13,%xmm3
+.byte	102,65,15,58,15,220,4
+	paddd	%xmm3,%xmm10
+	movups	0(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,202
+
+	movdqa	96-128(%rax),%xmm0
+	paddd	%xmm13,%xmm0
+.byte	69,15,56,205,213
+.byte	69,15,56,204,220
+	movups	16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movups	32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,221,4
+	paddd	%xmm3,%xmm11
+.byte	15,56,203,202
+	movdqa	128-128(%rax),%xmm0
+	paddd	%xmm10,%xmm0
+.byte	69,15,56,205,218
+.byte	69,15,56,204,229
+	movups	48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+	paddd	%xmm3,%xmm12
+	cmpl	$11,%r11d
+	jb	.Laesenclast1
+	movups	64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	je	.Laesenclast1
+	movups	96(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	112(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.Laesenclast1:
+	aesenclast	%xmm5,%xmm6
+	movups	16-112(%rcx),%xmm4
+	nop
+.byte	15,56,203,202
+	movups	16(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm6,0(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm6
+	movups	-80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	movdqa	160-128(%rax),%xmm0
+	paddd	%xmm11,%xmm0
+.byte	69,15,56,205,227
+.byte	69,15,56,204,234
+	movups	-64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm12,%xmm3
+.byte	102,65,15,58,15,219,4
+	paddd	%xmm3,%xmm13
+	movups	-48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	192-128(%rax),%xmm0
+	paddd	%xmm12,%xmm0
+.byte	69,15,56,205,236
+.byte	69,15,56,204,211
+	movups	-32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm13,%xmm3
+.byte	102,65,15,58,15,220,4
+	paddd	%xmm3,%xmm10
+	movups	-16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	224-128(%rax),%xmm0
+	paddd	%xmm13,%xmm0
+.byte	69,15,56,205,213
+.byte	69,15,56,204,220
+	movups	0(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,221,4
+	paddd	%xmm3,%xmm11
+	movups	16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	256-128(%rax),%xmm0
+	paddd	%xmm10,%xmm0
+.byte	69,15,56,205,218
+.byte	69,15,56,204,229
+	movups	32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+	paddd	%xmm3,%xmm12
+	movups	48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	cmpl	$11,%r11d
+	jb	.Laesenclast2
+	movups	64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	je	.Laesenclast2
+	movups	96(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	112(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.Laesenclast2:
+	aesenclast	%xmm5,%xmm6
+	movups	16-112(%rcx),%xmm4
+	nop
+.byte	15,56,203,202
+	movups	32(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm6,16(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm6
+	movups	-80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	movdqa	288-128(%rax),%xmm0
+	paddd	%xmm11,%xmm0
+.byte	69,15,56,205,227
+.byte	69,15,56,204,234
+	movups	-64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm12,%xmm3
+.byte	102,65,15,58,15,219,4
+	paddd	%xmm3,%xmm13
+	movups	-48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	320-128(%rax),%xmm0
+	paddd	%xmm12,%xmm0
+.byte	69,15,56,205,236
+.byte	69,15,56,204,211
+	movups	-32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm13,%xmm3
+.byte	102,65,15,58,15,220,4
+	paddd	%xmm3,%xmm10
+	movups	-16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	352-128(%rax),%xmm0
+	paddd	%xmm13,%xmm0
+.byte	69,15,56,205,213
+.byte	69,15,56,204,220
+	movups	0(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,221,4
+	paddd	%xmm3,%xmm11
+	movups	16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	384-128(%rax),%xmm0
+	paddd	%xmm10,%xmm0
+.byte	69,15,56,205,218
+.byte	69,15,56,204,229
+	movups	32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+	paddd	%xmm3,%xmm12
+	movups	48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+	movdqa	416-128(%rax),%xmm0
+	paddd	%xmm11,%xmm0
+.byte	69,15,56,205,227
+.byte	69,15,56,204,234
+	cmpl	$11,%r11d
+	jb	.Laesenclast3
+	movups	64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	je	.Laesenclast3
+	movups	96(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	112(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.Laesenclast3:
+	aesenclast	%xmm5,%xmm6
+	movups	16-112(%rcx),%xmm4
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm12,%xmm3
+.byte	102,65,15,58,15,219,4
+	paddd	%xmm3,%xmm13
+	movups	48(%rdi),%xmm14
+	xorps	%xmm15,%xmm14
+	movups	%xmm6,32(%rsi,%rdi,1)
+	xorps	%xmm14,%xmm6
+	movups	-80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	movups	-64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,202
+
+	movdqa	448-128(%rax),%xmm0
+	paddd	%xmm12,%xmm0
+.byte	69,15,56,205,236
+	movdqa	%xmm7,%xmm3
+	movups	-48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movups	-32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,202
+
+	movdqa	480-128(%rax),%xmm0
+	paddd	%xmm13,%xmm0
+	movups	-16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	movups	0(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movups	16(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.byte	15,56,203,202
+
+	movups	32(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	48(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	cmpl	$11,%r11d
+	jb	.Laesenclast4
+	movups	64(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	80(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+	je	.Laesenclast4
+	movups	96(%rcx),%xmm4
+	aesenc	%xmm5,%xmm6
+	movups	112(%rcx),%xmm5
+	aesenc	%xmm4,%xmm6
+.Laesenclast4:
+	aesenclast	%xmm5,%xmm6
+	movups	16-112(%rcx),%xmm4
+	nop
+
+	paddd	%xmm9,%xmm2
+	paddd	%xmm8,%xmm1
+
+	decq	%rdx
+	movups	%xmm6,48(%rsi,%rdi,1)
+	leaq	64(%rdi),%rdi
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm3
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,211,8
+
+	movups	%xmm6,(%r8)
+	movdqu	%xmm1,(%r9)
+	movdqu	%xmm2,16(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext


Property changes on: trunk/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,3520 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S 325337 2017-11-02 18:30:41Z jkim $ */
+/* Do not modify. This file is auto-generated from ecp_nistz256-x86_64.pl. */
+.text	
+
+
+
+.align	64
+.Lpoly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+
+.LRR:
+.quad	0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
+
+.LOne:
+.long	1,1,1,1,1,1,1,1
+.LTwo:
+.long	2,2,2,2,2,2,2,2
+.LThree:
+.long	3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+.globl	ecp_nistz256_mul_by_2
+.type	ecp_nistz256_mul_by_2, at function
+.align	64
+ecp_nistz256_mul_by_2:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%r8
+	xorq	%r13,%r13
+	movq	8(%rsi),%r9
+	addq	%r8,%r8
+	movq	16(%rsi),%r10
+	adcq	%r9,%r9
+	movq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	movq	%r9,%rdx
+	adcq	$0,%r13
+
+	subq	0(%rsi),%r8
+	movq	%r10,%rcx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r12
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%r13
+
+	cmovcq	%rax,%r8
+	cmovcq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+
+
+.globl	ecp_nistz256_div_by_2
+.type	ecp_nistz256_div_by_2, at function
+.align	32
+ecp_nistz256_div_by_2:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r8,%rax
+	movq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+
+	movq	%r9,%rdx
+	xorq	%r13,%r13
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	adcq	$0,%r13
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	cmovzq	%rcx,%r10
+	cmovzq	%r12,%r11
+	cmovzq	%rsi,%r13
+
+	movq	%r9,%rax
+	shrq	$1,%r8
+	shlq	$63,%rax
+	movq	%r10,%rdx
+	shrq	$1,%r9
+	orq	%rax,%r8
+	shlq	$63,%rdx
+	movq	%r11,%rcx
+	shrq	$1,%r10
+	orq	%rdx,%r9
+	shlq	$63,%rcx
+	shrq	$1,%r11
+	shlq	$63,%r13
+	orq	%rcx,%r10
+	orq	%r13,%r11
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+
+
+.globl	ecp_nistz256_mul_by_3
+.type	ecp_nistz256_mul_by_3, at function
+.align	32
+ecp_nistz256_mul_by_3:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%r8
+	xorq	%r13,%r13
+	movq	8(%rsi),%r9
+	addq	%r8,%r8
+	movq	16(%rsi),%r10
+	adcq	%r9,%r9
+	movq	24(%rsi),%r11
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	movq	%r9,%rdx
+	adcq	$0,%r13
+
+	subq	$-1,%r8
+	movq	%r10,%rcx
+	sbbq	.Lpoly+8(%rip),%r9
+	sbbq	$0,%r10
+	movq	%r11,%r12
+	sbbq	.Lpoly+24(%rip),%r11
+	sbbq	$0,%r13
+
+	cmovcq	%rax,%r8
+	cmovcq	%rdx,%r9
+	cmovcq	%rcx,%r10
+	cmovcq	%r12,%r11
+
+	xorq	%r13,%r13
+	addq	0(%rsi),%r8
+	adcq	8(%rsi),%r9
+	movq	%r8,%rax
+	adcq	16(%rsi),%r10
+	adcq	24(%rsi),%r11
+	movq	%r9,%rdx
+	adcq	$0,%r13
+
+	subq	$-1,%r8
+	movq	%r10,%rcx
+	sbbq	.Lpoly+8(%rip),%r9
+	sbbq	$0,%r10
+	movq	%r11,%r12
+	sbbq	.Lpoly+24(%rip),%r11
+	sbbq	$0,%r13
+
+	cmovcq	%rax,%r8
+	cmovcq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+
+
+.globl	ecp_nistz256_add
+.type	ecp_nistz256_add, at function
+.align	32
+ecp_nistz256_add:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%r8
+	xorq	%r13,%r13
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	movq	%r9,%rdx
+	adcq	$0,%r13
+
+	subq	0(%rsi),%r8
+	movq	%r10,%rcx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r12
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%r13
+
+	cmovcq	%rax,%r8
+	cmovcq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_add,.-ecp_nistz256_add
+
+
+
+.globl	ecp_nistz256_sub
+.type	ecp_nistz256_sub, at function
+.align	32
+ecp_nistz256_sub:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%r8
+	xorq	%r13,%r13
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+
+	subq	0(%rdx),%r8
+	sbbq	8(%rdx),%r9
+	movq	%r8,%rax
+	sbbq	16(%rdx),%r10
+	sbbq	24(%rdx),%r11
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_sub,.-ecp_nistz256_sub
+
+
+
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg, at function
+.align	32
+ecp_nistz256_neg:
+	pushq	%r12
+	pushq	%r13
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+.globl	ecp_nistz256_to_mont
+.type	ecp_nistz256_to_mont, at function
+.align	32
+ecp_nistz256_to_mont:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+	leaq	.LRR(%rip),%rdx
+	jmp	.Lmul_mont
+.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+
+
+
+
+
+
+.globl	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont, at function
+.align	32
+ecp_nistz256_mul_mont:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+.Lmul_mont:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	cmpl	$0x80100,%ecx
+	je	.Lmul_montx
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+	jmp	.Lmul_mont_done
+
+.align	32
+.Lmul_montx:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+.Lmul_mont_done:
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type	__ecp_nistz256_mul_montq, at function
+.align	32
+__ecp_nistz256_mul_montq:
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	.Lpoly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont, at function
+.align	32
+ecp_nistz256_sqr_mont:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	cmpl	$0x80100,%ecx
+	je	.Lsqr_montx
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	.Lsqr_mont_done
+
+.align	32
+.Lsqr_montx:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+.Lsqr_mont_done:
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type	__ecp_nistz256_sqr_montq, at function
+.align	32
+__ecp_nistz256_sqr_montq:
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	.Lpoly+8(%rip),%rsi
+	movq	.Lpoly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type	__ecp_nistz256_mul_montx, at function
+.align	32
+__ecp_nistz256_mul_montx:
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	.Lpoly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	.Lpoly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type	__ecp_nistz256_sqr_montx, at function
+.align	32
+__ecp_nistz256_sqr_montx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	.Lpoly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	.Lpoly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+
+
+
+
+.globl	ecp_nistz256_from_mont
+.type	ecp_nistz256_from_mont, at function
+.align	32
+ecp_nistz256_from_mont:
+	pushq	%r12
+	pushq	%r13
+
+	movq	0(%rsi),%rax
+	movq	.Lpoly+24(%rip),%r13
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%rax,%r8
+	movq	.Lpoly+8(%rip),%r12
+
+
+
+	movq	%rax,%rcx
+	shlq	$32,%r8
+	mulq	%r13
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%r13
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%r13
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%r13
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	movq	%r8,%rcx
+	adcq	%rax,%r10
+	movq	%r9,%rsi
+	adcq	$0,%rdx
+
+
+
+	subq	$-1,%r8
+	movq	%r10,%rax
+	sbbq	%r12,%r9
+	sbbq	$0,%r10
+	movq	%rdx,%r11
+	sbbq	%r13,%rdx
+	sbbq	%r13,%r13
+
+	cmovnzq	%rcx,%r8
+	cmovnzq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovnzq	%rax,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%rdx,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	popq	%r13
+	popq	%r12
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+
+
+.globl	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5, at function
+.align	32
+ecp_nistz256_select_w5:
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w5
+	movdqa	.LOne(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+.Lselect_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+
+.globl	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7, at function
+.align	32
+ecp_nistz256_select_w7:
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w7
+	movdqa	.LOne(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+.Lselect_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type	ecp_nistz256_avx2_select_w5, at function
+.align	32
+ecp_nistz256_avx2_select_w5:
+.Lavx2_select_w5:
+	vzeroupper
+	vmovdqa	.LTwo(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	.LOne(%rip),%ymm5
+	vmovdqa	.LTwo(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+.Lselect_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
+.globl	ecp_nistz256_avx2_select_w7
+.type	ecp_nistz256_avx2_select_w7, at function
+.align	32
+ecp_nistz256_avx2_select_w7:
+.Lavx2_select_w7:
+	vzeroupper
+	vmovdqa	.LThree(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	.LOne(%rip),%ymm4
+	vmovdqa	.LTwo(%rip),%ymm8
+	vmovdqa	.LThree(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+.Lselect_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.type	__ecp_nistz256_add_toq, at function
+.align	32
+__ecp_nistz256_add_toq:
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type	__ecp_nistz256_sub_fromq, at function
+.align	32
+__ecp_nistz256_sub_fromq:
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type	__ecp_nistz256_subq, at function
+.align	32
+__ecp_nistz256_subq:
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type	__ecp_nistz256_mul_by_2q, at function
+.align	32
+__ecp_nistz256_mul_by_2q:
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double, at function
+.align	32
+ecp_nistz256_point_double:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_doublex
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$160+8,%rsp
+
+.Lpoint_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	addq	$160+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add, at function
+.align	32
+ecp_nistz256_point_add:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_addx
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$576+8,%rsp
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	0x3e
+	jnz	.Ladd_proceedq
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	testq	%r8,%r8
+	jnz	.Ladd_proceedq
+	testq	%r9,%r9
+	jz	.Ladd_doubleq
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_doneq
+
+.align	32
+.Ladd_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+	jmp	.Lpoint_double_shortcutq
+
+.align	32
+.Ladd_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_doneq:
+	addq	$576+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine, at function
+.align	32
+ecp_nistz256_point_add_affine:
+	movl	$0x80100,%ecx
+	andl	OPENSSL_ia32cap_P+8(%rip),%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_add_affinex
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$480+8,%rsp
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	addq	$480+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type	__ecp_nistz256_add_tox, at function
+.align	32
+__ecp_nistz256_add_tox:
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type	__ecp_nistz256_sub_fromx, at function
+.align	32
+__ecp_nistz256_sub_fromx:
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type	__ecp_nistz256_subx, at function
+.align	32
+__ecp_nistz256_subx:
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type	__ecp_nistz256_mul_by_2x, at function
+.align	32
+__ecp_nistz256_mul_by_2x:
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type	ecp_nistz256_point_doublex, at function
+.align	32
+ecp_nistz256_point_doublex:
+.Lpoint_doublex:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$160+8,%rsp
+
+.Lpoint_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	addq	$160+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type	ecp_nistz256_point_addx, at function
+.align	32
+ecp_nistz256_point_addx:
+.Lpoint_addx:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$576+8,%rsp
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	0x3e
+	jnz	.Ladd_proceedx
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	testq	%r8,%r8
+	jnz	.Ladd_proceedx
+	testq	%r9,%r9
+	jz	.Ladd_doublex
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_donex
+
+.align	32
+.Ladd_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+	jmp	.Lpoint_double_shortcutx
+
+.align	32
+.Ladd_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_donex:
+	addq	$576+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type	ecp_nistz256_point_add_affinex, at function
+.align	32
+ecp_nistz256_point_add_affinex:
+.Lpoint_add_affinex:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$480+8,%rsp
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	addq	$480+8,%rsp
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+	.byte	0xf3,0xc3
+.size	ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex


Property changes on: trunk/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/rsaz-avx2.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/rsaz-avx2.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/rsaz-avx2.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,1700 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-avx2.S 326663 2017-12-07 18:04:48Z jkim $ */
+/* Do not modify. This file is auto-generated from rsaz-avx2.pl. */
+.text	
+
+.globl	rsaz_1024_sqr_avx2
+.type	rsaz_1024_sqr_avx2, at function
+.align	64
+rsaz_1024_sqr_avx2:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+	movq	%rax,%rbp
+	movq	%rdx,%r13
+	subq	$832,%rsp
+	movq	%r13,%r15
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	subq	$-128,%r13
+
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	vpxor	%ymm9,%ymm9,%ymm9
+	jz	.Lsqr_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%r13),%ymm0
+	andq	$-2048,%rsp
+	vmovdqu	32-128(%r13),%ymm1
+	vmovdqu	64-128(%r13),%ymm2
+	vmovdqu	96-128(%r13),%ymm3
+	vmovdqu	128-128(%r13),%ymm4
+	vmovdqu	160-128(%r13),%ymm5
+	vmovdqu	192-128(%r13),%ymm6
+	vmovdqu	224-128(%r13),%ymm7
+	vmovdqu	256-128(%r13),%ymm8
+	leaq	832+128(%rsp),%r13
+	vmovdqu	%ymm0,0-128(%r13)
+	vmovdqu	%ymm1,32-128(%r13)
+	vmovdqu	%ymm2,64-128(%r13)
+	vmovdqu	%ymm3,96-128(%r13)
+	vmovdqu	%ymm4,128-128(%r13)
+	vmovdqu	%ymm5,160-128(%r13)
+	vmovdqu	%ymm6,192-128(%r13)
+	vmovdqu	%ymm7,224-128(%r13)
+	vmovdqu	%ymm8,256-128(%r13)
+	vmovdqu	%ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+	andq	$-1024,%rsp
+
+	vmovdqu	32-128(%rsi),%ymm1
+	vmovdqu	64-128(%rsi),%ymm2
+	vmovdqu	96-128(%rsi),%ymm3
+	vmovdqu	128-128(%rsi),%ymm4
+	vmovdqu	160-128(%rsi),%ymm5
+	vmovdqu	192-128(%rsi),%ymm6
+	vmovdqu	224-128(%rsi),%ymm7
+	vmovdqu	256-128(%rsi),%ymm8
+
+	leaq	192(%rsp),%rbx
+	vmovdqu	.Land_mask(%rip),%ymm15
+	jmp	.LOOP_GRANDE_SQR_1024
+
+.align	32
+.LOOP_GRANDE_SQR_1024:
+	leaq	576+128(%rsp),%r9
+	leaq	448(%rsp),%r12
+
+
+
+
+	vpaddq	%ymm1,%ymm1,%ymm1
+	vpbroadcastq	0-128(%rsi),%ymm10
+	vpaddq	%ymm2,%ymm2,%ymm2
+	vmovdqa	%ymm1,0-128(%r9)
+	vpaddq	%ymm3,%ymm3,%ymm3
+	vmovdqa	%ymm2,32-128(%r9)
+	vpaddq	%ymm4,%ymm4,%ymm4
+	vmovdqa	%ymm3,64-128(%r9)
+	vpaddq	%ymm5,%ymm5,%ymm5
+	vmovdqa	%ymm4,96-128(%r9)
+	vpaddq	%ymm6,%ymm6,%ymm6
+	vmovdqa	%ymm5,128-128(%r9)
+	vpaddq	%ymm7,%ymm7,%ymm7
+	vmovdqa	%ymm6,160-128(%r9)
+	vpaddq	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm7,192-128(%r9)
+	vpxor	%ymm9,%ymm9,%ymm9
+	vmovdqa	%ymm8,224-128(%r9)
+
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpbroadcastq	32-128(%rsi),%ymm11
+	vmovdqu	%ymm9,288-192(%rbx)
+	vpmuludq	%ymm10,%ymm1,%ymm1
+	vmovdqu	%ymm9,320-448(%r12)
+	vpmuludq	%ymm10,%ymm2,%ymm2
+	vmovdqu	%ymm9,352-448(%r12)
+	vpmuludq	%ymm10,%ymm3,%ymm3
+	vmovdqu	%ymm9,384-448(%r12)
+	vpmuludq	%ymm10,%ymm4,%ymm4
+	vmovdqu	%ymm9,416-448(%r12)
+	vpmuludq	%ymm10,%ymm5,%ymm5
+	vmovdqu	%ymm9,448-448(%r12)
+	vpmuludq	%ymm10,%ymm6,%ymm6
+	vmovdqu	%ymm9,480-448(%r12)
+	vpmuludq	%ymm10,%ymm7,%ymm7
+	vmovdqu	%ymm9,512-448(%r12)
+	vpmuludq	%ymm10,%ymm8,%ymm8
+	vpbroadcastq	64-128(%rsi),%ymm10
+	vmovdqu	%ymm9,544-448(%r12)
+
+	movq	%rsi,%r15
+	movl	$4,%r14d
+	jmp	.Lsqr_entry_1024
+.align	32
+.LOOP_SQR_1024:
+	vpbroadcastq	32-128(%r15),%ymm11
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpaddq	0-192(%rbx),%ymm0,%ymm0
+	vpmuludq	0-128(%r9),%ymm10,%ymm1
+	vpaddq	32-192(%rbx),%ymm1,%ymm1
+	vpmuludq	32-128(%r9),%ymm10,%ymm2
+	vpaddq	64-192(%rbx),%ymm2,%ymm2
+	vpmuludq	64-128(%r9),%ymm10,%ymm3
+	vpaddq	96-192(%rbx),%ymm3,%ymm3
+	vpmuludq	96-128(%r9),%ymm10,%ymm4
+	vpaddq	128-192(%rbx),%ymm4,%ymm4
+	vpmuludq	128-128(%r9),%ymm10,%ymm5
+	vpaddq	160-192(%rbx),%ymm5,%ymm5
+	vpmuludq	160-128(%r9),%ymm10,%ymm6
+	vpaddq	192-192(%rbx),%ymm6,%ymm6
+	vpmuludq	192-128(%r9),%ymm10,%ymm7
+	vpaddq	224-192(%rbx),%ymm7,%ymm7
+	vpmuludq	224-128(%r9),%ymm10,%ymm8
+	vpbroadcastq	64-128(%r15),%ymm10
+	vpaddq	256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+	vmovdqu	%ymm0,0-192(%rbx)
+	vmovdqu	%ymm1,32-192(%rbx)
+
+	vpmuludq	32-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	32-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	64-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	96-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	128-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	160-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	192-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	224-128(%r9),%ymm11,%ymm0
+	vpbroadcastq	96-128(%r15),%ymm11
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+
+	vmovdqu	%ymm2,64-192(%rbx)
+	vmovdqu	%ymm3,96-192(%rbx)
+
+	vpmuludq	64-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	64-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	96-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	128-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	160-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	224-128(%r9),%ymm10,%ymm1
+	vpbroadcastq	128-128(%r15),%ymm10
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+
+	vmovdqu	%ymm4,128-192(%rbx)
+	vmovdqu	%ymm5,160-192(%rbx)
+
+	vpmuludq	96-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	96-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	128-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm0,%ymm0
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	224-128(%r9),%ymm11,%ymm2
+	vpbroadcastq	160-128(%r15),%ymm11
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+
+	vmovdqu	%ymm6,192-192(%rbx)
+	vmovdqu	%ymm7,224-192(%rbx)
+
+	vpmuludq	128-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	128-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	160-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	192-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	224-128(%r9),%ymm10,%ymm3
+	vpbroadcastq	192-128(%r15),%ymm10
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+
+	vmovdqu	%ymm8,256-192(%rbx)
+	vmovdqu	%ymm0,288-192(%rbx)
+	leaq	8(%rbx),%rbx
+
+	vpmuludq	160-128(%rsi),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	224-128(%r9),%ymm11,%ymm4
+	vpbroadcastq	224-128(%r15),%ymm11
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+
+	vmovdqu	%ymm1,320-448(%r12)
+	vmovdqu	%ymm2,352-448(%r12)
+
+	vpmuludq	192-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpbroadcastq	256-128(%r15),%ymm0
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	224-128(%r9),%ymm10,%ymm5
+	vpbroadcastq	0+8-128(%r15),%ymm10
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+
+	vmovdqu	%ymm3,384-448(%r12)
+	vmovdqu	%ymm4,416-448(%r12)
+	leaq	8(%r15),%r15
+
+	vpmuludq	224-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	224-128(%r9),%ymm11,%ymm6
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+
+	vpmuludq	256-128(%rsi),%ymm0,%ymm7
+	vmovdqu	%ymm5,448-448(%r12)
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vmovdqu	%ymm6,480-448(%r12)
+	vmovdqu	%ymm7,512-448(%r12)
+	leaq	8(%r12),%r12
+
+	decl	%r14d
+	jnz	.LOOP_SQR_1024
+
+	vmovdqu	256(%rsp),%ymm8
+	vmovdqu	288(%rsp),%ymm1
+	vmovdqu	320(%rsp),%ymm2
+	leaq	192(%rsp),%rbx
+
+	vpsrlq	$29,%ymm8,%ymm14
+	vpand	%ymm15,%ymm8,%ymm8
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+
+	vpermq	$0x93,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm9,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpblendd	$3,%ymm11,%ymm9,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,288-192(%rbx)
+	vmovdqu	%ymm2,320-192(%rbx)
+
+	movq	(%rsp),%rax
+	movq	8(%rsp),%r10
+	movq	16(%rsp),%r11
+	movq	24(%rsp),%r12
+	vmovdqu	32(%rsp),%ymm1
+	vmovdqu	64-192(%rbx),%ymm2
+	vmovdqu	96-192(%rbx),%ymm3
+	vmovdqu	128-192(%rbx),%ymm4
+	vmovdqu	160-192(%rbx),%ymm5
+	vmovdqu	192-192(%rbx),%ymm6
+	vmovdqu	224-192(%rbx),%ymm7
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpbroadcastq	%xmm12,%ymm12
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	shrq	$29,%r9
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	addq	%r9,%r10
+	addq	%rax,%r11
+	imulq	24-128(%r13),%rdx
+	addq	%rdx,%r12
+
+	movq	%r10,%rax
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+
+	movl	$9,%r14d
+	jmp	.LOOP_REDUCE_1024
+
+.align	32
+.LOOP_REDUCE_1024:
+	vmovd	%eax,%xmm13
+	vpbroadcastq	%xmm13,%ymm13
+
+	vpmuludq	32-128(%r13),%ymm12,%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm10,%ymm1,%ymm1
+	addq	%rax,%r10
+	vpmuludq	64-128(%r13),%ymm12,%ymm14
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm14,%ymm2,%ymm2
+	vpmuludq	96-128(%r13),%ymm12,%ymm11
+.byte	0x67
+	addq	%rax,%r11
+.byte	0x67
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	shrq	$29,%r10
+	vpaddq	%ymm11,%ymm3,%ymm3
+	vpmuludq	128-128(%r13),%ymm12,%ymm10
+	addq	%rax,%r12
+	addq	%r10,%r11
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpmuludq	160-128(%r13),%ymm12,%ymm14
+	movq	%r11,%rax
+	imull	%ecx,%eax
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpmuludq	192-128(%r13),%ymm12,%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpmuludq	224-128(%r13),%ymm12,%ymm10
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpmuludq	256-128(%r13),%ymm12,%ymm14
+	vmovd	%eax,%xmm12
+
+	vpaddq	%ymm14,%ymm8,%ymm8
+
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	32-8-128(%r13),%ymm13,%ymm11
+	vmovdqu	96-8-128(%r13),%ymm14
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm1,%ymm1
+	vpmuludq	64-8-128(%r13),%ymm13,%ymm10
+	vmovdqu	128-8-128(%r13),%ymm11
+	addq	%rax,%r11
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm10,%ymm2,%ymm2
+	addq	%r12,%rax
+	shrq	$29,%r11
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	160-8-128(%r13),%ymm10
+	addq	%r11,%rax
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	192-8-128(%r13),%ymm14
+.byte	0x67
+	movq	%rax,%r12
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm4,%ymm4
+	vpmuludq	%ymm13,%ymm10,%ymm10
+.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm5,%ymm5
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	256-8-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	288-8-128(%r13),%ymm9
+	vmovd	%eax,%xmm0
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm7,%ymm7
+	vpmuludq	%ymm13,%ymm10,%ymm10
+	vmovdqu	32-16-128(%r13),%ymm14
+	vpbroadcastq	%xmm0,%ymm0
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpmuludq	%ymm13,%ymm9,%ymm9
+	vmovdqu	64-16-128(%r13),%ymm11
+	addq	%rax,%r12
+
+	vmovdqu	32-24-128(%r13),%ymm13
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	96-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	%ymm0,%ymm13,%ymm13
+	vpmuludq	%ymm12,%ymm11,%ymm11
+.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+	vpaddq	%ymm1,%ymm13,%ymm13
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	160-16-128(%r13),%ymm11
+.byte	0x67
+	vmovq	%xmm13,%rax
+	vmovdqu	%ymm13,(%rsp)
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	192-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	%ymm12,%ymm11,%ymm11
+	vmovdqu	224-16-128(%r13),%ymm14
+	vpaddq	%ymm11,%ymm5,%ymm5
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	256-16-128(%r13),%ymm11
+	vpaddq	%ymm10,%ymm6,%ymm6
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	shrq	$29,%r12
+	vmovdqu	288-16-128(%r13),%ymm10
+	addq	%r12,%rax
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	%ymm12,%ymm11,%ymm11
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm8,%ymm8
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+	vmovdqu	96-24-128(%r13),%ymm11
+.byte	0x67
+	vpaddq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	64-24-128(%r13),%ymm0,%ymm14
+	vmovdqu	128-24-128(%r13),%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	movq	8(%rsp),%r10
+	vpaddq	%ymm14,%ymm2,%ymm1
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	160-24-128(%r13),%ymm14
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+.byte	0x67
+	shrq	$29,%r9
+	movq	16(%rsp),%r11
+	vpaddq	%ymm11,%ymm3,%ymm2
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vmovdqu	192-24-128(%r13),%ymm11
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	vpaddq	%ymm10,%ymm4,%ymm3
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	vmovdqu	224-24-128(%r13),%ymm10
+	imulq	24-128(%r13),%rdx
+	addq	%rax,%r11
+	leaq	(%r9,%r10,1),%rax
+	vpaddq	%ymm14,%ymm5,%ymm4
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	256-24-128(%r13),%ymm14
+	movq	%rax,%r10
+	imull	%ecx,%eax
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vpaddq	%ymm11,%ymm6,%ymm5
+	vmovdqu	288-24-128(%r13),%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm7,%ymm6
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	addq	24(%rsp),%rdx
+	vpaddq	%ymm14,%ymm8,%ymm7
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vpaddq	%ymm11,%ymm9,%ymm8
+	vmovq	%r12,%xmm9
+	movq	%rdx,%r12
+
+	decl	%r14d
+	jnz	.LOOP_REDUCE_1024
+	leaq	448(%rsp),%r12
+	vpaddq	%ymm9,%ymm13,%ymm0
+	vpxor	%ymm9,%ymm9,%ymm9
+
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vpaddq	544-448(%r12),%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vmovdqu	%ymm0,0-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,32-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vmovdqu	%ymm2,64-128(%rdi)
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vmovdqu	%ymm4,128-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vmovdqu	%ymm5,160-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vmovdqu	%ymm6,192-128(%rdi)
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+
+	movq	%rdi,%rsi
+	decl	%r8d
+	jne	.LOOP_GRANDE_SQR_1024
+
+	vzeroall
+	movq	%rbp,%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lsqr_1024_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl	rsaz_1024_mul_avx2
+.type	rsaz_1024_mul_avx2, at function
+.align	64
+rsaz_1024_mul_avx2:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rax,%rbp
+	vzeroall
+	movq	%rdx,%r13
+	subq	$64,%rsp
+
+
+
+
+
+
+.byte	0x67,0x67
+	movq	%rsi,%r15
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	movq	%rsi,%r15
+	cmovnzq	%r13,%rsi
+	cmovnzq	%r15,%r13
+
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	subq	$-128,%rcx
+	subq	$-128,%rdi
+
+	andq	$4095,%r15
+	addq	$320,%r15
+.byte	0x67,0x67
+	shrq	$12,%r15
+	jz	.Lmul_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%rcx),%ymm0
+	andq	$-512,%rsp
+	vmovdqu	32-128(%rcx),%ymm1
+	vmovdqu	64-128(%rcx),%ymm2
+	vmovdqu	96-128(%rcx),%ymm3
+	vmovdqu	128-128(%rcx),%ymm4
+	vmovdqu	160-128(%rcx),%ymm5
+	vmovdqu	192-128(%rcx),%ymm6
+	vmovdqu	224-128(%rcx),%ymm7
+	vmovdqu	256-128(%rcx),%ymm8
+	leaq	64+128(%rsp),%rcx
+	vmovdqu	%ymm0,0-128(%rcx)
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm1,32-128(%rcx)
+	vpxor	%ymm1,%ymm1,%ymm1
+	vmovdqu	%ymm2,64-128(%rcx)
+	vpxor	%ymm2,%ymm2,%ymm2
+	vmovdqu	%ymm3,96-128(%rcx)
+	vpxor	%ymm3,%ymm3,%ymm3
+	vmovdqu	%ymm4,128-128(%rcx)
+	vpxor	%ymm4,%ymm4,%ymm4
+	vmovdqu	%ymm5,160-128(%rcx)
+	vpxor	%ymm5,%ymm5,%ymm5
+	vmovdqu	%ymm6,192-128(%rcx)
+	vpxor	%ymm6,%ymm6,%ymm6
+	vmovdqu	%ymm7,224-128(%rcx)
+	vpxor	%ymm7,%ymm7,%ymm7
+	vmovdqu	%ymm8,256-128(%rcx)
+	vmovdqa	%ymm0,%ymm8
+	vmovdqu	%ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+	andq	$-64,%rsp
+
+	movq	(%r13),%rbx
+	vpbroadcastq	(%r13),%ymm10
+	vmovdqu	%ymm0,(%rsp)
+	xorq	%r9,%r9
+.byte	0x67
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+
+	vmovdqu	.Land_mask(%rip),%ymm15
+	movl	$9,%r14d
+	vmovdqu	%ymm9,288-128(%rdi)
+	jmp	.Loop_mul_1024
+
+.align	32
+.Loop_mul_1024:
+	vpsrlq	$29,%ymm3,%ymm9
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r9,%rax
+	movq	%rbx,%r10
+	imulq	8-128(%rsi),%r10
+	addq	8(%rsp),%r10
+
+	movq	%rax,%r9
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	movq	%rbx,%r11
+	imulq	16-128(%rsi),%r11
+	addq	16(%rsp),%r11
+
+	movq	%rbx,%r12
+	imulq	24-128(%rsi),%r12
+	addq	24(%rsp),%r12
+	vpmuludq	32-128(%rsi),%ymm10,%ymm0
+	vmovd	%eax,%xmm11
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	64-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	96-128(%rsi),%ymm10,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	128-128(%rsi),%ymm10,%ymm0
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	160-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	192-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	224-128(%rsi),%ymm10,%ymm0
+	vpermq	$0x93,%ymm9,%ymm9
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	256-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	8(%r13),%ymm10
+	vpaddq	%ymm12,%ymm8,%ymm8
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%rcx),%rax
+	addq	%rax,%r11
+	shrq	$29,%r9
+	imulq	24-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r9,%r10
+
+	vpmuludq	32-128(%rcx),%ymm11,%ymm13
+	vmovq	%xmm10,%rbx
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	64-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm2,%ymm2
+	vpmuludq	96-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	128-128(%rcx),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	160-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm5,%ymm5
+	vpmuludq	192-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	224-128(%rcx),%ymm11,%ymm13
+	vpblendd	$3,%ymm14,%ymm9,%ymm12
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	256-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm0,%ymm8,%ymm8
+
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rsi),%ymm12
+	movq	%rbx,%rax
+	imulq	8-128(%rsi),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rsi),%ymm13
+
+	movq	%r10,%rax
+	vpblendd	$0xfc,%ymm14,%ymm9,%ymm9
+	imull	%r8d,%eax
+	vpaddq	%ymm9,%ymm4,%ymm4
+	andl	$0x1fffffff,%eax
+
+	imulq	16-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovd	%eax,%xmm11
+	vmovdqu	-8+96-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-8+128-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+160-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+192-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-8+224-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+256-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+288-128(%rsi),%ymm9
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	16(%r13),%ymm10
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rcx),%ymm0
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rcx),%ymm12
+	shrq	$29,%r10
+	imulq	16-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r10,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-8+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rsi),%ymm0
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r11,%rax
+
+	vmovdqu	-16+64-128(%rsi),%ymm12
+	movq	%rax,%r11
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	imulq	8-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-16+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-16+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	24(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rcx),%ymm0
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-16+64-128(%rcx),%ymm12
+	imulq	8-128(%rcx),%rdx
+	addq	%rdx,%r12
+	shrq	$29,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-16+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+32-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+64-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	addq	%r11,%r12
+	imulq	-128(%rsi),%rbx
+	addq	%rbx,%r12
+
+	movq	%r12,%rax
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-24+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-24+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	32(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+	addq	$32,%r13
+
+	vmovdqu	-24+32-128(%rcx),%ymm0
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r12
+	shrq	$29,%r12
+
+	vmovdqu	-24+64-128(%rcx),%ymm12
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-24+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm0
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	%ymm0,(%rsp)
+	vpaddq	%ymm12,%ymm2,%ymm1
+	vmovdqu	-24+128-128(%rcx),%ymm0
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm2
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm3
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm4
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm5
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rcx),%ymm13
+	movq	%r12,%r9
+	vpaddq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	addq	(%rsp),%r9
+	vpaddq	%ymm12,%ymm8,%ymm7
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovq	%r12,%xmm12
+	vpaddq	%ymm13,%ymm9,%ymm8
+
+	decl	%r14d
+	jnz	.Loop_mul_1024
+	vpaddq	(%rsp),%ymm12,%ymm0
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm10,%ymm10
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpermq	$0x93,%ymm11,%ymm11
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm10,%ymm10
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0-128(%rdi)
+	vmovdqu	%ymm1,32-128(%rdi)
+	vmovdqu	%ymm2,64-128(%rdi)
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vmovdqu	%ymm4,128-128(%rdi)
+	vmovdqu	%ymm5,160-128(%rdi)
+	vmovdqu	%ymm6,192-128(%rdi)
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+	vzeroupper
+
+	movq	%rbp,%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lmul_1024_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl	rsaz_1024_red2norm_avx2
+.type	rsaz_1024_red2norm_avx2, at function
+.align	32
+rsaz_1024_red2norm_avx2:
+	subq	$-128,%rsi
+	xorq	%rax,%rax
+	movq	-128(%rsi),%r8
+	movq	-120(%rsi),%r9
+	movq	-112(%rsi),%r10
+	shlq	$0,%r8
+	shlq	$29,%r9
+	movq	%r10,%r11
+	shlq	$58,%r10
+	shrq	$6,%r11
+	addq	%r8,%rax
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,0(%rdi)
+	movq	%r11,%rax
+	movq	-104(%rsi),%r8
+	movq	-96(%rsi),%r9
+	shlq	$23,%r8
+	movq	%r9,%r10
+	shlq	$52,%r9
+	shrq	$12,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,8(%rdi)
+	movq	%r10,%rax
+	movq	-88(%rsi),%r11
+	movq	-80(%rsi),%r8
+	shlq	$17,%r11
+	movq	%r8,%r9
+	shlq	$46,%r8
+	shrq	$18,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,16(%rdi)
+	movq	%r9,%rax
+	movq	-72(%rsi),%r10
+	movq	-64(%rsi),%r11
+	shlq	$11,%r10
+	movq	%r11,%r8
+	shlq	$40,%r11
+	shrq	$24,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,24(%rdi)
+	movq	%r8,%rax
+	movq	-56(%rsi),%r9
+	movq	-48(%rsi),%r10
+	movq	-40(%rsi),%r11
+	shlq	$5,%r9
+	shlq	$34,%r10
+	movq	%r11,%r8
+	shlq	$63,%r11
+	shrq	$1,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,32(%rdi)
+	movq	%r8,%rax
+	movq	-32(%rsi),%r9
+	movq	-24(%rsi),%r10
+	shlq	$28,%r9
+	movq	%r10,%r11
+	shlq	$57,%r10
+	shrq	$7,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,40(%rdi)
+	movq	%r11,%rax
+	movq	-16(%rsi),%r8
+	movq	-8(%rsi),%r9
+	shlq	$22,%r8
+	movq	%r9,%r10
+	shlq	$51,%r9
+	shrq	$13,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,48(%rdi)
+	movq	%r10,%rax
+	movq	0(%rsi),%r11
+	movq	8(%rsi),%r8
+	shlq	$16,%r11
+	movq	%r8,%r9
+	shlq	$45,%r8
+	shrq	$19,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,56(%rdi)
+	movq	%r9,%rax
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	shlq	$10,%r10
+	movq	%r11,%r8
+	shlq	$39,%r11
+	shrq	$25,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,64(%rdi)
+	movq	%r8,%rax
+	movq	32(%rsi),%r9
+	movq	40(%rsi),%r10
+	movq	48(%rsi),%r11
+	shlq	$4,%r9
+	shlq	$33,%r10
+	movq	%r11,%r8
+	shlq	$62,%r11
+	shrq	$2,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,72(%rdi)
+	movq	%r8,%rax
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	shlq	$27,%r9
+	movq	%r10,%r11
+	shlq	$56,%r10
+	shrq	$8,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,80(%rdi)
+	movq	%r11,%rax
+	movq	72(%rsi),%r8
+	movq	80(%rsi),%r9
+	shlq	$21,%r8
+	movq	%r9,%r10
+	shlq	$50,%r9
+	shrq	$14,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,88(%rdi)
+	movq	%r10,%rax
+	movq	88(%rsi),%r11
+	movq	96(%rsi),%r8
+	shlq	$15,%r11
+	movq	%r8,%r9
+	shlq	$44,%r8
+	shrq	$20,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,96(%rdi)
+	movq	%r9,%rax
+	movq	104(%rsi),%r10
+	movq	112(%rsi),%r11
+	shlq	$9,%r10
+	movq	%r11,%r8
+	shlq	$38,%r11
+	shrq	$26,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,104(%rdi)
+	movq	%r8,%rax
+	movq	120(%rsi),%r9
+	movq	128(%rsi),%r10
+	movq	136(%rsi),%r11
+	shlq	$3,%r9
+	shlq	$32,%r10
+	movq	%r11,%r8
+	shlq	$61,%r11
+	shrq	$3,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,112(%rdi)
+	movq	%r8,%rax
+	movq	144(%rsi),%r9
+	movq	152(%rsi),%r10
+	shlq	$26,%r9
+	movq	%r10,%r11
+	shlq	$55,%r10
+	shrq	$9,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,120(%rdi)
+	movq	%r11,%rax
+	.byte	0xf3,0xc3
+.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl	rsaz_1024_norm2red_avx2
+.type	rsaz_1024_norm2red_avx2, at function
+.align	32
+rsaz_1024_norm2red_avx2:
+	subq	$-128,%rdi
+	movq	(%rsi),%r8
+	movl	$0x1fffffff,%eax
+	movq	8(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$0,%r11
+	andq	%rax,%r11
+	movq	%r11,-128(%rdi)
+	movq	%r8,%r10
+	shrq	$29,%r10
+	andq	%rax,%r10
+	movq	%r10,-120(%rdi)
+	shrdq	$58,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-112(%rdi)
+	movq	16(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$23,%r8
+	andq	%rax,%r8
+	movq	%r8,-104(%rdi)
+	shrdq	$52,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-96(%rdi)
+	movq	24(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$17,%r9
+	andq	%rax,%r9
+	movq	%r9,-88(%rdi)
+	shrdq	$46,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-80(%rdi)
+	movq	32(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$11,%r10
+	andq	%rax,%r10
+	movq	%r10,-72(%rdi)
+	shrdq	$40,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,-64(%rdi)
+	movq	40(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$5,%r11
+	andq	%rax,%r11
+	movq	%r11,-56(%rdi)
+	movq	%r8,%r10
+	shrq	$34,%r10
+	andq	%rax,%r10
+	movq	%r10,-48(%rdi)
+	shrdq	$63,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-40(%rdi)
+	movq	48(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$28,%r8
+	andq	%rax,%r8
+	movq	%r8,-32(%rdi)
+	shrdq	$57,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-24(%rdi)
+	movq	56(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$22,%r9
+	andq	%rax,%r9
+	movq	%r9,-16(%rdi)
+	shrdq	$51,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-8(%rdi)
+	movq	64(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$16,%r10
+	andq	%rax,%r10
+	movq	%r10,0(%rdi)
+	shrdq	$45,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,8(%rdi)
+	movq	72(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$10,%r11
+	andq	%rax,%r11
+	movq	%r11,16(%rdi)
+	shrdq	$39,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,24(%rdi)
+	movq	80(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$4,%r8
+	andq	%rax,%r8
+	movq	%r8,32(%rdi)
+	movq	%r9,%r11
+	shrq	$33,%r11
+	andq	%rax,%r11
+	movq	%r11,40(%rdi)
+	shrdq	$62,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,48(%rdi)
+	movq	88(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$27,%r9
+	andq	%rax,%r9
+	movq	%r9,56(%rdi)
+	shrdq	$56,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,64(%rdi)
+	movq	96(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$21,%r10
+	andq	%rax,%r10
+	movq	%r10,72(%rdi)
+	shrdq	$50,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,80(%rdi)
+	movq	104(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$15,%r11
+	andq	%rax,%r11
+	movq	%r11,88(%rdi)
+	shrdq	$44,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,96(%rdi)
+	movq	112(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$9,%r8
+	andq	%rax,%r8
+	movq	%r8,104(%rdi)
+	shrdq	$38,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,112(%rdi)
+	movq	120(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$3,%r9
+	andq	%rax,%r9
+	movq	%r9,120(%rdi)
+	movq	%r10,%r8
+	shrq	$32,%r8
+	andq	%rax,%r8
+	movq	%r8,128(%rdi)
+	shrdq	$61,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,136(%rdi)
+	xorq	%r8,%r8
+	movq	%r11,%r10
+	shrq	$26,%r10
+	andq	%rax,%r10
+	movq	%r10,144(%rdi)
+	shrdq	$55,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,152(%rdi)
+	movq	%r8,160(%rdi)
+	movq	%r8,168(%rdi)
+	movq	%r8,176(%rdi)
+	movq	%r8,184(%rdi)
+	.byte	0xf3,0xc3
+.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl	rsaz_1024_scatter5_avx2
+.type	rsaz_1024_scatter5_avx2, at function
+.align	32
+rsaz_1024_scatter5_avx2:
+	vzeroupper
+	vmovdqu	.Lscatter_permd(%rip),%ymm5
+	shll	$4,%edx
+	leaq	(%rdi,%rdx,1),%rdi
+	movl	$9,%eax
+	jmp	.Loop_scatter_1024
+
+.align	32
+.Loop_scatter_1024:
+	vmovdqu	(%rsi),%ymm0
+	leaq	32(%rsi),%rsi
+	vpermd	%ymm0,%ymm5,%ymm0
+	vmovdqu	%xmm0,(%rdi)
+	leaq	512(%rdi),%rdi
+	decl	%eax
+	jnz	.Loop_scatter_1024
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl	rsaz_1024_gather5_avx2
+.type	rsaz_1024_gather5_avx2, at function
+.align	32
+rsaz_1024_gather5_avx2:
+	vzeroupper
+	movq	%rsp,%r11
+	leaq	-256(%rsp),%rsp
+	andq	$-32,%rsp
+	leaq	.Linc(%rip),%r10
+	leaq	-128(%rsp),%rax
+
+	vmovd	%edx,%xmm4
+	vmovdqa	(%r10),%ymm0
+	vmovdqa	32(%r10),%ymm1
+	vmovdqa	64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4
+
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,0+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm0
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,32+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm1
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,64+128(%rax)
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vmovdqa	%ymm3,96+128(%rax)
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,128+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm8
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,160+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm9
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,192+128(%rax)
+	vpaddd	%ymm5,%ymm8,%ymm10
+	vpcmpeqd	%ymm4,%ymm8,%ymm8
+	vmovdqa	%ymm3,224+128(%rax)
+	vpaddd	%ymm5,%ymm9,%ymm11
+	vpcmpeqd	%ymm4,%ymm9,%ymm9
+	vpaddd	%ymm5,%ymm10,%ymm12
+	vpcmpeqd	%ymm4,%ymm10,%ymm10
+	vpaddd	%ymm5,%ymm11,%ymm13
+	vpcmpeqd	%ymm4,%ymm11,%ymm11
+	vpaddd	%ymm5,%ymm12,%ymm14
+	vpcmpeqd	%ymm4,%ymm12,%ymm12
+	vpaddd	%ymm5,%ymm13,%ymm15
+	vpcmpeqd	%ymm4,%ymm13,%ymm13
+	vpcmpeqd	%ymm4,%ymm14,%ymm14
+	vpcmpeqd	%ymm4,%ymm15,%ymm15
+
+	vmovdqa	-32(%r10),%ymm7
+	leaq	128(%rsi),%rsi
+	movl	$9,%edx
+
+.Loop_gather_1024:
+	vmovdqa	0-128(%rsi),%ymm0
+	vmovdqa	32-128(%rsi),%ymm1
+	vmovdqa	64-128(%rsi),%ymm2
+	vmovdqa	96-128(%rsi),%ymm3
+	vpand	0+128(%rax),%ymm0,%ymm0
+	vpand	32+128(%rax),%ymm1,%ymm1
+	vpand	64+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm1,%ymm4
+	vpand	96+128(%rax),%ymm3,%ymm3
+	vmovdqa	128-128(%rsi),%ymm0
+	vmovdqa	160-128(%rsi),%ymm1
+	vpor	%ymm2,%ymm3,%ymm5
+	vmovdqa	192-128(%rsi),%ymm2
+	vmovdqa	224-128(%rsi),%ymm3
+	vpand	128+128(%rax),%ymm0,%ymm0
+	vpand	160+128(%rax),%ymm1,%ymm1
+	vpand	192+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	224+128(%rax),%ymm3,%ymm3
+	vpand	256-128(%rsi),%ymm8,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	288-128(%rsi),%ymm9,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	320-128(%rsi),%ymm10,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	352-128(%rsi),%ymm11,%ymm3
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	384-128(%rsi),%ymm12,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	416-128(%rsi),%ymm13,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	448-128(%rsi),%ymm14,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	480-128(%rsi),%ymm15,%ymm3
+	leaq	512(%rsi),%rsi
+	vpor	%ymm0,%ymm4,%ymm4
+	vpor	%ymm1,%ymm5,%ymm5
+	vpor	%ymm2,%ymm4,%ymm4
+	vpor	%ymm3,%ymm5,%ymm5
+
+	vpor	%ymm5,%ymm4,%ymm4
+	vextracti128	$1,%ymm4,%xmm5
+	vpor	%xmm4,%xmm5,%xmm5
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqu	%ymm5,(%rdi)
+	leaq	32(%rdi),%rdi
+	decl	%edx
+	jnz	.Loop_gather_1024
+
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	vzeroupper
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+
+.globl	rsaz_avx2_eligible
+.type	rsaz_avx2_eligible, at function
+.align	32
+rsaz_avx2_eligible:
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	movl	$524544,%ecx
+	movl	$0,%edx
+	andl	%eax,%ecx
+	cmpl	$524544,%ecx
+	cmovel	%edx,%eax
+	andl	$32,%eax
+	shrl	$5,%eax
+	.byte	0xf3,0xc3
+.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align	64
+.Land_mask:
+.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long	0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long	0,7,1,7,2,7,3,7
+.Linc:
+.long	0,0,0,0, 1,1,1,1
+.long	2,2,2,2, 3,3,3,3
+.long	4,4,4,4, 4,4,4,4
+.align	64


Property changes on: trunk/secure/lib/libcrypto/amd64/rsaz-avx2.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/rsaz-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/rsaz-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/rsaz-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,1876 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
+.text	
+
+
+
+.globl	rsaz_512_sqr
+.type	rsaz_512_sqr, at function
+.align	32
+rsaz_512_sqr:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	subq	$128+24,%rsp
+.Lsqr_body:
+	movq	%rdx,%rbp
+	movq	(%rsi),%rdx
+	movq	8(%rsi),%rax
+	movq	%rcx,128(%rsp)
+	movl	$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	$0x80100,%r11d
+	je	.Loop_sqrx
+	jmp	.Loop_sqr
+
+.align	32
+.Loop_sqr:
+	movl	%r8d,128+8(%rsp)
+
+	movq	%rdx,%rbx
+	mulq	%rdx
+	movq	%rax,%r8
+	movq	16(%rsi),%rax
+	movq	%rdx,%r9
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	24(%rsi),%rax
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	32(%rsi),%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	40(%rsi),%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	48(%rsi),%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	56(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%rbx,%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	addq	%r8,%r8
+	movq	%r9,%rcx
+	adcq	%r9,%r9
+
+	mulq	%rax
+	movq	%rax,(%rsp)
+	addq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%r8,8(%rsp)
+	shrq	$63,%rcx
+
+
+	movq	8(%rsi),%r8
+	movq	16(%rsi),%rax
+	mulq	%r8
+	addq	%rax,%r10
+	movq	24(%rsi),%rax
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	32(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r8
+	addq	%rax,%r12
+	movq	40(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r8
+	addq	%rax,%r13
+	movq	48(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r8
+	addq	%rax,%r14
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r8
+	addq	%rax,%r15
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	movq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%r8
+
+	addq	%rdx,%rdx
+	leaq	(%rcx,%r10,2),%r10
+	movq	%r11,%rbx
+	adcq	%r11,%r11
+
+	mulq	%rax
+	addq	%rax,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%r9,16(%rsp)
+	movq	%r10,24(%rsp)
+	shrq	$63,%rbx
+
+
+	movq	16(%rsi),%r9
+	movq	24(%rsi),%rax
+	mulq	%r9
+	addq	%rax,%r12
+	movq	32(%rsi),%rax
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r9
+	addq	%rax,%r13
+	movq	40(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rcx,%r13
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r9
+	addq	%rax,%r14
+	movq	48(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rcx,%r14
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r9
+	movq	%r12,%r10
+	leaq	(%rbx,%r12,2),%r12
+	addq	%rax,%r15
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rcx,%r15
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r9
+	shrq	$63,%r10
+	addq	%rax,%r8
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rcx,%r8
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	movq	%r13,%rcx
+	leaq	(%r10,%r13,2),%r13
+
+	mulq	%rax
+	addq	%rax,%r11
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+
+	movq	%r11,32(%rsp)
+	movq	%r12,40(%rsp)
+	shrq	$63,%rcx
+
+
+	movq	24(%rsi),%r10
+	movq	32(%rsi),%rax
+	mulq	%r10
+	addq	%rax,%r14
+	movq	40(%rsi),%rax
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r10
+	addq	%rax,%r15
+	movq	48(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r10
+	movq	%r14,%r12
+	leaq	(%rcx,%r14,2),%r14
+	addq	%rax,%r8
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r10
+	shrq	$63,%r12
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	movq	%r15,%rbx
+	leaq	(%r12,%r15,2),%r15
+
+	mulq	%rax
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	adcq	$0,%r15
+
+	movq	%r13,48(%rsp)
+	movq	%r14,56(%rsp)
+	shrq	$63,%rbx
+
+
+	movq	32(%rsi),%r11
+	movq	40(%rsi),%rax
+	mulq	%r11
+	addq	%rax,%r8
+	movq	48(%rsi),%rax
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r11
+	addq	%rax,%r9
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%r8,%r12
+	leaq	(%rbx,%r8,2),%r8
+	addq	%rcx,%r9
+	movq	%rdx,%rcx
+	adcq	$0,%rcx
+
+	mulq	%r11
+	shrq	$63,%r12
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rcx,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	movq	%r9,%rcx
+	leaq	(%r12,%r9,2),%r9
+
+	mulq	%rax
+	addq	%rax,%r15
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%r15,64(%rsp)
+	movq	%r8,72(%rsp)
+	shrq	$63,%rcx
+
+
+	movq	40(%rsi),%r12
+	movq	48(%rsi),%rax
+	mulq	%r12
+	addq	%rax,%r10
+	movq	56(%rsi),%rax
+	movq	%rdx,%rbx
+	adcq	$0,%rbx
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r12,%rax
+	movq	%r10,%r15
+	leaq	(%rcx,%r10,2),%r10
+	adcq	$0,%rdx
+	shrq	$63,%r15
+	addq	%rbx,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	movq	%r11,%rbx
+	leaq	(%r15,%r11,2),%r11
+
+	mulq	%rax
+	addq	%rax,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%r9,80(%rsp)
+	movq	%r10,88(%rsp)
+
+
+	movq	48(%rsi),%r13
+	movq	56(%rsi),%rax
+	mulq	%r13
+	addq	%rax,%r12
+	movq	%r13,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	xorq	%r14,%r14
+	shlq	$1,%rbx
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+
+	mulq	%rax
+	addq	%rax,%r11
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+
+	movq	%r11,96(%rsp)
+	movq	%r12,104(%rsp)
+
+
+	movq	56(%rsi),%rax
+	mulq	%rax
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	addq	%rdx,%r14
+
+	movq	%r13,112(%rsp)
+	movq	%r14,120(%rsp)
+
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reduce
+
+	addq	64(%rsp),%r8
+	adcq	72(%rsp),%r9
+	adcq	80(%rsp),%r10
+	adcq	88(%rsp),%r11
+	adcq	96(%rsp),%r12
+	adcq	104(%rsp),%r13
+	adcq	112(%rsp),%r14
+	adcq	120(%rsp),%r15
+	sbbq	%rcx,%rcx
+
+	call	__rsaz_512_subtract
+
+	movq	%r8,%rdx
+	movq	%r9,%rax
+	movl	128+8(%rsp),%r8d
+	movq	%rdi,%rsi
+
+	decl	%r8d
+	jnz	.Loop_sqr
+	jmp	.Lsqr_tail
+
+.align	32
+.Loop_sqrx:
+	movl	%r8d,128+8(%rsp)
+.byte	102,72,15,110,199
+.byte	102,72,15,110,205
+
+	mulxq	%rax,%r8,%r9
+
+	mulxq	16(%rsi),%rcx,%r10
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r11
+	adcxq	%rcx,%r9
+
+	mulxq	32(%rsi),%rcx,%r12
+	adcxq	%rax,%r10
+
+	mulxq	40(%rsi),%rax,%r13
+	adcxq	%rcx,%r11
+
+.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
+	adcxq	%rax,%r12
+	adcxq	%rcx,%r13
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
+	adcxq	%rax,%r14
+	adcxq	%rbp,%r15
+
+	movq	%r9,%rcx
+	shldq	$1,%r8,%r9
+	shlq	$1,%r8
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rdx,%r8
+	movq	8(%rsi),%rdx
+	adcxq	%rbp,%r9
+
+	movq	%rax,(%rsp)
+	movq	%r8,8(%rsp)
+
+
+	mulxq	16(%rsi),%rax,%rbx
+	adoxq	%rax,%r10
+	adcxq	%rbx,%r11
+
+.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
+	adoxq	%rdi,%r11
+	adcxq	%r8,%r12
+
+	mulxq	32(%rsi),%rax,%rbx
+	adoxq	%rax,%r12
+	adcxq	%rbx,%r13
+
+	mulxq	40(%rsi),%rdi,%r8
+	adoxq	%rdi,%r13
+	adcxq	%r8,%r14
+
+.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adoxq	%rax,%r14
+	adcxq	%rbx,%r15
+
+.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%r8
+	adoxq	%rbp,%r8
+
+	movq	%r11,%rbx
+	shldq	$1,%r10,%r11
+	shldq	$1,%rcx,%r10
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rcx
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r9
+	adcxq	%rcx,%r10
+	adcxq	%rbp,%r11
+
+	movq	%r9,16(%rsp)
+.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
+
+
+.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
+	adoxq	%rdi,%r12
+	adcxq	%r9,%r13
+
+	mulxq	32(%rsi),%rax,%rcx
+	adoxq	%rax,%r13
+	adcxq	%rcx,%r14
+
+	mulxq	40(%rsi),%rdi,%r9
+	adoxq	%rdi,%r14
+	adcxq	%r9,%r15
+
+.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
+	adoxq	%rax,%r15
+	adcxq	%rcx,%r8
+
+.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rbp,%r9
+
+	movq	%r13,%rcx
+	shldq	$1,%r12,%r13
+	shldq	$1,%rbx,%r12
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rax,%r11
+	adcxq	%rdx,%r12
+	movq	24(%rsi),%rdx
+	adcxq	%rbp,%r13
+
+	movq	%r11,32(%rsp)
+.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
+
+
+.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
+	adoxq	%rax,%r14
+	adcxq	%rbx,%r15
+
+	mulxq	40(%rsi),%rdi,%r10
+	adoxq	%rdi,%r15
+	adcxq	%r10,%r8
+
+	mulxq	48(%rsi),%rax,%rbx
+	adoxq	%rax,%r8
+	adcxq	%rbx,%r9
+
+	mulxq	56(%rsi),%rdi,%r10
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+	adoxq	%rbp,%r10
+
+.byte	0x66
+	movq	%r15,%rbx
+	shldq	$1,%r14,%r15
+	shldq	$1,%rcx,%r14
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rdx,%r14
+	movq	32(%rsi),%rdx
+	adcxq	%rbp,%r15
+
+	movq	%r13,48(%rsp)
+	movq	%r14,56(%rsp)
+
+
+.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
+	adoxq	%rdi,%r8
+	adcxq	%r11,%r9
+
+	mulxq	48(%rsi),%rax,%rcx
+	adoxq	%rax,%r9
+	adcxq	%rcx,%r10
+
+	mulxq	56(%rsi),%rdi,%r11
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rbp,%r11
+
+	movq	%r9,%rcx
+	shldq	$1,%r8,%r9
+	shldq	$1,%rbx,%r8
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rax,%r15
+	adcxq	%rdx,%r8
+	movq	40(%rsi),%rdx
+	adcxq	%rbp,%r9
+
+	movq	%r15,64(%rsp)
+	movq	%r8,72(%rsp)
+
+
+.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adoxq	%rax,%r10
+	adcxq	%rbx,%r11
+
+.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+	adoxq	%rbp,%r12
+
+	movq	%r11,%rbx
+	shldq	$1,%r10,%r11
+	shldq	$1,%rcx,%r10
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rax,%r9
+	adcxq	%rdx,%r10
+	movq	48(%rsi),%rdx
+	adcxq	%rbp,%r11
+
+	movq	%r9,80(%rsp)
+	movq	%r10,88(%rsp)
+
+
+.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
+	adoxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	xorq	%r14,%r14
+	shldq	$1,%r13,%r14
+	shldq	$1,%r12,%r13
+	shldq	$1,%rbx,%r12
+
+	xorl	%ebp,%ebp
+	mulxq	%rdx,%rax,%rdx
+	adcxq	%rax,%r11
+	adcxq	%rdx,%r12
+	movq	56(%rsi),%rdx
+	adcxq	%rbp,%r13
+
+.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
+.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
+
+
+	mulxq	%rdx,%rax,%rdx
+	adoxq	%rax,%r13
+	adoxq	%rbp,%rdx
+
+.byte	0x66
+	addq	%rdx,%r14
+
+	movq	%r13,112(%rsp)
+	movq	%r14,120(%rsp)
+.byte	102,72,15,126,199
+.byte	102,72,15,126,205
+
+	movq	128(%rsp),%rdx
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reducex
+
+	addq	64(%rsp),%r8
+	adcq	72(%rsp),%r9
+	adcq	80(%rsp),%r10
+	adcq	88(%rsp),%r11
+	adcq	96(%rsp),%r12
+	adcq	104(%rsp),%r13
+	adcq	112(%rsp),%r14
+	adcq	120(%rsp),%r15
+	sbbq	%rcx,%rcx
+
+	call	__rsaz_512_subtract
+
+	movq	%r8,%rdx
+	movq	%r9,%rax
+	movl	128+8(%rsp),%r8d
+	movq	%rdi,%rsi
+
+	decl	%r8d
+	jnz	.Loop_sqrx
+
+.Lsqr_tail:
+
+	leaq	128+24+48(%rsp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lsqr_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_512_sqr,.-rsaz_512_sqr
+.globl	rsaz_512_mul
+.type	rsaz_512_mul, at function
+.align	32
+rsaz_512_mul:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	subq	$128+24,%rsp
+.Lmul_body:
+.byte	102,72,15,110,199
+.byte	102,72,15,110,201
+	movq	%r8,128(%rsp)
+	movl	$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	$0x80100,%r11d
+	je	.Lmulx
+	movq	(%rdx),%rbx
+	movq	%rdx,%rbp
+	call	__rsaz_512_mul
+
+.byte	102,72,15,126,199
+.byte	102,72,15,126,205
+
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reduce
+	jmp	.Lmul_tail
+
+.align	32
+.Lmulx:
+	movq	%rdx,%rbp
+	movq	(%rdx),%rdx
+	call	__rsaz_512_mulx
+
+.byte	102,72,15,126,199
+.byte	102,72,15,126,205
+
+	movq	128(%rsp),%rdx
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reducex
+.Lmul_tail:
+	addq	64(%rsp),%r8
+	adcq	72(%rsp),%r9
+	adcq	80(%rsp),%r10
+	adcq	88(%rsp),%r11
+	adcq	96(%rsp),%r12
+	adcq	104(%rsp),%r13
+	adcq	112(%rsp),%r14
+	adcq	120(%rsp),%r15
+	sbbq	%rcx,%rcx
+
+	call	__rsaz_512_subtract
+
+	leaq	128+24+48(%rsp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_512_mul,.-rsaz_512_mul
+.globl	rsaz_512_mul_gather4
+.type	rsaz_512_mul_gather4, at function
+.align	32
+rsaz_512_mul_gather4:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	subq	$152,%rsp
+.Lmul_gather4_body:
+	movd	%r9d,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1
+	movdqa	.Linc(%rip),%xmm0
+
+	pshufd	$0,%xmm8,%xmm8
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm8,%xmm0
+	movdqa	%xmm7,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm8,%xmm1
+	movdqa	%xmm7,%xmm4
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm8,%xmm2
+	movdqa	%xmm7,%xmm5
+	paddd	%xmm3,%xmm4
+	pcmpeqd	%xmm8,%xmm3
+	movdqa	%xmm7,%xmm6
+	paddd	%xmm4,%xmm5
+	pcmpeqd	%xmm8,%xmm4
+	paddd	%xmm5,%xmm6
+	pcmpeqd	%xmm8,%xmm5
+	paddd	%xmm6,%xmm7
+	pcmpeqd	%xmm8,%xmm6
+	pcmpeqd	%xmm8,%xmm7
+
+	movdqa	0(%rdx),%xmm8
+	movdqa	16(%rdx),%xmm9
+	movdqa	32(%rdx),%xmm10
+	movdqa	48(%rdx),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	64(%rdx),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	80(%rdx),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	96(%rdx),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	112(%rdx),%xmm15
+	leaq	128(%rdx),%rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movl	$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	$0x80100,%r11d
+	je	.Lmulx_gather
+.byte	102,76,15,126,195
+
+	movq	%r8,128(%rsp)
+	movq	%rdi,128+8(%rsp)
+	movq	%rcx,128+16(%rsp)
+
+	movq	(%rsi),%rax
+	movq	8(%rsi),%rcx
+	mulq	%rbx
+	movq	%rax,(%rsp)
+	movq	%rcx,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	16(%rsi),%rax
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	24(%rsi),%rax
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	32(%rsi),%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	40(%rsi),%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	48(%rsi),%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	56(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	(%rsi),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	leaq	8(%rsp),%rdi
+	movl	$7,%ecx
+	jmp	.Loop_mul_gather
+
+.align	32
+.Loop_mul_gather:
+	movdqa	0(%rbp),%xmm8
+	movdqa	16(%rbp),%xmm9
+	movdqa	32(%rbp),%xmm10
+	movdqa	48(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	64(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	80(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	96(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	112(%rbp),%xmm15
+	leaq	128(%rbp),%rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+.byte	102,76,15,126,195
+
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	addq	%rax,%r15
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	leaq	8(%rdi),%rdi
+
+	decl	%ecx
+	jnz	.Loop_mul_gather
+
+	movq	%r8,(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	movq	128+8(%rsp),%rdi
+	movq	128+16(%rsp),%rbp
+
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reduce
+	jmp	.Lmul_gather_tail
+
+.align	32
+.Lmulx_gather:
+.byte	102,76,15,126,194
+
+	movq	%r8,128(%rsp)
+	movq	%rdi,128+8(%rsp)
+	movq	%rcx,128+16(%rsp)
+
+	mulxq	(%rsi),%rbx,%r8
+	movq	%rbx,(%rsp)
+	xorl	%edi,%edi
+
+	mulxq	8(%rsi),%rax,%r9
+
+	mulxq	16(%rsi),%rbx,%r10
+	adcxq	%rax,%r8
+
+	mulxq	24(%rsi),%rax,%r11
+	adcxq	%rbx,%r9
+
+	mulxq	32(%rsi),%rbx,%r12
+	adcxq	%rax,%r10
+
+	mulxq	40(%rsi),%rax,%r13
+	adcxq	%rbx,%r11
+
+	mulxq	48(%rsi),%rbx,%r14
+	adcxq	%rax,%r12
+
+	mulxq	56(%rsi),%rax,%r15
+	adcxq	%rbx,%r13
+	adcxq	%rax,%r14
+.byte	0x67
+	movq	%r8,%rbx
+	adcxq	%rdi,%r15
+
+	movq	$-7,%rcx
+	jmp	.Loop_mulx_gather
+
+.align	32
+.Loop_mulx_gather:
+	movdqa	0(%rbp),%xmm8
+	movdqa	16(%rbp),%xmm9
+	movdqa	32(%rbp),%xmm10
+	movdqa	48(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	64(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	80(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	96(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	112(%rbp),%xmm15
+	leaq	128(%rbp),%rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+.byte	102,76,15,126,194
+
+.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rsi),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rsi),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+	mulxq	32(%rsi),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rsi),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+	adcxq	%rax,%r13
+.byte	0x67
+	adoxq	%r15,%r14
+
+	mulxq	56(%rsi),%rax,%r15
+	movq	%rbx,64(%rsp,%rcx,8)
+	adcxq	%rax,%r14
+	adoxq	%rdi,%r15
+	movq	%r8,%rbx
+	adcxq	%rdi,%r15
+
+	incq	%rcx
+	jnz	.Loop_mulx_gather
+
+	movq	%r8,64(%rsp)
+	movq	%r9,64+8(%rsp)
+	movq	%r10,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+	movq	%r12,64+32(%rsp)
+	movq	%r13,64+40(%rsp)
+	movq	%r14,64+48(%rsp)
+	movq	%r15,64+56(%rsp)
+
+	movq	128(%rsp),%rdx
+	movq	128+8(%rsp),%rdi
+	movq	128+16(%rsp),%rbp
+
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reducex
+
+.Lmul_gather_tail:
+	addq	64(%rsp),%r8
+	adcq	72(%rsp),%r9
+	adcq	80(%rsp),%r10
+	adcq	88(%rsp),%r11
+	adcq	96(%rsp),%r12
+	adcq	104(%rsp),%r13
+	adcq	112(%rsp),%r14
+	adcq	120(%rsp),%r15
+	sbbq	%rcx,%rcx
+
+	call	__rsaz_512_subtract
+
+	leaq	128+24+48(%rsp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lmul_gather4_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+.globl	rsaz_512_mul_scatter4
+.type	rsaz_512_mul_scatter4, at function
+.align	32
+rsaz_512_mul_scatter4:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	subq	$128+24,%rsp
+.Lmul_scatter4_body:
+	leaq	(%r8,%r9,8),%r8
+.byte	102,72,15,110,199
+.byte	102,72,15,110,202
+.byte	102,73,15,110,208
+	movq	%rcx,128(%rsp)
+
+	movq	%rdi,%rbp
+	movl	$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	$0x80100,%r11d
+	je	.Lmulx_scatter
+	movq	(%rdi),%rbx
+	call	__rsaz_512_mul
+
+.byte	102,72,15,126,199
+.byte	102,72,15,126,205
+
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reduce
+	jmp	.Lmul_scatter_tail
+
+.align	32
+.Lmulx_scatter:
+	movq	(%rdi),%rdx
+	call	__rsaz_512_mulx
+
+.byte	102,72,15,126,199
+.byte	102,72,15,126,205
+
+	movq	128(%rsp),%rdx
+	movq	(%rsp),%r8
+	movq	8(%rsp),%r9
+	movq	16(%rsp),%r10
+	movq	24(%rsp),%r11
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r15
+
+	call	__rsaz_512_reducex
+
+.Lmul_scatter_tail:
+	addq	64(%rsp),%r8
+	adcq	72(%rsp),%r9
+	adcq	80(%rsp),%r10
+	adcq	88(%rsp),%r11
+	adcq	96(%rsp),%r12
+	adcq	104(%rsp),%r13
+	adcq	112(%rsp),%r14
+	adcq	120(%rsp),%r15
+.byte	102,72,15,126,214
+	sbbq	%rcx,%rcx
+
+	call	__rsaz_512_subtract
+
+	movq	%r8,0(%rsi)
+	movq	%r9,128(%rsi)
+	movq	%r10,256(%rsi)
+	movq	%r11,384(%rsi)
+	movq	%r12,512(%rsi)
+	movq	%r13,640(%rsi)
+	movq	%r14,768(%rsi)
+	movq	%r15,896(%rsi)
+
+	leaq	128+24+48(%rsp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lmul_scatter4_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+.globl	rsaz_512_mul_by_one
+.type	rsaz_512_mul_by_one, at function
+.align	32
+rsaz_512_mul_by_one:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	subq	$128+24,%rsp
+.Lmul_by_one_body:
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	movq	%rdx,%rbp
+	movq	%rcx,128(%rsp)
+
+	movq	(%rsi),%r8
+	pxor	%xmm0,%xmm0
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+
+	movdqa	%xmm0,(%rsp)
+	movdqa	%xmm0,16(%rsp)
+	movdqa	%xmm0,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,80(%rsp)
+	movdqa	%xmm0,96(%rsp)
+	andl	$0x80100,%eax
+	cmpl	$0x80100,%eax
+	je	.Lby_one_callx
+	call	__rsaz_512_reduce
+	jmp	.Lby_one_tail
+.align	32
+.Lby_one_callx:
+	movq	128(%rsp),%rdx
+	call	__rsaz_512_reducex
+.Lby_one_tail:
+	movq	%r8,(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	128+24+48(%rsp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lmul_by_one_epilogue:
+	.byte	0xf3,0xc3
+.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+.type	__rsaz_512_reduce, at function
+.align	32
+__rsaz_512_reduce:
+	movq	%r8,%rbx
+	imulq	128+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	.Lreduction_loop
+
+.align	32
+.Lreduction_loop:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	128+8(%rsp),%rsi
+
+
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jne	.Lreduction_loop
+
+	.byte	0xf3,0xc3
+.size	__rsaz_512_reduce,.-__rsaz_512_reduce
+.type	__rsaz_512_reducex, at function
+.align	32
+__rsaz_512_reducex:
+
+	imulq	%r8,%rdx
+	xorq	%rsi,%rsi
+	movl	$8,%ecx
+	jmp	.Lreduction_loopx
+
+.align	32
+.Lreduction_loopx:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	128+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+	decl	%ecx
+	jne	.Lreduction_loopx
+
+	.byte	0xf3,0xc3
+.size	__rsaz_512_reducex,.-__rsaz_512_reducex
+.type	__rsaz_512_subtract, at function
+.align	32
+__rsaz_512_subtract:
+	movq	%r8,(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	movq	0(%rbp),%r8
+	movq	8(%rbp),%r9
+	negq	%r8
+	notq	%r9
+	andq	%rcx,%r8
+	movq	16(%rbp),%r10
+	andq	%rcx,%r9
+	notq	%r10
+	movq	24(%rbp),%r11
+	andq	%rcx,%r10
+	notq	%r11
+	movq	32(%rbp),%r12
+	andq	%rcx,%r11
+	notq	%r12
+	movq	40(%rbp),%r13
+	andq	%rcx,%r12
+	notq	%r13
+	movq	48(%rbp),%r14
+	andq	%rcx,%r13
+	notq	%r14
+	movq	56(%rbp),%r15
+	andq	%rcx,%r14
+	notq	%r15
+	andq	%rcx,%r15
+
+	addq	(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+
+	movq	%r8,(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__rsaz_512_subtract,.-__rsaz_512_subtract
+.type	__rsaz_512_mul, at function
+.align	32
+__rsaz_512_mul:
+	leaq	8(%rsp),%rdi
+
+	movq	(%rsi),%rax
+	mulq	%rbx
+	movq	%rax,(%rdi)
+	movq	8(%rsi),%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	16(%rsi),%rax
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	24(%rsi),%rax
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	32(%rsi),%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	40(%rsi),%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	48(%rsi),%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	56(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	(%rsi),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	leaq	8(%rbp),%rbp
+	leaq	8(%rdi),%rdi
+
+	movl	$7,%ecx
+	jmp	.Loop_mul
+
+.align	32
+.Loop_mul:
+	movq	(%rbp),%rbx
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	leaq	8(%rbp),%rbp
+	adcq	$0,%r14
+
+	mulq	%rbx
+	addq	%rax,%r15
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	leaq	8(%rdi),%rdi
+
+	decl	%ecx
+	jnz	.Loop_mul
+
+	movq	%r8,(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+.size	__rsaz_512_mul,.-__rsaz_512_mul
+.type	__rsaz_512_mulx, at function
+.align	32
+__rsaz_512_mulx:
+	mulxq	(%rsi),%rbx,%r8
+	movq	$-6,%rcx
+
+	mulxq	8(%rsi),%rax,%r9
+	movq	%rbx,8(%rsp)
+
+	mulxq	16(%rsi),%rbx,%r10
+	adcq	%rax,%r8
+
+	mulxq	24(%rsi),%rax,%r11
+	adcq	%rbx,%r9
+
+	mulxq	32(%rsi),%rbx,%r12
+	adcq	%rax,%r10
+
+	mulxq	40(%rsi),%rax,%r13
+	adcq	%rbx,%r11
+
+	mulxq	48(%rsi),%rbx,%r14
+	adcq	%rax,%r12
+
+	mulxq	56(%rsi),%rax,%r15
+	movq	8(%rbp),%rdx
+	adcq	%rbx,%r13
+	adcq	%rax,%r14
+	adcq	$0,%r15
+
+	xorq	%rdi,%rdi
+	jmp	.Loop_mulx
+
+.align	32
+.Loop_mulx:
+	movq	%r8,%rbx
+	mulxq	(%rsi),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rsi),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rsi),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rsi),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rsi),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rsi),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rsi),%rax,%r15
+	movq	64(%rbp,%rcx,8),%rdx
+	movq	%rbx,8+64-8(%rsp,%rcx,8)
+	adcxq	%rax,%r14
+	adoxq	%rdi,%r15
+	adcxq	%rdi,%r15
+
+	incq	%rcx
+	jnz	.Loop_mulx
+
+	movq	%r8,%rbx
+	mulxq	(%rsi),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rsi),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+	mulxq	32(%rsi),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rsi),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
+	adcxq	%rax,%r14
+	adoxq	%rdi,%r15
+	adcxq	%rdi,%r15
+
+	movq	%rbx,8+64-8(%rsp)
+	movq	%r8,8+64(%rsp)
+	movq	%r9,8+64+8(%rsp)
+	movq	%r10,8+64+16(%rsp)
+	movq	%r11,8+64+24(%rsp)
+	movq	%r12,8+64+32(%rsp)
+	movq	%r13,8+64+40(%rsp)
+	movq	%r14,8+64+48(%rsp)
+	movq	%r15,8+64+56(%rsp)
+
+	.byte	0xf3,0xc3
+.size	__rsaz_512_mulx,.-__rsaz_512_mulx
+.globl	rsaz_512_scatter4
+.type	rsaz_512_scatter4, at function
+.align	16
+rsaz_512_scatter4:
+	leaq	(%rdi,%rdx,8),%rdi
+	movl	$8,%r9d
+	jmp	.Loop_scatter
+.align	16
+.Loop_scatter:
+	movq	(%rsi),%rax
+	leaq	8(%rsi),%rsi
+	movq	%rax,(%rdi)
+	leaq	128(%rdi),%rdi
+	decl	%r9d
+	jnz	.Loop_scatter
+	.byte	0xf3,0xc3
+.size	rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl	rsaz_512_gather4
+.type	rsaz_512_gather4, at function
+.align	16
+rsaz_512_gather4:
+	movd	%edx,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1
+	movdqa	.Linc(%rip),%xmm0
+
+	pshufd	$0,%xmm8,%xmm8
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm8,%xmm0
+	movdqa	%xmm7,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm8,%xmm1
+	movdqa	%xmm7,%xmm4
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm8,%xmm2
+	movdqa	%xmm7,%xmm5
+	paddd	%xmm3,%xmm4
+	pcmpeqd	%xmm8,%xmm3
+	movdqa	%xmm7,%xmm6
+	paddd	%xmm4,%xmm5
+	pcmpeqd	%xmm8,%xmm4
+	paddd	%xmm5,%xmm6
+	pcmpeqd	%xmm8,%xmm5
+	paddd	%xmm6,%xmm7
+	pcmpeqd	%xmm8,%xmm6
+	pcmpeqd	%xmm8,%xmm7
+	movl	$8,%r9d
+	jmp	.Loop_gather
+.align	16
+.Loop_gather:
+	movdqa	0(%rsi),%xmm8
+	movdqa	16(%rsi),%xmm9
+	movdqa	32(%rsi),%xmm10
+	movdqa	48(%rsi),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	64(%rsi),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	80(%rsi),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	96(%rsi),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	112(%rsi),%xmm15
+	leaq	128(%rsi),%rsi
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,(%rdi)
+	leaq	8(%rdi),%rdi
+	decl	%r9d
+	jnz	.Loop_gather
+	.byte	0xf3,0xc3
+.LSEH_end_rsaz_512_gather4:
+.size	rsaz_512_gather4,.-rsaz_512_gather4
+
+.align	64
+.Linc:
+.long	0,0, 1,1
+.long	2,2, 2,2


Property changes on: trunk/secure/lib/libcrypto/amd64/rsaz-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,7224 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha1-mb-x86_64.pl. */
+.text	
+
+
+
+.globl	sha1_multi_block
+.type	sha1_multi_block, at function
+.align	32
+sha1_multi_block:
+	movq	OPENSSL_ia32cap_P+4(%rip),%rcx
+	btq	$61,%rcx
+	jc	_shaext_shortcut
+	testl	$268435456,%ecx
+	jnz	_avx_shortcut
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	andq	$-256,%rsp
+	movq	%rax,272(%rsp)
+.Lbody:
+	leaq	K_XX_XX(%rip),%rbp
+	leaq	256(%rsp),%rbx
+
+.Loop_grande:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r9
+	movq	32(%rsi),%r10
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r10
+	movq	48(%rsi),%r11
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r11
+	testl	%edx,%edx
+	jz	.Ldone
+
+	movdqu	0(%rdi),%xmm10
+	leaq	128(%rsp),%rax
+	movdqu	32(%rdi),%xmm11
+	movdqu	64(%rdi),%xmm12
+	movdqu	96(%rdi),%xmm13
+	movdqu	128(%rdi),%xmm14
+	movdqa	96(%rbp),%xmm5
+	movdqa	-32(%rbp),%xmm15
+	jmp	.Loop
+
+.align	32
+.Loop:
+	movd	(%r8),%xmm0
+	leaq	64(%r8),%r8
+	movd	(%r9),%xmm2
+	leaq	64(%r9),%r9
+	movd	(%r10),%xmm3
+	leaq	64(%r10),%r10
+	movd	(%r11),%xmm4
+	leaq	64(%r11),%r11
+	punpckldq	%xmm3,%xmm0
+	movd	-60(%r8),%xmm1
+	punpckldq	%xmm4,%xmm2
+	movd	-60(%r9),%xmm9
+	punpckldq	%xmm2,%xmm0
+	movd	-60(%r10),%xmm8
+.byte	102,15,56,0,197
+	movd	-60(%r11),%xmm7
+	punpckldq	%xmm8,%xmm1
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm15,%xmm14
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm11,%xmm7
+	movdqa	%xmm11,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm13,%xmm7
+	pand	%xmm12,%xmm6
+	punpckldq	%xmm9,%xmm1
+	movdqa	%xmm10,%xmm9
+
+	movdqa	%xmm0,0-128(%rax)
+	paddd	%xmm0,%xmm14
+	movd	-56(%r8),%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-56(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+.byte	102,15,56,0,205
+	movd	-56(%r10),%xmm8
+	por	%xmm7,%xmm11
+	movd	-56(%r11),%xmm7
+	punpckldq	%xmm8,%xmm2
+	movdqa	%xmm14,%xmm8
+	paddd	%xmm15,%xmm13
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm10,%xmm7
+	movdqa	%xmm10,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm12,%xmm7
+	pand	%xmm11,%xmm6
+	punpckldq	%xmm9,%xmm2
+	movdqa	%xmm14,%xmm9
+
+	movdqa	%xmm1,16-128(%rax)
+	paddd	%xmm1,%xmm13
+	movd	-52(%r8),%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-52(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+.byte	102,15,56,0,213
+	movd	-52(%r10),%xmm8
+	por	%xmm7,%xmm10
+	movd	-52(%r11),%xmm7
+	punpckldq	%xmm8,%xmm3
+	movdqa	%xmm13,%xmm8
+	paddd	%xmm15,%xmm12
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm14,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm11,%xmm7
+	pand	%xmm10,%xmm6
+	punpckldq	%xmm9,%xmm3
+	movdqa	%xmm13,%xmm9
+
+	movdqa	%xmm2,32-128(%rax)
+	paddd	%xmm2,%xmm12
+	movd	-48(%r8),%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-48(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+.byte	102,15,56,0,221
+	movd	-48(%r10),%xmm8
+	por	%xmm7,%xmm14
+	movd	-48(%r11),%xmm7
+	punpckldq	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	paddd	%xmm15,%xmm11
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm13,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm10,%xmm7
+	pand	%xmm14,%xmm6
+	punpckldq	%xmm9,%xmm4
+	movdqa	%xmm12,%xmm9
+
+	movdqa	%xmm3,48-128(%rax)
+	paddd	%xmm3,%xmm11
+	movd	-44(%r8),%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-44(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+.byte	102,15,56,0,229
+	movd	-44(%r10),%xmm8
+	por	%xmm7,%xmm13
+	movd	-44(%r11),%xmm7
+	punpckldq	%xmm8,%xmm0
+	movdqa	%xmm11,%xmm8
+	paddd	%xmm15,%xmm10
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm12,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm14,%xmm7
+	pand	%xmm13,%xmm6
+	punpckldq	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm9
+
+	movdqa	%xmm4,64-128(%rax)
+	paddd	%xmm4,%xmm10
+	movd	-40(%r8),%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-40(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+.byte	102,15,56,0,197
+	movd	-40(%r10),%xmm8
+	por	%xmm7,%xmm12
+	movd	-40(%r11),%xmm7
+	punpckldq	%xmm8,%xmm1
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm15,%xmm14
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm11,%xmm7
+	movdqa	%xmm11,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm13,%xmm7
+	pand	%xmm12,%xmm6
+	punpckldq	%xmm9,%xmm1
+	movdqa	%xmm10,%xmm9
+
+	movdqa	%xmm0,80-128(%rax)
+	paddd	%xmm0,%xmm14
+	movd	-36(%r8),%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-36(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+.byte	102,15,56,0,205
+	movd	-36(%r10),%xmm8
+	por	%xmm7,%xmm11
+	movd	-36(%r11),%xmm7
+	punpckldq	%xmm8,%xmm2
+	movdqa	%xmm14,%xmm8
+	paddd	%xmm15,%xmm13
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm10,%xmm7
+	movdqa	%xmm10,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm12,%xmm7
+	pand	%xmm11,%xmm6
+	punpckldq	%xmm9,%xmm2
+	movdqa	%xmm14,%xmm9
+
+	movdqa	%xmm1,96-128(%rax)
+	paddd	%xmm1,%xmm13
+	movd	-32(%r8),%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-32(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+.byte	102,15,56,0,213
+	movd	-32(%r10),%xmm8
+	por	%xmm7,%xmm10
+	movd	-32(%r11),%xmm7
+	punpckldq	%xmm8,%xmm3
+	movdqa	%xmm13,%xmm8
+	paddd	%xmm15,%xmm12
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm14,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm11,%xmm7
+	pand	%xmm10,%xmm6
+	punpckldq	%xmm9,%xmm3
+	movdqa	%xmm13,%xmm9
+
+	movdqa	%xmm2,112-128(%rax)
+	paddd	%xmm2,%xmm12
+	movd	-28(%r8),%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-28(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+.byte	102,15,56,0,221
+	movd	-28(%r10),%xmm8
+	por	%xmm7,%xmm14
+	movd	-28(%r11),%xmm7
+	punpckldq	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	paddd	%xmm15,%xmm11
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm13,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm10,%xmm7
+	pand	%xmm14,%xmm6
+	punpckldq	%xmm9,%xmm4
+	movdqa	%xmm12,%xmm9
+
+	movdqa	%xmm3,128-128(%rax)
+	paddd	%xmm3,%xmm11
+	movd	-24(%r8),%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-24(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+.byte	102,15,56,0,229
+	movd	-24(%r10),%xmm8
+	por	%xmm7,%xmm13
+	movd	-24(%r11),%xmm7
+	punpckldq	%xmm8,%xmm0
+	movdqa	%xmm11,%xmm8
+	paddd	%xmm15,%xmm10
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm12,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm14,%xmm7
+	pand	%xmm13,%xmm6
+	punpckldq	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm9
+
+	movdqa	%xmm4,144-128(%rax)
+	paddd	%xmm4,%xmm10
+	movd	-20(%r8),%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-20(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+.byte	102,15,56,0,197
+	movd	-20(%r10),%xmm8
+	por	%xmm7,%xmm12
+	movd	-20(%r11),%xmm7
+	punpckldq	%xmm8,%xmm1
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm15,%xmm14
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm11,%xmm7
+	movdqa	%xmm11,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm13,%xmm7
+	pand	%xmm12,%xmm6
+	punpckldq	%xmm9,%xmm1
+	movdqa	%xmm10,%xmm9
+
+	movdqa	%xmm0,160-128(%rax)
+	paddd	%xmm0,%xmm14
+	movd	-16(%r8),%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-16(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+.byte	102,15,56,0,205
+	movd	-16(%r10),%xmm8
+	por	%xmm7,%xmm11
+	movd	-16(%r11),%xmm7
+	punpckldq	%xmm8,%xmm2
+	movdqa	%xmm14,%xmm8
+	paddd	%xmm15,%xmm13
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm10,%xmm7
+	movdqa	%xmm10,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm12,%xmm7
+	pand	%xmm11,%xmm6
+	punpckldq	%xmm9,%xmm2
+	movdqa	%xmm14,%xmm9
+
+	movdqa	%xmm1,176-128(%rax)
+	paddd	%xmm1,%xmm13
+	movd	-12(%r8),%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-12(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+.byte	102,15,56,0,213
+	movd	-12(%r10),%xmm8
+	por	%xmm7,%xmm10
+	movd	-12(%r11),%xmm7
+	punpckldq	%xmm8,%xmm3
+	movdqa	%xmm13,%xmm8
+	paddd	%xmm15,%xmm12
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm14,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm11,%xmm7
+	pand	%xmm10,%xmm6
+	punpckldq	%xmm9,%xmm3
+	movdqa	%xmm13,%xmm9
+
+	movdqa	%xmm2,192-128(%rax)
+	paddd	%xmm2,%xmm12
+	movd	-8(%r8),%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-8(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+.byte	102,15,56,0,221
+	movd	-8(%r10),%xmm8
+	por	%xmm7,%xmm14
+	movd	-8(%r11),%xmm7
+	punpckldq	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	paddd	%xmm15,%xmm11
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm13,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$5,%xmm8
+	pandn	%xmm10,%xmm7
+	pand	%xmm14,%xmm6
+	punpckldq	%xmm9,%xmm4
+	movdqa	%xmm12,%xmm9
+
+	movdqa	%xmm3,208-128(%rax)
+	paddd	%xmm3,%xmm11
+	movd	-4(%r8),%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	por	%xmm9,%xmm8
+	movd	-4(%r9),%xmm9
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+.byte	102,15,56,0,229
+	movd	-4(%r10),%xmm8
+	por	%xmm7,%xmm13
+	movdqa	0-128(%rax),%xmm1
+	movd	-4(%r11),%xmm7
+	punpckldq	%xmm8,%xmm0
+	movdqa	%xmm11,%xmm8
+	paddd	%xmm15,%xmm10
+	punpckldq	%xmm7,%xmm9
+	movdqa	%xmm12,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$5,%xmm8
+	prefetcht0	63(%r8)
+	pandn	%xmm14,%xmm7
+	pand	%xmm13,%xmm6
+	punpckldq	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm9
+
+	movdqa	%xmm4,224-128(%rax)
+	paddd	%xmm4,%xmm10
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm12,%xmm7
+	prefetcht0	63(%r9)
+
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm10
+	prefetcht0	63(%r10)
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+.byte	102,15,56,0,197
+	prefetcht0	63(%r11)
+	por	%xmm7,%xmm12
+	movdqa	16-128(%rax),%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	32-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	pxor	128-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	movdqa	%xmm11,%xmm7
+	pslld	$5,%xmm8
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm11,%xmm6
+	pandn	%xmm13,%xmm7
+	movdqa	%xmm1,%xmm5
+	pand	%xmm12,%xmm6
+	movdqa	%xmm10,%xmm9
+	psrld	$31,%xmm5
+	paddd	%xmm1,%xmm1
+
+	movdqa	%xmm0,240-128(%rax)
+	paddd	%xmm0,%xmm14
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+
+	movdqa	%xmm11,%xmm7
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	48-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	pxor	144-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	movdqa	%xmm10,%xmm7
+	pslld	$5,%xmm8
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm10,%xmm6
+	pandn	%xmm12,%xmm7
+	movdqa	%xmm2,%xmm5
+	pand	%xmm11,%xmm6
+	movdqa	%xmm14,%xmm9
+	psrld	$31,%xmm5
+	paddd	%xmm2,%xmm2
+
+	movdqa	%xmm1,0-128(%rax)
+	paddd	%xmm1,%xmm13
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+
+	movdqa	%xmm10,%xmm7
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	64-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	pxor	160-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	movdqa	%xmm14,%xmm7
+	pslld	$5,%xmm8
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm14,%xmm6
+	pandn	%xmm11,%xmm7
+	movdqa	%xmm3,%xmm5
+	pand	%xmm10,%xmm6
+	movdqa	%xmm13,%xmm9
+	psrld	$31,%xmm5
+	paddd	%xmm3,%xmm3
+
+	movdqa	%xmm2,16-128(%rax)
+	paddd	%xmm2,%xmm12
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+
+	movdqa	%xmm14,%xmm7
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	80-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	pxor	176-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	movdqa	%xmm13,%xmm7
+	pslld	$5,%xmm8
+	pxor	%xmm1,%xmm4
+	movdqa	%xmm13,%xmm6
+	pandn	%xmm10,%xmm7
+	movdqa	%xmm4,%xmm5
+	pand	%xmm14,%xmm6
+	movdqa	%xmm12,%xmm9
+	psrld	$31,%xmm5
+	paddd	%xmm4,%xmm4
+
+	movdqa	%xmm3,32-128(%rax)
+	paddd	%xmm3,%xmm11
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+
+	movdqa	%xmm13,%xmm7
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	96-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	pxor	192-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	movdqa	%xmm12,%xmm7
+	pslld	$5,%xmm8
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm12,%xmm6
+	pandn	%xmm14,%xmm7
+	movdqa	%xmm0,%xmm5
+	pand	%xmm13,%xmm6
+	movdqa	%xmm11,%xmm9
+	psrld	$31,%xmm5
+	paddd	%xmm0,%xmm0
+
+	movdqa	%xmm4,48-128(%rax)
+	paddd	%xmm4,%xmm10
+	psrld	$27,%xmm9
+	pxor	%xmm7,%xmm6
+
+	movdqa	%xmm12,%xmm7
+	por	%xmm9,%xmm8
+	pslld	$30,%xmm7
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	movdqa	0(%rbp),%xmm15
+	pxor	%xmm3,%xmm1
+	movdqa	112-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	208-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,64-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	128-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	224-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,80-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	144-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	240-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,96-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	160-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	0-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,112-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	176-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	16-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,128-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	192-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	32-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,144-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	208-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	48-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,160-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	224-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	64-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,176-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	240-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	80-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,192-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	0-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	96-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,208-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	16-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	112-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,224-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	32-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	128-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,240-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	48-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	144-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,0-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	64-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	160-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,16-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	80-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	176-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,32-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	96-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	192-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,48-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	112-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	208-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,64-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	128-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	224-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,80-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	144-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	240-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,96-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	160-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	0-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,112-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	movdqa	32(%rbp),%xmm15
+	pxor	%xmm3,%xmm1
+	movdqa	176-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm7
+	pxor	16-128(%rax),%xmm1
+	pxor	%xmm3,%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	movdqa	%xmm10,%xmm9
+	pand	%xmm12,%xmm7
+
+	movdqa	%xmm13,%xmm6
+	movdqa	%xmm1,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm14
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm0,128-128(%rax)
+	paddd	%xmm0,%xmm14
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm11,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm1,%xmm1
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	192-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm7
+	pxor	32-128(%rax),%xmm2
+	pxor	%xmm4,%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	movdqa	%xmm14,%xmm9
+	pand	%xmm11,%xmm7
+
+	movdqa	%xmm12,%xmm6
+	movdqa	%xmm2,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm13
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm1,144-128(%rax)
+	paddd	%xmm1,%xmm13
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm10,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm2,%xmm2
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	208-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm7
+	pxor	48-128(%rax),%xmm3
+	pxor	%xmm0,%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	movdqa	%xmm13,%xmm9
+	pand	%xmm10,%xmm7
+
+	movdqa	%xmm11,%xmm6
+	movdqa	%xmm3,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm12
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm2,160-128(%rax)
+	paddd	%xmm2,%xmm12
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm14,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm3,%xmm3
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	224-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm7
+	pxor	64-128(%rax),%xmm4
+	pxor	%xmm1,%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	movdqa	%xmm12,%xmm9
+	pand	%xmm14,%xmm7
+
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm4,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm11
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm3,176-128(%rax)
+	paddd	%xmm3,%xmm11
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm13,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm4,%xmm4
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	240-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm7
+	pxor	80-128(%rax),%xmm0
+	pxor	%xmm2,%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	movdqa	%xmm11,%xmm9
+	pand	%xmm13,%xmm7
+
+	movdqa	%xmm14,%xmm6
+	movdqa	%xmm0,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm10
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm4,192-128(%rax)
+	paddd	%xmm4,%xmm10
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm12,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm0,%xmm0
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	0-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm7
+	pxor	96-128(%rax),%xmm1
+	pxor	%xmm3,%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	movdqa	%xmm10,%xmm9
+	pand	%xmm12,%xmm7
+
+	movdqa	%xmm13,%xmm6
+	movdqa	%xmm1,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm14
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm0,208-128(%rax)
+	paddd	%xmm0,%xmm14
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm11,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm1,%xmm1
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	16-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm7
+	pxor	112-128(%rax),%xmm2
+	pxor	%xmm4,%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	movdqa	%xmm14,%xmm9
+	pand	%xmm11,%xmm7
+
+	movdqa	%xmm12,%xmm6
+	movdqa	%xmm2,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm13
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm1,224-128(%rax)
+	paddd	%xmm1,%xmm13
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm10,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm2,%xmm2
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	32-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm7
+	pxor	128-128(%rax),%xmm3
+	pxor	%xmm0,%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	movdqa	%xmm13,%xmm9
+	pand	%xmm10,%xmm7
+
+	movdqa	%xmm11,%xmm6
+	movdqa	%xmm3,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm12
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm2,240-128(%rax)
+	paddd	%xmm2,%xmm12
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm14,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm3,%xmm3
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	48-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm7
+	pxor	144-128(%rax),%xmm4
+	pxor	%xmm1,%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	movdqa	%xmm12,%xmm9
+	pand	%xmm14,%xmm7
+
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm4,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm11
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm3,0-128(%rax)
+	paddd	%xmm3,%xmm11
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm13,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm4,%xmm4
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	64-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm7
+	pxor	160-128(%rax),%xmm0
+	pxor	%xmm2,%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	movdqa	%xmm11,%xmm9
+	pand	%xmm13,%xmm7
+
+	movdqa	%xmm14,%xmm6
+	movdqa	%xmm0,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm10
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm4,16-128(%rax)
+	paddd	%xmm4,%xmm10
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm12,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm0,%xmm0
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	80-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm7
+	pxor	176-128(%rax),%xmm1
+	pxor	%xmm3,%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	movdqa	%xmm10,%xmm9
+	pand	%xmm12,%xmm7
+
+	movdqa	%xmm13,%xmm6
+	movdqa	%xmm1,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm14
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm0,32-128(%rax)
+	paddd	%xmm0,%xmm14
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm11,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm1,%xmm1
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	96-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm7
+	pxor	192-128(%rax),%xmm2
+	pxor	%xmm4,%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	movdqa	%xmm14,%xmm9
+	pand	%xmm11,%xmm7
+
+	movdqa	%xmm12,%xmm6
+	movdqa	%xmm2,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm13
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm1,48-128(%rax)
+	paddd	%xmm1,%xmm13
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm10,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm2,%xmm2
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	112-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm7
+	pxor	208-128(%rax),%xmm3
+	pxor	%xmm0,%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	movdqa	%xmm13,%xmm9
+	pand	%xmm10,%xmm7
+
+	movdqa	%xmm11,%xmm6
+	movdqa	%xmm3,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm12
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm2,64-128(%rax)
+	paddd	%xmm2,%xmm12
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm14,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm3,%xmm3
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	128-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm7
+	pxor	224-128(%rax),%xmm4
+	pxor	%xmm1,%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	movdqa	%xmm12,%xmm9
+	pand	%xmm14,%xmm7
+
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm4,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm11
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm3,80-128(%rax)
+	paddd	%xmm3,%xmm11
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm13,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm4,%xmm4
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	144-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm7
+	pxor	240-128(%rax),%xmm0
+	pxor	%xmm2,%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	movdqa	%xmm11,%xmm9
+	pand	%xmm13,%xmm7
+
+	movdqa	%xmm14,%xmm6
+	movdqa	%xmm0,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm10
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm4,96-128(%rax)
+	paddd	%xmm4,%xmm10
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm12,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm0,%xmm0
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	160-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm7
+	pxor	0-128(%rax),%xmm1
+	pxor	%xmm3,%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	movdqa	%xmm10,%xmm9
+	pand	%xmm12,%xmm7
+
+	movdqa	%xmm13,%xmm6
+	movdqa	%xmm1,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm14
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm0,112-128(%rax)
+	paddd	%xmm0,%xmm14
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm11,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm1,%xmm1
+	paddd	%xmm6,%xmm14
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	176-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm7
+	pxor	16-128(%rax),%xmm2
+	pxor	%xmm4,%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	movdqa	%xmm14,%xmm9
+	pand	%xmm11,%xmm7
+
+	movdqa	%xmm12,%xmm6
+	movdqa	%xmm2,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm13
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm1,128-128(%rax)
+	paddd	%xmm1,%xmm13
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm10,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm2,%xmm2
+	paddd	%xmm6,%xmm13
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	192-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm7
+	pxor	32-128(%rax),%xmm3
+	pxor	%xmm0,%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	movdqa	%xmm13,%xmm9
+	pand	%xmm10,%xmm7
+
+	movdqa	%xmm11,%xmm6
+	movdqa	%xmm3,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm12
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm2,144-128(%rax)
+	paddd	%xmm2,%xmm12
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm14,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm3,%xmm3
+	paddd	%xmm6,%xmm12
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	208-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm7
+	pxor	48-128(%rax),%xmm4
+	pxor	%xmm1,%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	movdqa	%xmm12,%xmm9
+	pand	%xmm14,%xmm7
+
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm4,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm11
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm3,160-128(%rax)
+	paddd	%xmm3,%xmm11
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm13,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm4,%xmm4
+	paddd	%xmm6,%xmm11
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	224-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm7
+	pxor	64-128(%rax),%xmm0
+	pxor	%xmm2,%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	movdqa	%xmm11,%xmm9
+	pand	%xmm13,%xmm7
+
+	movdqa	%xmm14,%xmm6
+	movdqa	%xmm0,%xmm5
+	psrld	$27,%xmm9
+	paddd	%xmm7,%xmm10
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm4,176-128(%rax)
+	paddd	%xmm4,%xmm10
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	pand	%xmm12,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	paddd	%xmm0,%xmm0
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	movdqa	64(%rbp),%xmm15
+	pxor	%xmm3,%xmm1
+	movdqa	240-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	80-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,192-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	0-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	96-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,208-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	16-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	112-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,224-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	32-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	128-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,240-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	48-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	144-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,0-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	64-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	160-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,16-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	80-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	176-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,32-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	96-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	192-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,48-128(%rax)
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	112-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	208-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	movdqa	%xmm3,64-128(%rax)
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	128-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	224-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	movdqa	%xmm4,80-128(%rax)
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	144-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	240-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	movdqa	%xmm0,96-128(%rax)
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	160-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	0-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	movdqa	%xmm1,112-128(%rax)
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	176-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	16-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	192-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	32-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	pxor	%xmm2,%xmm0
+	movdqa	208-128(%rax),%xmm2
+
+	movdqa	%xmm11,%xmm8
+	movdqa	%xmm14,%xmm6
+	pxor	48-128(%rax),%xmm0
+	paddd	%xmm15,%xmm10
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	paddd	%xmm4,%xmm10
+	pxor	%xmm2,%xmm0
+	psrld	$27,%xmm9
+	pxor	%xmm13,%xmm6
+	movdqa	%xmm12,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm0,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm10
+	paddd	%xmm0,%xmm0
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm5,%xmm0
+	por	%xmm7,%xmm12
+	pxor	%xmm3,%xmm1
+	movdqa	224-128(%rax),%xmm3
+
+	movdqa	%xmm10,%xmm8
+	movdqa	%xmm13,%xmm6
+	pxor	64-128(%rax),%xmm1
+	paddd	%xmm15,%xmm14
+	pslld	$5,%xmm8
+	pxor	%xmm11,%xmm6
+
+	movdqa	%xmm10,%xmm9
+	paddd	%xmm0,%xmm14
+	pxor	%xmm3,%xmm1
+	psrld	$27,%xmm9
+	pxor	%xmm12,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm1,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm14
+	paddd	%xmm1,%xmm1
+
+	psrld	$2,%xmm11
+	paddd	%xmm8,%xmm14
+	por	%xmm5,%xmm1
+	por	%xmm7,%xmm11
+	pxor	%xmm4,%xmm2
+	movdqa	240-128(%rax),%xmm4
+
+	movdqa	%xmm14,%xmm8
+	movdqa	%xmm12,%xmm6
+	pxor	80-128(%rax),%xmm2
+	paddd	%xmm15,%xmm13
+	pslld	$5,%xmm8
+	pxor	%xmm10,%xmm6
+
+	movdqa	%xmm14,%xmm9
+	paddd	%xmm1,%xmm13
+	pxor	%xmm4,%xmm2
+	psrld	$27,%xmm9
+	pxor	%xmm11,%xmm6
+	movdqa	%xmm10,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm2,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm13
+	paddd	%xmm2,%xmm2
+
+	psrld	$2,%xmm10
+	paddd	%xmm8,%xmm13
+	por	%xmm5,%xmm2
+	por	%xmm7,%xmm10
+	pxor	%xmm0,%xmm3
+	movdqa	0-128(%rax),%xmm0
+
+	movdqa	%xmm13,%xmm8
+	movdqa	%xmm11,%xmm6
+	pxor	96-128(%rax),%xmm3
+	paddd	%xmm15,%xmm12
+	pslld	$5,%xmm8
+	pxor	%xmm14,%xmm6
+
+	movdqa	%xmm13,%xmm9
+	paddd	%xmm2,%xmm12
+	pxor	%xmm0,%xmm3
+	psrld	$27,%xmm9
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm3,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm12
+	paddd	%xmm3,%xmm3
+
+	psrld	$2,%xmm14
+	paddd	%xmm8,%xmm12
+	por	%xmm5,%xmm3
+	por	%xmm7,%xmm14
+	pxor	%xmm1,%xmm4
+	movdqa	16-128(%rax),%xmm1
+
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm10,%xmm6
+	pxor	112-128(%rax),%xmm4
+	paddd	%xmm15,%xmm11
+	pslld	$5,%xmm8
+	pxor	%xmm13,%xmm6
+
+	movdqa	%xmm12,%xmm9
+	paddd	%xmm3,%xmm11
+	pxor	%xmm1,%xmm4
+	psrld	$27,%xmm9
+	pxor	%xmm14,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	pslld	$30,%xmm7
+	movdqa	%xmm4,%xmm5
+	por	%xmm9,%xmm8
+	psrld	$31,%xmm5
+	paddd	%xmm6,%xmm11
+	paddd	%xmm4,%xmm4
+
+	psrld	$2,%xmm13
+	paddd	%xmm8,%xmm11
+	por	%xmm5,%xmm4
+	por	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm8
+	paddd	%xmm15,%xmm10
+	movdqa	%xmm14,%xmm6
+	pslld	$5,%xmm8
+	pxor	%xmm12,%xmm6
+
+	movdqa	%xmm11,%xmm9
+	paddd	%xmm4,%xmm10
+	psrld	$27,%xmm9
+	movdqa	%xmm12,%xmm7
+	pxor	%xmm13,%xmm6
+
+	pslld	$30,%xmm7
+	por	%xmm9,%xmm8
+	paddd	%xmm6,%xmm10
+
+	psrld	$2,%xmm12
+	paddd	%xmm8,%xmm10
+	por	%xmm7,%xmm12
+	movdqa	(%rbx),%xmm0
+	movl	$1,%ecx
+	cmpl	0(%rbx),%ecx
+	pxor	%xmm8,%xmm8
+	cmovgeq	%rbp,%r8
+	cmpl	4(%rbx),%ecx
+	movdqa	%xmm0,%xmm1
+	cmovgeq	%rbp,%r9
+	cmpl	8(%rbx),%ecx
+	pcmpgtd	%xmm8,%xmm1
+	cmovgeq	%rbp,%r10
+	cmpl	12(%rbx),%ecx
+	paddd	%xmm1,%xmm0
+	cmovgeq	%rbp,%r11
+
+	movdqu	0(%rdi),%xmm6
+	pand	%xmm1,%xmm10
+	movdqu	32(%rdi),%xmm7
+	pand	%xmm1,%xmm11
+	paddd	%xmm6,%xmm10
+	movdqu	64(%rdi),%xmm8
+	pand	%xmm1,%xmm12
+	paddd	%xmm7,%xmm11
+	movdqu	96(%rdi),%xmm9
+	pand	%xmm1,%xmm13
+	paddd	%xmm8,%xmm12
+	movdqu	128(%rdi),%xmm5
+	pand	%xmm1,%xmm14
+	movdqu	%xmm10,0(%rdi)
+	paddd	%xmm9,%xmm13
+	movdqu	%xmm11,32(%rdi)
+	paddd	%xmm5,%xmm14
+	movdqu	%xmm12,64(%rdi)
+	movdqu	%xmm13,96(%rdi)
+	movdqu	%xmm14,128(%rdi)
+
+	movdqa	%xmm0,(%rbx)
+	movdqa	96(%rbp),%xmm5
+	movdqa	-32(%rbp),%xmm15
+	decl	%edx
+	jnz	.Loop
+
+	movl	280(%rsp),%edx
+	leaq	16(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande
+
+.Ldone:
+	movq	272(%rsp),%rax
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha1_multi_block,.-sha1_multi_block
+.type	sha1_multi_block_shaext, at function
+.align	32
+sha1_multi_block_shaext:
+_shaext_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	shll	$1,%edx
+	andq	$-256,%rsp
+	leaq	64(%rdi),%rdi
+	movq	%rax,272(%rsp)
+.Lbody_shaext:
+	leaq	256(%rsp),%rbx
+	movdqa	K_XX_XX+128(%rip),%xmm3
+
+.Loop_grande_shaext:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rsp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rsp,%r9
+	testl	%edx,%edx
+	jz	.Ldone_shaext
+
+	movq	0-64(%rdi),%xmm0
+	movq	32-64(%rdi),%xmm4
+	movq	64-64(%rdi),%xmm5
+	movq	96-64(%rdi),%xmm6
+	movq	128-64(%rdi),%xmm7
+
+	punpckldq	%xmm4,%xmm0
+	punpckldq	%xmm6,%xmm5
+
+	movdqa	%xmm0,%xmm8
+	punpcklqdq	%xmm5,%xmm0
+	punpckhqdq	%xmm5,%xmm8
+
+	pshufd	$63,%xmm7,%xmm1
+	pshufd	$127,%xmm7,%xmm9
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm8,%xmm8
+	jmp	.Loop_shaext
+
+.align	32
+.Loop_shaext:
+	movdqu	0(%r8),%xmm4
+	movdqu	0(%r9),%xmm11
+	movdqu	16(%r8),%xmm5
+	movdqu	16(%r9),%xmm12
+	movdqu	32(%r8),%xmm6
+.byte	102,15,56,0,227
+	movdqu	32(%r9),%xmm13
+.byte	102,68,15,56,0,219
+	movdqu	48(%r8),%xmm7
+	leaq	64(%r8),%r8
+.byte	102,15,56,0,235
+	movdqu	48(%r9),%xmm14
+	leaq	64(%r9),%r9
+.byte	102,68,15,56,0,227
+
+	movdqa	%xmm1,80(%rsp)
+	paddd	%xmm4,%xmm1
+	movdqa	%xmm9,112(%rsp)
+	paddd	%xmm11,%xmm9
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,96(%rsp)
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+.byte	69,15,58,204,193,0
+.byte	69,15,56,200,212
+.byte	102,15,56,0,243
+	prefetcht0	127(%r8)
+.byte	15,56,201,229
+.byte	102,68,15,56,0,235
+	prefetcht0	127(%r9)
+.byte	69,15,56,201,220
+
+.byte	102,15,56,0,251
+	movdqa	%xmm0,%xmm1
+.byte	102,68,15,56,0,243
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+.byte	69,15,58,204,194,0
+.byte	69,15,56,200,205
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	pxor	%xmm13,%xmm11
+.byte	69,15,56,201,229
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+.byte	69,15,58,204,193,0
+.byte	69,15,56,200,214
+.byte	15,56,202,231
+.byte	69,15,56,202,222
+	pxor	%xmm7,%xmm5
+.byte	15,56,201,247
+	pxor	%xmm14,%xmm12
+.byte	69,15,56,201,238
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+.byte	69,15,58,204,194,0
+.byte	69,15,56,200,203
+.byte	15,56,202,236
+.byte	69,15,56,202,227
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+	pxor	%xmm11,%xmm13
+.byte	69,15,56,201,243
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+.byte	69,15,58,204,193,0
+.byte	69,15,56,200,212
+.byte	15,56,202,245
+.byte	69,15,56,202,236
+	pxor	%xmm5,%xmm7
+.byte	15,56,201,229
+	pxor	%xmm12,%xmm14
+.byte	69,15,56,201,220
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+.byte	69,15,58,204,194,1
+.byte	69,15,56,200,205
+.byte	15,56,202,254
+.byte	69,15,56,202,245
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	pxor	%xmm13,%xmm11
+.byte	69,15,56,201,229
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+.byte	69,15,58,204,193,1
+.byte	69,15,56,200,214
+.byte	15,56,202,231
+.byte	69,15,56,202,222
+	pxor	%xmm7,%xmm5
+.byte	15,56,201,247
+	pxor	%xmm14,%xmm12
+.byte	69,15,56,201,238
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+.byte	69,15,58,204,194,1
+.byte	69,15,56,200,203
+.byte	15,56,202,236
+.byte	69,15,56,202,227
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+	pxor	%xmm11,%xmm13
+.byte	69,15,56,201,243
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+.byte	69,15,58,204,193,1
+.byte	69,15,56,200,212
+.byte	15,56,202,245
+.byte	69,15,56,202,236
+	pxor	%xmm5,%xmm7
+.byte	15,56,201,229
+	pxor	%xmm12,%xmm14
+.byte	69,15,56,201,220
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+.byte	69,15,58,204,194,1
+.byte	69,15,56,200,205
+.byte	15,56,202,254
+.byte	69,15,56,202,245
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	pxor	%xmm13,%xmm11
+.byte	69,15,56,201,229
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+.byte	69,15,58,204,193,2
+.byte	69,15,56,200,214
+.byte	15,56,202,231
+.byte	69,15,56,202,222
+	pxor	%xmm7,%xmm5
+.byte	15,56,201,247
+	pxor	%xmm14,%xmm12
+.byte	69,15,56,201,238
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+.byte	69,15,58,204,194,2
+.byte	69,15,56,200,203
+.byte	15,56,202,236
+.byte	69,15,56,202,227
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+	pxor	%xmm11,%xmm13
+.byte	69,15,56,201,243
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+.byte	69,15,58,204,193,2
+.byte	69,15,56,200,212
+.byte	15,56,202,245
+.byte	69,15,56,202,236
+	pxor	%xmm5,%xmm7
+.byte	15,56,201,229
+	pxor	%xmm12,%xmm14
+.byte	69,15,56,201,220
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+.byte	69,15,58,204,194,2
+.byte	69,15,56,200,205
+.byte	15,56,202,254
+.byte	69,15,56,202,245
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+	pxor	%xmm13,%xmm11
+.byte	69,15,56,201,229
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+.byte	69,15,58,204,193,2
+.byte	69,15,56,200,214
+.byte	15,56,202,231
+.byte	69,15,56,202,222
+	pxor	%xmm7,%xmm5
+.byte	15,56,201,247
+	pxor	%xmm14,%xmm12
+.byte	69,15,56,201,238
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+.byte	69,15,58,204,194,3
+.byte	69,15,56,200,203
+.byte	15,56,202,236
+.byte	69,15,56,202,227
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+	pxor	%xmm11,%xmm13
+.byte	69,15,56,201,243
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+.byte	69,15,58,204,193,3
+.byte	69,15,56,200,212
+.byte	15,56,202,245
+.byte	69,15,56,202,236
+	pxor	%xmm5,%xmm7
+	pxor	%xmm12,%xmm14
+
+	movl	$1,%ecx
+	pxor	%xmm4,%xmm4
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rsp,%r8
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+.byte	69,15,58,204,194,3
+.byte	69,15,56,200,205
+.byte	15,56,202,254
+.byte	69,15,56,202,245
+
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rsp,%r9
+	movq	(%rbx),%xmm6
+
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm8,%xmm10
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+.byte	69,15,58,204,193,3
+.byte	69,15,56,200,214
+
+	pshufd	$0x00,%xmm6,%xmm11
+	pshufd	$0x55,%xmm6,%xmm12
+	movdqa	%xmm6,%xmm7
+	pcmpgtd	%xmm4,%xmm11
+	pcmpgtd	%xmm4,%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm8,%xmm9
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+.byte	69,15,58,204,194,3
+.byte	68,15,56,200,204
+
+	pcmpgtd	%xmm4,%xmm7
+	pand	%xmm11,%xmm0
+	pand	%xmm11,%xmm1
+	pand	%xmm12,%xmm8
+	pand	%xmm12,%xmm9
+	paddd	%xmm7,%xmm6
+
+	paddd	64(%rsp),%xmm0
+	paddd	80(%rsp),%xmm1
+	paddd	96(%rsp),%xmm8
+	paddd	112(%rsp),%xmm9
+
+	movq	%xmm6,(%rbx)
+	decl	%edx
+	jnz	.Loop_shaext
+
+	movl	280(%rsp),%edx
+
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm8,%xmm8
+
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm8,%xmm0
+	punpckhdq	%xmm8,%xmm6
+	punpckhdq	%xmm9,%xmm1
+	movq	%xmm0,0-64(%rdi)
+	psrldq	$8,%xmm0
+	movq	%xmm6,64-64(%rdi)
+	psrldq	$8,%xmm6
+	movq	%xmm0,32-64(%rdi)
+	psrldq	$8,%xmm1
+	movq	%xmm6,96-64(%rdi)
+	movq	%xmm1,128-64(%rdi)
+
+	leaq	8(%rdi),%rdi
+	leaq	32(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande_shaext
+
+.Ldone_shaext:
+
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_shaext:
+	.byte	0xf3,0xc3
+.size	sha1_multi_block_shaext,.-sha1_multi_block_shaext
+.type	sha1_multi_block_avx, at function
+.align	32
+sha1_multi_block_avx:
+_avx_shortcut:
+	shrq	$32,%rcx
+	cmpl	$2,%edx
+	jb	.Lavx
+	testl	$32,%ecx
+	jnz	_avx2_shortcut
+	jmp	.Lavx
+.align	32
+.Lavx:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	andq	$-256,%rsp
+	movq	%rax,272(%rsp)
+.Lbody_avx:
+	leaq	K_XX_XX(%rip),%rbp
+	leaq	256(%rsp),%rbx
+
+	vzeroupper
+.Loop_grande_avx:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r9
+	movq	32(%rsi),%r10
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r10
+	movq	48(%rsi),%r11
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r11
+	testl	%edx,%edx
+	jz	.Ldone_avx
+
+	vmovdqu	0(%rdi),%xmm10
+	leaq	128(%rsp),%rax
+	vmovdqu	32(%rdi),%xmm11
+	vmovdqu	64(%rdi),%xmm12
+	vmovdqu	96(%rdi),%xmm13
+	vmovdqu	128(%rdi),%xmm14
+	vmovdqu	96(%rbp),%xmm5
+	jmp	.Loop_avx
+
+.align	32
+.Loop_avx:
+	vmovdqa	-32(%rbp),%xmm15
+	vmovd	(%r8),%xmm0
+	leaq	64(%r8),%r8
+	vmovd	(%r9),%xmm2
+	leaq	64(%r9),%r9
+	vpinsrd	$1,(%r10),%xmm0,%xmm0
+	leaq	64(%r10),%r10
+	vpinsrd	$1,(%r11),%xmm2,%xmm2
+	leaq	64(%r11),%r11
+	vmovd	-60(%r8),%xmm1
+	vpunpckldq	%xmm2,%xmm0,%xmm0
+	vmovd	-60(%r9),%xmm9
+	vpshufb	%xmm5,%xmm0,%xmm0
+	vpinsrd	$1,-60(%r10),%xmm1,%xmm1
+	vpinsrd	$1,-60(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpandn	%xmm13,%xmm11,%xmm7
+	vpand	%xmm12,%xmm11,%xmm6
+
+	vmovdqa	%xmm0,0-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpunpckldq	%xmm9,%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-56(%r8),%xmm2
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-56(%r9),%xmm9
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpshufb	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpinsrd	$1,-56(%r10),%xmm2,%xmm2
+	vpinsrd	$1,-56(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpandn	%xmm12,%xmm10,%xmm7
+	vpand	%xmm11,%xmm10,%xmm6
+
+	vmovdqa	%xmm1,16-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpunpckldq	%xmm9,%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-52(%r8),%xmm3
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-52(%r9),%xmm9
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpshufb	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpinsrd	$1,-52(%r10),%xmm3,%xmm3
+	vpinsrd	$1,-52(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpandn	%xmm11,%xmm14,%xmm7
+	vpand	%xmm10,%xmm14,%xmm6
+
+	vmovdqa	%xmm2,32-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpunpckldq	%xmm9,%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-48(%r8),%xmm4
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-48(%r9),%xmm9
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpshufb	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpinsrd	$1,-48(%r10),%xmm4,%xmm4
+	vpinsrd	$1,-48(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpandn	%xmm10,%xmm13,%xmm7
+	vpand	%xmm14,%xmm13,%xmm6
+
+	vmovdqa	%xmm3,48-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpunpckldq	%xmm9,%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-44(%r8),%xmm0
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-44(%r9),%xmm9
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpshufb	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpinsrd	$1,-44(%r10),%xmm0,%xmm0
+	vpinsrd	$1,-44(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpandn	%xmm14,%xmm12,%xmm7
+	vpand	%xmm13,%xmm12,%xmm6
+
+	vmovdqa	%xmm4,64-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpunpckldq	%xmm9,%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-40(%r8),%xmm1
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-40(%r9),%xmm9
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpshufb	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpinsrd	$1,-40(%r10),%xmm1,%xmm1
+	vpinsrd	$1,-40(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpandn	%xmm13,%xmm11,%xmm7
+	vpand	%xmm12,%xmm11,%xmm6
+
+	vmovdqa	%xmm0,80-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpunpckldq	%xmm9,%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-36(%r8),%xmm2
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-36(%r9),%xmm9
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpshufb	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpinsrd	$1,-36(%r10),%xmm2,%xmm2
+	vpinsrd	$1,-36(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpandn	%xmm12,%xmm10,%xmm7
+	vpand	%xmm11,%xmm10,%xmm6
+
+	vmovdqa	%xmm1,96-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpunpckldq	%xmm9,%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-32(%r8),%xmm3
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-32(%r9),%xmm9
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpshufb	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpinsrd	$1,-32(%r10),%xmm3,%xmm3
+	vpinsrd	$1,-32(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpandn	%xmm11,%xmm14,%xmm7
+	vpand	%xmm10,%xmm14,%xmm6
+
+	vmovdqa	%xmm2,112-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpunpckldq	%xmm9,%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-28(%r8),%xmm4
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-28(%r9),%xmm9
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpshufb	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpinsrd	$1,-28(%r10),%xmm4,%xmm4
+	vpinsrd	$1,-28(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpandn	%xmm10,%xmm13,%xmm7
+	vpand	%xmm14,%xmm13,%xmm6
+
+	vmovdqa	%xmm3,128-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpunpckldq	%xmm9,%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-24(%r8),%xmm0
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-24(%r9),%xmm9
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpshufb	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpinsrd	$1,-24(%r10),%xmm0,%xmm0
+	vpinsrd	$1,-24(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpandn	%xmm14,%xmm12,%xmm7
+	vpand	%xmm13,%xmm12,%xmm6
+
+	vmovdqa	%xmm4,144-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpunpckldq	%xmm9,%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-20(%r8),%xmm1
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-20(%r9),%xmm9
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpshufb	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpinsrd	$1,-20(%r10),%xmm1,%xmm1
+	vpinsrd	$1,-20(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpandn	%xmm13,%xmm11,%xmm7
+	vpand	%xmm12,%xmm11,%xmm6
+
+	vmovdqa	%xmm0,160-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpunpckldq	%xmm9,%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-16(%r8),%xmm2
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-16(%r9),%xmm9
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpshufb	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpinsrd	$1,-16(%r10),%xmm2,%xmm2
+	vpinsrd	$1,-16(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpandn	%xmm12,%xmm10,%xmm7
+	vpand	%xmm11,%xmm10,%xmm6
+
+	vmovdqa	%xmm1,176-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpunpckldq	%xmm9,%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-12(%r8),%xmm3
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-12(%r9),%xmm9
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpshufb	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpinsrd	$1,-12(%r10),%xmm3,%xmm3
+	vpinsrd	$1,-12(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpandn	%xmm11,%xmm14,%xmm7
+	vpand	%xmm10,%xmm14,%xmm6
+
+	vmovdqa	%xmm2,192-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpunpckldq	%xmm9,%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-8(%r8),%xmm4
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-8(%r9),%xmm9
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpshufb	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpinsrd	$1,-8(%r10),%xmm4,%xmm4
+	vpinsrd	$1,-8(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpandn	%xmm10,%xmm13,%xmm7
+	vpand	%xmm14,%xmm13,%xmm6
+
+	vmovdqa	%xmm3,208-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpunpckldq	%xmm9,%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovd	-4(%r8),%xmm0
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vmovd	-4(%r9),%xmm9
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpshufb	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vmovdqa	0-128(%rax),%xmm1
+	vpinsrd	$1,-4(%r10),%xmm0,%xmm0
+	vpinsrd	$1,-4(%r11),%xmm9,%xmm9
+	vpaddd	%xmm15,%xmm10,%xmm10
+	prefetcht0	63(%r8)
+	vpslld	$5,%xmm11,%xmm8
+	vpandn	%xmm14,%xmm12,%xmm7
+	vpand	%xmm13,%xmm12,%xmm6
+
+	vmovdqa	%xmm4,224-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpunpckldq	%xmm9,%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	prefetcht0	63(%r9)
+	vpxor	%xmm7,%xmm6,%xmm6
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	prefetcht0	63(%r10)
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	prefetcht0	63(%r11)
+	vpshufb	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vmovdqa	16-128(%rax),%xmm2
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	32-128(%rax),%xmm3
+
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpandn	%xmm13,%xmm11,%xmm7
+
+	vpand	%xmm12,%xmm11,%xmm6
+
+	vmovdqa	%xmm0,240-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	128-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	48-128(%rax),%xmm4
+
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpandn	%xmm12,%xmm10,%xmm7
+
+	vpand	%xmm11,%xmm10,%xmm6
+
+	vmovdqa	%xmm1,0-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	144-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	64-128(%rax),%xmm0
+
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpandn	%xmm11,%xmm14,%xmm7
+
+	vpand	%xmm10,%xmm14,%xmm6
+
+	vmovdqa	%xmm2,16-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	160-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	80-128(%rax),%xmm1
+
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpandn	%xmm10,%xmm13,%xmm7
+
+	vpand	%xmm14,%xmm13,%xmm6
+
+	vmovdqa	%xmm3,32-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	176-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	96-128(%rax),%xmm2
+
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpandn	%xmm14,%xmm12,%xmm7
+
+	vpand	%xmm13,%xmm12,%xmm6
+
+	vmovdqa	%xmm4,48-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	192-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vmovdqa	0(%rbp),%xmm15
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	112-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,64-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	208-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	128-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,80-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	224-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	144-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,96-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	240-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	160-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,112-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	0-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	176-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,128-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	16-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	192-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,144-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	32-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	208-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,160-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	48-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	224-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,176-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	64-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	240-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,192-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	80-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	0-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,208-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	96-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	16-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,224-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	112-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	32-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,240-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	128-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	48-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,0-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	144-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	64-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,16-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	160-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	80-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,32-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	176-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	96-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,48-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	192-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	112-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,64-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	208-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	128-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,80-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	224-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	144-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,96-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	240-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	160-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,112-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	0-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vmovdqa	32(%rbp),%xmm15
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	176-128(%rax),%xmm3
+
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpand	%xmm12,%xmm13,%xmm7
+	vpxor	16-128(%rax),%xmm1,%xmm1
+
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm13,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vmovdqu	%xmm0,128-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm1,%xmm5
+	vpand	%xmm11,%xmm6,%xmm6
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	192-128(%rax),%xmm4
+
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpand	%xmm11,%xmm12,%xmm7
+	vpxor	32-128(%rax),%xmm2,%xmm2
+
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm12,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vmovdqu	%xmm1,144-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm2,%xmm5
+	vpand	%xmm10,%xmm6,%xmm6
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	208-128(%rax),%xmm0
+
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpand	%xmm10,%xmm11,%xmm7
+	vpxor	48-128(%rax),%xmm3,%xmm3
+
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm11,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vmovdqu	%xmm2,160-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm3,%xmm5
+	vpand	%xmm14,%xmm6,%xmm6
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	224-128(%rax),%xmm1
+
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpand	%xmm14,%xmm10,%xmm7
+	vpxor	64-128(%rax),%xmm4,%xmm4
+
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm10,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqu	%xmm3,176-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm4,%xmm5
+	vpand	%xmm13,%xmm6,%xmm6
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	240-128(%rax),%xmm2
+
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpand	%xmm13,%xmm14,%xmm7
+	vpxor	80-128(%rax),%xmm0,%xmm0
+
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm14,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vmovdqu	%xmm4,192-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm0,%xmm5
+	vpand	%xmm12,%xmm6,%xmm6
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	0-128(%rax),%xmm3
+
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpand	%xmm12,%xmm13,%xmm7
+	vpxor	96-128(%rax),%xmm1,%xmm1
+
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm13,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vmovdqu	%xmm0,208-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm1,%xmm5
+	vpand	%xmm11,%xmm6,%xmm6
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	16-128(%rax),%xmm4
+
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpand	%xmm11,%xmm12,%xmm7
+	vpxor	112-128(%rax),%xmm2,%xmm2
+
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm12,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vmovdqu	%xmm1,224-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm2,%xmm5
+	vpand	%xmm10,%xmm6,%xmm6
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	32-128(%rax),%xmm0
+
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpand	%xmm10,%xmm11,%xmm7
+	vpxor	128-128(%rax),%xmm3,%xmm3
+
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm11,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vmovdqu	%xmm2,240-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm3,%xmm5
+	vpand	%xmm14,%xmm6,%xmm6
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	48-128(%rax),%xmm1
+
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpand	%xmm14,%xmm10,%xmm7
+	vpxor	144-128(%rax),%xmm4,%xmm4
+
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm10,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqu	%xmm3,0-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm4,%xmm5
+	vpand	%xmm13,%xmm6,%xmm6
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	64-128(%rax),%xmm2
+
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpand	%xmm13,%xmm14,%xmm7
+	vpxor	160-128(%rax),%xmm0,%xmm0
+
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm14,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vmovdqu	%xmm4,16-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm0,%xmm5
+	vpand	%xmm12,%xmm6,%xmm6
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	80-128(%rax),%xmm3
+
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpand	%xmm12,%xmm13,%xmm7
+	vpxor	176-128(%rax),%xmm1,%xmm1
+
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm13,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vmovdqu	%xmm0,32-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm1,%xmm5
+	vpand	%xmm11,%xmm6,%xmm6
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	96-128(%rax),%xmm4
+
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpand	%xmm11,%xmm12,%xmm7
+	vpxor	192-128(%rax),%xmm2,%xmm2
+
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm12,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vmovdqu	%xmm1,48-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm2,%xmm5
+	vpand	%xmm10,%xmm6,%xmm6
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	112-128(%rax),%xmm0
+
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpand	%xmm10,%xmm11,%xmm7
+	vpxor	208-128(%rax),%xmm3,%xmm3
+
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm11,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vmovdqu	%xmm2,64-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm3,%xmm5
+	vpand	%xmm14,%xmm6,%xmm6
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	128-128(%rax),%xmm1
+
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpand	%xmm14,%xmm10,%xmm7
+	vpxor	224-128(%rax),%xmm4,%xmm4
+
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm10,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqu	%xmm3,80-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm4,%xmm5
+	vpand	%xmm13,%xmm6,%xmm6
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	144-128(%rax),%xmm2
+
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpand	%xmm13,%xmm14,%xmm7
+	vpxor	240-128(%rax),%xmm0,%xmm0
+
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm14,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vmovdqu	%xmm4,96-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm0,%xmm5
+	vpand	%xmm12,%xmm6,%xmm6
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	160-128(%rax),%xmm3
+
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpslld	$5,%xmm10,%xmm8
+	vpand	%xmm12,%xmm13,%xmm7
+	vpxor	0-128(%rax),%xmm1,%xmm1
+
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm13,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vmovdqu	%xmm0,112-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm1,%xmm5
+	vpand	%xmm11,%xmm6,%xmm6
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	176-128(%rax),%xmm4
+
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpslld	$5,%xmm14,%xmm8
+	vpand	%xmm11,%xmm12,%xmm7
+	vpxor	16-128(%rax),%xmm2,%xmm2
+
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm12,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vmovdqu	%xmm1,128-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm2,%xmm5
+	vpand	%xmm10,%xmm6,%xmm6
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpaddd	%xmm6,%xmm13,%xmm13
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	192-128(%rax),%xmm0
+
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpslld	$5,%xmm13,%xmm8
+	vpand	%xmm10,%xmm11,%xmm7
+	vpxor	32-128(%rax),%xmm3,%xmm3
+
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm11,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vmovdqu	%xmm2,144-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm3,%xmm5
+	vpand	%xmm14,%xmm6,%xmm6
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	208-128(%rax),%xmm1
+
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpslld	$5,%xmm12,%xmm8
+	vpand	%xmm14,%xmm10,%xmm7
+	vpxor	48-128(%rax),%xmm4,%xmm4
+
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm10,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqu	%xmm3,160-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm4,%xmm5
+	vpand	%xmm13,%xmm6,%xmm6
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpaddd	%xmm6,%xmm11,%xmm11
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	224-128(%rax),%xmm2
+
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpslld	$5,%xmm11,%xmm8
+	vpand	%xmm13,%xmm14,%xmm7
+	vpxor	64-128(%rax),%xmm0,%xmm0
+
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm14,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vmovdqu	%xmm4,176-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpor	%xmm9,%xmm8,%xmm8
+	vpsrld	$31,%xmm0,%xmm5
+	vpand	%xmm12,%xmm6,%xmm6
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vmovdqa	64(%rbp),%xmm15
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	240-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,192-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	80-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	0-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,208-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	96-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	16-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,224-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	112-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	32-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,240-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	128-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	48-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,0-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	144-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	64-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,16-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	160-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	80-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,32-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	176-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	96-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vmovdqa	%xmm2,48-128(%rax)
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	192-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	112-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vmovdqa	%xmm3,64-128(%rax)
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	208-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	128-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vmovdqa	%xmm4,80-128(%rax)
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	224-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	144-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vmovdqa	%xmm0,96-128(%rax)
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	240-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	160-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vmovdqa	%xmm1,112-128(%rax)
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	0-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	176-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	16-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	192-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	32-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	208-128(%rax),%xmm2
+
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	48-128(%rax),%xmm0,%xmm0
+	vpsrld	$27,%xmm11,%xmm9
+	vpxor	%xmm13,%xmm6,%xmm6
+	vpxor	%xmm2,%xmm0,%xmm0
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpsrld	$31,%xmm0,%xmm5
+	vpaddd	%xmm0,%xmm0,%xmm0
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm5,%xmm0,%xmm0
+	vpor	%xmm7,%xmm12,%xmm12
+	vpxor	%xmm3,%xmm1,%xmm1
+	vmovdqa	224-128(%rax),%xmm3
+
+	vpslld	$5,%xmm10,%xmm8
+	vpaddd	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm11,%xmm13,%xmm6
+	vpaddd	%xmm0,%xmm14,%xmm14
+	vpxor	64-128(%rax),%xmm1,%xmm1
+	vpsrld	$27,%xmm10,%xmm9
+	vpxor	%xmm12,%xmm6,%xmm6
+	vpxor	%xmm3,%xmm1,%xmm1
+
+	vpslld	$30,%xmm11,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpsrld	$31,%xmm1,%xmm5
+	vpaddd	%xmm1,%xmm1,%xmm1
+
+	vpsrld	$2,%xmm11,%xmm11
+	vpaddd	%xmm8,%xmm14,%xmm14
+	vpor	%xmm5,%xmm1,%xmm1
+	vpor	%xmm7,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm2,%xmm2
+	vmovdqa	240-128(%rax),%xmm4
+
+	vpslld	$5,%xmm14,%xmm8
+	vpaddd	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm10,%xmm12,%xmm6
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vpxor	80-128(%rax),%xmm2,%xmm2
+	vpsrld	$27,%xmm14,%xmm9
+	vpxor	%xmm11,%xmm6,%xmm6
+	vpxor	%xmm4,%xmm2,%xmm2
+
+	vpslld	$30,%xmm10,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm13,%xmm13
+	vpsrld	$31,%xmm2,%xmm5
+	vpaddd	%xmm2,%xmm2,%xmm2
+
+	vpsrld	$2,%xmm10,%xmm10
+	vpaddd	%xmm8,%xmm13,%xmm13
+	vpor	%xmm5,%xmm2,%xmm2
+	vpor	%xmm7,%xmm10,%xmm10
+	vpxor	%xmm0,%xmm3,%xmm3
+	vmovdqa	0-128(%rax),%xmm0
+
+	vpslld	$5,%xmm13,%xmm8
+	vpaddd	%xmm15,%xmm12,%xmm12
+	vpxor	%xmm14,%xmm11,%xmm6
+	vpaddd	%xmm2,%xmm12,%xmm12
+	vpxor	96-128(%rax),%xmm3,%xmm3
+	vpsrld	$27,%xmm13,%xmm9
+	vpxor	%xmm10,%xmm6,%xmm6
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vpslld	$30,%xmm14,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpsrld	$31,%xmm3,%xmm5
+	vpaddd	%xmm3,%xmm3,%xmm3
+
+	vpsrld	$2,%xmm14,%xmm14
+	vpaddd	%xmm8,%xmm12,%xmm12
+	vpor	%xmm5,%xmm3,%xmm3
+	vpor	%xmm7,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqa	16-128(%rax),%xmm1
+
+	vpslld	$5,%xmm12,%xmm8
+	vpaddd	%xmm15,%xmm11,%xmm11
+	vpxor	%xmm13,%xmm10,%xmm6
+	vpaddd	%xmm3,%xmm11,%xmm11
+	vpxor	112-128(%rax),%xmm4,%xmm4
+	vpsrld	$27,%xmm12,%xmm9
+	vpxor	%xmm14,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vpslld	$30,%xmm13,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm11,%xmm11
+	vpsrld	$31,%xmm4,%xmm5
+	vpaddd	%xmm4,%xmm4,%xmm4
+
+	vpsrld	$2,%xmm13,%xmm13
+	vpaddd	%xmm8,%xmm11,%xmm11
+	vpor	%xmm5,%xmm4,%xmm4
+	vpor	%xmm7,%xmm13,%xmm13
+	vpslld	$5,%xmm11,%xmm8
+	vpaddd	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm12,%xmm14,%xmm6
+
+	vpsrld	$27,%xmm11,%xmm9
+	vpaddd	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm13,%xmm6,%xmm6
+
+	vpslld	$30,%xmm12,%xmm7
+	vpor	%xmm9,%xmm8,%xmm8
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpsrld	$2,%xmm12,%xmm12
+	vpaddd	%xmm8,%xmm10,%xmm10
+	vpor	%xmm7,%xmm12,%xmm12
+	movl	$1,%ecx
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rbp,%r8
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rbp,%r9
+	cmpl	8(%rbx),%ecx
+	cmovgeq	%rbp,%r10
+	cmpl	12(%rbx),%ecx
+	cmovgeq	%rbp,%r11
+	vmovdqu	(%rbx),%xmm6
+	vpxor	%xmm8,%xmm8,%xmm8
+	vmovdqa	%xmm6,%xmm7
+	vpcmpgtd	%xmm8,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpand	%xmm7,%xmm10,%xmm10
+	vpand	%xmm7,%xmm11,%xmm11
+	vpaddd	0(%rdi),%xmm10,%xmm10
+	vpand	%xmm7,%xmm12,%xmm12
+	vpaddd	32(%rdi),%xmm11,%xmm11
+	vpand	%xmm7,%xmm13,%xmm13
+	vpaddd	64(%rdi),%xmm12,%xmm12
+	vpand	%xmm7,%xmm14,%xmm14
+	vpaddd	96(%rdi),%xmm13,%xmm13
+	vpaddd	128(%rdi),%xmm14,%xmm14
+	vmovdqu	%xmm10,0(%rdi)
+	vmovdqu	%xmm11,32(%rdi)
+	vmovdqu	%xmm12,64(%rdi)
+	vmovdqu	%xmm13,96(%rdi)
+	vmovdqu	%xmm14,128(%rdi)
+
+	vmovdqu	%xmm6,(%rbx)
+	vmovdqu	96(%rbp),%xmm5
+	decl	%edx
+	jnz	.Loop_avx
+
+	movl	280(%rsp),%edx
+	leaq	16(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande_avx
+
+.Ldone_avx:
+	movq	272(%rsp),%rax
+	vzeroupper
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	sha1_multi_block_avx,.-sha1_multi_block_avx
+.type	sha1_multi_block_avx2, at function
+.align	32
+sha1_multi_block_avx2:
+_avx2_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$576,%rsp
+	andq	$-256,%rsp
+	movq	%rax,544(%rsp)
+.Lbody_avx2:
+	leaq	K_XX_XX(%rip),%rbp
+	shrl	$1,%edx
+
+	vzeroupper
+.Loop_grande_avx2:
+	movl	%edx,552(%rsp)
+	xorl	%edx,%edx
+	leaq	512(%rsp),%rbx
+	movq	0(%rsi),%r12
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r12
+	movq	16(%rsi),%r13
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r13
+	movq	32(%rsi),%r14
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r14
+	movq	48(%rsi),%r15
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r15
+	movq	64(%rsi),%r8
+	movl	72(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,16(%rbx)
+	cmovleq	%rbp,%r8
+	movq	80(%rsi),%r9
+	movl	88(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,20(%rbx)
+	cmovleq	%rbp,%r9
+	movq	96(%rsi),%r10
+	movl	104(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,24(%rbx)
+	cmovleq	%rbp,%r10
+	movq	112(%rsi),%r11
+	movl	120(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,28(%rbx)
+	cmovleq	%rbp,%r11
+	vmovdqu	0(%rdi),%ymm0
+	leaq	128(%rsp),%rax
+	vmovdqu	32(%rdi),%ymm1
+	leaq	256+128(%rsp),%rbx
+	vmovdqu	64(%rdi),%ymm2
+	vmovdqu	96(%rdi),%ymm3
+	vmovdqu	128(%rdi),%ymm4
+	vmovdqu	96(%rbp),%ymm9
+	jmp	.Loop_avx2
+
+.align	32
+.Loop_avx2:
+	vmovdqa	-32(%rbp),%ymm15
+	vmovd	(%r12),%xmm10
+	leaq	64(%r12),%r12
+	vmovd	(%r8),%xmm12
+	leaq	64(%r8),%r8
+	vmovd	(%r13),%xmm7
+	leaq	64(%r13),%r13
+	vmovd	(%r9),%xmm6
+	leaq	64(%r9),%r9
+	vpinsrd	$1,(%r14),%xmm10,%xmm10
+	leaq	64(%r14),%r14
+	vpinsrd	$1,(%r10),%xmm12,%xmm12
+	leaq	64(%r10),%r10
+	vpinsrd	$1,(%r15),%xmm7,%xmm7
+	leaq	64(%r15),%r15
+	vpunpckldq	%ymm7,%ymm10,%ymm10
+	vpinsrd	$1,(%r11),%xmm6,%xmm6
+	leaq	64(%r11),%r11
+	vpunpckldq	%ymm6,%ymm12,%ymm12
+	vmovd	-60(%r12),%xmm11
+	vinserti128	$1,%xmm12,%ymm10,%ymm10
+	vmovd	-60(%r8),%xmm8
+	vpshufb	%ymm9,%ymm10,%ymm10
+	vmovd	-60(%r13),%xmm7
+	vmovd	-60(%r9),%xmm6
+	vpinsrd	$1,-60(%r14),%xmm11,%xmm11
+	vpinsrd	$1,-60(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-60(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm11,%ymm11
+	vpinsrd	$1,-60(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpandn	%ymm3,%ymm1,%ymm6
+	vpand	%ymm2,%ymm1,%ymm5
+
+	vmovdqa	%ymm10,0-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vinserti128	$1,%xmm8,%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-56(%r12),%xmm12
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-56(%r8),%xmm8
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpshufb	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vmovd	-56(%r13),%xmm7
+	vmovd	-56(%r9),%xmm6
+	vpinsrd	$1,-56(%r14),%xmm12,%xmm12
+	vpinsrd	$1,-56(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-56(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm12,%ymm12
+	vpinsrd	$1,-56(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpandn	%ymm2,%ymm0,%ymm6
+	vpand	%ymm1,%ymm0,%ymm5
+
+	vmovdqa	%ymm11,32-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vinserti128	$1,%xmm8,%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-52(%r12),%xmm13
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-52(%r8),%xmm8
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpshufb	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vmovd	-52(%r13),%xmm7
+	vmovd	-52(%r9),%xmm6
+	vpinsrd	$1,-52(%r14),%xmm13,%xmm13
+	vpinsrd	$1,-52(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-52(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm13,%ymm13
+	vpinsrd	$1,-52(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpandn	%ymm1,%ymm4,%ymm6
+	vpand	%ymm0,%ymm4,%ymm5
+
+	vmovdqa	%ymm12,64-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vinserti128	$1,%xmm8,%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-48(%r12),%xmm14
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-48(%r8),%xmm8
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpshufb	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vmovd	-48(%r13),%xmm7
+	vmovd	-48(%r9),%xmm6
+	vpinsrd	$1,-48(%r14),%xmm14,%xmm14
+	vpinsrd	$1,-48(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-48(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm14,%ymm14
+	vpinsrd	$1,-48(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpandn	%ymm0,%ymm3,%ymm6
+	vpand	%ymm4,%ymm3,%ymm5
+
+	vmovdqa	%ymm13,96-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vinserti128	$1,%xmm8,%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-44(%r12),%xmm10
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-44(%r8),%xmm8
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpshufb	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vmovd	-44(%r13),%xmm7
+	vmovd	-44(%r9),%xmm6
+	vpinsrd	$1,-44(%r14),%xmm10,%xmm10
+	vpinsrd	$1,-44(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-44(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm10,%ymm10
+	vpinsrd	$1,-44(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpandn	%ymm4,%ymm2,%ymm6
+	vpand	%ymm3,%ymm2,%ymm5
+
+	vmovdqa	%ymm14,128-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vinserti128	$1,%xmm8,%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-40(%r12),%xmm11
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-40(%r8),%xmm8
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpshufb	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovd	-40(%r13),%xmm7
+	vmovd	-40(%r9),%xmm6
+	vpinsrd	$1,-40(%r14),%xmm11,%xmm11
+	vpinsrd	$1,-40(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-40(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm11,%ymm11
+	vpinsrd	$1,-40(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpandn	%ymm3,%ymm1,%ymm6
+	vpand	%ymm2,%ymm1,%ymm5
+
+	vmovdqa	%ymm10,160-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vinserti128	$1,%xmm8,%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-36(%r12),%xmm12
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-36(%r8),%xmm8
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpshufb	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vmovd	-36(%r13),%xmm7
+	vmovd	-36(%r9),%xmm6
+	vpinsrd	$1,-36(%r14),%xmm12,%xmm12
+	vpinsrd	$1,-36(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-36(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm12,%ymm12
+	vpinsrd	$1,-36(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpandn	%ymm2,%ymm0,%ymm6
+	vpand	%ymm1,%ymm0,%ymm5
+
+	vmovdqa	%ymm11,192-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vinserti128	$1,%xmm8,%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-32(%r12),%xmm13
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-32(%r8),%xmm8
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpshufb	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vmovd	-32(%r13),%xmm7
+	vmovd	-32(%r9),%xmm6
+	vpinsrd	$1,-32(%r14),%xmm13,%xmm13
+	vpinsrd	$1,-32(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-32(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm13,%ymm13
+	vpinsrd	$1,-32(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpandn	%ymm1,%ymm4,%ymm6
+	vpand	%ymm0,%ymm4,%ymm5
+
+	vmovdqa	%ymm12,224-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vinserti128	$1,%xmm8,%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-28(%r12),%xmm14
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-28(%r8),%xmm8
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpshufb	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vmovd	-28(%r13),%xmm7
+	vmovd	-28(%r9),%xmm6
+	vpinsrd	$1,-28(%r14),%xmm14,%xmm14
+	vpinsrd	$1,-28(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-28(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm14,%ymm14
+	vpinsrd	$1,-28(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpandn	%ymm0,%ymm3,%ymm6
+	vpand	%ymm4,%ymm3,%ymm5
+
+	vmovdqa	%ymm13,256-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vinserti128	$1,%xmm8,%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-24(%r12),%xmm10
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-24(%r8),%xmm8
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpshufb	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vmovd	-24(%r13),%xmm7
+	vmovd	-24(%r9),%xmm6
+	vpinsrd	$1,-24(%r14),%xmm10,%xmm10
+	vpinsrd	$1,-24(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-24(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm10,%ymm10
+	vpinsrd	$1,-24(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpandn	%ymm4,%ymm2,%ymm6
+	vpand	%ymm3,%ymm2,%ymm5
+
+	vmovdqa	%ymm14,288-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vinserti128	$1,%xmm8,%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-20(%r12),%xmm11
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-20(%r8),%xmm8
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpshufb	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovd	-20(%r13),%xmm7
+	vmovd	-20(%r9),%xmm6
+	vpinsrd	$1,-20(%r14),%xmm11,%xmm11
+	vpinsrd	$1,-20(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-20(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm11,%ymm11
+	vpinsrd	$1,-20(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpandn	%ymm3,%ymm1,%ymm6
+	vpand	%ymm2,%ymm1,%ymm5
+
+	vmovdqa	%ymm10,320-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vinserti128	$1,%xmm8,%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-16(%r12),%xmm12
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-16(%r8),%xmm8
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpshufb	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vmovd	-16(%r13),%xmm7
+	vmovd	-16(%r9),%xmm6
+	vpinsrd	$1,-16(%r14),%xmm12,%xmm12
+	vpinsrd	$1,-16(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-16(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm12,%ymm12
+	vpinsrd	$1,-16(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpandn	%ymm2,%ymm0,%ymm6
+	vpand	%ymm1,%ymm0,%ymm5
+
+	vmovdqa	%ymm11,352-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vinserti128	$1,%xmm8,%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-12(%r12),%xmm13
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-12(%r8),%xmm8
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpshufb	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vmovd	-12(%r13),%xmm7
+	vmovd	-12(%r9),%xmm6
+	vpinsrd	$1,-12(%r14),%xmm13,%xmm13
+	vpinsrd	$1,-12(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-12(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm13,%ymm13
+	vpinsrd	$1,-12(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpandn	%ymm1,%ymm4,%ymm6
+	vpand	%ymm0,%ymm4,%ymm5
+
+	vmovdqa	%ymm12,384-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vinserti128	$1,%xmm8,%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-8(%r12),%xmm14
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-8(%r8),%xmm8
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpshufb	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vmovd	-8(%r13),%xmm7
+	vmovd	-8(%r9),%xmm6
+	vpinsrd	$1,-8(%r14),%xmm14,%xmm14
+	vpinsrd	$1,-8(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-8(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm14,%ymm14
+	vpinsrd	$1,-8(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpandn	%ymm0,%ymm3,%ymm6
+	vpand	%ymm4,%ymm3,%ymm5
+
+	vmovdqa	%ymm13,416-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vinserti128	$1,%xmm8,%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vmovd	-4(%r12),%xmm10
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vmovd	-4(%r8),%xmm8
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpshufb	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vmovdqa	0-128(%rax),%ymm11
+	vmovd	-4(%r13),%xmm7
+	vmovd	-4(%r9),%xmm6
+	vpinsrd	$1,-4(%r14),%xmm10,%xmm10
+	vpinsrd	$1,-4(%r10),%xmm8,%xmm8
+	vpinsrd	$1,-4(%r15),%xmm7,%xmm7
+	vpunpckldq	%ymm7,%ymm10,%ymm10
+	vpinsrd	$1,-4(%r11),%xmm6,%xmm6
+	vpunpckldq	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm15,%ymm0,%ymm0
+	prefetcht0	63(%r12)
+	vpslld	$5,%ymm1,%ymm7
+	vpandn	%ymm4,%ymm2,%ymm6
+	vpand	%ymm3,%ymm2,%ymm5
+
+	vmovdqa	%ymm14,448-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vinserti128	$1,%xmm8,%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	prefetcht0	63(%r13)
+	vpxor	%ymm6,%ymm5,%ymm5
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	prefetcht0	63(%r14)
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	prefetcht0	63(%r15)
+	vpshufb	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovdqa	32-128(%rax),%ymm12
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	64-128(%rax),%ymm13
+
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpandn	%ymm3,%ymm1,%ymm6
+	prefetcht0	63(%r8)
+	vpand	%ymm2,%ymm1,%ymm5
+
+	vmovdqa	%ymm10,480-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	256-256-128(%rbx),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+	prefetcht0	63(%r9)
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	prefetcht0	63(%r10)
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	prefetcht0	63(%r11)
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	96-128(%rax),%ymm14
+
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpandn	%ymm2,%ymm0,%ymm6
+
+	vpand	%ymm1,%ymm0,%ymm5
+
+	vmovdqa	%ymm11,0-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	288-256-128(%rbx),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	128-128(%rax),%ymm10
+
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpandn	%ymm1,%ymm4,%ymm6
+
+	vpand	%ymm0,%ymm4,%ymm5
+
+	vmovdqa	%ymm12,32-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	320-256-128(%rbx),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	160-128(%rax),%ymm11
+
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpandn	%ymm0,%ymm3,%ymm6
+
+	vpand	%ymm4,%ymm3,%ymm5
+
+	vmovdqa	%ymm13,64-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	352-256-128(%rbx),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	192-128(%rax),%ymm12
+
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpandn	%ymm4,%ymm2,%ymm6
+
+	vpand	%ymm3,%ymm2,%ymm5
+
+	vmovdqa	%ymm14,96-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	384-256-128(%rbx),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovdqa	0(%rbp),%ymm15
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	224-128(%rax),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,128-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	416-256-128(%rbx),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	256-256-128(%rbx),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,160-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	448-256-128(%rbx),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	288-256-128(%rbx),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,192-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	480-256-128(%rbx),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	320-256-128(%rbx),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,224-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	0-128(%rax),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	352-256-128(%rbx),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,256-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	32-128(%rax),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	384-256-128(%rbx),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,288-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	64-128(%rax),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	416-256-128(%rbx),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,320-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	96-128(%rax),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	448-256-128(%rbx),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,352-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	128-128(%rax),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	480-256-128(%rbx),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,384-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	160-128(%rax),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	0-128(%rax),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,416-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	192-128(%rax),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	32-128(%rax),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,448-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	224-128(%rax),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	64-128(%rax),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,480-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	256-256-128(%rbx),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	96-128(%rax),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,0-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	288-256-128(%rbx),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	128-128(%rax),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,32-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	320-256-128(%rbx),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	160-128(%rax),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,64-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	352-256-128(%rbx),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	192-128(%rax),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,96-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	384-256-128(%rbx),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	224-128(%rax),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,128-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	416-256-128(%rbx),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	256-256-128(%rbx),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,160-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	448-256-128(%rbx),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	288-256-128(%rbx),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,192-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	480-256-128(%rbx),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	320-256-128(%rbx),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,224-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	0-128(%rax),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovdqa	32(%rbp),%ymm15
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	352-256-128(%rbx),%ymm13
+
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpand	%ymm2,%ymm3,%ymm6
+	vpxor	32-128(%rax),%ymm11,%ymm11
+
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm3,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vmovdqu	%ymm10,256-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm11,%ymm9
+	vpand	%ymm1,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	384-256-128(%rbx),%ymm14
+
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpand	%ymm1,%ymm2,%ymm6
+	vpxor	64-128(%rax),%ymm12,%ymm12
+
+	vpaddd	%ymm6,%ymm3,%ymm3
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm2,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vmovdqu	%ymm11,288-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm12,%ymm9
+	vpand	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	416-256-128(%rbx),%ymm10
+
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpand	%ymm0,%ymm1,%ymm6
+	vpxor	96-128(%rax),%ymm13,%ymm13
+
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm1,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vmovdqu	%ymm12,320-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm13,%ymm9
+	vpand	%ymm4,%ymm5,%ymm5
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	448-256-128(%rbx),%ymm11
+
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpand	%ymm4,%ymm0,%ymm6
+	vpxor	128-128(%rax),%ymm14,%ymm14
+
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vmovdqu	%ymm13,352-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm14,%ymm9
+	vpand	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	480-256-128(%rbx),%ymm12
+
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpand	%ymm3,%ymm4,%ymm6
+	vpxor	160-128(%rax),%ymm10,%ymm10
+
+	vpaddd	%ymm6,%ymm0,%ymm0
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm4,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vmovdqu	%ymm14,384-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm10,%ymm9
+	vpand	%ymm2,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	0-128(%rax),%ymm13
+
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpand	%ymm2,%ymm3,%ymm6
+	vpxor	192-128(%rax),%ymm11,%ymm11
+
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm3,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vmovdqu	%ymm10,416-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm11,%ymm9
+	vpand	%ymm1,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	32-128(%rax),%ymm14
+
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpand	%ymm1,%ymm2,%ymm6
+	vpxor	224-128(%rax),%ymm12,%ymm12
+
+	vpaddd	%ymm6,%ymm3,%ymm3
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm2,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vmovdqu	%ymm11,448-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm12,%ymm9
+	vpand	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	64-128(%rax),%ymm10
+
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpand	%ymm0,%ymm1,%ymm6
+	vpxor	256-256-128(%rbx),%ymm13,%ymm13
+
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm1,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vmovdqu	%ymm12,480-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm13,%ymm9
+	vpand	%ymm4,%ymm5,%ymm5
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	96-128(%rax),%ymm11
+
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpand	%ymm4,%ymm0,%ymm6
+	vpxor	288-256-128(%rbx),%ymm14,%ymm14
+
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vmovdqu	%ymm13,0-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm14,%ymm9
+	vpand	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	128-128(%rax),%ymm12
+
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpand	%ymm3,%ymm4,%ymm6
+	vpxor	320-256-128(%rbx),%ymm10,%ymm10
+
+	vpaddd	%ymm6,%ymm0,%ymm0
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm4,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vmovdqu	%ymm14,32-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm10,%ymm9
+	vpand	%ymm2,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	160-128(%rax),%ymm13
+
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpand	%ymm2,%ymm3,%ymm6
+	vpxor	352-256-128(%rbx),%ymm11,%ymm11
+
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm3,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vmovdqu	%ymm10,64-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm11,%ymm9
+	vpand	%ymm1,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	192-128(%rax),%ymm14
+
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpand	%ymm1,%ymm2,%ymm6
+	vpxor	384-256-128(%rbx),%ymm12,%ymm12
+
+	vpaddd	%ymm6,%ymm3,%ymm3
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm2,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vmovdqu	%ymm11,96-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm12,%ymm9
+	vpand	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	224-128(%rax),%ymm10
+
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpand	%ymm0,%ymm1,%ymm6
+	vpxor	416-256-128(%rbx),%ymm13,%ymm13
+
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm1,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vmovdqu	%ymm12,128-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm13,%ymm9
+	vpand	%ymm4,%ymm5,%ymm5
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	256-256-128(%rbx),%ymm11
+
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpand	%ymm4,%ymm0,%ymm6
+	vpxor	448-256-128(%rbx),%ymm14,%ymm14
+
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vmovdqu	%ymm13,160-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm14,%ymm9
+	vpand	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	288-256-128(%rbx),%ymm12
+
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpand	%ymm3,%ymm4,%ymm6
+	vpxor	480-256-128(%rbx),%ymm10,%ymm10
+
+	vpaddd	%ymm6,%ymm0,%ymm0
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm4,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vmovdqu	%ymm14,192-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm10,%ymm9
+	vpand	%ymm2,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	320-256-128(%rbx),%ymm13
+
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpslld	$5,%ymm0,%ymm7
+	vpand	%ymm2,%ymm3,%ymm6
+	vpxor	0-128(%rax),%ymm11,%ymm11
+
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm3,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vmovdqu	%ymm10,224-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm11,%ymm9
+	vpand	%ymm1,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpaddd	%ymm5,%ymm4,%ymm4
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	352-256-128(%rbx),%ymm14
+
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpslld	$5,%ymm4,%ymm7
+	vpand	%ymm1,%ymm2,%ymm6
+	vpxor	32-128(%rax),%ymm12,%ymm12
+
+	vpaddd	%ymm6,%ymm3,%ymm3
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm2,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vmovdqu	%ymm11,256-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm12,%ymm9
+	vpand	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpaddd	%ymm5,%ymm3,%ymm3
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	384-256-128(%rbx),%ymm10
+
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpslld	$5,%ymm3,%ymm7
+	vpand	%ymm0,%ymm1,%ymm6
+	vpxor	64-128(%rax),%ymm13,%ymm13
+
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm1,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vmovdqu	%ymm12,288-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm13,%ymm9
+	vpand	%ymm4,%ymm5,%ymm5
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpaddd	%ymm5,%ymm2,%ymm2
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	416-256-128(%rbx),%ymm11
+
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpslld	$5,%ymm2,%ymm7
+	vpand	%ymm4,%ymm0,%ymm6
+	vpxor	96-128(%rax),%ymm14,%ymm14
+
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vmovdqu	%ymm13,320-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm14,%ymm9
+	vpand	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	448-256-128(%rbx),%ymm12
+
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpslld	$5,%ymm1,%ymm7
+	vpand	%ymm3,%ymm4,%ymm6
+	vpxor	128-128(%rax),%ymm10,%ymm10
+
+	vpaddd	%ymm6,%ymm0,%ymm0
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm4,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vmovdqu	%ymm14,352-256-128(%rbx)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm10,%ymm9
+	vpand	%ymm2,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vmovdqa	64(%rbp),%ymm15
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	480-256-128(%rbx),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,384-256-128(%rbx)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	160-128(%rax),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	0-128(%rax),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,416-256-128(%rbx)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	192-128(%rax),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	32-128(%rax),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,448-256-128(%rbx)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	224-128(%rax),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	64-128(%rax),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,480-256-128(%rbx)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	256-256-128(%rbx),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	96-128(%rax),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,0-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	288-256-128(%rbx),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	128-128(%rax),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,32-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	320-256-128(%rbx),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	160-128(%rax),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,64-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	352-256-128(%rbx),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	192-128(%rax),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vmovdqa	%ymm12,96-128(%rax)
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	384-256-128(%rbx),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	224-128(%rax),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vmovdqa	%ymm13,128-128(%rax)
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	416-256-128(%rbx),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	256-256-128(%rbx),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vmovdqa	%ymm14,160-128(%rax)
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	448-256-128(%rbx),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	288-256-128(%rbx),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vmovdqa	%ymm10,192-128(%rax)
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	480-256-128(%rbx),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	320-256-128(%rbx),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vmovdqa	%ymm11,224-128(%rax)
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	0-128(%rax),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	352-256-128(%rbx),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	32-128(%rax),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	384-256-128(%rbx),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	64-128(%rax),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm12,%ymm10,%ymm10
+	vmovdqa	416-256-128(%rbx),%ymm12
+
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	96-128(%rax),%ymm10,%ymm10
+	vpsrld	$27,%ymm1,%ymm8
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpxor	%ymm12,%ymm10,%ymm10
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpsrld	$31,%ymm10,%ymm9
+	vpaddd	%ymm10,%ymm10,%ymm10
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm9,%ymm10,%ymm10
+	vpor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm13,%ymm11,%ymm11
+	vmovdqa	448-256-128(%rbx),%ymm13
+
+	vpslld	$5,%ymm0,%ymm7
+	vpaddd	%ymm15,%ymm4,%ymm4
+	vpxor	%ymm1,%ymm3,%ymm5
+	vpaddd	%ymm10,%ymm4,%ymm4
+	vpxor	128-128(%rax),%ymm11,%ymm11
+	vpsrld	$27,%ymm0,%ymm8
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm13,%ymm11,%ymm11
+
+	vpslld	$30,%ymm1,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm4,%ymm4
+	vpsrld	$31,%ymm11,%ymm9
+	vpaddd	%ymm11,%ymm11,%ymm11
+
+	vpsrld	$2,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpor	%ymm9,%ymm11,%ymm11
+	vpor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm14,%ymm12,%ymm12
+	vmovdqa	480-256-128(%rbx),%ymm14
+
+	vpslld	$5,%ymm4,%ymm7
+	vpaddd	%ymm15,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm2,%ymm5
+	vpaddd	%ymm11,%ymm3,%ymm3
+	vpxor	160-128(%rax),%ymm12,%ymm12
+	vpsrld	$27,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm14,%ymm12,%ymm12
+
+	vpslld	$30,%ymm0,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm3,%ymm3
+	vpsrld	$31,%ymm12,%ymm9
+	vpaddd	%ymm12,%ymm12,%ymm12
+
+	vpsrld	$2,%ymm0,%ymm0
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpor	%ymm9,%ymm12,%ymm12
+	vpor	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm13,%ymm13
+	vmovdqa	0-128(%rax),%ymm10
+
+	vpslld	$5,%ymm3,%ymm7
+	vpaddd	%ymm15,%ymm2,%ymm2
+	vpxor	%ymm4,%ymm1,%ymm5
+	vpaddd	%ymm12,%ymm2,%ymm2
+	vpxor	192-128(%rax),%ymm13,%ymm13
+	vpsrld	$27,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm13,%ymm13
+
+	vpslld	$30,%ymm4,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm2,%ymm2
+	vpsrld	$31,%ymm13,%ymm9
+	vpaddd	%ymm13,%ymm13,%ymm13
+
+	vpsrld	$2,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpor	%ymm9,%ymm13,%ymm13
+	vpor	%ymm6,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm14,%ymm14
+	vmovdqa	32-128(%rax),%ymm11
+
+	vpslld	$5,%ymm2,%ymm7
+	vpaddd	%ymm15,%ymm1,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm5
+	vpaddd	%ymm13,%ymm1,%ymm1
+	vpxor	224-128(%rax),%ymm14,%ymm14
+	vpsrld	$27,%ymm2,%ymm8
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm14,%ymm14
+
+	vpslld	$30,%ymm3,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpsrld	$31,%ymm14,%ymm9
+	vpaddd	%ymm14,%ymm14,%ymm14
+
+	vpsrld	$2,%ymm3,%ymm3
+	vpaddd	%ymm7,%ymm1,%ymm1
+	vpor	%ymm9,%ymm14,%ymm14
+	vpor	%ymm6,%ymm3,%ymm3
+	vpslld	$5,%ymm1,%ymm7
+	vpaddd	%ymm15,%ymm0,%ymm0
+	vpxor	%ymm2,%ymm4,%ymm5
+
+	vpsrld	$27,%ymm1,%ymm8
+	vpaddd	%ymm14,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm5,%ymm5
+
+	vpslld	$30,%ymm2,%ymm6
+	vpor	%ymm8,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+
+	vpsrld	$2,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm0,%ymm0
+	vpor	%ymm6,%ymm2,%ymm2
+	movl	$1,%ecx
+	leaq	512(%rsp),%rbx
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rbp,%r12
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rbp,%r13
+	cmpl	8(%rbx),%ecx
+	cmovgeq	%rbp,%r14
+	cmpl	12(%rbx),%ecx
+	cmovgeq	%rbp,%r15
+	cmpl	16(%rbx),%ecx
+	cmovgeq	%rbp,%r8
+	cmpl	20(%rbx),%ecx
+	cmovgeq	%rbp,%r9
+	cmpl	24(%rbx),%ecx
+	cmovgeq	%rbp,%r10
+	cmpl	28(%rbx),%ecx
+	cmovgeq	%rbp,%r11
+	vmovdqu	(%rbx),%ymm5
+	vpxor	%ymm7,%ymm7,%ymm7
+	vmovdqa	%ymm5,%ymm6
+	vpcmpgtd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm5,%ymm5
+
+	vpand	%ymm6,%ymm0,%ymm0
+	vpand	%ymm6,%ymm1,%ymm1
+	vpaddd	0(%rdi),%ymm0,%ymm0
+	vpand	%ymm6,%ymm2,%ymm2
+	vpaddd	32(%rdi),%ymm1,%ymm1
+	vpand	%ymm6,%ymm3,%ymm3
+	vpaddd	64(%rdi),%ymm2,%ymm2
+	vpand	%ymm6,%ymm4,%ymm4
+	vpaddd	96(%rdi),%ymm3,%ymm3
+	vpaddd	128(%rdi),%ymm4,%ymm4
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm1,32(%rdi)
+	vmovdqu	%ymm2,64(%rdi)
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,128(%rdi)
+
+	vmovdqu	%ymm5,(%rbx)
+	leaq	256+128(%rsp),%rbx
+	vmovdqu	96(%rbp),%ymm9
+	decl	%edx
+	jnz	.Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+	movq	544(%rsp),%rax
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	sha1_multi_block_avx2,.-sha1_multi_block_avx2
+
+.align	256
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+K_XX_XX:
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte	83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0


Property changes on: trunk/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
===================================================================
--- trunk/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S	                        (rev 0)
+++ trunk/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,7905 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
+/* Do not modify. This file is auto-generated from sha256-mb-x86_64.pl. */
+.text	
+
+
+
+.globl	sha256_multi_block
+.type	sha256_multi_block, at function
+.align	32
+sha256_multi_block:
+	movq	OPENSSL_ia32cap_P+4(%rip),%rcx
+	btq	$61,%rcx
+	jc	_shaext_shortcut
+	testl	$268435456,%ecx
+	jnz	_avx_shortcut
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	andq	$-256,%rsp
+	movq	%rax,272(%rsp)
+.Lbody:
+	leaq	K256+128(%rip),%rbp
+	leaq	256(%rsp),%rbx
+	leaq	128(%rdi),%rdi
+
+.Loop_grande:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r9
+	movq	32(%rsi),%r10
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r10
+	movq	48(%rsi),%r11
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r11
+	testl	%edx,%edx
+	jz	.Ldone
+
+	movdqu	0-128(%rdi),%xmm8
+	leaq	128(%rsp),%rax
+	movdqu	32-128(%rdi),%xmm9
+	movdqu	64-128(%rdi),%xmm10
+	movdqu	96-128(%rdi),%xmm11
+	movdqu	128-128(%rdi),%xmm12
+	movdqu	160-128(%rdi),%xmm13
+	movdqu	192-128(%rdi),%xmm14
+	movdqu	224-128(%rdi),%xmm15
+	movdqu	.Lpbswap(%rip),%xmm6
+	jmp	.Loop
+
+.align	32
+.Loop:
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm9,%xmm4
+	movd	0(%r8),%xmm5
+	movd	0(%r9),%xmm0
+	movd	0(%r10),%xmm1
+	movd	0(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm12,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm12,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm12,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,0-128(%rax)
+	paddd	%xmm15,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-128(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm12,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm14,%xmm0
+	pand	%xmm13,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm8,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm9,%xmm3
+	movdqa	%xmm8,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm8,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm9,%xmm15
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm15
+	paddd	%xmm5,%xmm11
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm15
+	paddd	%xmm7,%xmm15
+	movd	4(%r8),%xmm5
+	movd	4(%r9),%xmm0
+	movd	4(%r10),%xmm1
+	movd	4(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm11,%xmm7
+
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm11,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,16-128(%rax)
+	paddd	%xmm14,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-96(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm11,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm13,%xmm0
+	pand	%xmm12,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm15,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm15,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm15,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm8,%xmm14
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm14
+	paddd	%xmm5,%xmm10
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm14
+	paddd	%xmm7,%xmm14
+	movd	8(%r8),%xmm5
+	movd	8(%r9),%xmm0
+	movd	8(%r10),%xmm1
+	movd	8(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm10,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm10,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm10,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,32-128(%rax)
+	paddd	%xmm13,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm10,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm12,%xmm0
+	pand	%xmm11,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm14,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm15,%xmm3
+	movdqa	%xmm14,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm14,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm15,%xmm13
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm13
+	paddd	%xmm5,%xmm9
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm13
+	paddd	%xmm7,%xmm13
+	movd	12(%r8),%xmm5
+	movd	12(%r9),%xmm0
+	movd	12(%r10),%xmm1
+	movd	12(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm9,%xmm7
+
+	movdqa	%xmm9,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm9,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,48-128(%rax)
+	paddd	%xmm12,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-32(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm9,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm11,%xmm0
+	pand	%xmm10,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm13,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm14,%xmm4
+	movdqa	%xmm13,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm13,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm14,%xmm12
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm12
+	paddd	%xmm5,%xmm8
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm12
+	paddd	%xmm7,%xmm12
+	movd	16(%r8),%xmm5
+	movd	16(%r9),%xmm0
+	movd	16(%r10),%xmm1
+	movd	16(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm8,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm8,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm8,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,64-128(%rax)
+	paddd	%xmm11,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	0(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm8,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm10,%xmm0
+	pand	%xmm9,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm12,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm13,%xmm3
+	movdqa	%xmm12,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm12,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm13,%xmm11
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm11
+	paddd	%xmm5,%xmm15
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm11
+	paddd	%xmm7,%xmm11
+	movd	20(%r8),%xmm5
+	movd	20(%r9),%xmm0
+	movd	20(%r10),%xmm1
+	movd	20(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm15,%xmm7
+
+	movdqa	%xmm15,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm15,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,80-128(%rax)
+	paddd	%xmm10,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	32(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm15,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm9,%xmm0
+	pand	%xmm8,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm11,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm11,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm11,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm12,%xmm10
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm10
+	paddd	%xmm5,%xmm14
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm10
+	paddd	%xmm7,%xmm10
+	movd	24(%r8),%xmm5
+	movd	24(%r9),%xmm0
+	movd	24(%r10),%xmm1
+	movd	24(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm14,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm14,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm14,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,96-128(%rax)
+	paddd	%xmm9,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm14,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm8,%xmm0
+	pand	%xmm15,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm10,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm10,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm10,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm11,%xmm9
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm9
+	paddd	%xmm5,%xmm13
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm9
+	paddd	%xmm7,%xmm9
+	movd	28(%r8),%xmm5
+	movd	28(%r9),%xmm0
+	movd	28(%r10),%xmm1
+	movd	28(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm13,%xmm7
+
+	movdqa	%xmm13,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm13,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,112-128(%rax)
+	paddd	%xmm8,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	96(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm13,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm15,%xmm0
+	pand	%xmm14,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm9,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm9,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm10,%xmm8
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm8
+	paddd	%xmm5,%xmm12
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm8
+	paddd	%xmm7,%xmm8
+	leaq	256(%rbp),%rbp
+	movd	32(%r8),%xmm5
+	movd	32(%r9),%xmm0
+	movd	32(%r10),%xmm1
+	movd	32(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm12,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm12,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm12,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,128-128(%rax)
+	paddd	%xmm15,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-128(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm12,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm14,%xmm0
+	pand	%xmm13,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm8,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm9,%xmm3
+	movdqa	%xmm8,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm8,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm9,%xmm15
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm15
+	paddd	%xmm5,%xmm11
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm15
+	paddd	%xmm7,%xmm15
+	movd	36(%r8),%xmm5
+	movd	36(%r9),%xmm0
+	movd	36(%r10),%xmm1
+	movd	36(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm11,%xmm7
+
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm11,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,144-128(%rax)
+	paddd	%xmm14,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-96(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm11,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm13,%xmm0
+	pand	%xmm12,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm15,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm15,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm15,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm8,%xmm14
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm14
+	paddd	%xmm5,%xmm10
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm14
+	paddd	%xmm7,%xmm14
+	movd	40(%r8),%xmm5
+	movd	40(%r9),%xmm0
+	movd	40(%r10),%xmm1
+	movd	40(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm10,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm10,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm10,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,160-128(%rax)
+	paddd	%xmm13,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm10,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm12,%xmm0
+	pand	%xmm11,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm14,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm15,%xmm3
+	movdqa	%xmm14,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm14,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm15,%xmm13
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm13
+	paddd	%xmm5,%xmm9
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm13
+	paddd	%xmm7,%xmm13
+	movd	44(%r8),%xmm5
+	movd	44(%r9),%xmm0
+	movd	44(%r10),%xmm1
+	movd	44(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm9,%xmm7
+
+	movdqa	%xmm9,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm9,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,176-128(%rax)
+	paddd	%xmm12,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-32(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm9,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm11,%xmm0
+	pand	%xmm10,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm13,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm14,%xmm4
+	movdqa	%xmm13,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm13,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm14,%xmm12
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm12
+	paddd	%xmm5,%xmm8
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm12
+	paddd	%xmm7,%xmm12
+	movd	48(%r8),%xmm5
+	movd	48(%r9),%xmm0
+	movd	48(%r10),%xmm1
+	movd	48(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm8,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm8,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm8,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,192-128(%rax)
+	paddd	%xmm11,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	0(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm8,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm10,%xmm0
+	pand	%xmm9,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm12,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm13,%xmm3
+	movdqa	%xmm12,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm12,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm13,%xmm11
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm11
+	paddd	%xmm5,%xmm15
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm11
+	paddd	%xmm7,%xmm11
+	movd	52(%r8),%xmm5
+	movd	52(%r9),%xmm0
+	movd	52(%r10),%xmm1
+	movd	52(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm15,%xmm7
+
+	movdqa	%xmm15,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm15,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,208-128(%rax)
+	paddd	%xmm10,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	32(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm15,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm9,%xmm0
+	pand	%xmm8,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm11,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm11,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm11,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm12,%xmm10
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm10
+	paddd	%xmm5,%xmm14
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm10
+	paddd	%xmm7,%xmm10
+	movd	56(%r8),%xmm5
+	movd	56(%r9),%xmm0
+	movd	56(%r10),%xmm1
+	movd	56(%r11),%xmm2
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm14,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm14,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm14,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,224-128(%rax)
+	paddd	%xmm9,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm14,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm8,%xmm0
+	pand	%xmm15,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm10,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm10,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm10,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm11,%xmm9
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm9
+	paddd	%xmm5,%xmm13
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm9
+	paddd	%xmm7,%xmm9
+	movd	60(%r8),%xmm5
+	leaq	64(%r8),%r8
+	movd	60(%r9),%xmm0
+	leaq	64(%r9),%r9
+	movd	60(%r10),%xmm1
+	leaq	64(%r10),%r10
+	movd	60(%r11),%xmm2
+	leaq	64(%r11),%r11
+	punpckldq	%xmm1,%xmm5
+	punpckldq	%xmm2,%xmm0
+	punpckldq	%xmm0,%xmm5
+	movdqa	%xmm13,%xmm7
+
+	movdqa	%xmm13,%xmm2
+.byte	102,15,56,0,238
+	psrld	$6,%xmm7
+	movdqa	%xmm13,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,240-128(%rax)
+	paddd	%xmm8,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	96(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm13,%xmm0
+	prefetcht0	63(%r8)
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm15,%xmm0
+	pand	%xmm14,%xmm4
+	pxor	%xmm1,%xmm7
+
+	prefetcht0	63(%r9)
+	movdqa	%xmm9,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm9,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm9,%xmm4
+
+	prefetcht0	63(%r10)
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+	prefetcht0	63(%r11)
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm10,%xmm8
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm8
+	paddd	%xmm5,%xmm12
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm8
+	paddd	%xmm7,%xmm8
+	leaq	256(%rbp),%rbp
+	movdqu	0-128(%rax),%xmm5
+	movl	$3,%ecx
+	jmp	.Loop_16_xx
+.align	32
+.Loop_16_xx:
+	movdqa	16-128(%rax),%xmm6
+	paddd	144-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	224-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm12,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm12,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,0-128(%rax)
+	paddd	%xmm15,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-128(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm12,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm14,%xmm0
+	pand	%xmm13,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm8,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm9,%xmm3
+	movdqa	%xmm8,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm8,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm9,%xmm15
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm15
+	paddd	%xmm5,%xmm11
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm15
+	paddd	%xmm7,%xmm15
+	movdqa	32-128(%rax),%xmm5
+	paddd	160-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	240-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	movdqa	%xmm11,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm11,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,16-128(%rax)
+	paddd	%xmm14,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-96(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm11,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm13,%xmm0
+	pand	%xmm12,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm15,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm15,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm15,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm8,%xmm14
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm14
+	paddd	%xmm6,%xmm10
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm14
+	paddd	%xmm7,%xmm14
+	movdqa	48-128(%rax),%xmm6
+	paddd	176-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	0-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm10,%xmm7
+
+	movdqa	%xmm10,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm10,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,32-128(%rax)
+	paddd	%xmm13,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm10,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm12,%xmm0
+	pand	%xmm11,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm14,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm15,%xmm3
+	movdqa	%xmm14,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm14,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm15,%xmm13
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm13
+	paddd	%xmm5,%xmm9
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm13
+	paddd	%xmm7,%xmm13
+	movdqa	64-128(%rax),%xmm5
+	paddd	192-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	16-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm9,%xmm7
+
+	movdqa	%xmm9,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm9,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,48-128(%rax)
+	paddd	%xmm12,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-32(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm9,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm11,%xmm0
+	pand	%xmm10,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm13,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm14,%xmm4
+	movdqa	%xmm13,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm13,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm14,%xmm12
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm12
+	paddd	%xmm6,%xmm8
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm12
+	paddd	%xmm7,%xmm12
+	movdqa	80-128(%rax),%xmm6
+	paddd	208-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	32-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm8,%xmm7
+
+	movdqa	%xmm8,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm8,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,64-128(%rax)
+	paddd	%xmm11,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	0(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm8,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm10,%xmm0
+	pand	%xmm9,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm12,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm13,%xmm3
+	movdqa	%xmm12,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm12,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm13,%xmm11
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm11
+	paddd	%xmm5,%xmm15
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm11
+	paddd	%xmm7,%xmm11
+	movdqa	96-128(%rax),%xmm5
+	paddd	224-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	48-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm15,%xmm7
+
+	movdqa	%xmm15,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm15,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,80-128(%rax)
+	paddd	%xmm10,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	32(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm15,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm9,%xmm0
+	pand	%xmm8,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm11,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm11,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm11,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm12,%xmm10
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm10
+	paddd	%xmm6,%xmm14
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm10
+	paddd	%xmm7,%xmm10
+	movdqa	112-128(%rax),%xmm6
+	paddd	240-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	64-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm14,%xmm7
+
+	movdqa	%xmm14,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm14,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,96-128(%rax)
+	paddd	%xmm9,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm14,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm8,%xmm0
+	pand	%xmm15,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm10,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm10,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm10,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm11,%xmm9
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm9
+	paddd	%xmm5,%xmm13
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm9
+	paddd	%xmm7,%xmm9
+	movdqa	128-128(%rax),%xmm5
+	paddd	0-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	80-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	movdqa	%xmm13,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm13,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,112-128(%rax)
+	paddd	%xmm8,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	96(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm13,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm15,%xmm0
+	pand	%xmm14,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm9,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm9,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm10,%xmm8
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm8
+	paddd	%xmm6,%xmm12
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm8
+	paddd	%xmm7,%xmm8
+	leaq	256(%rbp),%rbp
+	movdqa	144-128(%rax),%xmm6
+	paddd	16-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	96-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm12,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm12,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,128-128(%rax)
+	paddd	%xmm15,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-128(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm12,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm14,%xmm0
+	pand	%xmm13,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm8,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm9,%xmm3
+	movdqa	%xmm8,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm8,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm9,%xmm15
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm15
+	paddd	%xmm5,%xmm11
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm15
+	paddd	%xmm7,%xmm15
+	movdqa	160-128(%rax),%xmm5
+	paddd	32-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	112-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm11,%xmm7
+
+	movdqa	%xmm11,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm11,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,144-128(%rax)
+	paddd	%xmm14,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-96(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm11,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm13,%xmm0
+	pand	%xmm12,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm15,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm15,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm15,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm8,%xmm14
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm14
+	paddd	%xmm6,%xmm10
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm14
+	paddd	%xmm7,%xmm14
+	movdqa	176-128(%rax),%xmm6
+	paddd	48-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	128-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm10,%xmm7
+
+	movdqa	%xmm10,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm10,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,160-128(%rax)
+	paddd	%xmm13,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm10,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm12,%xmm0
+	pand	%xmm11,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm14,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm15,%xmm3
+	movdqa	%xmm14,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm14,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm15,%xmm13
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm13
+	paddd	%xmm5,%xmm9
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm13
+	paddd	%xmm7,%xmm13
+	movdqa	192-128(%rax),%xmm5
+	paddd	64-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	144-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm9,%xmm7
+
+	movdqa	%xmm9,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm9,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,176-128(%rax)
+	paddd	%xmm12,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	-32(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm9,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm11,%xmm0
+	pand	%xmm10,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm13,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm14,%xmm4
+	movdqa	%xmm13,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm13,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm14,%xmm12
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm12
+	paddd	%xmm6,%xmm8
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm12
+	paddd	%xmm7,%xmm12
+	movdqa	208-128(%rax),%xmm6
+	paddd	80-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	160-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm8,%xmm7
+
+	movdqa	%xmm8,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm8,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,192-128(%rax)
+	paddd	%xmm11,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	0(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm8,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm8,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm10,%xmm0
+	pand	%xmm9,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm12,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm12,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm13,%xmm3
+	movdqa	%xmm12,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm12,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm13,%xmm11
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm11
+	paddd	%xmm5,%xmm15
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm11
+	paddd	%xmm7,%xmm11
+	movdqa	224-128(%rax),%xmm5
+	paddd	96-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	176-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm15,%xmm7
+
+	movdqa	%xmm15,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm15,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,208-128(%rax)
+	paddd	%xmm10,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	32(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm15,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm15,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm9,%xmm0
+	pand	%xmm8,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm11,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm11,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm11,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm11,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm12,%xmm10
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm10
+	paddd	%xmm6,%xmm14
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm10
+	paddd	%xmm7,%xmm10
+	movdqa	240-128(%rax),%xmm6
+	paddd	112-128(%rax),%xmm5
+
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm6,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	192-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm3,%xmm1
+
+	psrld	$17,%xmm3
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	psrld	$19-17,%xmm3
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm3,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm5
+	movdqa	%xmm14,%xmm7
+
+	movdqa	%xmm14,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm14,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm5,224-128(%rax)
+	paddd	%xmm9,%xmm5
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	64(%rbp),%xmm5
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm14,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm14,%xmm3
+	pslld	$26-21,%xmm2
+	pandn	%xmm8,%xmm0
+	pand	%xmm15,%xmm3
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm10,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm10,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm5
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm10,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm10,%xmm3
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm5
+	pslld	$19-10,%xmm2
+	pand	%xmm3,%xmm4
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm11,%xmm9
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm4,%xmm9
+	paddd	%xmm5,%xmm13
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm5,%xmm9
+	paddd	%xmm7,%xmm9
+	movdqa	0-128(%rax),%xmm5
+	paddd	128-128(%rax),%xmm6
+
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm5,%xmm1
+	psrld	$3,%xmm7
+	movdqa	%xmm5,%xmm2
+
+	psrld	$7,%xmm1
+	movdqa	208-128(%rax),%xmm0
+	pslld	$14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$18-7,%xmm1
+	movdqa	%xmm0,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$25-14,%xmm2
+	pxor	%xmm1,%xmm7
+	psrld	$10,%xmm0
+	movdqa	%xmm4,%xmm1
+
+	psrld	$17,%xmm4
+	pxor	%xmm2,%xmm7
+	pslld	$13,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	psrld	$19-17,%xmm4
+	pxor	%xmm1,%xmm0
+	pslld	$15-13,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm0
+	paddd	%xmm0,%xmm6
+	movdqa	%xmm13,%xmm7
+
+	movdqa	%xmm13,%xmm2
+
+	psrld	$6,%xmm7
+	movdqa	%xmm13,%xmm1
+	pslld	$7,%xmm2
+	movdqa	%xmm6,240-128(%rax)
+	paddd	%xmm8,%xmm6
+
+	psrld	$11,%xmm1
+	pxor	%xmm2,%xmm7
+	pslld	$21-7,%xmm2
+	paddd	96(%rbp),%xmm6
+	pxor	%xmm1,%xmm7
+
+	psrld	$25-11,%xmm1
+	movdqa	%xmm13,%xmm0
+
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm13,%xmm4
+	pslld	$26-21,%xmm2
+	pandn	%xmm15,%xmm0
+	pand	%xmm14,%xmm4
+	pxor	%xmm1,%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pxor	%xmm2,%xmm7
+	movdqa	%xmm9,%xmm2
+	psrld	$2,%xmm1
+	paddd	%xmm7,%xmm6
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm9,%xmm7
+	pslld	$10,%xmm2
+	pxor	%xmm9,%xmm4
+
+
+	psrld	$13,%xmm7
+	pxor	%xmm2,%xmm1
+	paddd	%xmm0,%xmm6
+	pslld	$19-10,%xmm2
+	pand	%xmm4,%xmm3
+	pxor	%xmm7,%xmm1
+
+
+	psrld	$22-13,%xmm7
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm10,%xmm8
+	pslld	$30-19,%xmm2
+	pxor	%xmm1,%xmm7
+	pxor	%xmm3,%xmm8
+	paddd	%xmm6,%xmm12
+	pxor	%xmm2,%xmm7
+
+	paddd	%xmm6,%xmm8
+	paddd	%xmm7,%xmm8
+	leaq	256(%rbp),%rbp
+	decl	%ecx
+	jnz	.Loop_16_xx
+
+	movl	$1,%ecx
+	leaq	K256+128(%rip),%rbp
+
+	movdqa	(%rbx),%xmm7
+	cmpl	0(%rbx),%ecx
+	pxor	%xmm0,%xmm0
+	cmovgeq	%rbp,%r8
+	cmpl	4(%rbx),%ecx
+	movdqa	%xmm7,%xmm6
+	cmovgeq	%rbp,%r9
+	cmpl	8(%rbx),%ecx
+	pcmpgtd	%xmm0,%xmm6
+	cmovgeq	%rbp,%r10
+	cmpl	12(%rbx),%ecx
+	paddd	%xmm6,%xmm7
+	cmovgeq	%rbp,%r11
+
+	movdqu	0-128(%rdi),%xmm0
+	pand	%xmm6,%xmm8
+	movdqu	32-128(%rdi),%xmm1
+	pand	%xmm6,%xmm9
+	movdqu	64-128(%rdi),%xmm2
+	pand	%xmm6,%xmm10
+	movdqu	96-128(%rdi),%xmm5
+	pand	%xmm6,%xmm11
+	paddd	%xmm0,%xmm8
+	movdqu	128-128(%rdi),%xmm0
+	pand	%xmm6,%xmm12
+	paddd	%xmm1,%xmm9
+	movdqu	160-128(%rdi),%xmm1
+	pand	%xmm6,%xmm13
+	paddd	%xmm2,%xmm10
+	movdqu	192-128(%rdi),%xmm2
+	pand	%xmm6,%xmm14
+	paddd	%xmm5,%xmm11
+	movdqu	224-128(%rdi),%xmm5
+	pand	%xmm6,%xmm15
+	paddd	%xmm0,%xmm12
+	paddd	%xmm1,%xmm13
+	movdqu	%xmm8,0-128(%rdi)
+	paddd	%xmm2,%xmm14
+	movdqu	%xmm9,32-128(%rdi)
+	paddd	%xmm5,%xmm15
+	movdqu	%xmm10,64-128(%rdi)
+	movdqu	%xmm11,96-128(%rdi)
+	movdqu	%xmm12,128-128(%rdi)
+	movdqu	%xmm13,160-128(%rdi)
+	movdqu	%xmm14,192-128(%rdi)
+	movdqu	%xmm15,224-128(%rdi)
+
+	movdqa	%xmm7,(%rbx)
+	movdqa	.Lpbswap(%rip),%xmm6
+	decl	%edx
+	jnz	.Loop
+
+	movl	280(%rsp),%edx
+	leaq	16(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande
+
+.Ldone:
+	movq	272(%rsp),%rax
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha256_multi_block,.-sha256_multi_block
+.type	sha256_multi_block_shaext, at function
+.align	32
+sha256_multi_block_shaext:
+_shaext_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	shll	$1,%edx
+	andq	$-256,%rsp
+	leaq	128(%rdi),%rdi
+	movq	%rax,272(%rsp)
+.Lbody_shaext:
+	leaq	256(%rsp),%rbx
+	leaq	K256_shaext+128(%rip),%rbp
+
+.Loop_grande_shaext:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rsp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rsp,%r9
+	testl	%edx,%edx
+	jz	.Ldone_shaext
+
+	movq	0-128(%rdi),%xmm12
+	movq	32-128(%rdi),%xmm4
+	movq	64-128(%rdi),%xmm13
+	movq	96-128(%rdi),%xmm5
+	movq	128-128(%rdi),%xmm8
+	movq	160-128(%rdi),%xmm9
+	movq	192-128(%rdi),%xmm10
+	movq	224-128(%rdi),%xmm11
+
+	punpckldq	%xmm4,%xmm12
+	punpckldq	%xmm5,%xmm13
+	punpckldq	%xmm9,%xmm8
+	punpckldq	%xmm11,%xmm10
+	movdqa	K256_shaext-16(%rip),%xmm3
+
+	movdqa	%xmm12,%xmm14
+	movdqa	%xmm13,%xmm15
+	punpcklqdq	%xmm8,%xmm12
+	punpcklqdq	%xmm10,%xmm13
+	punpckhqdq	%xmm8,%xmm14
+	punpckhqdq	%xmm10,%xmm15
+
+	pshufd	$27,%xmm12,%xmm12
+	pshufd	$27,%xmm13,%xmm13
+	pshufd	$27,%xmm14,%xmm14
+	pshufd	$27,%xmm15,%xmm15
+	jmp	.Loop_shaext
+
+.align	32
+.Loop_shaext:
+	movdqu	0(%r8),%xmm4
+	movdqu	0(%r9),%xmm8
+	movdqu	16(%r8),%xmm5
+	movdqu	16(%r9),%xmm9
+	movdqu	32(%r8),%xmm6
+.byte	102,15,56,0,227
+	movdqu	32(%r9),%xmm10
+.byte	102,68,15,56,0,195
+	movdqu	48(%r8),%xmm7
+	leaq	64(%r8),%r8
+	movdqu	48(%r9),%xmm11
+	leaq	64(%r9),%r9
+
+	movdqa	0-128(%rbp),%xmm0
+.byte	102,15,56,0,235
+	paddd	%xmm4,%xmm0
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm0,%xmm1
+	movdqa	0-128(%rbp),%xmm2
+.byte	102,68,15,56,0,203
+	paddd	%xmm8,%xmm2
+	movdqa	%xmm13,80(%rsp)
+.byte	69,15,56,203,236
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm15,112(%rsp)
+.byte	69,15,56,203,254
+	pshufd	$0x0e,%xmm1,%xmm0
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm12,64(%rsp)
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm14,96(%rsp)
+	movdqa	16-128(%rbp),%xmm1
+	paddd	%xmm5,%xmm1
+.byte	102,15,56,0,243
+.byte	69,15,56,203,247
+
+	movdqa	%xmm1,%xmm0
+	movdqa	16-128(%rbp),%xmm2
+	paddd	%xmm9,%xmm2
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	prefetcht0	127(%r8)
+.byte	102,15,56,0,251
+.byte	102,68,15,56,0,211
+	prefetcht0	127(%r9)
+.byte	69,15,56,203,254
+	pshufd	$0x0e,%xmm1,%xmm0
+.byte	102,68,15,56,0,219
+.byte	15,56,204,229
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	32-128(%rbp),%xmm1
+	paddd	%xmm6,%xmm1
+.byte	69,15,56,203,247
+
+	movdqa	%xmm1,%xmm0
+	movdqa	32-128(%rbp),%xmm2
+	paddd	%xmm10,%xmm2
+.byte	69,15,56,203,236
+.byte	69,15,56,204,193
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	69,15,56,203,254
+	pshufd	$0x0e,%xmm1,%xmm0
+.byte	102,15,58,15,222,4
+	paddd	%xmm3,%xmm4
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+.byte	15,56,204,238
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	48-128(%rbp),%xmm1
+	paddd	%xmm7,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,202
+
+	movdqa	%xmm1,%xmm0
+	movdqa	48-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm8
+	paddd	%xmm11,%xmm2
+.byte	15,56,205,231
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,223,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,195
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm5
+	movdqa	%xmm8,%xmm3
+.byte	102,65,15,58,15,219,4
+.byte	15,56,204,247
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	64-128(%rbp),%xmm1
+	paddd	%xmm4,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,211
+	movdqa	%xmm1,%xmm0
+	movdqa	64-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm9
+	paddd	%xmm8,%xmm2
+.byte	15,56,205,236
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,58,15,220,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,200
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm6
+	movdqa	%xmm9,%xmm3
+.byte	102,65,15,58,15,216,4
+.byte	15,56,204,252
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	80-128(%rbp),%xmm1
+	paddd	%xmm5,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,216
+	movdqa	%xmm1,%xmm0
+	movdqa	80-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm10
+	paddd	%xmm9,%xmm2
+.byte	15,56,205,245
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+.byte	102,15,58,15,221,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,209
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,217,4
+.byte	15,56,204,229
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	96-128(%rbp),%xmm1
+	paddd	%xmm6,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,193
+	movdqa	%xmm1,%xmm0
+	movdqa	96-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm11
+	paddd	%xmm10,%xmm2
+.byte	15,56,205,254
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,58,15,222,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,218
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm4
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+.byte	15,56,204,238
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	112-128(%rbp),%xmm1
+	paddd	%xmm7,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,202
+	movdqa	%xmm1,%xmm0
+	movdqa	112-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm8
+	paddd	%xmm11,%xmm2
+.byte	15,56,205,231
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,223,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,195
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm5
+	movdqa	%xmm8,%xmm3
+.byte	102,65,15,58,15,219,4
+.byte	15,56,204,247
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	128-128(%rbp),%xmm1
+	paddd	%xmm4,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,211
+	movdqa	%xmm1,%xmm0
+	movdqa	128-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm9
+	paddd	%xmm8,%xmm2
+.byte	15,56,205,236
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,58,15,220,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,200
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm6
+	movdqa	%xmm9,%xmm3
+.byte	102,65,15,58,15,216,4
+.byte	15,56,204,252
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	144-128(%rbp),%xmm1
+	paddd	%xmm5,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,216
+	movdqa	%xmm1,%xmm0
+	movdqa	144-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm10
+	paddd	%xmm9,%xmm2
+.byte	15,56,205,245
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+.byte	102,15,58,15,221,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,209
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,217,4
+.byte	15,56,204,229
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	160-128(%rbp),%xmm1
+	paddd	%xmm6,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,193
+	movdqa	%xmm1,%xmm0
+	movdqa	160-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm11
+	paddd	%xmm10,%xmm2
+.byte	15,56,205,254
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,58,15,222,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,218
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm4
+	movdqa	%xmm11,%xmm3
+.byte	102,65,15,58,15,218,4
+.byte	15,56,204,238
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	176-128(%rbp),%xmm1
+	paddd	%xmm7,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,202
+	movdqa	%xmm1,%xmm0
+	movdqa	176-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm8
+	paddd	%xmm11,%xmm2
+.byte	15,56,205,231
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,223,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,195
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm5
+	movdqa	%xmm8,%xmm3
+.byte	102,65,15,58,15,219,4
+.byte	15,56,204,247
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	192-128(%rbp),%xmm1
+	paddd	%xmm4,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,211
+	movdqa	%xmm1,%xmm0
+	movdqa	192-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm9
+	paddd	%xmm8,%xmm2
+.byte	15,56,205,236
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,58,15,220,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,200
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm6
+	movdqa	%xmm9,%xmm3
+.byte	102,65,15,58,15,216,4
+.byte	15,56,204,252
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	208-128(%rbp),%xmm1
+	paddd	%xmm5,%xmm1
+.byte	69,15,56,203,247
+.byte	69,15,56,204,216
+	movdqa	%xmm1,%xmm0
+	movdqa	208-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm10
+	paddd	%xmm9,%xmm2
+.byte	15,56,205,245
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+.byte	102,15,58,15,221,4
+.byte	69,15,56,203,254
+.byte	69,15,56,205,209
+	pshufd	$0x0e,%xmm1,%xmm0
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm10,%xmm3
+.byte	102,65,15,58,15,217,4
+	nop
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	224-128(%rbp),%xmm1
+	paddd	%xmm6,%xmm1
+.byte	69,15,56,203,247
+
+	movdqa	%xmm1,%xmm0
+	movdqa	224-128(%rbp),%xmm2
+	paddd	%xmm3,%xmm11
+	paddd	%xmm10,%xmm2
+.byte	15,56,205,254
+	nop
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	movl	$1,%ecx
+	pxor	%xmm6,%xmm6
+.byte	69,15,56,203,254
+.byte	69,15,56,205,218
+	pshufd	$0x0e,%xmm1,%xmm0
+	movdqa	240-128(%rbp),%xmm1
+	paddd	%xmm7,%xmm1
+	movq	(%rbx),%xmm7
+	nop
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	movdqa	240-128(%rbp),%xmm2
+	paddd	%xmm11,%xmm2
+.byte	69,15,56,203,247
+
+	movdqa	%xmm1,%xmm0
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rsp,%r8
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rsp,%r9
+	pshufd	$0x00,%xmm7,%xmm9
+.byte	69,15,56,203,236
+	movdqa	%xmm2,%xmm0
+	pshufd	$0x55,%xmm7,%xmm10
+	movdqa	%xmm7,%xmm11
+.byte	69,15,56,203,254
+	pshufd	$0x0e,%xmm1,%xmm0
+	pcmpgtd	%xmm6,%xmm9
+	pcmpgtd	%xmm6,%xmm10
+.byte	69,15,56,203,229
+	pshufd	$0x0e,%xmm2,%xmm0
+	pcmpgtd	%xmm6,%xmm11
+	movdqa	K256_shaext-16(%rip),%xmm3
+.byte	69,15,56,203,247
+
+	pand	%xmm9,%xmm13
+	pand	%xmm10,%xmm15
+	pand	%xmm9,%xmm12
+	pand	%xmm10,%xmm14
+	paddd	%xmm7,%xmm11
+
+	paddd	80(%rsp),%xmm13
+	paddd	112(%rsp),%xmm15
+	paddd	64(%rsp),%xmm12
+	paddd	96(%rsp),%xmm14
+
+	movq	%xmm11,(%rbx)
+	decl	%edx
+	jnz	.Loop_shaext
+
+	movl	280(%rsp),%edx
+
+	pshufd	$27,%xmm12,%xmm12
+	pshufd	$27,%xmm13,%xmm13
+	pshufd	$27,%xmm14,%xmm14
+	pshufd	$27,%xmm15,%xmm15
+
+	movdqa	%xmm12,%xmm5
+	movdqa	%xmm13,%xmm6
+	punpckldq	%xmm14,%xmm12
+	punpckhdq	%xmm14,%xmm5
+	punpckldq	%xmm15,%xmm13
+	punpckhdq	%xmm15,%xmm6
+
+	movq	%xmm12,0-128(%rdi)
+	psrldq	$8,%xmm12
+	movq	%xmm5,128-128(%rdi)
+	psrldq	$8,%xmm5
+	movq	%xmm12,32-128(%rdi)
+	movq	%xmm5,160-128(%rdi)
+
+	movq	%xmm13,64-128(%rdi)
+	psrldq	$8,%xmm13
+	movq	%xmm6,192-128(%rdi)
+	psrldq	$8,%xmm6
+	movq	%xmm13,96-128(%rdi)
+	movq	%xmm6,224-128(%rdi)
+
+	leaq	8(%rdi),%rdi
+	leaq	32(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande_shaext
+
+.Ldone_shaext:
+
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_shaext:
+	.byte	0xf3,0xc3
+.size	sha256_multi_block_shaext,.-sha256_multi_block_shaext
+.type	sha256_multi_block_avx, at function
+.align	32
+sha256_multi_block_avx:
+_avx_shortcut:
+	shrq	$32,%rcx
+	cmpl	$2,%edx
+	jb	.Lavx
+	testl	$32,%ecx
+	jnz	_avx2_shortcut
+	jmp	.Lavx
+.align	32
+.Lavx:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	subq	$288,%rsp
+	andq	$-256,%rsp
+	movq	%rax,272(%rsp)
+.Lbody_avx:
+	leaq	K256+128(%rip),%rbp
+	leaq	256(%rsp),%rbx
+	leaq	128(%rdi),%rdi
+
+.Loop_grande_avx:
+	movl	%edx,280(%rsp)
+	xorl	%edx,%edx
+	movq	0(%rsi),%r8
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r8
+	movq	16(%rsi),%r9
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r9
+	movq	32(%rsi),%r10
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r10
+	movq	48(%rsi),%r11
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r11
+	testl	%edx,%edx
+	jz	.Ldone_avx
+
+	vmovdqu	0-128(%rdi),%xmm8
+	leaq	128(%rsp),%rax
+	vmovdqu	32-128(%rdi),%xmm9
+	vmovdqu	64-128(%rdi),%xmm10
+	vmovdqu	96-128(%rdi),%xmm11
+	vmovdqu	128-128(%rdi),%xmm12
+	vmovdqu	160-128(%rdi),%xmm13
+	vmovdqu	192-128(%rdi),%xmm14
+	vmovdqu	224-128(%rdi),%xmm15
+	vmovdqu	.Lpbswap(%rip),%xmm6
+	jmp	.Loop_avx
+
+.align	32
+.Loop_avx:
+	vpxor	%xmm9,%xmm10,%xmm4
+	vmovd	0(%r8),%xmm5
+	vmovd	0(%r9),%xmm0
+	vpinsrd	$1,0(%r10),%xmm5,%xmm5
+	vpinsrd	$1,0(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm12,%xmm7
+	vpslld	$26,%xmm12,%xmm2
+	vmovdqu	%xmm5,0-128(%rax)
+	vpaddd	%xmm15,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm12,%xmm2
+	vpaddd	-128(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm12,%xmm2
+	vpandn	%xmm14,%xmm12,%xmm0
+	vpand	%xmm13,%xmm12,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm8,%xmm15
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm8,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm8,%xmm9,%xmm3
+
+	vpxor	%xmm1,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm8,%xmm1
+
+	vpslld	$19,%xmm8,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm15,%xmm7
+
+	vpsrld	$22,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm8,%xmm2
+	vpxor	%xmm4,%xmm9,%xmm15
+	vpaddd	%xmm5,%xmm11,%xmm11
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm15,%xmm15
+	vmovd	4(%r8),%xmm5
+	vmovd	4(%r9),%xmm0
+	vpinsrd	$1,4(%r10),%xmm5,%xmm5
+	vpinsrd	$1,4(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm11,%xmm7
+	vpslld	$26,%xmm11,%xmm2
+	vmovdqu	%xmm5,16-128(%rax)
+	vpaddd	%xmm14,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm11,%xmm2
+	vpaddd	-96(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm11,%xmm2
+	vpandn	%xmm13,%xmm11,%xmm0
+	vpand	%xmm12,%xmm11,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm15,%xmm14
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm15,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm4
+
+	vpxor	%xmm1,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm15,%xmm1
+
+	vpslld	$19,%xmm15,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm14,%xmm7
+
+	vpsrld	$22,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm15,%xmm2
+	vpxor	%xmm3,%xmm8,%xmm14
+	vpaddd	%xmm5,%xmm10,%xmm10
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vmovd	8(%r8),%xmm5
+	vmovd	8(%r9),%xmm0
+	vpinsrd	$1,8(%r10),%xmm5,%xmm5
+	vpinsrd	$1,8(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm10,%xmm7
+	vpslld	$26,%xmm10,%xmm2
+	vmovdqu	%xmm5,32-128(%rax)
+	vpaddd	%xmm13,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm10,%xmm2
+	vpaddd	-64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm10,%xmm2
+	vpandn	%xmm12,%xmm10,%xmm0
+	vpand	%xmm11,%xmm10,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm14,%xmm13
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm14,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm14,%xmm15,%xmm3
+
+	vpxor	%xmm1,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm14,%xmm1
+
+	vpslld	$19,%xmm14,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm13,%xmm7
+
+	vpsrld	$22,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm14,%xmm2
+	vpxor	%xmm4,%xmm15,%xmm13
+	vpaddd	%xmm5,%xmm9,%xmm9
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vmovd	12(%r8),%xmm5
+	vmovd	12(%r9),%xmm0
+	vpinsrd	$1,12(%r10),%xmm5,%xmm5
+	vpinsrd	$1,12(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm9,%xmm7
+	vpslld	$26,%xmm9,%xmm2
+	vmovdqu	%xmm5,48-128(%rax)
+	vpaddd	%xmm12,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm9,%xmm2
+	vpaddd	-32(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm9,%xmm2
+	vpandn	%xmm11,%xmm9,%xmm0
+	vpand	%xmm10,%xmm9,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm13,%xmm12
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm13,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm13,%xmm14,%xmm4
+
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm13,%xmm1
+
+	vpslld	$19,%xmm13,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm12,%xmm7
+
+	vpsrld	$22,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm13,%xmm2
+	vpxor	%xmm3,%xmm14,%xmm12
+	vpaddd	%xmm5,%xmm8,%xmm8
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vmovd	16(%r8),%xmm5
+	vmovd	16(%r9),%xmm0
+	vpinsrd	$1,16(%r10),%xmm5,%xmm5
+	vpinsrd	$1,16(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm8,%xmm7
+	vpslld	$26,%xmm8,%xmm2
+	vmovdqu	%xmm5,64-128(%rax)
+	vpaddd	%xmm11,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm8,%xmm2
+	vpaddd	0(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm8,%xmm2
+	vpandn	%xmm10,%xmm8,%xmm0
+	vpand	%xmm9,%xmm8,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm12,%xmm11
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm12,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm12,%xmm13,%xmm3
+
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm12,%xmm1
+
+	vpslld	$19,%xmm12,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm11,%xmm7
+
+	vpsrld	$22,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm12,%xmm2
+	vpxor	%xmm4,%xmm13,%xmm11
+	vpaddd	%xmm5,%xmm15,%xmm15
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vmovd	20(%r8),%xmm5
+	vmovd	20(%r9),%xmm0
+	vpinsrd	$1,20(%r10),%xmm5,%xmm5
+	vpinsrd	$1,20(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm15,%xmm7
+	vpslld	$26,%xmm15,%xmm2
+	vmovdqu	%xmm5,80-128(%rax)
+	vpaddd	%xmm10,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm15,%xmm2
+	vpaddd	32(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm15,%xmm2
+	vpandn	%xmm9,%xmm15,%xmm0
+	vpand	%xmm8,%xmm15,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm11,%xmm10
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm11,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm11,%xmm12,%xmm4
+
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm11,%xmm1
+
+	vpslld	$19,%xmm11,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm10,%xmm7
+
+	vpsrld	$22,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm11,%xmm2
+	vpxor	%xmm3,%xmm12,%xmm10
+	vpaddd	%xmm5,%xmm14,%xmm14
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vmovd	24(%r8),%xmm5
+	vmovd	24(%r9),%xmm0
+	vpinsrd	$1,24(%r10),%xmm5,%xmm5
+	vpinsrd	$1,24(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm14,%xmm7
+	vpslld	$26,%xmm14,%xmm2
+	vmovdqu	%xmm5,96-128(%rax)
+	vpaddd	%xmm9,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm14,%xmm2
+	vpaddd	64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm14,%xmm2
+	vpandn	%xmm8,%xmm14,%xmm0
+	vpand	%xmm15,%xmm14,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm10,%xmm9
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm10,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm10,%xmm11,%xmm3
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm10,%xmm1
+
+	vpslld	$19,%xmm10,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm9,%xmm7
+
+	vpsrld	$22,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm10,%xmm2
+	vpxor	%xmm4,%xmm11,%xmm9
+	vpaddd	%xmm5,%xmm13,%xmm13
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm9,%xmm9
+	vmovd	28(%r8),%xmm5
+	vmovd	28(%r9),%xmm0
+	vpinsrd	$1,28(%r10),%xmm5,%xmm5
+	vpinsrd	$1,28(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm13,%xmm7
+	vpslld	$26,%xmm13,%xmm2
+	vmovdqu	%xmm5,112-128(%rax)
+	vpaddd	%xmm8,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm13,%xmm2
+	vpaddd	96(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm13,%xmm2
+	vpandn	%xmm15,%xmm13,%xmm0
+	vpand	%xmm14,%xmm13,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm9,%xmm8
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm9,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm9,%xmm10,%xmm4
+
+	vpxor	%xmm1,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm9,%xmm1
+
+	vpslld	$19,%xmm9,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm8,%xmm7
+
+	vpsrld	$22,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm9,%xmm2
+	vpxor	%xmm3,%xmm10,%xmm8
+	vpaddd	%xmm5,%xmm12,%xmm12
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm8,%xmm8
+	addq	$256,%rbp
+	vmovd	32(%r8),%xmm5
+	vmovd	32(%r9),%xmm0
+	vpinsrd	$1,32(%r10),%xmm5,%xmm5
+	vpinsrd	$1,32(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm12,%xmm7
+	vpslld	$26,%xmm12,%xmm2
+	vmovdqu	%xmm5,128-128(%rax)
+	vpaddd	%xmm15,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm12,%xmm2
+	vpaddd	-128(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm12,%xmm2
+	vpandn	%xmm14,%xmm12,%xmm0
+	vpand	%xmm13,%xmm12,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm8,%xmm15
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm8,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm8,%xmm9,%xmm3
+
+	vpxor	%xmm1,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm8,%xmm1
+
+	vpslld	$19,%xmm8,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm15,%xmm7
+
+	vpsrld	$22,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm8,%xmm2
+	vpxor	%xmm4,%xmm9,%xmm15
+	vpaddd	%xmm5,%xmm11,%xmm11
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm15,%xmm15
+	vmovd	36(%r8),%xmm5
+	vmovd	36(%r9),%xmm0
+	vpinsrd	$1,36(%r10),%xmm5,%xmm5
+	vpinsrd	$1,36(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm11,%xmm7
+	vpslld	$26,%xmm11,%xmm2
+	vmovdqu	%xmm5,144-128(%rax)
+	vpaddd	%xmm14,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm11,%xmm2
+	vpaddd	-96(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm11,%xmm2
+	vpandn	%xmm13,%xmm11,%xmm0
+	vpand	%xmm12,%xmm11,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm15,%xmm14
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm15,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm4
+
+	vpxor	%xmm1,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm15,%xmm1
+
+	vpslld	$19,%xmm15,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm14,%xmm7
+
+	vpsrld	$22,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm15,%xmm2
+	vpxor	%xmm3,%xmm8,%xmm14
+	vpaddd	%xmm5,%xmm10,%xmm10
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vmovd	40(%r8),%xmm5
+	vmovd	40(%r9),%xmm0
+	vpinsrd	$1,40(%r10),%xmm5,%xmm5
+	vpinsrd	$1,40(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm10,%xmm7
+	vpslld	$26,%xmm10,%xmm2
+	vmovdqu	%xmm5,160-128(%rax)
+	vpaddd	%xmm13,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm10,%xmm2
+	vpaddd	-64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm10,%xmm2
+	vpandn	%xmm12,%xmm10,%xmm0
+	vpand	%xmm11,%xmm10,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm14,%xmm13
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm14,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm14,%xmm15,%xmm3
+
+	vpxor	%xmm1,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm14,%xmm1
+
+	vpslld	$19,%xmm14,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm13,%xmm7
+
+	vpsrld	$22,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm14,%xmm2
+	vpxor	%xmm4,%xmm15,%xmm13
+	vpaddd	%xmm5,%xmm9,%xmm9
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vmovd	44(%r8),%xmm5
+	vmovd	44(%r9),%xmm0
+	vpinsrd	$1,44(%r10),%xmm5,%xmm5
+	vpinsrd	$1,44(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm9,%xmm7
+	vpslld	$26,%xmm9,%xmm2
+	vmovdqu	%xmm5,176-128(%rax)
+	vpaddd	%xmm12,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm9,%xmm2
+	vpaddd	-32(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm9,%xmm2
+	vpandn	%xmm11,%xmm9,%xmm0
+	vpand	%xmm10,%xmm9,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm13,%xmm12
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm13,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm13,%xmm14,%xmm4
+
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm13,%xmm1
+
+	vpslld	$19,%xmm13,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm12,%xmm7
+
+	vpsrld	$22,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm13,%xmm2
+	vpxor	%xmm3,%xmm14,%xmm12
+	vpaddd	%xmm5,%xmm8,%xmm8
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vmovd	48(%r8),%xmm5
+	vmovd	48(%r9),%xmm0
+	vpinsrd	$1,48(%r10),%xmm5,%xmm5
+	vpinsrd	$1,48(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm8,%xmm7
+	vpslld	$26,%xmm8,%xmm2
+	vmovdqu	%xmm5,192-128(%rax)
+	vpaddd	%xmm11,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm8,%xmm2
+	vpaddd	0(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm8,%xmm2
+	vpandn	%xmm10,%xmm8,%xmm0
+	vpand	%xmm9,%xmm8,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm12,%xmm11
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm12,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm12,%xmm13,%xmm3
+
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm12,%xmm1
+
+	vpslld	$19,%xmm12,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm11,%xmm7
+
+	vpsrld	$22,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm12,%xmm2
+	vpxor	%xmm4,%xmm13,%xmm11
+	vpaddd	%xmm5,%xmm15,%xmm15
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vmovd	52(%r8),%xmm5
+	vmovd	52(%r9),%xmm0
+	vpinsrd	$1,52(%r10),%xmm5,%xmm5
+	vpinsrd	$1,52(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm15,%xmm7
+	vpslld	$26,%xmm15,%xmm2
+	vmovdqu	%xmm5,208-128(%rax)
+	vpaddd	%xmm10,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm15,%xmm2
+	vpaddd	32(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm15,%xmm2
+	vpandn	%xmm9,%xmm15,%xmm0
+	vpand	%xmm8,%xmm15,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm11,%xmm10
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm11,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm11,%xmm12,%xmm4
+
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm11,%xmm1
+
+	vpslld	$19,%xmm11,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm10,%xmm7
+
+	vpsrld	$22,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm11,%xmm2
+	vpxor	%xmm3,%xmm12,%xmm10
+	vpaddd	%xmm5,%xmm14,%xmm14
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vmovd	56(%r8),%xmm5
+	vmovd	56(%r9),%xmm0
+	vpinsrd	$1,56(%r10),%xmm5,%xmm5
+	vpinsrd	$1,56(%r11),%xmm0,%xmm0
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm14,%xmm7
+	vpslld	$26,%xmm14,%xmm2
+	vmovdqu	%xmm5,224-128(%rax)
+	vpaddd	%xmm9,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm14,%xmm2
+	vpaddd	64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm14,%xmm2
+	vpandn	%xmm8,%xmm14,%xmm0
+	vpand	%xmm15,%xmm14,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm10,%xmm9
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm10,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm10,%xmm11,%xmm3
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm10,%xmm1
+
+	vpslld	$19,%xmm10,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm9,%xmm7
+
+	vpsrld	$22,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm10,%xmm2
+	vpxor	%xmm4,%xmm11,%xmm9
+	vpaddd	%xmm5,%xmm13,%xmm13
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm9,%xmm9
+	vmovd	60(%r8),%xmm5
+	leaq	64(%r8),%r8
+	vmovd	60(%r9),%xmm0
+	leaq	64(%r9),%r9
+	vpinsrd	$1,60(%r10),%xmm5,%xmm5
+	leaq	64(%r10),%r10
+	vpinsrd	$1,60(%r11),%xmm0,%xmm0
+	leaq	64(%r11),%r11
+	vpunpckldq	%xmm0,%xmm5,%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vpsrld	$6,%xmm13,%xmm7
+	vpslld	$26,%xmm13,%xmm2
+	vmovdqu	%xmm5,240-128(%rax)
+	vpaddd	%xmm8,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm13,%xmm2
+	vpaddd	96(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	prefetcht0	63(%r8)
+	vpslld	$7,%xmm13,%xmm2
+	vpandn	%xmm15,%xmm13,%xmm0
+	vpand	%xmm14,%xmm13,%xmm4
+	prefetcht0	63(%r9)
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm9,%xmm8
+	vpxor	%xmm2,%xmm7,%xmm7
+	prefetcht0	63(%r10)
+	vpslld	$30,%xmm9,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm9,%xmm10,%xmm4
+	prefetcht0	63(%r11)
+	vpxor	%xmm1,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm9,%xmm1
+
+	vpslld	$19,%xmm9,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm8,%xmm7
+
+	vpsrld	$22,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm9,%xmm2
+	vpxor	%xmm3,%xmm10,%xmm8
+	vpaddd	%xmm5,%xmm12,%xmm12
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm8,%xmm8
+	addq	$256,%rbp
+	vmovdqu	0-128(%rax),%xmm5
+	movl	$3,%ecx
+	jmp	.Loop_16_xx_avx
+.align	32
+.Loop_16_xx_avx:
+	vmovdqu	16-128(%rax),%xmm6
+	vpaddd	144-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	224-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm12,%xmm7
+	vpslld	$26,%xmm12,%xmm2
+	vmovdqu	%xmm5,0-128(%rax)
+	vpaddd	%xmm15,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm12,%xmm2
+	vpaddd	-128(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm12,%xmm2
+	vpandn	%xmm14,%xmm12,%xmm0
+	vpand	%xmm13,%xmm12,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm8,%xmm15
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm8,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm8,%xmm9,%xmm3
+
+	vpxor	%xmm1,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm8,%xmm1
+
+	vpslld	$19,%xmm8,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm15,%xmm7
+
+	vpsrld	$22,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm8,%xmm2
+	vpxor	%xmm4,%xmm9,%xmm15
+	vpaddd	%xmm5,%xmm11,%xmm11
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm15,%xmm15
+	vmovdqu	32-128(%rax),%xmm5
+	vpaddd	160-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	240-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm11,%xmm7
+	vpslld	$26,%xmm11,%xmm2
+	vmovdqu	%xmm6,16-128(%rax)
+	vpaddd	%xmm14,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm11,%xmm2
+	vpaddd	-96(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm11,%xmm2
+	vpandn	%xmm13,%xmm11,%xmm0
+	vpand	%xmm12,%xmm11,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm15,%xmm14
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm15,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm4
+
+	vpxor	%xmm1,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm15,%xmm1
+
+	vpslld	$19,%xmm15,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm14,%xmm7
+
+	vpsrld	$22,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm15,%xmm2
+	vpxor	%xmm3,%xmm8,%xmm14
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vmovdqu	48-128(%rax),%xmm6
+	vpaddd	176-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	0-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm10,%xmm7
+	vpslld	$26,%xmm10,%xmm2
+	vmovdqu	%xmm5,32-128(%rax)
+	vpaddd	%xmm13,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm10,%xmm2
+	vpaddd	-64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm10,%xmm2
+	vpandn	%xmm12,%xmm10,%xmm0
+	vpand	%xmm11,%xmm10,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm14,%xmm13
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm14,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm14,%xmm15,%xmm3
+
+	vpxor	%xmm1,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm14,%xmm1
+
+	vpslld	$19,%xmm14,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm13,%xmm7
+
+	vpsrld	$22,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm14,%xmm2
+	vpxor	%xmm4,%xmm15,%xmm13
+	vpaddd	%xmm5,%xmm9,%xmm9
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vmovdqu	64-128(%rax),%xmm5
+	vpaddd	192-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	16-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm9,%xmm7
+	vpslld	$26,%xmm9,%xmm2
+	vmovdqu	%xmm6,48-128(%rax)
+	vpaddd	%xmm12,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm9,%xmm2
+	vpaddd	-32(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm9,%xmm2
+	vpandn	%xmm11,%xmm9,%xmm0
+	vpand	%xmm10,%xmm9,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm13,%xmm12
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm13,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm13,%xmm14,%xmm4
+
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm13,%xmm1
+
+	vpslld	$19,%xmm13,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm12,%xmm7
+
+	vpsrld	$22,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm13,%xmm2
+	vpxor	%xmm3,%xmm14,%xmm12
+	vpaddd	%xmm6,%xmm8,%xmm8
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vmovdqu	80-128(%rax),%xmm6
+	vpaddd	208-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	32-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm8,%xmm7
+	vpslld	$26,%xmm8,%xmm2
+	vmovdqu	%xmm5,64-128(%rax)
+	vpaddd	%xmm11,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm8,%xmm2
+	vpaddd	0(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm8,%xmm2
+	vpandn	%xmm10,%xmm8,%xmm0
+	vpand	%xmm9,%xmm8,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm12,%xmm11
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm12,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm12,%xmm13,%xmm3
+
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm12,%xmm1
+
+	vpslld	$19,%xmm12,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm11,%xmm7
+
+	vpsrld	$22,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm12,%xmm2
+	vpxor	%xmm4,%xmm13,%xmm11
+	vpaddd	%xmm5,%xmm15,%xmm15
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vmovdqu	96-128(%rax),%xmm5
+	vpaddd	224-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	48-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm15,%xmm7
+	vpslld	$26,%xmm15,%xmm2
+	vmovdqu	%xmm6,80-128(%rax)
+	vpaddd	%xmm10,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm15,%xmm2
+	vpaddd	32(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm15,%xmm2
+	vpandn	%xmm9,%xmm15,%xmm0
+	vpand	%xmm8,%xmm15,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm11,%xmm10
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm11,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm11,%xmm12,%xmm4
+
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm11,%xmm1
+
+	vpslld	$19,%xmm11,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm10,%xmm7
+
+	vpsrld	$22,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm11,%xmm2
+	vpxor	%xmm3,%xmm12,%xmm10
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vmovdqu	112-128(%rax),%xmm6
+	vpaddd	240-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	64-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm14,%xmm7
+	vpslld	$26,%xmm14,%xmm2
+	vmovdqu	%xmm5,96-128(%rax)
+	vpaddd	%xmm9,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm14,%xmm2
+	vpaddd	64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm14,%xmm2
+	vpandn	%xmm8,%xmm14,%xmm0
+	vpand	%xmm15,%xmm14,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm10,%xmm9
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm10,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm10,%xmm11,%xmm3
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm10,%xmm1
+
+	vpslld	$19,%xmm10,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm9,%xmm7
+
+	vpsrld	$22,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm10,%xmm2
+	vpxor	%xmm4,%xmm11,%xmm9
+	vpaddd	%xmm5,%xmm13,%xmm13
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm9,%xmm9
+	vmovdqu	128-128(%rax),%xmm5
+	vpaddd	0-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	80-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm13,%xmm7
+	vpslld	$26,%xmm13,%xmm2
+	vmovdqu	%xmm6,112-128(%rax)
+	vpaddd	%xmm8,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm13,%xmm2
+	vpaddd	96(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm13,%xmm2
+	vpandn	%xmm15,%xmm13,%xmm0
+	vpand	%xmm14,%xmm13,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm9,%xmm8
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm9,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm9,%xmm10,%xmm4
+
+	vpxor	%xmm1,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm9,%xmm1
+
+	vpslld	$19,%xmm9,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm8,%xmm7
+
+	vpsrld	$22,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm9,%xmm2
+	vpxor	%xmm3,%xmm10,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm8,%xmm8
+	addq	$256,%rbp
+	vmovdqu	144-128(%rax),%xmm6
+	vpaddd	16-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	96-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm12,%xmm7
+	vpslld	$26,%xmm12,%xmm2
+	vmovdqu	%xmm5,128-128(%rax)
+	vpaddd	%xmm15,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm12,%xmm2
+	vpaddd	-128(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm12,%xmm2
+	vpandn	%xmm14,%xmm12,%xmm0
+	vpand	%xmm13,%xmm12,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm8,%xmm15
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm8,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm8,%xmm9,%xmm3
+
+	vpxor	%xmm1,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm8,%xmm1
+
+	vpslld	$19,%xmm8,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm15,%xmm7
+
+	vpsrld	$22,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm8,%xmm2
+	vpxor	%xmm4,%xmm9,%xmm15
+	vpaddd	%xmm5,%xmm11,%xmm11
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm15,%xmm15
+	vpaddd	%xmm7,%xmm15,%xmm15
+	vmovdqu	160-128(%rax),%xmm5
+	vpaddd	32-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	112-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm11,%xmm7
+	vpslld	$26,%xmm11,%xmm2
+	vmovdqu	%xmm6,144-128(%rax)
+	vpaddd	%xmm14,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm11,%xmm2
+	vpaddd	-96(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm11,%xmm2
+	vpandn	%xmm13,%xmm11,%xmm0
+	vpand	%xmm12,%xmm11,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm15,%xmm14
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm15,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm4
+
+	vpxor	%xmm1,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm15,%xmm1
+
+	vpslld	$19,%xmm15,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm14,%xmm7
+
+	vpsrld	$22,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm15,%xmm2
+	vpxor	%xmm3,%xmm8,%xmm14
+	vpaddd	%xmm6,%xmm10,%xmm10
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm14,%xmm14
+	vpaddd	%xmm7,%xmm14,%xmm14
+	vmovdqu	176-128(%rax),%xmm6
+	vpaddd	48-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	128-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm10,%xmm7
+	vpslld	$26,%xmm10,%xmm2
+	vmovdqu	%xmm5,160-128(%rax)
+	vpaddd	%xmm13,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm10,%xmm2
+	vpaddd	-64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm10,%xmm2
+	vpandn	%xmm12,%xmm10,%xmm0
+	vpand	%xmm11,%xmm10,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm14,%xmm13
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm14,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm14,%xmm15,%xmm3
+
+	vpxor	%xmm1,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm14,%xmm1
+
+	vpslld	$19,%xmm14,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm13,%xmm7
+
+	vpsrld	$22,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm14,%xmm2
+	vpxor	%xmm4,%xmm15,%xmm13
+	vpaddd	%xmm5,%xmm9,%xmm9
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm13,%xmm13
+	vpaddd	%xmm7,%xmm13,%xmm13
+	vmovdqu	192-128(%rax),%xmm5
+	vpaddd	64-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	144-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm9,%xmm7
+	vpslld	$26,%xmm9,%xmm2
+	vmovdqu	%xmm6,176-128(%rax)
+	vpaddd	%xmm12,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm9,%xmm2
+	vpaddd	-32(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm9,%xmm2
+	vpandn	%xmm11,%xmm9,%xmm0
+	vpand	%xmm10,%xmm9,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm13,%xmm12
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm13,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm13,%xmm14,%xmm4
+
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm13,%xmm1
+
+	vpslld	$19,%xmm13,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm12,%xmm7
+
+	vpsrld	$22,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm13,%xmm2
+	vpxor	%xmm3,%xmm14,%xmm12
+	vpaddd	%xmm6,%xmm8,%xmm8
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm12,%xmm12
+	vpaddd	%xmm7,%xmm12,%xmm12
+	vmovdqu	208-128(%rax),%xmm6
+	vpaddd	80-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	160-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm8,%xmm7
+	vpslld	$26,%xmm8,%xmm2
+	vmovdqu	%xmm5,192-128(%rax)
+	vpaddd	%xmm11,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm8,%xmm2
+	vpaddd	0(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm8,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm8,%xmm2
+	vpandn	%xmm10,%xmm8,%xmm0
+	vpand	%xmm9,%xmm8,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm12,%xmm11
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm12,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm12,%xmm13,%xmm3
+
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm12,%xmm1
+
+	vpslld	$19,%xmm12,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm11,%xmm7
+
+	vpsrld	$22,%xmm12,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm12,%xmm2
+	vpxor	%xmm4,%xmm13,%xmm11
+	vpaddd	%xmm5,%xmm15,%xmm15
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm11,%xmm11
+	vpaddd	%xmm7,%xmm11,%xmm11
+	vmovdqu	224-128(%rax),%xmm5
+	vpaddd	96-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	176-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm15,%xmm7
+	vpslld	$26,%xmm15,%xmm2
+	vmovdqu	%xmm6,208-128(%rax)
+	vpaddd	%xmm10,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm15,%xmm2
+	vpaddd	32(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm15,%xmm2
+	vpandn	%xmm9,%xmm15,%xmm0
+	vpand	%xmm8,%xmm15,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm11,%xmm10
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm11,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm11,%xmm12,%xmm4
+
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm11,%xmm1
+
+	vpslld	$19,%xmm11,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm10,%xmm7
+
+	vpsrld	$22,%xmm11,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm11,%xmm2
+	vpxor	%xmm3,%xmm12,%xmm10
+	vpaddd	%xmm6,%xmm14,%xmm14
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm10,%xmm10
+	vpaddd	%xmm7,%xmm10,%xmm10
+	vmovdqu	240-128(%rax),%xmm6
+	vpaddd	112-128(%rax),%xmm5,%xmm5
+
+	vpsrld	$3,%xmm6,%xmm7
+	vpsrld	$7,%xmm6,%xmm1
+	vpslld	$25,%xmm6,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm6,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm6,%xmm2
+	vmovdqu	192-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm3,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm5,%xmm5
+	vpsrld	$6,%xmm14,%xmm7
+	vpslld	$26,%xmm14,%xmm2
+	vmovdqu	%xmm5,224-128(%rax)
+	vpaddd	%xmm9,%xmm5,%xmm5
+
+	vpsrld	$11,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm14,%xmm2
+	vpaddd	64(%rbp),%xmm5,%xmm5
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm14,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm14,%xmm2
+	vpandn	%xmm8,%xmm14,%xmm0
+	vpand	%xmm15,%xmm14,%xmm3
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm10,%xmm9
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm10,%xmm1
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm10,%xmm11,%xmm3
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm5,%xmm5
+
+	vpsrld	$13,%xmm10,%xmm1
+
+	vpslld	$19,%xmm10,%xmm2
+	vpaddd	%xmm0,%xmm5,%xmm5
+	vpand	%xmm3,%xmm4,%xmm4
+
+	vpxor	%xmm1,%xmm9,%xmm7
+
+	vpsrld	$22,%xmm10,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm10,%xmm2
+	vpxor	%xmm4,%xmm11,%xmm9
+	vpaddd	%xmm5,%xmm13,%xmm13
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm5,%xmm9,%xmm9
+	vpaddd	%xmm7,%xmm9,%xmm9
+	vmovdqu	0-128(%rax),%xmm5
+	vpaddd	128-128(%rax),%xmm6,%xmm6
+
+	vpsrld	$3,%xmm5,%xmm7
+	vpsrld	$7,%xmm5,%xmm1
+	vpslld	$25,%xmm5,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$18,%xmm5,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$14,%xmm5,%xmm2
+	vmovdqu	208-128(%rax),%xmm0
+	vpsrld	$10,%xmm0,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpsrld	$17,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$15,%xmm0,%xmm2
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm4,%xmm7
+	vpsrld	$19,%xmm0,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$13,%xmm0,%xmm2
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpaddd	%xmm7,%xmm6,%xmm6
+	vpsrld	$6,%xmm13,%xmm7
+	vpslld	$26,%xmm13,%xmm2
+	vmovdqu	%xmm6,240-128(%rax)
+	vpaddd	%xmm8,%xmm6,%xmm6
+
+	vpsrld	$11,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpslld	$21,%xmm13,%xmm2
+	vpaddd	96(%rbp),%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$25,%xmm13,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$7,%xmm13,%xmm2
+	vpandn	%xmm15,%xmm13,%xmm0
+	vpand	%xmm14,%xmm13,%xmm4
+
+	vpxor	%xmm1,%xmm7,%xmm7
+
+	vpsrld	$2,%xmm9,%xmm8
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$30,%xmm9,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm9,%xmm10,%xmm4
+
+	vpxor	%xmm1,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm6,%xmm6
+
+	vpsrld	$13,%xmm9,%xmm1
+
+	vpslld	$19,%xmm9,%xmm2
+	vpaddd	%xmm0,%xmm6,%xmm6
+	vpand	%xmm4,%xmm3,%xmm3
+
+	vpxor	%xmm1,%xmm8,%xmm7
+
+	vpsrld	$22,%xmm9,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpslld	$10,%xmm9,%xmm2
+	vpxor	%xmm3,%xmm10,%xmm8
+	vpaddd	%xmm6,%xmm12,%xmm12
+
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm7,%xmm7
+
+	vpaddd	%xmm6,%xmm8,%xmm8
+	vpaddd	%xmm7,%xmm8,%xmm8
+	addq	$256,%rbp
+	decl	%ecx
+	jnz	.Loop_16_xx_avx
+
+	movl	$1,%ecx
+	leaq	K256+128(%rip),%rbp
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rbp,%r8
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rbp,%r9
+	cmpl	8(%rbx),%ecx
+	cmovgeq	%rbp,%r10
+	cmpl	12(%rbx),%ecx
+	cmovgeq	%rbp,%r11
+	vmovdqa	(%rbx),%xmm7
+	vpxor	%xmm0,%xmm0,%xmm0
+	vmovdqa	%xmm7,%xmm6
+	vpcmpgtd	%xmm0,%xmm6,%xmm6
+	vpaddd	%xmm6,%xmm7,%xmm7
+
+	vmovdqu	0-128(%rdi),%xmm0
+	vpand	%xmm6,%xmm8,%xmm8
+	vmovdqu	32-128(%rdi),%xmm1
+	vpand	%xmm6,%xmm9,%xmm9
+	vmovdqu	64-128(%rdi),%xmm2
+	vpand	%xmm6,%xmm10,%xmm10
+	vmovdqu	96-128(%rdi),%xmm5
+	vpand	%xmm6,%xmm11,%xmm11
+	vpaddd	%xmm0,%xmm8,%xmm8
+	vmovdqu	128-128(%rdi),%xmm0
+	vpand	%xmm6,%xmm12,%xmm12
+	vpaddd	%xmm1,%xmm9,%xmm9
+	vmovdqu	160-128(%rdi),%xmm1
+	vpand	%xmm6,%xmm13,%xmm13
+	vpaddd	%xmm2,%xmm10,%xmm10
+	vmovdqu	192-128(%rdi),%xmm2
+	vpand	%xmm6,%xmm14,%xmm14
+	vpaddd	%xmm5,%xmm11,%xmm11
+	vmovdqu	224-128(%rdi),%xmm5
+	vpand	%xmm6,%xmm15,%xmm15
+	vpaddd	%xmm0,%xmm12,%xmm12
+	vpaddd	%xmm1,%xmm13,%xmm13
+	vmovdqu	%xmm8,0-128(%rdi)
+	vpaddd	%xmm2,%xmm14,%xmm14
+	vmovdqu	%xmm9,32-128(%rdi)
+	vpaddd	%xmm5,%xmm15,%xmm15
+	vmovdqu	%xmm10,64-128(%rdi)
+	vmovdqu	%xmm11,96-128(%rdi)
+	vmovdqu	%xmm12,128-128(%rdi)
+	vmovdqu	%xmm13,160-128(%rdi)
+	vmovdqu	%xmm14,192-128(%rdi)
+	vmovdqu	%xmm15,224-128(%rdi)
+
+	vmovdqu	%xmm7,(%rbx)
+	vmovdqu	.Lpbswap(%rip),%xmm6
+	decl	%edx
+	jnz	.Loop_avx
+
+	movl	280(%rsp),%edx
+	leaq	16(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+	decl	%edx
+	jnz	.Loop_grande_avx
+
+.Ldone_avx:
+	movq	272(%rsp),%rax
+	vzeroupper
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.size	sha256_multi_block_avx,.-sha256_multi_block_avx
+.type	sha256_multi_block_avx2, at function
+.align	32
+sha256_multi_block_avx2:
+_avx2_shortcut:
+	movq	%rsp,%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$576,%rsp
+	andq	$-256,%rsp
+	movq	%rax,544(%rsp)
+.Lbody_avx2:
+	leaq	K256+128(%rip),%rbp
+	leaq	128(%rdi),%rdi
+
+.Loop_grande_avx2:
+	movl	%edx,552(%rsp)
+	xorl	%edx,%edx
+	leaq	512(%rsp),%rbx
+	movq	0(%rsi),%r12
+	movl	8(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,0(%rbx)
+	cmovleq	%rbp,%r12
+	movq	16(%rsi),%r13
+	movl	24(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,4(%rbx)
+	cmovleq	%rbp,%r13
+	movq	32(%rsi),%r14
+	movl	40(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,8(%rbx)
+	cmovleq	%rbp,%r14
+	movq	48(%rsi),%r15
+	movl	56(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,12(%rbx)
+	cmovleq	%rbp,%r15
+	movq	64(%rsi),%r8
+	movl	72(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,16(%rbx)
+	cmovleq	%rbp,%r8
+	movq	80(%rsi),%r9
+	movl	88(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,20(%rbx)
+	cmovleq	%rbp,%r9
+	movq	96(%rsi),%r10
+	movl	104(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,24(%rbx)
+	cmovleq	%rbp,%r10
+	movq	112(%rsi),%r11
+	movl	120(%rsi),%ecx
+	cmpl	%edx,%ecx
+	cmovgl	%ecx,%edx
+	testl	%ecx,%ecx
+	movl	%ecx,28(%rbx)
+	cmovleq	%rbp,%r11
+	vmovdqu	0-128(%rdi),%ymm8
+	leaq	128(%rsp),%rax
+	vmovdqu	32-128(%rdi),%ymm9
+	leaq	256+128(%rsp),%rbx
+	vmovdqu	64-128(%rdi),%ymm10
+	vmovdqu	96-128(%rdi),%ymm11
+	vmovdqu	128-128(%rdi),%ymm12
+	vmovdqu	160-128(%rdi),%ymm13
+	vmovdqu	192-128(%rdi),%ymm14
+	vmovdqu	224-128(%rdi),%ymm15
+	vmovdqu	.Lpbswap(%rip),%ymm6
+	jmp	.Loop_avx2
+
+.align	32
+.Loop_avx2:
+	vpxor	%ymm9,%ymm10,%ymm4
+	vmovd	0(%r12),%xmm5
+	vmovd	0(%r8),%xmm0
+	vmovd	0(%r13),%xmm1
+	vmovd	0(%r9),%xmm2
+	vpinsrd	$1,0(%r14),%xmm5,%xmm5
+	vpinsrd	$1,0(%r10),%xmm0,%xmm0
+	vpinsrd	$1,0(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,0(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm12,%ymm7
+	vpslld	$26,%ymm12,%ymm2
+	vmovdqu	%ymm5,0-128(%rax)
+	vpaddd	%ymm15,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm12,%ymm2
+	vpaddd	-128(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm12,%ymm2
+	vpandn	%ymm14,%ymm12,%ymm0
+	vpand	%ymm13,%ymm12,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm8,%ymm15
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm8,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm8,%ymm9,%ymm3
+
+	vpxor	%ymm1,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm8,%ymm1
+
+	vpslld	$19,%ymm8,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm15,%ymm7
+
+	vpsrld	$22,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm8,%ymm2
+	vpxor	%ymm4,%ymm9,%ymm15
+	vpaddd	%ymm5,%ymm11,%ymm11
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm15,%ymm15
+	vmovd	4(%r12),%xmm5
+	vmovd	4(%r8),%xmm0
+	vmovd	4(%r13),%xmm1
+	vmovd	4(%r9),%xmm2
+	vpinsrd	$1,4(%r14),%xmm5,%xmm5
+	vpinsrd	$1,4(%r10),%xmm0,%xmm0
+	vpinsrd	$1,4(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,4(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm11,%ymm7
+	vpslld	$26,%ymm11,%ymm2
+	vmovdqu	%ymm5,32-128(%rax)
+	vpaddd	%ymm14,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm11,%ymm2
+	vpaddd	-96(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm11,%ymm2
+	vpandn	%ymm13,%ymm11,%ymm0
+	vpand	%ymm12,%ymm11,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm15,%ymm14
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm15,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm15,%ymm8,%ymm4
+
+	vpxor	%ymm1,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm15,%ymm1
+
+	vpslld	$19,%ymm15,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm14,%ymm7
+
+	vpsrld	$22,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm15,%ymm2
+	vpxor	%ymm3,%ymm8,%ymm14
+	vpaddd	%ymm5,%ymm10,%ymm10
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm14,%ymm14
+	vmovd	8(%r12),%xmm5
+	vmovd	8(%r8),%xmm0
+	vmovd	8(%r13),%xmm1
+	vmovd	8(%r9),%xmm2
+	vpinsrd	$1,8(%r14),%xmm5,%xmm5
+	vpinsrd	$1,8(%r10),%xmm0,%xmm0
+	vpinsrd	$1,8(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,8(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm10,%ymm7
+	vpslld	$26,%ymm10,%ymm2
+	vmovdqu	%ymm5,64-128(%rax)
+	vpaddd	%ymm13,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm10,%ymm2
+	vpaddd	-64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm10,%ymm2
+	vpandn	%ymm12,%ymm10,%ymm0
+	vpand	%ymm11,%ymm10,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm14,%ymm13
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm14,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm14,%ymm15,%ymm3
+
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm14,%ymm1
+
+	vpslld	$19,%ymm14,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm13,%ymm7
+
+	vpsrld	$22,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm14,%ymm2
+	vpxor	%ymm4,%ymm15,%ymm13
+	vpaddd	%ymm5,%ymm9,%ymm9
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vmovd	12(%r12),%xmm5
+	vmovd	12(%r8),%xmm0
+	vmovd	12(%r13),%xmm1
+	vmovd	12(%r9),%xmm2
+	vpinsrd	$1,12(%r14),%xmm5,%xmm5
+	vpinsrd	$1,12(%r10),%xmm0,%xmm0
+	vpinsrd	$1,12(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,12(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm9,%ymm7
+	vpslld	$26,%ymm9,%ymm2
+	vmovdqu	%ymm5,96-128(%rax)
+	vpaddd	%ymm12,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm9,%ymm2
+	vpaddd	-32(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm9,%ymm2
+	vpandn	%ymm11,%ymm9,%ymm0
+	vpand	%ymm10,%ymm9,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm13,%ymm12
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm13,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm13,%ymm14,%ymm4
+
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm13,%ymm1
+
+	vpslld	$19,%ymm13,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm12,%ymm7
+
+	vpsrld	$22,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm13,%ymm2
+	vpxor	%ymm3,%ymm14,%ymm12
+	vpaddd	%ymm5,%ymm8,%ymm8
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vmovd	16(%r12),%xmm5
+	vmovd	16(%r8),%xmm0
+	vmovd	16(%r13),%xmm1
+	vmovd	16(%r9),%xmm2
+	vpinsrd	$1,16(%r14),%xmm5,%xmm5
+	vpinsrd	$1,16(%r10),%xmm0,%xmm0
+	vpinsrd	$1,16(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,16(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm8,%ymm7
+	vpslld	$26,%ymm8,%ymm2
+	vmovdqu	%ymm5,128-128(%rax)
+	vpaddd	%ymm11,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm8,%ymm2
+	vpaddd	0(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm8,%ymm2
+	vpandn	%ymm10,%ymm8,%ymm0
+	vpand	%ymm9,%ymm8,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm12,%ymm11
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm12,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm12,%ymm13,%ymm3
+
+	vpxor	%ymm1,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm12,%ymm1
+
+	vpslld	$19,%ymm12,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm11,%ymm7
+
+	vpsrld	$22,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm12,%ymm2
+	vpxor	%ymm4,%ymm13,%ymm11
+	vpaddd	%ymm5,%ymm15,%ymm15
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm11,%ymm11
+	vmovd	20(%r12),%xmm5
+	vmovd	20(%r8),%xmm0
+	vmovd	20(%r13),%xmm1
+	vmovd	20(%r9),%xmm2
+	vpinsrd	$1,20(%r14),%xmm5,%xmm5
+	vpinsrd	$1,20(%r10),%xmm0,%xmm0
+	vpinsrd	$1,20(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,20(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm15,%ymm7
+	vpslld	$26,%ymm15,%ymm2
+	vmovdqu	%ymm5,160-128(%rax)
+	vpaddd	%ymm10,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm15,%ymm2
+	vpaddd	32(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm15,%ymm2
+	vpandn	%ymm9,%ymm15,%ymm0
+	vpand	%ymm8,%ymm15,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm11,%ymm10
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm11,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm11,%ymm12,%ymm4
+
+	vpxor	%ymm1,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm11,%ymm1
+
+	vpslld	$19,%ymm11,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm10,%ymm7
+
+	vpsrld	$22,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm11,%ymm2
+	vpxor	%ymm3,%ymm12,%ymm10
+	vpaddd	%ymm5,%ymm14,%ymm14
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm10,%ymm10
+	vmovd	24(%r12),%xmm5
+	vmovd	24(%r8),%xmm0
+	vmovd	24(%r13),%xmm1
+	vmovd	24(%r9),%xmm2
+	vpinsrd	$1,24(%r14),%xmm5,%xmm5
+	vpinsrd	$1,24(%r10),%xmm0,%xmm0
+	vpinsrd	$1,24(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,24(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm14,%ymm7
+	vpslld	$26,%ymm14,%ymm2
+	vmovdqu	%ymm5,192-128(%rax)
+	vpaddd	%ymm9,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm14,%ymm2
+	vpaddd	64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm14,%ymm2
+	vpandn	%ymm8,%ymm14,%ymm0
+	vpand	%ymm15,%ymm14,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm10,%ymm9
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm10,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm11,%ymm3
+
+	vpxor	%ymm1,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm10,%ymm1
+
+	vpslld	$19,%ymm10,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm9,%ymm7
+
+	vpsrld	$22,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm10,%ymm2
+	vpxor	%ymm4,%ymm11,%ymm9
+	vpaddd	%ymm5,%ymm13,%ymm13
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm9,%ymm9
+	vmovd	28(%r12),%xmm5
+	vmovd	28(%r8),%xmm0
+	vmovd	28(%r13),%xmm1
+	vmovd	28(%r9),%xmm2
+	vpinsrd	$1,28(%r14),%xmm5,%xmm5
+	vpinsrd	$1,28(%r10),%xmm0,%xmm0
+	vpinsrd	$1,28(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,28(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm13,%ymm7
+	vpslld	$26,%ymm13,%ymm2
+	vmovdqu	%ymm5,224-128(%rax)
+	vpaddd	%ymm8,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm13,%ymm2
+	vpaddd	96(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm13,%ymm2
+	vpandn	%ymm15,%ymm13,%ymm0
+	vpand	%ymm14,%ymm13,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm9,%ymm8
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm9,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm9,%ymm10,%ymm4
+
+	vpxor	%ymm1,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm9,%ymm1
+
+	vpslld	$19,%ymm9,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm8,%ymm7
+
+	vpsrld	$22,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm9,%ymm2
+	vpxor	%ymm3,%ymm10,%ymm8
+	vpaddd	%ymm5,%ymm12,%ymm12
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm8,%ymm8
+	addq	$256,%rbp
+	vmovd	32(%r12),%xmm5
+	vmovd	32(%r8),%xmm0
+	vmovd	32(%r13),%xmm1
+	vmovd	32(%r9),%xmm2
+	vpinsrd	$1,32(%r14),%xmm5,%xmm5
+	vpinsrd	$1,32(%r10),%xmm0,%xmm0
+	vpinsrd	$1,32(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,32(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm12,%ymm7
+	vpslld	$26,%ymm12,%ymm2
+	vmovdqu	%ymm5,256-256-128(%rbx)
+	vpaddd	%ymm15,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm12,%ymm2
+	vpaddd	-128(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm12,%ymm2
+	vpandn	%ymm14,%ymm12,%ymm0
+	vpand	%ymm13,%ymm12,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm8,%ymm15
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm8,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm8,%ymm9,%ymm3
+
+	vpxor	%ymm1,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm8,%ymm1
+
+	vpslld	$19,%ymm8,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm15,%ymm7
+
+	vpsrld	$22,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm8,%ymm2
+	vpxor	%ymm4,%ymm9,%ymm15
+	vpaddd	%ymm5,%ymm11,%ymm11
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm15,%ymm15
+	vmovd	36(%r12),%xmm5
+	vmovd	36(%r8),%xmm0
+	vmovd	36(%r13),%xmm1
+	vmovd	36(%r9),%xmm2
+	vpinsrd	$1,36(%r14),%xmm5,%xmm5
+	vpinsrd	$1,36(%r10),%xmm0,%xmm0
+	vpinsrd	$1,36(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,36(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm11,%ymm7
+	vpslld	$26,%ymm11,%ymm2
+	vmovdqu	%ymm5,288-256-128(%rbx)
+	vpaddd	%ymm14,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm11,%ymm2
+	vpaddd	-96(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm11,%ymm2
+	vpandn	%ymm13,%ymm11,%ymm0
+	vpand	%ymm12,%ymm11,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm15,%ymm14
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm15,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm15,%ymm8,%ymm4
+
+	vpxor	%ymm1,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm15,%ymm1
+
+	vpslld	$19,%ymm15,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm14,%ymm7
+
+	vpsrld	$22,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm15,%ymm2
+	vpxor	%ymm3,%ymm8,%ymm14
+	vpaddd	%ymm5,%ymm10,%ymm10
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm14,%ymm14
+	vmovd	40(%r12),%xmm5
+	vmovd	40(%r8),%xmm0
+	vmovd	40(%r13),%xmm1
+	vmovd	40(%r9),%xmm2
+	vpinsrd	$1,40(%r14),%xmm5,%xmm5
+	vpinsrd	$1,40(%r10),%xmm0,%xmm0
+	vpinsrd	$1,40(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,40(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm10,%ymm7
+	vpslld	$26,%ymm10,%ymm2
+	vmovdqu	%ymm5,320-256-128(%rbx)
+	vpaddd	%ymm13,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm10,%ymm2
+	vpaddd	-64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm10,%ymm2
+	vpandn	%ymm12,%ymm10,%ymm0
+	vpand	%ymm11,%ymm10,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm14,%ymm13
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm14,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm14,%ymm15,%ymm3
+
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm14,%ymm1
+
+	vpslld	$19,%ymm14,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm13,%ymm7
+
+	vpsrld	$22,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm14,%ymm2
+	vpxor	%ymm4,%ymm15,%ymm13
+	vpaddd	%ymm5,%ymm9,%ymm9
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vmovd	44(%r12),%xmm5
+	vmovd	44(%r8),%xmm0
+	vmovd	44(%r13),%xmm1
+	vmovd	44(%r9),%xmm2
+	vpinsrd	$1,44(%r14),%xmm5,%xmm5
+	vpinsrd	$1,44(%r10),%xmm0,%xmm0
+	vpinsrd	$1,44(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,44(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm9,%ymm7
+	vpslld	$26,%ymm9,%ymm2
+	vmovdqu	%ymm5,352-256-128(%rbx)
+	vpaddd	%ymm12,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm9,%ymm2
+	vpaddd	-32(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm9,%ymm2
+	vpandn	%ymm11,%ymm9,%ymm0
+	vpand	%ymm10,%ymm9,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm13,%ymm12
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm13,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm13,%ymm14,%ymm4
+
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm13,%ymm1
+
+	vpslld	$19,%ymm13,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm12,%ymm7
+
+	vpsrld	$22,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm13,%ymm2
+	vpxor	%ymm3,%ymm14,%ymm12
+	vpaddd	%ymm5,%ymm8,%ymm8
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vmovd	48(%r12),%xmm5
+	vmovd	48(%r8),%xmm0
+	vmovd	48(%r13),%xmm1
+	vmovd	48(%r9),%xmm2
+	vpinsrd	$1,48(%r14),%xmm5,%xmm5
+	vpinsrd	$1,48(%r10),%xmm0,%xmm0
+	vpinsrd	$1,48(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,48(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm8,%ymm7
+	vpslld	$26,%ymm8,%ymm2
+	vmovdqu	%ymm5,384-256-128(%rbx)
+	vpaddd	%ymm11,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm8,%ymm2
+	vpaddd	0(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm8,%ymm2
+	vpandn	%ymm10,%ymm8,%ymm0
+	vpand	%ymm9,%ymm8,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm12,%ymm11
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm12,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm12,%ymm13,%ymm3
+
+	vpxor	%ymm1,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm12,%ymm1
+
+	vpslld	$19,%ymm12,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm11,%ymm7
+
+	vpsrld	$22,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm12,%ymm2
+	vpxor	%ymm4,%ymm13,%ymm11
+	vpaddd	%ymm5,%ymm15,%ymm15
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm11,%ymm11
+	vmovd	52(%r12),%xmm5
+	vmovd	52(%r8),%xmm0
+	vmovd	52(%r13),%xmm1
+	vmovd	52(%r9),%xmm2
+	vpinsrd	$1,52(%r14),%xmm5,%xmm5
+	vpinsrd	$1,52(%r10),%xmm0,%xmm0
+	vpinsrd	$1,52(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,52(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm15,%ymm7
+	vpslld	$26,%ymm15,%ymm2
+	vmovdqu	%ymm5,416-256-128(%rbx)
+	vpaddd	%ymm10,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm15,%ymm2
+	vpaddd	32(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm15,%ymm2
+	vpandn	%ymm9,%ymm15,%ymm0
+	vpand	%ymm8,%ymm15,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm11,%ymm10
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm11,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm11,%ymm12,%ymm4
+
+	vpxor	%ymm1,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm11,%ymm1
+
+	vpslld	$19,%ymm11,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm10,%ymm7
+
+	vpsrld	$22,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm11,%ymm2
+	vpxor	%ymm3,%ymm12,%ymm10
+	vpaddd	%ymm5,%ymm14,%ymm14
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm10,%ymm10
+	vmovd	56(%r12),%xmm5
+	vmovd	56(%r8),%xmm0
+	vmovd	56(%r13),%xmm1
+	vmovd	56(%r9),%xmm2
+	vpinsrd	$1,56(%r14),%xmm5,%xmm5
+	vpinsrd	$1,56(%r10),%xmm0,%xmm0
+	vpinsrd	$1,56(%r15),%xmm1,%xmm1
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,56(%r11),%xmm2,%xmm2
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm14,%ymm7
+	vpslld	$26,%ymm14,%ymm2
+	vmovdqu	%ymm5,448-256-128(%rbx)
+	vpaddd	%ymm9,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm14,%ymm2
+	vpaddd	64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm14,%ymm2
+	vpandn	%ymm8,%ymm14,%ymm0
+	vpand	%ymm15,%ymm14,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm10,%ymm9
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm10,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm11,%ymm3
+
+	vpxor	%ymm1,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm10,%ymm1
+
+	vpslld	$19,%ymm10,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm9,%ymm7
+
+	vpsrld	$22,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm10,%ymm2
+	vpxor	%ymm4,%ymm11,%ymm9
+	vpaddd	%ymm5,%ymm13,%ymm13
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm9,%ymm9
+	vmovd	60(%r12),%xmm5
+	leaq	64(%r12),%r12
+	vmovd	60(%r8),%xmm0
+	leaq	64(%r8),%r8
+	vmovd	60(%r13),%xmm1
+	leaq	64(%r13),%r13
+	vmovd	60(%r9),%xmm2
+	leaq	64(%r9),%r9
+	vpinsrd	$1,60(%r14),%xmm5,%xmm5
+	leaq	64(%r14),%r14
+	vpinsrd	$1,60(%r10),%xmm0,%xmm0
+	leaq	64(%r10),%r10
+	vpinsrd	$1,60(%r15),%xmm1,%xmm1
+	leaq	64(%r15),%r15
+	vpunpckldq	%ymm1,%ymm5,%ymm5
+	vpinsrd	$1,60(%r11),%xmm2,%xmm2
+	leaq	64(%r11),%r11
+	vpunpckldq	%ymm2,%ymm0,%ymm0
+	vinserti128	$1,%xmm0,%ymm5,%ymm5
+	vpshufb	%ymm6,%ymm5,%ymm5
+	vpsrld	$6,%ymm13,%ymm7
+	vpslld	$26,%ymm13,%ymm2
+	vmovdqu	%ymm5,480-256-128(%rbx)
+	vpaddd	%ymm8,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm13,%ymm2
+	vpaddd	96(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	prefetcht0	63(%r12)
+	vpslld	$7,%ymm13,%ymm2
+	vpandn	%ymm15,%ymm13,%ymm0
+	vpand	%ymm14,%ymm13,%ymm4
+	prefetcht0	63(%r13)
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm9,%ymm8
+	vpxor	%ymm2,%ymm7,%ymm7
+	prefetcht0	63(%r14)
+	vpslld	$30,%ymm9,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm9,%ymm10,%ymm4
+	prefetcht0	63(%r15)
+	vpxor	%ymm1,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm9,%ymm1
+	prefetcht0	63(%r8)
+	vpslld	$19,%ymm9,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm4,%ymm3,%ymm3
+	prefetcht0	63(%r9)
+	vpxor	%ymm1,%ymm8,%ymm7
+
+	vpsrld	$22,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	prefetcht0	63(%r10)
+	vpslld	$10,%ymm9,%ymm2
+	vpxor	%ymm3,%ymm10,%ymm8
+	vpaddd	%ymm5,%ymm12,%ymm12
+	prefetcht0	63(%r11)
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm8,%ymm8
+	addq	$256,%rbp
+	vmovdqu	0-128(%rax),%ymm5
+	movl	$3,%ecx
+	jmp	.Loop_16_xx_avx2
+.align	32
+.Loop_16_xx_avx2:
+	vmovdqu	32-128(%rax),%ymm6
+	vpaddd	288-256-128(%rbx),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	448-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm12,%ymm7
+	vpslld	$26,%ymm12,%ymm2
+	vmovdqu	%ymm5,0-128(%rax)
+	vpaddd	%ymm15,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm12,%ymm2
+	vpaddd	-128(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm12,%ymm2
+	vpandn	%ymm14,%ymm12,%ymm0
+	vpand	%ymm13,%ymm12,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm8,%ymm15
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm8,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm8,%ymm9,%ymm3
+
+	vpxor	%ymm1,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm8,%ymm1
+
+	vpslld	$19,%ymm8,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm15,%ymm7
+
+	vpsrld	$22,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm8,%ymm2
+	vpxor	%ymm4,%ymm9,%ymm15
+	vpaddd	%ymm5,%ymm11,%ymm11
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm15,%ymm15
+	vmovdqu	64-128(%rax),%ymm5
+	vpaddd	320-256-128(%rbx),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	480-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm11,%ymm7
+	vpslld	$26,%ymm11,%ymm2
+	vmovdqu	%ymm6,32-128(%rax)
+	vpaddd	%ymm14,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm11,%ymm2
+	vpaddd	-96(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm11,%ymm2
+	vpandn	%ymm13,%ymm11,%ymm0
+	vpand	%ymm12,%ymm11,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm15,%ymm14
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm15,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm15,%ymm8,%ymm4
+
+	vpxor	%ymm1,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm15,%ymm1
+
+	vpslld	$19,%ymm15,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm14,%ymm7
+
+	vpsrld	$22,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm15,%ymm2
+	vpxor	%ymm3,%ymm8,%ymm14
+	vpaddd	%ymm6,%ymm10,%ymm10
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm14,%ymm14
+	vmovdqu	96-128(%rax),%ymm6
+	vpaddd	352-256-128(%rbx),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	0-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm10,%ymm7
+	vpslld	$26,%ymm10,%ymm2
+	vmovdqu	%ymm5,64-128(%rax)
+	vpaddd	%ymm13,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm10,%ymm2
+	vpaddd	-64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm10,%ymm2
+	vpandn	%ymm12,%ymm10,%ymm0
+	vpand	%ymm11,%ymm10,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm14,%ymm13
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm14,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm14,%ymm15,%ymm3
+
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm14,%ymm1
+
+	vpslld	$19,%ymm14,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm13,%ymm7
+
+	vpsrld	$22,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm14,%ymm2
+	vpxor	%ymm4,%ymm15,%ymm13
+	vpaddd	%ymm5,%ymm9,%ymm9
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vmovdqu	128-128(%rax),%ymm5
+	vpaddd	384-256-128(%rbx),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	32-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm9,%ymm7
+	vpslld	$26,%ymm9,%ymm2
+	vmovdqu	%ymm6,96-128(%rax)
+	vpaddd	%ymm12,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm9,%ymm2
+	vpaddd	-32(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm9,%ymm2
+	vpandn	%ymm11,%ymm9,%ymm0
+	vpand	%ymm10,%ymm9,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm13,%ymm12
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm13,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm13,%ymm14,%ymm4
+
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm13,%ymm1
+
+	vpslld	$19,%ymm13,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm12,%ymm7
+
+	vpsrld	$22,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm13,%ymm2
+	vpxor	%ymm3,%ymm14,%ymm12
+	vpaddd	%ymm6,%ymm8,%ymm8
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vmovdqu	160-128(%rax),%ymm6
+	vpaddd	416-256-128(%rbx),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	64-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm8,%ymm7
+	vpslld	$26,%ymm8,%ymm2
+	vmovdqu	%ymm5,128-128(%rax)
+	vpaddd	%ymm11,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm8,%ymm2
+	vpaddd	0(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm8,%ymm2
+	vpandn	%ymm10,%ymm8,%ymm0
+	vpand	%ymm9,%ymm8,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm12,%ymm11
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm12,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm12,%ymm13,%ymm3
+
+	vpxor	%ymm1,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm12,%ymm1
+
+	vpslld	$19,%ymm12,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm11,%ymm7
+
+	vpsrld	$22,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm12,%ymm2
+	vpxor	%ymm4,%ymm13,%ymm11
+	vpaddd	%ymm5,%ymm15,%ymm15
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm11,%ymm11
+	vmovdqu	192-128(%rax),%ymm5
+	vpaddd	448-256-128(%rbx),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	96-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm15,%ymm7
+	vpslld	$26,%ymm15,%ymm2
+	vmovdqu	%ymm6,160-128(%rax)
+	vpaddd	%ymm10,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm15,%ymm2
+	vpaddd	32(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm15,%ymm2
+	vpandn	%ymm9,%ymm15,%ymm0
+	vpand	%ymm8,%ymm15,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm11,%ymm10
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm11,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm11,%ymm12,%ymm4
+
+	vpxor	%ymm1,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm11,%ymm1
+
+	vpslld	$19,%ymm11,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm10,%ymm7
+
+	vpsrld	$22,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm11,%ymm2
+	vpxor	%ymm3,%ymm12,%ymm10
+	vpaddd	%ymm6,%ymm14,%ymm14
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm10,%ymm10
+	vmovdqu	224-128(%rax),%ymm6
+	vpaddd	480-256-128(%rbx),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	128-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm14,%ymm7
+	vpslld	$26,%ymm14,%ymm2
+	vmovdqu	%ymm5,192-128(%rax)
+	vpaddd	%ymm9,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm14,%ymm2
+	vpaddd	64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm14,%ymm2
+	vpandn	%ymm8,%ymm14,%ymm0
+	vpand	%ymm15,%ymm14,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm10,%ymm9
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm10,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm11,%ymm3
+
+	vpxor	%ymm1,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm10,%ymm1
+
+	vpslld	$19,%ymm10,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm9,%ymm7
+
+	vpsrld	$22,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm10,%ymm2
+	vpxor	%ymm4,%ymm11,%ymm9
+	vpaddd	%ymm5,%ymm13,%ymm13
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm9,%ymm9
+	vmovdqu	256-256-128(%rbx),%ymm5
+	vpaddd	0-128(%rax),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	160-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm13,%ymm7
+	vpslld	$26,%ymm13,%ymm2
+	vmovdqu	%ymm6,224-128(%rax)
+	vpaddd	%ymm8,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm13,%ymm2
+	vpaddd	96(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm13,%ymm2
+	vpandn	%ymm15,%ymm13,%ymm0
+	vpand	%ymm14,%ymm13,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm9,%ymm8
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm9,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm9,%ymm10,%ymm4
+
+	vpxor	%ymm1,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm9,%ymm1
+
+	vpslld	$19,%ymm9,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm8,%ymm7
+
+	vpsrld	$22,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm9,%ymm2
+	vpxor	%ymm3,%ymm10,%ymm8
+	vpaddd	%ymm6,%ymm12,%ymm12
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm8,%ymm8
+	addq	$256,%rbp
+	vmovdqu	288-256-128(%rbx),%ymm6
+	vpaddd	32-128(%rax),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	192-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm12,%ymm7
+	vpslld	$26,%ymm12,%ymm2
+	vmovdqu	%ymm5,256-256-128(%rbx)
+	vpaddd	%ymm15,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm12,%ymm2
+	vpaddd	-128(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm12,%ymm2
+	vpandn	%ymm14,%ymm12,%ymm0
+	vpand	%ymm13,%ymm12,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm8,%ymm15
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm8,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm8,%ymm9,%ymm3
+
+	vpxor	%ymm1,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm8,%ymm1
+
+	vpslld	$19,%ymm8,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm15,%ymm7
+
+	vpsrld	$22,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm8,%ymm2
+	vpxor	%ymm4,%ymm9,%ymm15
+	vpaddd	%ymm5,%ymm11,%ymm11
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm15,%ymm15
+	vpaddd	%ymm7,%ymm15,%ymm15
+	vmovdqu	320-256-128(%rbx),%ymm5
+	vpaddd	64-128(%rax),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	224-128(%rax),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm11,%ymm7
+	vpslld	$26,%ymm11,%ymm2
+	vmovdqu	%ymm6,288-256-128(%rbx)
+	vpaddd	%ymm14,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm11,%ymm2
+	vpaddd	-96(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm11,%ymm2
+	vpandn	%ymm13,%ymm11,%ymm0
+	vpand	%ymm12,%ymm11,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm15,%ymm14
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm15,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm15,%ymm8,%ymm4
+
+	vpxor	%ymm1,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm15,%ymm1
+
+	vpslld	$19,%ymm15,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm14,%ymm7
+
+	vpsrld	$22,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm15,%ymm2
+	vpxor	%ymm3,%ymm8,%ymm14
+	vpaddd	%ymm6,%ymm10,%ymm10
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm14,%ymm14
+	vpaddd	%ymm7,%ymm14,%ymm14
+	vmovdqu	352-256-128(%rbx),%ymm6
+	vpaddd	96-128(%rax),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	256-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm10,%ymm7
+	vpslld	$26,%ymm10,%ymm2
+	vmovdqu	%ymm5,320-256-128(%rbx)
+	vpaddd	%ymm13,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm10,%ymm2
+	vpaddd	-64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm10,%ymm2
+	vpandn	%ymm12,%ymm10,%ymm0
+	vpand	%ymm11,%ymm10,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm14,%ymm13
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm14,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm14,%ymm15,%ymm3
+
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm14,%ymm1
+
+	vpslld	$19,%ymm14,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm13,%ymm7
+
+	vpsrld	$22,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm14,%ymm2
+	vpxor	%ymm4,%ymm15,%ymm13
+	vpaddd	%ymm5,%ymm9,%ymm9
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vmovdqu	384-256-128(%rbx),%ymm5
+	vpaddd	128-128(%rax),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	288-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm9,%ymm7
+	vpslld	$26,%ymm9,%ymm2
+	vmovdqu	%ymm6,352-256-128(%rbx)
+	vpaddd	%ymm12,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm9,%ymm2
+	vpaddd	-32(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm9,%ymm2
+	vpandn	%ymm11,%ymm9,%ymm0
+	vpand	%ymm10,%ymm9,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm13,%ymm12
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm13,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm13,%ymm14,%ymm4
+
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm13,%ymm1
+
+	vpslld	$19,%ymm13,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm12,%ymm7
+
+	vpsrld	$22,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm13,%ymm2
+	vpxor	%ymm3,%ymm14,%ymm12
+	vpaddd	%ymm6,%ymm8,%ymm8
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vmovdqu	416-256-128(%rbx),%ymm6
+	vpaddd	160-128(%rax),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	320-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm8,%ymm7
+	vpslld	$26,%ymm8,%ymm2
+	vmovdqu	%ymm5,384-256-128(%rbx)
+	vpaddd	%ymm11,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm8,%ymm2
+	vpaddd	0(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm8,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm8,%ymm2
+	vpandn	%ymm10,%ymm8,%ymm0
+	vpand	%ymm9,%ymm8,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm12,%ymm11
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm12,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm12,%ymm13,%ymm3
+
+	vpxor	%ymm1,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm12,%ymm1
+
+	vpslld	$19,%ymm12,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm11,%ymm7
+
+	vpsrld	$22,%ymm12,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm12,%ymm2
+	vpxor	%ymm4,%ymm13,%ymm11
+	vpaddd	%ymm5,%ymm15,%ymm15
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm11,%ymm11
+	vpaddd	%ymm7,%ymm11,%ymm11
+	vmovdqu	448-256-128(%rbx),%ymm5
+	vpaddd	192-128(%rax),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	352-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm15,%ymm7
+	vpslld	$26,%ymm15,%ymm2
+	vmovdqu	%ymm6,416-256-128(%rbx)
+	vpaddd	%ymm10,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm15,%ymm2
+	vpaddd	32(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm15,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm15,%ymm2
+	vpandn	%ymm9,%ymm15,%ymm0
+	vpand	%ymm8,%ymm15,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm11,%ymm10
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm11,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm11,%ymm12,%ymm4
+
+	vpxor	%ymm1,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm11,%ymm1
+
+	vpslld	$19,%ymm11,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm10,%ymm7
+
+	vpsrld	$22,%ymm11,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm11,%ymm2
+	vpxor	%ymm3,%ymm12,%ymm10
+	vpaddd	%ymm6,%ymm14,%ymm14
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm10,%ymm10
+	vpaddd	%ymm7,%ymm10,%ymm10
+	vmovdqu	480-256-128(%rbx),%ymm6
+	vpaddd	224-128(%rax),%ymm5,%ymm5
+
+	vpsrld	$3,%ymm6,%ymm7
+	vpsrld	$7,%ymm6,%ymm1
+	vpslld	$25,%ymm6,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm6,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm6,%ymm2
+	vmovdqu	384-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm3,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpsrld	$6,%ymm14,%ymm7
+	vpslld	$26,%ymm14,%ymm2
+	vmovdqu	%ymm5,448-256-128(%rbx)
+	vpaddd	%ymm9,%ymm5,%ymm5
+
+	vpsrld	$11,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm14,%ymm2
+	vpaddd	64(%rbp),%ymm5,%ymm5
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm14,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm14,%ymm2
+	vpandn	%ymm8,%ymm14,%ymm0
+	vpand	%ymm15,%ymm14,%ymm3
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm10,%ymm9
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm10,%ymm1
+	vpxor	%ymm3,%ymm0,%ymm0
+	vpxor	%ymm10,%ymm11,%ymm3
+
+	vpxor	%ymm1,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm5,%ymm5
+
+	vpsrld	$13,%ymm10,%ymm1
+
+	vpslld	$19,%ymm10,%ymm2
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpand	%ymm3,%ymm4,%ymm4
+
+	vpxor	%ymm1,%ymm9,%ymm7
+
+	vpsrld	$22,%ymm10,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm10,%ymm2
+	vpxor	%ymm4,%ymm11,%ymm9
+	vpaddd	%ymm5,%ymm13,%ymm13
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm5,%ymm9,%ymm9
+	vpaddd	%ymm7,%ymm9,%ymm9
+	vmovdqu	0-128(%rax),%ymm5
+	vpaddd	256-256-128(%rbx),%ymm6,%ymm6
+
+	vpsrld	$3,%ymm5,%ymm7
+	vpsrld	$7,%ymm5,%ymm1
+	vpslld	$25,%ymm5,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$18,%ymm5,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$14,%ymm5,%ymm2
+	vmovdqu	416-256-128(%rbx),%ymm0
+	vpsrld	$10,%ymm0,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpsrld	$17,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$15,%ymm0,%ymm2
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm4,%ymm7
+	vpsrld	$19,%ymm0,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$13,%ymm0,%ymm2
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpsrld	$6,%ymm13,%ymm7
+	vpslld	$26,%ymm13,%ymm2
+	vmovdqu	%ymm6,480-256-128(%rbx)
+	vpaddd	%ymm8,%ymm6,%ymm6
+
+	vpsrld	$11,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+	vpslld	$21,%ymm13,%ymm2
+	vpaddd	96(%rbp),%ymm6,%ymm6
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$25,%ymm13,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$7,%ymm13,%ymm2
+	vpandn	%ymm15,%ymm13,%ymm0
+	vpand	%ymm14,%ymm13,%ymm4
+
+	vpxor	%ymm1,%ymm7,%ymm7
+
+	vpsrld	$2,%ymm9,%ymm8
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$30,%ymm9,%ymm1
+	vpxor	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm9,%ymm10,%ymm4
+
+	vpxor	%ymm1,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm6,%ymm6
+
+	vpsrld	$13,%ymm9,%ymm1
+
+	vpslld	$19,%ymm9,%ymm2
+	vpaddd	%ymm0,%ymm6,%ymm6
+	vpand	%ymm4,%ymm3,%ymm3
+
+	vpxor	%ymm1,%ymm8,%ymm7
+
+	vpsrld	$22,%ymm9,%ymm1
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpslld	$10,%ymm9,%ymm2
+	vpxor	%ymm3,%ymm10,%ymm8
+	vpaddd	%ymm6,%ymm12,%ymm12
+
+	vpxor	%ymm1,%ymm7,%ymm7
+	vpxor	%ymm2,%ymm7,%ymm7
+
+	vpaddd	%ymm6,%ymm8,%ymm8
+	vpaddd	%ymm7,%ymm8,%ymm8
+	addq	$256,%rbp
+	decl	%ecx
+	jnz	.Loop_16_xx_avx2
+
+	movl	$1,%ecx
+	leaq	512(%rsp),%rbx
+	leaq	K256+128(%rip),%rbp
+	cmpl	0(%rbx),%ecx
+	cmovgeq	%rbp,%r12
+	cmpl	4(%rbx),%ecx
+	cmovgeq	%rbp,%r13
+	cmpl	8(%rbx),%ecx
+	cmovgeq	%rbp,%r14
+	cmpl	12(%rbx),%ecx
+	cmovgeq	%rbp,%r15
+	cmpl	16(%rbx),%ecx
+	cmovgeq	%rbp,%r8
+	cmpl	20(%rbx),%ecx
+	cmovgeq	%rbp,%r9
+	cmpl	24(%rbx),%ecx
+	cmovgeq	%rbp,%r10
+	cmpl	28(%rbx),%ecx
+	cmovgeq	%rbp,%r11
+	vmovdqa	(%rbx),%ymm7
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqa	%ymm7,%ymm6
+	vpcmpgtd	%ymm0,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm7,%ymm7
+
+	vmovdqu	0-128(%rdi),%ymm0
+	vpand	%ymm6,%ymm8,%ymm8
+	vmovdqu	32-128(%rdi),%ymm1
+	vpand	%ymm6,%ymm9,%ymm9
+	vmovdqu	64-128(%rdi),%ymm2
+	vpand	%ymm6,%ymm10,%ymm10
+	vmovdqu	96-128(%rdi),%ymm5
+	vpand	%ymm6,%ymm11,%ymm11
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vmovdqu	128-128(%rdi),%ymm0
+	vpand	%ymm6,%ymm12,%ymm12
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vmovdqu	160-128(%rdi),%ymm1
+	vpand	%ymm6,%ymm13,%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vmovdqu	192-128(%rdi),%ymm2
+	vpand	%ymm6,%ymm14,%ymm14
+	vpaddd	%ymm5,%ymm11,%ymm11
+	vmovdqu	224-128(%rdi),%ymm5
+	vpand	%ymm6,%ymm15,%ymm15
+	vpaddd	%ymm0,%ymm12,%ymm12
+	vpaddd	%ymm1,%ymm13,%ymm13
+	vmovdqu	%ymm8,0-128(%rdi)
+	vpaddd	%ymm2,%ymm14,%ymm14
+	vmovdqu	%ymm9,32-128(%rdi)
+	vpaddd	%ymm5,%ymm15,%ymm15
+	vmovdqu	%ymm10,64-128(%rdi)
+	vmovdqu	%ymm11,96-128(%rdi)
+	vmovdqu	%ymm12,128-128(%rdi)
+	vmovdqu	%ymm13,160-128(%rdi)
+	vmovdqu	%ymm14,192-128(%rdi)
+	vmovdqu	%ymm15,224-128(%rdi)
+
+	vmovdqu	%ymm7,(%rbx)
+	leaq	256+128(%rsp),%rbx
+	vmovdqu	.Lpbswap(%rip),%ymm6
+	decl	%edx
+	jnz	.Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+	movq	544(%rsp),%rax
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.size	sha256_multi_block_avx2,.-sha256_multi_block_avx2
+.align	256
+K256:
+.long	1116352408,1116352408,1116352408,1116352408
+.long	1116352408,1116352408,1116352408,1116352408
+.long	1899447441,1899447441,1899447441,1899447441
+.long	1899447441,1899447441,1899447441,1899447441
+.long	3049323471,3049323471,3049323471,3049323471
+.long	3049323471,3049323471,3049323471,3049323471
+.long	3921009573,3921009573,3921009573,3921009573
+.long	3921009573,3921009573,3921009573,3921009573
+.long	961987163,961987163,961987163,961987163
+.long	961987163,961987163,961987163,961987163
+.long	1508970993,1508970993,1508970993,1508970993
+.long	1508970993,1508970993,1508970993,1508970993
+.long	2453635748,2453635748,2453635748,2453635748
+.long	2453635748,2453635748,2453635748,2453635748
+.long	2870763221,2870763221,2870763221,2870763221
+.long	2870763221,2870763221,2870763221,2870763221
+.long	3624381080,3624381080,3624381080,3624381080
+.long	3624381080,3624381080,3624381080,3624381080
+.long	310598401,310598401,310598401,310598401
+.long	310598401,310598401,310598401,310598401
+.long	607225278,607225278,607225278,607225278
+.long	607225278,607225278,607225278,607225278
+.long	1426881987,1426881987,1426881987,1426881987
+.long	1426881987,1426881987,1426881987,1426881987
+.long	1925078388,1925078388,1925078388,1925078388
+.long	1925078388,1925078388,1925078388,1925078388
+.long	2162078206,2162078206,2162078206,2162078206
+.long	2162078206,2162078206,2162078206,2162078206
+.long	2614888103,2614888103,2614888103,2614888103
+.long	2614888103,2614888103,2614888103,2614888103
+.long	3248222580,3248222580,3248222580,3248222580
+.long	3248222580,3248222580,3248222580,3248222580
+.long	3835390401,3835390401,3835390401,3835390401
+.long	3835390401,3835390401,3835390401,3835390401
+.long	4022224774,4022224774,4022224774,4022224774
+.long	4022224774,4022224774,4022224774,4022224774
+.long	264347078,264347078,264347078,264347078
+.long	264347078,264347078,264347078,264347078
+.long	604807628,604807628,604807628,604807628
+.long	604807628,604807628,604807628,604807628
+.long	770255983,770255983,770255983,770255983
+.long	770255983,770255983,770255983,770255983
+.long	1249150122,1249150122,1249150122,1249150122
+.long	1249150122,1249150122,1249150122,1249150122
+.long	1555081692,1555081692,1555081692,1555081692
+.long	1555081692,1555081692,1555081692,1555081692
+.long	1996064986,1996064986,1996064986,1996064986
+.long	1996064986,1996064986,1996064986,1996064986
+.long	2554220882,2554220882,2554220882,2554220882
+.long	2554220882,2554220882,2554220882,2554220882
+.long	2821834349,2821834349,2821834349,2821834349
+.long	2821834349,2821834349,2821834349,2821834349
+.long	2952996808,2952996808,2952996808,2952996808
+.long	2952996808,2952996808,2952996808,2952996808
+.long	3210313671,3210313671,3210313671,3210313671
+.long	3210313671,3210313671,3210313671,3210313671
+.long	3336571891,3336571891,3336571891,3336571891
+.long	3336571891,3336571891,3336571891,3336571891
+.long	3584528711,3584528711,3584528711,3584528711
+.long	3584528711,3584528711,3584528711,3584528711
+.long	113926993,113926993,113926993,113926993
+.long	113926993,113926993,113926993,113926993
+.long	338241895,338241895,338241895,338241895
+.long	338241895,338241895,338241895,338241895
+.long	666307205,666307205,666307205,666307205
+.long	666307205,666307205,666307205,666307205
+.long	773529912,773529912,773529912,773529912
+.long	773529912,773529912,773529912,773529912
+.long	1294757372,1294757372,1294757372,1294757372
+.long	1294757372,1294757372,1294757372,1294757372
+.long	1396182291,1396182291,1396182291,1396182291
+.long	1396182291,1396182291,1396182291,1396182291
+.long	1695183700,1695183700,1695183700,1695183700
+.long	1695183700,1695183700,1695183700,1695183700
+.long	1986661051,1986661051,1986661051,1986661051
+.long	1986661051,1986661051,1986661051,1986661051
+.long	2177026350,2177026350,2177026350,2177026350
+.long	2177026350,2177026350,2177026350,2177026350
+.long	2456956037,2456956037,2456956037,2456956037
+.long	2456956037,2456956037,2456956037,2456956037
+.long	2730485921,2730485921,2730485921,2730485921
+.long	2730485921,2730485921,2730485921,2730485921
+.long	2820302411,2820302411,2820302411,2820302411
+.long	2820302411,2820302411,2820302411,2820302411
+.long	3259730800,3259730800,3259730800,3259730800
+.long	3259730800,3259730800,3259730800,3259730800
+.long	3345764771,3345764771,3345764771,3345764771
+.long	3345764771,3345764771,3345764771,3345764771
+.long	3516065817,3516065817,3516065817,3516065817
+.long	3516065817,3516065817,3516065817,3516065817
+.long	3600352804,3600352804,3600352804,3600352804
+.long	3600352804,3600352804,3600352804,3600352804
+.long	4094571909,4094571909,4094571909,4094571909
+.long	4094571909,4094571909,4094571909,4094571909
+.long	275423344,275423344,275423344,275423344
+.long	275423344,275423344,275423344,275423344
+.long	430227734,430227734,430227734,430227734
+.long	430227734,430227734,430227734,430227734
+.long	506948616,506948616,506948616,506948616
+.long	506948616,506948616,506948616,506948616
+.long	659060556,659060556,659060556,659060556
+.long	659060556,659060556,659060556,659060556
+.long	883997877,883997877,883997877,883997877
+.long	883997877,883997877,883997877,883997877
+.long	958139571,958139571,958139571,958139571
+.long	958139571,958139571,958139571,958139571
+.long	1322822218,1322822218,1322822218,1322822218
+.long	1322822218,1322822218,1322822218,1322822218
+.long	1537002063,1537002063,1537002063,1537002063
+.long	1537002063,1537002063,1537002063,1537002063
+.long	1747873779,1747873779,1747873779,1747873779
+.long	1747873779,1747873779,1747873779,1747873779
+.long	1955562222,1955562222,1955562222,1955562222
+.long	1955562222,1955562222,1955562222,1955562222
+.long	2024104815,2024104815,2024104815,2024104815
+.long	2024104815,2024104815,2024104815,2024104815
+.long	2227730452,2227730452,2227730452,2227730452
+.long	2227730452,2227730452,2227730452,2227730452
+.long	2361852424,2361852424,2361852424,2361852424
+.long	2361852424,2361852424,2361852424,2361852424
+.long	2428436474,2428436474,2428436474,2428436474
+.long	2428436474,2428436474,2428436474,2428436474
+.long	2756734187,2756734187,2756734187,2756734187
+.long	2756734187,2756734187,2756734187,2756734187
+.long	3204031479,3204031479,3204031479,3204031479
+.long	3204031479,3204031479,3204031479,3204031479
+.long	3329325298,3329325298,3329325298,3329325298
+.long	3329325298,3329325298,3329325298,3329325298
+.Lpbswap:
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+K256_shaext:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.byte	83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0


Property changes on: trunk/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/lib4758cca/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libaep/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libaep/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libaep/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libaep/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libaep/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libatalla/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libatalla/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libatalla/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libatalla/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libatalla/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libcapi/Makefile
===================================================================
--- trunk/secure/lib/libcrypto/engines/libcapi/Makefile	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libcapi/Makefile	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,7 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libcapi/Makefile 290207 2015-10-30 20:51:33Z jkim $
+
+SHLIB_NAME?= libcapi.so
+SRCS=	e_capi.c
+
+.include <bsd.lib.mk>


Property changes on: trunk/secure/lib/libcrypto/engines/libcapi/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libcapi/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libcapi/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libcapi/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libcapi/Makefile.depend 291563 2015-12-01 05:23:19Z bdrewery $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libcapi/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libchil/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libchil/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libchil/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libchil/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libchil/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libcswift/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libcswift/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libcswift/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libcswift/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libcswift/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libgost/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libgost/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libgost/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libgost/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libgost/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libnuron/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libnuron/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libnuron/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libnuron/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libnuron/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libsureware/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libsureware/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libsureware/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libsureware/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libsureware/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/engines/libubsec/Makefile.depend
===================================================================
--- trunk/secure/lib/libcrypto/engines/libubsec/Makefile.depend	                        (rev 0)
+++ trunk/secure/lib/libcrypto/engines/libubsec/Makefile.depend	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,20 @@
+# $MidnightBSD$
+# $FreeBSD: stable/11/secure/lib/libcrypto/engines/libubsec/Makefile.depend 284345 2015-06-13 19:20:56Z sjg $
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+	secure/lib/libcrypto \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif


Property changes on: trunk/secure/lib/libcrypto/engines/libubsec/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/ASN1_TIME_set.3
===================================================================
--- trunk/secure/lib/libcrypto/man/ASN1_TIME_set.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/ASN1_TIME_set.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,261 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "ASN1_TIME_set 3"
+.TH ASN1_TIME_set 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+ASN1_TIME_set, ASN1_TIME_adj, ASN1_TIME_check, ASN1_TIME_set_string,
+ASN1_TIME_print, ASN1_TIME_diff \- ASN.1 Time functions.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 6
+\& ASN1_TIME *ASN1_TIME_set(ASN1_TIME *s, time_t t);
+\& ASN1_TIME *ASN1_TIME_adj(ASN1_TIME *s, time_t t,
+\&                          int offset_day, long offset_sec);
+\& int ASN1_TIME_set_string(ASN1_TIME *s, const char *str);
+\& int ASN1_TIME_check(const ASN1_TIME *t);
+\& int ASN1_TIME_print(BIO *b, const ASN1_TIME *s);
+\&
+\& int ASN1_TIME_diff(int *pday, int *psec,
+\&                    const ASN1_TIME *from, const ASN1_TIME *to);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+The function \fIASN1_TIME_set()\fR sets the \s-1ASN1_TIME\s0 structure \fBs\fR to the
+time represented by the time_t value \fBt\fR. If \fBs\fR is \s-1NULL\s0 a new \s-1ASN1_TIME\s0
+structure is allocated and returned.
+.PP
+\&\fIASN1_TIME_adj()\fR sets the \s-1ASN1_TIME\s0 structure \fBs\fR to the time represented
+by the time \fBoffset_day\fR and \fBoffset_sec\fR after the time_t value \fBt\fR.
+The values of \fBoffset_day\fR or \fBoffset_sec\fR can be negative to set a
+time before \fBt\fR. The \fBoffset_sec\fR value can also exceed the number of
+seconds in a day. If \fBs\fR is \s-1NULL\s0 a new \s-1ASN1_TIME\s0 structure is allocated
+and returned.
+.PP
+\&\fIASN1_TIME_set_string()\fR sets \s-1ASN1_TIME\s0 structure \fBs\fR to the time
+represented by string \fBstr\fR which must be in appropriate \s-1ASN.1\s0 time
+format (for example \s-1YYMMDDHHMMSSZ\s0 or \s-1YYYYMMDDHHMMSSZ\s0).
+.PP
+\&\fIASN1_TIME_check()\fR checks the syntax of \s-1ASN1_TIME\s0 structure \fBs\fR.
+.PP
+\&\fIASN1_TIME_print()\fR prints out the time \fBs\fR to \s-1BIO\s0 \fBb\fR in human readable
+format. It will be of the format \s-1MMM DD HH:MM:SS YYYY\s0 [\s-1GMT\s0], for example
+\&\*(L"Feb  3 00:55:52 2015 \s-1GMT\*(R"\s0 it does not include a newline. If the time
+structure has invalid format it prints out \*(L"Bad time value\*(R" and returns
+an error.
+.PP
+\&\fIASN1_TIME_diff()\fR sets \fB*pday\fR and \fB*psec\fR to the time difference between
+\&\fBfrom\fR and \fBto\fR. If \fBto\fR represents a time later than \fBfrom\fR then
+one or both (depending on the time difference) of \fB*pday\fR and \fB*psec\fR
+will be positive. If \fBto\fR represents a time earlier than \fBfrom\fR then
+one or both of \fB*pday\fR and \fB*psec\fR will be negative. If \fBto\fR and \fBfrom\fR
+represent the same time then \fB*pday\fR and \fB*psec\fR will both be zero.
+If both \fB*pday\fR and \fB*psec\fR are non-zero they will always have the same
+sign. The value of \fB*psec\fR will always be less than the number of seconds
+in a day. If \fBfrom\fR or \fBto\fR is \s-1NULL\s0 the current time is used.
+.SH "NOTES"
+.IX Header "NOTES"
+The \s-1ASN1_TIME\s0 structure corresponds to the \s-1ASN.1\s0 structure \fBTime\fR
+defined in \s-1RFC5280\s0 et al. The time setting functions obey the rules outlined
+in \s-1RFC5280:\s0 if the date can be represented by UTCTime it is used, else
+GeneralizedTime is used.
+.PP
+The \s-1ASN1_TIME\s0 structure is represented as an \s-1ASN1_STRING\s0 internally and can
+be freed up using \fIASN1_STRING_free()\fR.
+.PP
+The \s-1ASN1_TIME\s0 structure can represent years from 0000 to 9999 but no attempt
+is made to correct ancient calendar changes (for example from Julian to
+Gregorian calendars).
+.PP
+Some applications add offset times directly to a time_t value and pass the
+results to \fIASN1_TIME_set()\fR (or equivalent). This can cause problems as the
+time_t value can overflow on some systems resulting in unexpected results.
+New applications should use \fIASN1_TIME_adj()\fR instead and pass the offset value
+in the \fBoffset_sec\fR and \fBoffset_day\fR parameters instead of directly
+manipulating a time_t value.
+.SH "BUGS"
+.IX Header "BUGS"
+\&\fIASN1_TIME_print()\fR currently does not print out the time zone: it either prints
+out \*(L"\s-1GMT\*(R"\s0 or nothing. But all certificates complying with \s-1RFC5280\s0 et al use \s-1GMT\s0
+anyway.
+.SH "EXAMPLES"
+.IX Header "EXAMPLES"
+Set a time structure to one hour after the current time and print it out:
+.PP
+.Vb 11
+\& #include <time.h>
+\& #include <openssl/asn1.h>
+\& ASN1_TIME *tm;
+\& time_t t;
+\& BIO *b;
+\& t = time(NULL);
+\& tm = ASN1_TIME_adj(NULL, t, 0, 60 * 60);
+\& b = BIO_new_fp(stdout, BIO_NOCLOSE);
+\& ASN1_TIME_print(b, tm);
+\& ASN1_STRING_free(tm);
+\& BIO_free(b);
+.Ve
+.PP
+Determine if one time is later or sooner than the current time:
+.PP
+.Vb 1
+\& int day, sec;
+\&
+\& if (!ASN1_TIME_diff(&day, &sec, NULL, to))
+\&        /* Invalid time format */
+\&
+\& if (day > 0 || sec > 0)
+\&   printf("Later\en");
+\& else if (day < 0 || sec < 0)
+\&   printf("Sooner\en");
+\& else
+\&   printf("Same\en");
+.Ve
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+\&\fIASN1_TIME_set()\fR and \fIASN1_TIME_adj()\fR return a pointer to an \s-1ASN1_TIME\s0 structure
+or \s-1NULL\s0 if an error occurred.
+.PP
+\&\fIASN1_TIME_set_string()\fR returns 1 if the time value is successfully set and
+0 otherwise.
+.PP
+\&\fIASN1_TIME_check()\fR returns 1 if the structure is syntactically correct and 0
+otherwise.
+.PP
+\&\fIASN1_TIME_print()\fR returns 1 if the time is successfully printed out and 0 if
+an error occurred (I/O error or invalid time format).
+.PP
+\&\fIASN1_TIME_diff()\fR returns 1 for sucess and 0 for failure. It can fail if the
+pass \s-1ASN1_TIME\s0 structure has invalid syntax for example.


Property changes on: trunk/secure/lib/libcrypto/man/ASN1_TIME_set.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_GFp_simple_method.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_GFp_simple_method.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_GFp_simple_method.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,190 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_GFp_simple_method 3"
+.TH EC_GFp_simple_method 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_GFp_simple_method, EC_GFp_mont_method, EC_GFp_nist_method, EC_GFp_nistp224_method, EC_GFp_nistp256_method, EC_GFp_nistp521_method, EC_GF2m_simple_method, EC_METHOD_get_field_type \- Functions for obtaining EC_METHOD objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& #include <openssl/ec.h>
+\&
+\& const EC_METHOD *EC_GFp_simple_method(void);
+\& const EC_METHOD *EC_GFp_mont_method(void);
+\& const EC_METHOD *EC_GFp_nist_method(void);
+\& const EC_METHOD *EC_GFp_nistp224_method(void);
+\& const EC_METHOD *EC_GFp_nistp256_method(void);
+\& const EC_METHOD *EC_GFp_nistp521_method(void);
+\&
+\& const EC_METHOD *EC_GF2m_simple_method(void);
+\&
+\& int EC_METHOD_get_field_type(const EC_METHOD *meth);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+The Elliptic Curve library provides a number of different implementations through a single common interface.
+When constructing a curve using EC_GROUP_new (see \fIEC_GROUP_new\fR\|(3)) an
+implementation method must be provided. The functions described here all return a const pointer to an
+\&\fB\s-1EC_METHOD\s0\fR structure that can be passed to \s-1EC_GROUP_NEW.\s0 It is important that the correct implementation
+type for the form of curve selected is used.
+.PP
+For F2^m curves there is only one implementation choice, i.e. EC_GF2_simple_method.
+.PP
+For Fp curves the lowest common denominator implementation is the EC_GFp_simple_method implementation. All
+other implementations are based on this one. EC_GFp_mont_method builds on EC_GFp_simple_method but adds the
+use of montgomery multiplication (see \fIBN_mod_mul_montgomery\fR\|(3)). EC_GFp_nist_method
+offers an implementation optimised for use with \s-1NIST\s0 recommended curves (\s-1NIST\s0 curves are available through
+EC_GROUP_new_by_curve_name as described in \fIEC_GROUP_new\fR\|(3)).
+.PP
+The functions EC_GFp_nistp224_method, EC_GFp_nistp256_method and EC_GFp_nistp521_method offer 64 bit
+optimised implementations for the \s-1NIST P224, P256\s0 and P521 curves respectively. Note, however, that these
+implementations are not available on all platforms.
+.PP
+EC_METHOD_get_field_type identifies what type of field the \s-1EC_METHOD\s0 structure supports, which will be either
+F2^m or Fp. If the field type is Fp then the value \fBNID_X9_62_prime_field\fR is returned. If the field type is
+F2^m then the value \fBNID_X9_62_characteristic_two_field\fR is returned. These values are defined in the
+obj_mac.h header file.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+All EC_GFp* functions and EC_GF2m_simple_method always return a const pointer to an \s-1EC_METHOD\s0 structure.
+.PP
+EC_METHOD_get_field_type returns an integer that identifies the type of field the \s-1EC_METHOD\s0 structure supports.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fId2i_ECPKParameters\fR\|(3),
+\&\fIBN_mod_mul_montgomery\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_GFp_simple_method.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_GROUP_copy.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_GROUP_copy.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_GROUP_copy.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,305 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_GROUP_copy 3"
+.TH EC_GROUP_copy 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_GROUP_copy, EC_GROUP_dup, EC_GROUP_method_of, EC_GROUP_set_generator, EC_GROUP_get0_generator, EC_GROUP_get_order, EC_GROUP_get_cofactor, EC_GROUP_set_curve_name, EC_GROUP_get_curve_name, EC_GROUP_set_asn1_flag, EC_GROUP_get_asn1_flag, EC_GROUP_set_point_conversion_form, EC_GROUP_get_point_conversion_form, EC_GROUP_get0_seed, EC_GROUP_get_seed_len, EC_GROUP_set_seed, EC_GROUP_get_degree, EC_GROUP_check, EC_GROUP_check_discriminant, EC_GROUP_cmp, EC_GROUP_get_basis_type, EC_GROUP_get_trinomial_basis, EC_GROUP_get_pentanomial_basis \- Functions for manipulating EC_GROUP objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& int EC_GROUP_copy(EC_GROUP *dst, const EC_GROUP *src);
+\& EC_GROUP *EC_GROUP_dup(const EC_GROUP *src);
+\&
+\& const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group);
+\&
+\& int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator, const BIGNUM *order, const BIGNUM *cofactor);
+\& const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group);
+\&
+\& int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx);
+\& int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor, BN_CTX *ctx);
+\&
+\& void EC_GROUP_set_curve_name(EC_GROUP *group, int nid);
+\& int EC_GROUP_get_curve_name(const EC_GROUP *group);
+\&
+\& void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
+\& int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
+\&
+\& void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
+\& point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
+\&
+\& unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
+\& size_t EC_GROUP_get_seed_len(const EC_GROUP *);
+\& size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
+\&
+\& int EC_GROUP_get_degree(const EC_GROUP *group);
+\&
+\& int EC_GROUP_check(const EC_GROUP *group, BN_CTX *ctx);
+\&
+\& int EC_GROUP_check_discriminant(const EC_GROUP *group, BN_CTX *ctx);
+\&
+\& int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
+\&
+\& int EC_GROUP_get_basis_type(const EC_GROUP *);
+\& int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
+\& int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
+\&        unsigned int *k2, unsigned int *k3);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+EC_GROUP_copy copies the curve \fBsrc\fR into \fBdst\fR. Both \fBsrc\fR and \fBdst\fR must use the same \s-1EC_METHOD.\s0
+.PP
+EC_GROUP_dup creates a new \s-1EC_GROUP\s0 object and copies the content from \fBsrc\fR to the newly created
+\&\s-1EC_GROUP\s0 object.
+.PP
+EC_GROUP_method_of obtains the \s-1EC_METHOD\s0 of \fBgroup\fR.
+.PP
+EC_GROUP_set_generator sets curve paramaters that must be agreed by all participants using the curve. These
+paramaters include the \fBgenerator\fR, the \fBorder\fR and the \fBcofactor\fR. The \fBgenerator\fR is a well defined point on the
+curve chosen for cryptographic operations. Integers used for point multiplications will be between 0 and
+n\-1 where n is the \fBorder\fR. The \fBorder\fR multipied by the \fBcofactor\fR gives the number of points on the curve.
+.PP
+EC_GROUP_get0_generator returns the generator for the identified \fBgroup\fR.
+.PP
+The functions EC_GROUP_get_order and EC_GROUP_get_cofactor populate the provided \fBorder\fR and \fBcofactor\fR parameters
+with the respective order and cofactors for the \fBgroup\fR.
+.PP
+The functions EC_GROUP_set_curve_name and EC_GROUP_get_curve_name, set and get the \s-1NID\s0 for the curve respectively
+(see \fIEC_GROUP_new\fR\|(3)). If a curve does not have a \s-1NID\s0 associated with it, then EC_GROUP_get_curve_name
+will return 0.
+.PP
+The asn1_flag value on a curve is used to determine whether there is a specific \s-1ASN1 OID\s0 to describe the curve or not.
+If the asn1_flag is 1 then this is a named curve with an associated \s-1ASN1 OID.\s0 If not then asn1_flag is 0. The functions
+EC_GROUP_get_asn1_flag and EC_GROUP_set_asn1_flag get and set the status of the asn1_flag for the curve. If set then
+the curve_name must also be set.
+.PP
+The point_coversion_form for a curve controls how \s-1EC_POINT\s0 data is encoded as \s-1ASN1\s0 as defined in X9.62 (\s-1ECDSA\s0).
+point_conversion_form_t is an enum defined as follows:
+.PP
+.Vb 10
+\& typedef enum {
+\&        /** the point is encoded as z||x, where the octet z specifies 
+\&         *   which solution of the quadratic equation y is  */
+\&        POINT_CONVERSION_COMPRESSED = 2,
+\&        /** the point is encoded as z||x||y, where z is the octet 0x02  */
+\&        POINT_CONVERSION_UNCOMPRESSED = 4,
+\&        /** the point is encoded as z||x||y, where the octet z specifies
+\&         *  which solution of the quadratic equation y is  */
+\&        POINT_CONVERSION_HYBRID = 6
+\& } point_conversion_form_t;
+.Ve
+.PP
+For \s-1POINT_CONVERSION_UNCOMPRESSED\s0 the point is encoded as an octet signifying the \s-1UNCOMPRESSED\s0 form has been used followed by
+the octets for x, followed by the octets for y.
+.PP
+For any given x co-ordinate for a point on a curve it is possible to derive two possible y values. For
+\&\s-1POINT_CONVERSION_COMPRESSED\s0 the point is encoded as an octet signifying that the \s-1COMPRESSED\s0 form has been used \s-1AND\s0 which of
+the two possible solutions for y has been used, followed by the octets for x.
+.PP
+For \s-1POINT_CONVERSION_HYBRID\s0 the point is encoded as an octet signifying the \s-1HYBRID\s0 form has been used \s-1AND\s0 which of the two
+possible solutions for y has been used, followed by the octets for x, followed by the octets for y.
+.PP
+The functions EC_GROUP_set_point_conversion_form and EC_GROUP_get_point_conversion_form set and get the point_conversion_form
+for the curve respectively.
+.PP
+\&\s-1ANSI X9.62\s0 (\s-1ECDSA\s0 standard) defines a method of generating the curve parameter b from a random number. This provides advantages
+in that a parameter obtained in this way is highly unlikely to be susceptible to special purpose attacks, or have any trapdoors in it.
+If the seed is present for a curve then the b parameter was generated in a verifiable fashion using that seed. The OpenSSL \s-1EC\s0 library
+does not use this seed value but does enable you to inspect it using EC_GROUP_get0_seed. This returns a pointer to a memory block
+containing the seed that was used. The length of the memory block can be obtained using EC_GROUP_get_seed_len. A number of the
+builtin curves within the library provide seed values that can be obtained. It is also possible to set a custom seed using
+EC_GROUP_set_seed and passing a pointer to a memory block, along with the length of the seed. Again, the \s-1EC\s0 library will not use
+this seed value, although it will be preserved in any \s-1ASN1\s0 based communications.
+.PP
+EC_GROUP_get_degree gets the degree of the field. For Fp fields this will be the number of bits in p.  For F2^m fields this will be
+the value m.
+.PP
+The function EC_GROUP_check_discriminant calculates the discriminant for the curve and verifies that it is valid.
+For a curve defined over Fp the discriminant is given by the formula 4*a^3 + 27*b^2 whilst for F2^m curves the discriminant is
+simply b. In either case for the curve to be valid the discriminant must be non zero.
+.PP
+The function EC_GROUP_check performs a number of checks on a curve to verify that it is valid. Checks performed include
+verifying that the discriminant is non zero; that a generator has been defined; that the generator is on the curve and has
+the correct order.
+.PP
+EC_GROUP_cmp compares \fBa\fR and \fBb\fR to determine whether they represent the same curve or not.
+.PP
+The functions EC_GROUP_get_basis_type, EC_GROUP_get_trinomial_basis and EC_GROUP_get_pentanomial_basis should only be called for curves
+defined over an F2^m field. Addition and multiplication operations within an F2^m field are performed using an irreducible polynomial
+function f(x). This function is either a trinomial of the form:
+.PP
+f(x) = x^m + x^k + 1 with m > k >= 1
+.PP
+or a pentanomial of the form:
+.PP
+f(x) = x^m + x^k3 + x^k2 + x^k1 + 1 with m > k3 > k2 > k1 >= 1
+.PP
+The function EC_GROUP_get_basis_type returns a \s-1NID\s0 identifying whether a trinomial or pentanomial is in use for the field. The
+function EC_GROUP_get_trinomial_basis must only be called where f(x) is of the trinomial form, and returns the value of \fBk\fR. Similary
+the function EC_GROUP_get_pentanomial_basis must only be called where f(x) is of the pentanomial form, and returns the values of \fBk1\fR,
+\&\fBk2\fR and \fBk3\fR respectively.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+The following functions return 1 on success or 0 on error: EC_GROUP_copy, EC_GROUP_set_generator, EC_GROUP_check,
+EC_GROUP_check_discriminant, EC_GROUP_get_trinomial_basis and EC_GROUP_get_pentanomial_basis.
+.PP
+EC_GROUP_dup returns a pointer to the duplicated curve, or \s-1NULL\s0 on error.
+.PP
+EC_GROUP_method_of returns the \s-1EC_METHOD\s0 implementation in use for the given curve or \s-1NULL\s0 on error.
+.PP
+EC_GROUP_get0_generator returns the generator for the given curve or \s-1NULL\s0 on error.
+.PP
+EC_GROUP_get_order, EC_GROUP_get_cofactor, EC_GROUP_get_curve_name, EC_GROUP_get_asn1_flag, EC_GROUP_get_point_conversion_form
+and EC_GROUP_get_degree return the order, cofactor, curve name (\s-1NID\s0), \s-1ASN1\s0 flag, point_conversion_form and degree for the
+specified curve respectively. If there is no curve name associated with a curve then EC_GROUP_get_curve_name will return 0.
+.PP
+EC_GROUP_get0_seed returns a pointer to the seed that was used to generate the parameter b, or \s-1NULL\s0 if the seed is not
+specified. EC_GROUP_get_seed_len returns the length of the seed or 0 if the seed is not specified.
+.PP
+EC_GROUP_set_seed returns the length of the seed that has been set. If the supplied seed is \s-1NULL,\s0 or the supplied seed length is
+0, the return value will be 1. On error 0 is returned.
+.PP
+EC_GROUP_cmp returns 0 if the curves are equal, 1 if they are not equal, or \-1 on error.
+.PP
+EC_GROUP_get_basis_type returns the values NID_X9_62_tpBasis or NID_X9_62_ppBasis (as defined in <openssl/obj_mac.h>) for a
+trinomial or pentanomial respectively. Alternatively in the event of an error a 0 is returned.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_GROUP_copy.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_GROUP_new.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_GROUP_new.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_GROUP_new.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,227 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_GROUP_new 3"
+.TH EC_GROUP_new 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_GROUP_new, EC_GROUP_free, EC_GROUP_clear_free, EC_GROUP_new_curve_GFp, EC_GROUP_new_curve_GF2m, EC_GROUP_new_by_curve_name, EC_GROUP_set_curve_GFp, EC_GROUP_get_curve_GFp, EC_GROUP_set_curve_GF2m, EC_GROUP_get_curve_GF2m, EC_get_builtin_curves \- Functions for creating and destroying EC_GROUP objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& EC_GROUP *EC_GROUP_new(const EC_METHOD *meth);
+\& void EC_GROUP_free(EC_GROUP *group);
+\& void EC_GROUP_clear_free(EC_GROUP *group);
+\&
+\& EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& EC_GROUP *EC_GROUP_new_by_curve_name(int nid);
+\&
+\& int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+\&
+\& size_t EC_get_builtin_curves(EC_builtin_curve *r, size_t nitems);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+Within the library there are two forms of elliptic curve that are of interest. The first form is those defined over the
+prime field Fp. The elements of Fp are the integers 0 to p\-1, where p is a prime number. This gives us a revised
+elliptic curve equation as follows:
+.PP
+y^2 mod p = x^3 +ax + b mod p
+.PP
+The second form is those defined over a binary field F2^m where the elements of the field are integers of length at
+most m bits. For this form the elliptic curve equation is modified to:
+.PP
+y^2 + xy = x^3 + ax^2 + b (where b != 0)
+.PP
+Operations in a binary field are performed relative to an \fBirreducible polynomial\fR. All such curves with OpenSSL
+use a trinomial or a pentanomial for this parameter.
+.PP
+A new curve can be constructed by calling EC_GROUP_new, using the implementation provided by \fBmeth\fR (see
+\&\fIEC_GFp_simple_method\fR\|(3)). It is then necessary to call either EC_GROUP_set_curve_GFp or
+EC_GROUP_set_curve_GF2m as appropriate to create a curve defined over Fp or over F2^m respectively.
+.PP
+EC_GROUP_set_curve_GFp sets the curve parameters \fBp\fR, \fBa\fR and \fBb\fR for a curve over Fp stored in \fBgroup\fR.
+EC_group_get_curve_GFp obtains the previously set curve parameters.
+.PP
+EC_GROUP_set_curve_GF2m sets the equivalent curve parameters for a curve over F2^m. In this case \fBp\fR represents
+the irreducible polybnomial \- each bit represents a term in the polynomial. Therefore there will either be three
+or five bits set dependant on whether the polynomial is a trinomial or a pentanomial.
+EC_group_get_curve_GF2m obtains the previously set curve parameters.
+.PP
+The functions EC_GROUP_new_curve_GFp and EC_GROUP_new_curve_GF2m are shortcuts for calling EC_GROUP_new and the
+appropriate EC_group_set_curve function. An appropriate default implementation method will be used.
+.PP
+Whilst the library can be used to create any curve using the functions described above, there are also a number of
+predefined curves that are available. In order to obtain a list of all of the predefined curves, call the function
+EC_get_builtin_curves. The parameter \fBr\fR should be an array of EC_builtin_curve structures of size \fBnitems\fR. The function
+will populate the \fBr\fR array with information about the builtin curves. If \fBnitems\fR is less than the total number of
+curves available, then the first \fBnitems\fR curves will be returned. Otherwise the total number of curves will be
+provided. The return value is the total number of curves available (whether that number has been populated in \fBr\fR or
+not). Passing a \s-1NULL\s0 \fBr\fR, or setting \fBnitems\fR to 0 will do nothing other than return the total number of curves available.
+The EC_builtin_curve structure is defined as follows:
+.PP
+.Vb 4
+\& typedef struct { 
+\&        int nid;
+\&        const char *comment;
+\&        } EC_builtin_curve;
+.Ve
+.PP
+Each EC_builtin_curve item has a unique integer id (\fBnid\fR), and a human readable comment string describing the curve.
+.PP
+In order to construct a builtin curve use the function EC_GROUP_new_by_curve_name and provide the \fBnid\fR of the curve to
+be constructed.
+.PP
+EC_GROUP_free frees the memory associated with the \s-1EC_GROUP.\s0
+.PP
+EC_GROUP_clear_free destroys any sensitive data held within the \s-1EC_GROUP\s0 and then frees its memory.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+All EC_GROUP_new* functions return a pointer to the newly constructed group, or \s-1NULL\s0 on error.
+.PP
+EC_get_builtin_curves returns the number of builtin curves that are available.
+.PP
+EC_GROUP_set_curve_GFp, EC_GROUP_get_curve_GFp, EC_GROUP_set_curve_GF2m, EC_GROUP_get_curve_GF2m return 1 on success or 0 on error.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_GROUP_new.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_KEY_new.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_KEY_new.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_KEY_new.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,236 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_KEY_new 3"
+.TH EC_KEY_new 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_KEY_new, EC_KEY_get_flags, EC_KEY_set_flags, EC_KEY_clear_flags, EC_KEY_new_by_curve_name, EC_KEY_free, EC_KEY_copy, EC_KEY_dup, EC_KEY_up_ref, EC_KEY_get0_group, EC_KEY_set_group, EC_KEY_get0_private_key, EC_KEY_set_private_key, EC_KEY_get0_public_key, EC_KEY_set_public_key, EC_KEY_get_enc_flags, EC_KEY_set_enc_flags, EC_KEY_get_conv_form, EC_KEY_set_conv_form, EC_KEY_get_key_method_data, EC_KEY_insert_key_method_data, EC_KEY_set_asn1_flag, EC_KEY_precompute_mult, EC_KEY_generate_key, EC_KEY_check_key, EC_KEY_set_public_key_affine_coordinates \- Functions for creating, destroying and manipulating EC_KEY objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& EC_KEY *EC_KEY_new(void);
+\& int EC_KEY_get_flags(const EC_KEY *key);
+\& void EC_KEY_set_flags(EC_KEY *key, int flags);
+\& void EC_KEY_clear_flags(EC_KEY *key, int flags);
+\& EC_KEY *EC_KEY_new_by_curve_name(int nid);
+\& void EC_KEY_free(EC_KEY *key);
+\& EC_KEY *EC_KEY_copy(EC_KEY *dst, const EC_KEY *src);
+\& EC_KEY *EC_KEY_dup(const EC_KEY *src);
+\& int EC_KEY_up_ref(EC_KEY *key);
+\& const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key);
+\& int EC_KEY_set_group(EC_KEY *key, const EC_GROUP *group);
+\& const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key);
+\& int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *prv);
+\& const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
+\& int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
+\& point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+\& void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
+\& void *EC_KEY_get_key_method_data(EC_KEY *key, 
+\&        void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
+\& void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+\&        void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
+\& void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
+\& int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx);
+\& int EC_KEY_generate_key(EC_KEY *key);
+\& int EC_KEY_check_key(const EC_KEY *key);
+\& int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+An \s-1EC_KEY\s0 represents a public key and (optionaly) an associated private key. A new \s-1EC_KEY\s0 (with no associated curve) can be constructed by calling EC_KEY_new.
+The reference count for the newly created \s-1EC_KEY\s0 is initially set to 1. A curve can be associated with the \s-1EC_KEY\s0 by calling
+EC_KEY_set_group.
+.PP
+Alternatively a new \s-1EC_KEY\s0 can be constructed by calling EC_KEY_new_by_curve_name and supplying the nid of the associated curve. Refer to \fIEC_GROUP_new\fR\|(3) for a description of curve names. This function simply wraps calls to EC_KEY_new and 
+EC_GROUP_new_by_curve_name.
+.PP
+Calling EC_KEY_free decrements the reference count for the \s-1EC_KEY\s0 object, and if it has dropped to zero then frees the memory associated
+with it.
+.PP
+EC_KEY_copy copies the contents of the \s-1EC_KEY\s0 in \fBsrc\fR into \fBdest\fR.
+.PP
+EC_KEY_dup creates a new \s-1EC_KEY\s0 object and copies \fBec_key\fR into it.
+.PP
+EC_KEY_up_ref increments the reference count associated with the \s-1EC_KEY\s0 object.
+.PP
+EC_KEY_generate_key generates a new public and private key for the supplied \fBeckey\fR object. \fBeckey\fR must have an \s-1EC_GROUP\s0 object
+associated with it before calling this function. The private key is a random integer (0 < priv_key < order, where order is the order
+of the \s-1EC_GROUP\s0 object). The public key is an \s-1EC_POINT\s0 on the curve calculated by multiplying the generator for the curve by the
+private key.
+.PP
+EC_KEY_check_key performs various sanity checks on the \s-1EC_KEY\s0 object to confirm that it is valid.
+.PP
+EC_KEY_set_public_key_affine_coordinates sets the public key for \fBkey\fR based on its affine co-ordinates, i.e. it constructs an \s-1EC_POINT\s0
+object based on the supplied \fBx\fR and \fBy\fR values and sets the public key to be this \s-1EC_POINT.\s0 It will also performs certain sanity checks
+on the key to confirm that it is valid.
+.PP
+The functions EC_KEY_get0_group, EC_KEY_set_group, EC_KEY_get0_private_key, EC_KEY_set_private_key, EC_KEY_get0_public_key, and EC_KEY_set_public_key get and set the \s-1EC_GROUP\s0 object, the private key and the \s-1EC_POINT\s0 public key for the \fBkey\fR respectively.
+.PP
+The functions EC_KEY_get_conv_form and EC_KEY_set_conv_form get and set the point_conversion_form for the \fBkey\fR. For a description
+of point_conversion_forms please refer to \fIEC_POINT_new\fR\|(3).
+.PP
+EC_KEY_insert_key_method_data and EC_KEY_get_key_method_data enable the caller to associate arbitrary additional data specific to the
+elliptic curve scheme being used with the \s-1EC_KEY\s0 object. This data is treated as a \*(L"black box\*(R" by the ec library. The data to be stored by EC_KEY_insert_key_method_data is provided in the \fBdata\fR parameter, which must have associated functions for duplicating, freeing and \*(L"clear_freeing\*(R" the data item. If a subsequent EC_KEY_get_key_method_data call is issued, the functions for duplicating, freeing and \*(L"clear_freeing\*(R" the data item must be provided again, and they must be the same as they were when the data item was inserted.
+.PP
+EC_KEY_set_flags sets the flags in the \fBflags\fR parameter on the \s-1EC_KEY\s0 object. Any flags that are already set are left set. The currently defined standard flags are \s-1EC_FLAG_NON_FIPS_ALLOW\s0 and \s-1EC_FLAG_FIPS_CHECKED.\s0 In addition there is the flag \s-1EC_FLAG_COFACTOR_ECDH\s0 which is specific to \s-1ECDH\s0 and is defined in ecdh.h. EC_KEY_get_flags returns the current flags that are set for this \s-1EC_KEY.\s0 EC_KEY_clear_flags clears the flags indicated by the \fBflags\fR parameter. All other flags are left in their existing state.
+.PP
+EC_KEY_set_asn1_flag sets the asn1_flag on the underlying \s-1EC_GROUP\s0 object (if set). Refer to \fIEC_GROUP_copy\fR\|(3) for further information on the asn1_flag.
+.PP
+EC_KEY_precompute_mult stores multiples of the underlying \s-1EC_GROUP\s0 generator for faster point multiplication. See also \fIEC_POINT_add\fR\|(3).
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+EC_KEY_new, EC_KEY_new_by_curve_name and EC_KEY_dup return a pointer to the newly created \s-1EC_KEY\s0 object, or \s-1NULL\s0 on error.
+.PP
+EC_KEY_get_flags returns the flags associated with the \s-1EC_KEY\s0 object as an integer.
+.PP
+EC_KEY_copy returns a pointer to the destination key, or \s-1NULL\s0 on error.
+.PP
+EC_KEY_up_ref, EC_KEY_set_group, EC_KEY_set_private_key, EC_KEY_set_public_key, EC_KEY_precompute_mult, EC_KEY_generate_key, EC_KEY_check_key and EC_KEY_set_public_key_affine_coordinates return 1 on success or 0 on error.
+.PP
+EC_KEY_get0_group returns the \s-1EC_GROUP\s0 associated with the \s-1EC_KEY.\s0
+.PP
+EC_KEY_get0_private_key returns the private key associated with the \s-1EC_KEY.\s0
+.PP
+EC_KEY_get_conv_form return the point_conversion_form for the \s-1EC_KEY.\s0
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3),
+\&\fIEC_GROUP_copy\fR\|(3), \fIEC_POINT_new\fR\|(3),
+\&\fIEC_POINT_add\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3),
+\&\fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_KEY_new.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_POINT_add.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_POINT_add.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_POINT_add.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,200 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_POINT_add 3"
+.TH EC_POINT_add 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_POINT_add, EC_POINT_dbl, EC_POINT_invert, EC_POINT_is_at_infinity, EC_POINT_is_on_curve, EC_POINT_cmp, EC_POINT_make_affine, EC_POINTs_make_affine, EC_POINTs_mul, EC_POINT_mul, EC_GROUP_precompute_mult, EC_GROUP_have_precompute_mult \- Functions for performing mathematical operations and tests on EC_POINT objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
+\& int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, BN_CTX *ctx);
+\& int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a, BN_CTX *ctx);
+\& int EC_POINT_is_at_infinity(const EC_GROUP *group, const EC_POINT *p);
+\& int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *ctx);
+\& int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
+\& int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+\& int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
+\& int EC_POINTs_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *n, size_t num, const EC_POINT *p[], const BIGNUM *m[], BN_CTX *ctx);
+\& int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *n, const EC_POINT *q, const BIGNUM *m, BN_CTX *ctx);
+\& int EC_GROUP_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+\& int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+EC_POINT_add adds the two points \fBa\fR and \fBb\fR and places the result in \fBr\fR. Similarly EC_POINT_dbl doubles the point \fBa\fR and places the
+result in \fBr\fR. In both cases it is valid for \fBr\fR to be one of \fBa\fR or \fBb\fR.
+.PP
+EC_POINT_invert calculates the inverse of the supplied point \fBa\fR. The result is placed back in \fBa\fR.
+.PP
+The function EC_POINT_is_at_infinity tests whether the supplied point is at infinity or not.
+.PP
+EC_POINT_is_on_curve tests whether the supplied point is on the curve or not.
+.PP
+EC_POINT_cmp compares the two supplied points and tests whether or not they are equal.
+.PP
+The functions EC_POINT_make_affine and EC_POINTs_make_affine force the internal representation of the \s-1EC_POINT\s0(s) into the affine
+co-ordinate system. In the case of EC_POINTs_make_affine the value \fBnum\fR provides the number of points in the array \fBpoints\fR to be
+forced.
+.PP
+EC_POINT_mul calculates the value generator * \fBn\fR + \fBq\fR * \fBm\fR and stores the result in \fBr\fR. The value \fBn\fR may be \s-1NULL\s0 in which case the result is just \fBq\fR * \fBm\fR.
+.PP
+EC_POINTs_mul calculates the value generator * \fBn\fR + \fBq[0]\fR * \fBm[0]\fR + ... + \fBq[num\-1]\fR * \fBm[num\-1]\fR. As for EC_POINT_mul the value
+\&\fBn\fR may be \s-1NULL.\s0
+.PP
+The function EC_GROUP_precompute_mult stores multiples of the generator for faster point multiplication, whilst
+EC_GROUP_have_precompute_mult tests whether precomputation has already been done. See \fIEC_GROUP_copy\fR\|(3) for information
+about the generator.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+The following functions return 1 on success or 0 on error: EC_POINT_add, EC_POINT_dbl, EC_POINT_invert, EC_POINT_make_affine,
+EC_POINTs_make_affine, EC_POINTs_make_affine, EC_POINT_mul, EC_POINTs_mul and EC_GROUP_precompute_mult.
+.PP
+EC_POINT_is_at_infinity returns 1 if the point is at infinity, or 0 otherwise.
+.PP
+EC_POINT_is_on_curve returns 1 if the point is on the curve, 0 if not, or \-1 on error.
+.PP
+EC_POINT_cmp returns 1 if the points are not equal, 0 if they are, or \-1 on error.
+.PP
+EC_GROUP_have_precompute_mult return 1 if a precomputation has been done, or 0 if not.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_POINT_add.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EC_POINT_new.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EC_POINT_new.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EC_POINT_new.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,257 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EC_POINT_new 3"
+.TH EC_POINT_new 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EC_POINT_new, EC_POINT_free, EC_POINT_clear_free, EC_POINT_copy, EC_POINT_dup, EC_POINT_method_of, EC_POINT_set_to_infinity, EC_POINT_set_Jprojective_coordinates, EC_POINT_get_Jprojective_coordinates_GFp, EC_POINT_set_affine_coordinates_GFp, EC_POINT_get_affine_coordinates_GFp, EC_POINT_set_compressed_coordinates_GFp, EC_POINT_set_affine_coordinates_GF2m, EC_POINT_get_affine_coordinates_GF2m, EC_POINT_set_compressed_coordinates_GF2m, EC_POINT_point2oct, EC_POINT_oct2point, EC_POINT_point2bn, EC_POINT_bn2point, EC_POINT_point2hex, EC_POINT_hex2point \- Functions for creating, destroying and manipulating EC_POINT objects.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& EC_POINT *EC_POINT_new(const EC_GROUP *group);
+\& void EC_POINT_free(EC_POINT *point);
+\& void EC_POINT_clear_free(EC_POINT *point);
+\& int EC_POINT_copy(EC_POINT *dst, const EC_POINT *src);
+\& EC_POINT *EC_POINT_dup(const EC_POINT *src, const EC_GROUP *group);
+\& const EC_METHOD *EC_POINT_method_of(const EC_POINT *point);
+\& int EC_POINT_set_to_infinity(const EC_GROUP *group, EC_POINT *point);
+\& int EC_POINT_set_Jprojective_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, const BIGNUM *z, BN_CTX *ctx);
+\& int EC_POINT_get_Jprojective_coordinates_GFp(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BIGNUM *z, BN_CTX *ctx);
+\& int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, int y_bit, BN_CTX *ctx);
+\& int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, int y_bit, BN_CTX *ctx);
+\& size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *p,
+\&        point_conversion_form_t form,
+\&        unsigned char *buf, size_t len, BN_CTX *ctx);
+\& int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *p,
+\&        const unsigned char *buf, size_t len, BN_CTX *ctx);
+\& BIGNUM *EC_POINT_point2bn(const EC_GROUP *, const EC_POINT *,
+\&        point_conversion_form_t form, BIGNUM *, BN_CTX *);
+\& EC_POINT *EC_POINT_bn2point(const EC_GROUP *, const BIGNUM *,
+\&        EC_POINT *, BN_CTX *);
+\& char *EC_POINT_point2hex(const EC_GROUP *, const EC_POINT *,
+\&        point_conversion_form_t form, BN_CTX *);
+\& EC_POINT *EC_POINT_hex2point(const EC_GROUP *, const char *,
+\&        EC_POINT *, BN_CTX *);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+An \s-1EC_POINT\s0 represents a point on a curve. A new point is constructed by calling the function EC_POINT_new and providing the \fBgroup\fR
+object that the point relates to.
+.PP
+EC_POINT_free frees the memory associated with the \s-1EC_POINT.\s0
+.PP
+EC_POINT_clear_free destroys any sensitive data held within the \s-1EC_POINT\s0 and then frees its memory.
+.PP
+EC_POINT_copy copies the point \fBsrc\fR into \fBdst\fR. Both \fBsrc\fR and \fBdst\fR must use the same \s-1EC_METHOD.\s0
+.PP
+EC_POINT_dup creates a new \s-1EC_POINT\s0 object and copies the content from \fBsrc\fR to the newly created
+\&\s-1EC_POINT\s0 object.
+.PP
+EC_POINT_method_of obtains the \s-1EC_METHOD\s0 associated with \fBpoint\fR.
+.PP
+A valid point on a curve is the special point at  infinity. A point is set to be at infinity by calling EC_POINT_set_to_infinity.
+.PP
+The affine co-ordinates for a point describe a point in terms of its x and y position. The functions
+EC_POINT_set_affine_coordinates_GFp and EC_POINT_set_affine_coordinates_GF2m set the \fBx\fR and \fBy\fR co-ordinates for the point
+\&\fBp\fR defined over the curve given in \fBgroup\fR.
+.PP
+As well as the affine co-ordinates, a point can alternatively be described in terms of its Jacobian
+projective co-ordinates (for Fp curves only). Jacobian projective co-ordinates are expressed as three values x, y and z. Working in
+this co-ordinate system provides more efficient point multiplication operations.
+A mapping exists between Jacobian projective co-ordinates and affine co-ordinates. A Jacobian projective co-ordinate (x, y, z) can be written as an affine co-ordinate as (x/(z^2), y/(z^3)). Conversion to Jacobian projective to affine co-ordinates is simple. The co-ordinate (x, y) is
+mapped to (x, y, 1). To set or get the projective co-ordinates use EC_POINT_set_Jprojective_coordinates_GFp and
+EC_POINT_get_Jprojective_coordinates_GFp respectively.
+.PP
+Points can also be described in terms of their compressed co-ordinates. For a point (x, y), for any given value for x such that the point is
+on the curve there will only ever be two possible values for y. Therefore a point can be set using the EC_POINT_set_compressed_coordinates_GFp
+and EC_POINT_set_compressed_coordinates_GF2m functions where \fBx\fR is the x co-ordinate and \fBy_bit\fR is a value 0 or 1 to identify which of
+the two possible values for y should be used.
+.PP
+In addition EC_POINTs can be converted to and from various external
+representations. Supported representations are octet strings, BIGNUMs and
+hexadecimal. Octet strings are stored in a buffer along with an associated
+buffer length. A point held in a \s-1BIGNUM\s0 is calculated by converting the point to
+an octet string and then converting that octet string into a \s-1BIGNUM\s0 integer.
+Points in hexadecimal format are stored in a \s-1NULL\s0 terminated character string
+where each character is one of the printable values 0\-9 or A\-F (or a\-f).
+.PP
+The functions EC_POINT_point2oct, EC_POINT_oct2point, EC_POINT_point2bn, EC_POINT_bn2point, EC_POINT_point2hex and EC_POINT_hex2point convert
+from and to EC_POINTs for the formats: octet string, \s-1BIGNUM\s0 and hexadecimal respectively.
+.PP
+The function EC_POINT_point2oct must be supplied with a buffer long enough to store the octet string. The return value provides the number of
+octets stored. Calling the function with a \s-1NULL\s0 buffer will not perform the conversion but will still return the required buffer length.
+.PP
+The function EC_POINT_point2hex will allocate sufficient memory to store the hexadecimal string. It is the caller's responsibility to free
+this memory with a subsequent call to \fIOPENSSL_free()\fR.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+EC_POINT_new and EC_POINT_dup return the newly allocated \s-1EC_POINT\s0 or \s-1NULL\s0 on error.
+.PP
+The following functions return 1 on success or 0 on error: EC_POINT_copy, EC_POINT_set_to_infinity, EC_POINT_set_Jprojective_coordinates_GFp,
+EC_POINT_get_Jprojective_coordinates_GFp, EC_POINT_set_affine_coordinates_GFp, EC_POINT_get_affine_coordinates_GFp,
+EC_POINT_set_compressed_coordinates_GFp, EC_POINT_set_affine_coordinates_GF2m, EC_POINT_get_affine_coordinates_GF2m,
+EC_POINT_set_compressed_coordinates_GF2m and EC_POINT_oct2point.
+.PP
+EC_POINT_method_of returns the \s-1EC_METHOD\s0 associated with the supplied \s-1EC_POINT.\s0
+.PP
+EC_POINT_point2oct returns the length of the required buffer, or 0 on error.
+.PP
+EC_POINT_point2bn returns the pointer to the \s-1BIGNUM\s0 supplied, or \s-1NULL\s0 on error.
+.PP
+EC_POINT_bn2point returns the pointer to the \s-1EC_POINT\s0 supplied, or \s-1NULL\s0 on error.
+.PP
+EC_POINT_point2hex returns a pointer to the hex string, or \s-1NULL\s0 on error.
+.PP
+EC_POINT_hex2point returns the pointer to the \s-1EC_POINT\s0 supplied, or \s-1NULL\s0 on error.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/EC_POINT_new.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/EVP_PKEY_meth_new.3
===================================================================
--- trunk/secure/lib/libcrypto/man/EVP_PKEY_meth_new.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/EVP_PKEY_meth_new.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,532 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "EVP_PKEY_meth_new 3"
+.TH EVP_PKEY_meth_new 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+EVP_PKEY_meth_new, EVP_PKEY_meth_free, EVP_PKEY_meth_copy, EVP_PKEY_meth_find,
+EVP_PKEY_meth_add0, EVP_PKEY_METHOD,
+EVP_PKEY_meth_set_init, EVP_PKEY_meth_set_copy, EVP_PKEY_meth_set_cleanup,
+EVP_PKEY_meth_set_paramgen, EVP_PKEY_meth_set_keygen, EVP_PKEY_meth_set_sign,
+EVP_PKEY_meth_set_verify, EVP_PKEY_meth_set_verify_recover, EVP_PKEY_meth_set_signctx,
+EVP_PKEY_meth_set_verifyctx, EVP_PKEY_meth_set_encrypt, EVP_PKEY_meth_set_decrypt,
+EVP_PKEY_meth_set_derive, EVP_PKEY_meth_set_ctrl,
+EVP_PKEY_meth_get_init, EVP_PKEY_meth_get_copy, EVP_PKEY_meth_get_cleanup,
+EVP_PKEY_meth_get_paramgen, EVP_PKEY_meth_get_keygen, EVP_PKEY_meth_get_sign,
+EVP_PKEY_meth_get_verify, EVP_PKEY_meth_get_verify_recover, EVP_PKEY_meth_get_signctx,
+EVP_PKEY_meth_get_verifyctx, EVP_PKEY_meth_get_encrypt, EVP_PKEY_meth_get_decrypt,
+EVP_PKEY_meth_get_derive, EVP_PKEY_meth_get_ctrl
+\&\- manipulating EVP_PKEY_METHOD structure
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& #include <openssl/evp.h>
+\&
+\& typedef struct evp_pkey_method_st EVP_PKEY_METHOD;
+\&
+\& EVP_PKEY_METHOD *EVP_PKEY_meth_new(int id, int flags);
+\& void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
+\& void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
+\& const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
+\& int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
+\&
+\& void EVP_PKEY_meth_set_init(EVP_PKEY_METHOD *pmeth,
+\&                             int (*init) (EVP_PKEY_CTX *ctx));
+\& void EVP_PKEY_meth_set_copy(EVP_PKEY_METHOD *pmeth,
+\&                             int (*copy) (EVP_PKEY_CTX *dst,
+\&                                          EVP_PKEY_CTX *src));
+\& void EVP_PKEY_meth_set_cleanup(EVP_PKEY_METHOD *pmeth,
+\&                                void (*cleanup) (EVP_PKEY_CTX *ctx));
+\& void EVP_PKEY_meth_set_paramgen(EVP_PKEY_METHOD *pmeth,
+\&                                 int (*paramgen_init) (EVP_PKEY_CTX *ctx),
+\&                                 int (*paramgen) (EVP_PKEY_CTX *ctx,
+\&                                                  EVP_PKEY *pkey));
+\& void EVP_PKEY_meth_set_keygen(EVP_PKEY_METHOD *pmeth,
+\&                               int (*keygen_init) (EVP_PKEY_CTX *ctx),
+\&                               int (*keygen) (EVP_PKEY_CTX *ctx,
+\&                                              EVP_PKEY *pkey));
+\& void EVP_PKEY_meth_set_sign(EVP_PKEY_METHOD *pmeth,
+\&                             int (*sign_init) (EVP_PKEY_CTX *ctx),
+\&                             int (*sign) (EVP_PKEY_CTX *ctx,
+\&                                          unsigned char *sig, size_t *siglen,
+\&                                          const unsigned char *tbs,
+\&                                          size_t tbslen));
+\& void EVP_PKEY_meth_set_verify(EVP_PKEY_METHOD *pmeth,
+\&                               int (*verify_init) (EVP_PKEY_CTX *ctx),
+\&                               int (*verify) (EVP_PKEY_CTX *ctx,
+\&                                              const unsigned char *sig,
+\&                                              size_t siglen,
+\&                                              const unsigned char *tbs,
+\&                                              size_t tbslen));
+\& void EVP_PKEY_meth_set_verify_recover(EVP_PKEY_METHOD *pmeth,
+\&                                       int (*verify_recover_init) (EVP_PKEY_CTX
+\&                                                                   *ctx),
+\&                                       int (*verify_recover) (EVP_PKEY_CTX
+\&                                                              *ctx,
+\&                                                              unsigned char
+\&                                                              *sig,
+\&                                                              size_t *siglen,
+\&                                                              const unsigned
+\&                                                              char *tbs,
+\&                                                              size_t tbslen));
+\& void EVP_PKEY_meth_set_signctx(EVP_PKEY_METHOD *pmeth,
+\&                                int (*signctx_init) (EVP_PKEY_CTX *ctx,
+\&                                                     EVP_MD_CTX *mctx),
+\&                                int (*signctx) (EVP_PKEY_CTX *ctx,
+\&                                                unsigned char *sig,
+\&                                                size_t *siglen,
+\&                                                EVP_MD_CTX *mctx));
+\& void EVP_PKEY_meth_set_verifyctx(EVP_PKEY_METHOD *pmeth,
+\&                                  int (*verifyctx_init) (EVP_PKEY_CTX *ctx,
+\&                                                         EVP_MD_CTX *mctx),
+\&                                  int (*verifyctx) (EVP_PKEY_CTX *ctx,
+\&                                                    const unsigned char *sig,
+\&                                                    int siglen,
+\&                                                    EVP_MD_CTX *mctx));
+\& void EVP_PKEY_meth_set_encrypt(EVP_PKEY_METHOD *pmeth,
+\&                                int (*encrypt_init) (EVP_PKEY_CTX *ctx),
+\&                                int (*encryptfn) (EVP_PKEY_CTX *ctx,
+\&                                                  unsigned char *out,
+\&                                                  size_t *outlen,
+\&                                                  const unsigned char *in,
+\&                                                  size_t inlen));
+\& void EVP_PKEY_meth_set_decrypt(EVP_PKEY_METHOD *pmeth,
+\&                                int (*decrypt_init) (EVP_PKEY_CTX *ctx),
+\&                                int (*decrypt) (EVP_PKEY_CTX *ctx,
+\&                                                unsigned char *out,
+\&                                                size_t *outlen,
+\&                                                const unsigned char *in,
+\&                                                size_t inlen));
+\& void EVP_PKEY_meth_set_derive(EVP_PKEY_METHOD *pmeth,
+\&                               int (*derive_init) (EVP_PKEY_CTX *ctx),
+\&                               int (*derive) (EVP_PKEY_CTX *ctx,
+\&                                              unsigned char *key,
+\&                                              size_t *keylen));
+\& void EVP_PKEY_meth_set_ctrl(EVP_PKEY_METHOD *pmeth,
+\&                             int (*ctrl) (EVP_PKEY_CTX *ctx, int type, int p1,
+\&                                          void *p2),
+\&                             int (*ctrl_str) (EVP_PKEY_CTX *ctx,
+\&                                              const char *type,
+\&                                              const char *value));
+\&
+\& void EVP_PKEY_meth_get_init(EVP_PKEY_METHOD *pmeth,
+\&                             int (**pinit) (EVP_PKEY_CTX *ctx));
+\& void EVP_PKEY_meth_get_copy(EVP_PKEY_METHOD *pmeth,
+\&                             int (**pcopy) (EVP_PKEY_CTX *dst,
+\&                                            EVP_PKEY_CTX *src));
+\& void EVP_PKEY_meth_get_cleanup(EVP_PKEY_METHOD *pmeth,
+\&                                void (**pcleanup) (EVP_PKEY_CTX *ctx));
+\& void EVP_PKEY_meth_get_paramgen(EVP_PKEY_METHOD *pmeth,
+\&                                 int (**pparamgen_init) (EVP_PKEY_CTX *ctx),
+\&                                 int (**pparamgen) (EVP_PKEY_CTX *ctx,
+\&                                                    EVP_PKEY *pkey));
+\& void EVP_PKEY_meth_get_keygen(EVP_PKEY_METHOD *pmeth,
+\&                               int (**pkeygen_init) (EVP_PKEY_CTX *ctx),
+\&                               int (**pkeygen) (EVP_PKEY_CTX *ctx,
+\&                                                EVP_PKEY *pkey));
+\& void EVP_PKEY_meth_get_sign(EVP_PKEY_METHOD *pmeth,
+\&                             int (**psign_init) (EVP_PKEY_CTX *ctx),
+\&                             int (**psign) (EVP_PKEY_CTX *ctx,
+\&                                            unsigned char *sig, size_t *siglen,
+\&                                            const unsigned char *tbs,
+\&                                            size_t tbslen));
+\& void EVP_PKEY_meth_get_verify(EVP_PKEY_METHOD *pmeth,
+\&                               int (**pverify_init) (EVP_PKEY_CTX *ctx),
+\&                               int (**pverify) (EVP_PKEY_CTX *ctx,
+\&                                                const unsigned char *sig,
+\&                                                size_t siglen,
+\&                                                const unsigned char *tbs,
+\&                                                size_t tbslen));
+\& void EVP_PKEY_meth_get_verify_recover(EVP_PKEY_METHOD *pmeth,
+\&                                       int (**pverify_recover_init) (EVP_PKEY_CTX
+\&                                                                     *ctx),
+\&                                       int (**pverify_recover) (EVP_PKEY_CTX
+\&                                                                *ctx,
+\&                                                                unsigned char
+\&                                                                *sig,
+\&                                                                size_t *siglen,
+\&                                                                const unsigned
+\&                                                                char *tbs,
+\&                                                                size_t tbslen));
+\& void EVP_PKEY_meth_get_signctx(EVP_PKEY_METHOD *pmeth,
+\&                                int (**psignctx_init) (EVP_PKEY_CTX *ctx,
+\&                                                       EVP_MD_CTX *mctx),
+\&                                int (**psignctx) (EVP_PKEY_CTX *ctx,
+\&                                                  unsigned char *sig,
+\&                                                  size_t *siglen,
+\&                                                  EVP_MD_CTX *mctx));
+\& void EVP_PKEY_meth_get_verifyctx(EVP_PKEY_METHOD *pmeth,
+\&                                  int (**pverifyctx_init) (EVP_PKEY_CTX *ctx,
+\&                                                           EVP_MD_CTX *mctx),
+\&                                  int (**pverifyctx) (EVP_PKEY_CTX *ctx,
+\&                                                      const unsigned char *sig,
+\&                                                      int siglen,
+\&                                                      EVP_MD_CTX *mctx));
+\& void EVP_PKEY_meth_get_encrypt(EVP_PKEY_METHOD *pmeth,
+\&                                int (**pencrypt_init) (EVP_PKEY_CTX *ctx),
+\&                                int (**pencryptfn) (EVP_PKEY_CTX *ctx,
+\&                                                    unsigned char *out,
+\&                                                    size_t *outlen,
+\&                                                    const unsigned char *in,
+\&                                                    size_t inlen));
+\& void EVP_PKEY_meth_get_decrypt(EVP_PKEY_METHOD *pmeth,
+\&                                int (**pdecrypt_init) (EVP_PKEY_CTX *ctx),
+\&                                int (**pdecrypt) (EVP_PKEY_CTX *ctx,
+\&                                                  unsigned char *out,
+\&                                                  size_t *outlen,
+\&                                                  const unsigned char *in,
+\&                                                  size_t inlen));
+\& void EVP_PKEY_meth_get_derive(EVP_PKEY_METHOD *pmeth,
+\&                               int (**pderive_init) (EVP_PKEY_CTX *ctx),
+\&                               int (**pderive) (EVP_PKEY_CTX *ctx,
+\&                                                unsigned char *key,
+\&                                                size_t *keylen));
+\& void EVP_PKEY_meth_get_ctrl(EVP_PKEY_METHOD *pmeth,
+\&                             int (**pctrl) (EVP_PKEY_CTX *ctx, int type, int p1,
+\&                                            void *p2),
+\&                             int (**pctrl_str) (EVP_PKEY_CTX *ctx,
+\&                                                const char *type,
+\&                                                const char *value));
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+\&\fB\s-1EVP_PKEY_METHOD\s0\fR is a structure which holds a set of methods for a
+specific public key cryptographic algorithm. Those methods are usually
+used to perform different jobs, such as generating a key, signing or
+verifying, encrypting or decrypting, etc.
+.PP
+There are two places where the \fB\s-1EVP_PKEY_METHOD\s0\fR objects are stored: one
+is a built-in static array representing the standard methods for different
+algorithms, and the other one is a stack of user-defined application-specific
+methods, which can be manipulated by using \fIEVP_PKEY_meth_add0\fR\|(3).
+.PP
+The \fB\s-1EVP_PKEY_METHOD\s0\fR objects are usually referenced by \fB\s-1EVP_PKEY_CTX\s0\fR
+objects.
+.SS "Methods"
+.IX Subsection "Methods"
+The methods are the underlying implementations of a particular public key
+algorithm present by the \fB\s-1EVP_PKEY_CTX\s0\fR object.
+.PP
+.Vb 3
+\& int (*init) (EVP_PKEY_CTX *ctx);
+\& int (*copy) (EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src);
+\& void (*cleanup) (EVP_PKEY_CTX *ctx);
+.Ve
+.PP
+The \fIinit()\fR method is called to initialize algorithm-specific data when a new
+\&\fB\s-1EVP_PKEY_CTX\s0\fR is created. As opposed to \fIinit()\fR, the \fIcleanup()\fR method is called
+when an \fB\s-1EVP_PKEY_CTX\s0\fR is freed. The \fIcopy()\fR method is called when an \fB\s-1EVP_PKEY_CTX\s0\fR
+is being duplicated. Refer to \fIEVP_PKEY_CTX_new\fR\|(3), \fIEVP_PKEY_CTX_new_id\fR\|(3),
+\&\fIEVP_PKEY_CTX_free\fR\|(3) and \fIEVP_PKEY_CTX_dup\fR\|(3).
+.PP
+.Vb 2
+\& int (*paramgen_init) (EVP_PKEY_CTX *ctx);
+\& int (*paramgen) (EVP_PKEY_CTX *ctx, EVP_PKEY *pkey);
+.Ve
+.PP
+The \fIparamgen_init()\fR and \fIparamgen()\fR methods deal with key parameter generation.
+They are called by \fIEVP_PKEY_paramgen_init\fR\|(3) and \fIEVP_PKEY_paramgen\fR\|(3) to
+handle the parameter generation process.
+.PP
+.Vb 2
+\& int (*keygen_init) (EVP_PKEY_CTX *ctx);
+\& int (*keygen) (EVP_PKEY_CTX *ctx, EVP_PKEY *pkey);
+.Ve
+.PP
+The \fIkeygen_init()\fR and \fIkeygen()\fR methods are used to generate the actual key for
+the specified algorithm. They are called by \fIEVP_PKEY_keygen_init\fR\|(3) and
+\&\fIEVP_PKEY_keygen\fR\|(3).
+.PP
+.Vb 3
+\& int (*sign_init) (EVP_PKEY_CTX *ctx);
+\& int (*sign) (EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+\&              const unsigned char *tbs, size_t tbslen);
+.Ve
+.PP
+The \fIsign_init()\fR and \fIsign()\fR methods are used to generate the signature of a
+piece of data using a private key. They are called by \fIEVP_PKEY_sign_init\fR\|(3)
+and \fIEVP_PKEY_sign\fR\|(3).
+.PP
+.Vb 4
+\& int (*verify_init) (EVP_PKEY_CTX *ctx);
+\& int (*verify) (EVP_PKEY_CTX *ctx,
+\&                const unsigned char *sig, size_t siglen,
+\&                const unsigned char *tbs, size_t tbslen);
+.Ve
+.PP
+The \fIverify_init()\fR and \fIverify()\fR methods are used to verify whether a signature is
+valid. They are called by \fIEVP_PKEY_verify_init\fR\|(3) and \fIEVP_PKEY_verify\fR\|(3).
+.PP
+.Vb 4
+\& int (*verify_recover_init) (EVP_PKEY_CTX *ctx);
+\& int (*verify_recover) (EVP_PKEY_CTX *ctx,
+\&                        unsigned char *rout, size_t *routlen,
+\&                        const unsigned char *sig, size_t siglen);
+.Ve
+.PP
+The \fIverify_recover_init()\fR and \fIverify_recover()\fR methods are used to verify a
+signature and then recover the digest from the signature (for instance, a
+signature that was generated by \s-1RSA\s0 signing algorithm). They are called by
+\&\fIEVP_PKEY_verify_recover_init\fR\|(3) and \fIEVP_PKEY_verify_recover\fR\|(3).
+.PP
+.Vb 3
+\& int (*signctx_init) (EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx);
+\& int (*signctx) (EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+\&                 EVP_MD_CTX *mctx);
+.Ve
+.PP
+The \fIsignctx_init()\fR and \fIsignctx()\fR methods are used to sign a digest present by
+a \fB\s-1EVP_MD_CTX\s0\fR object. They are called by the EVP_DigestSign functions. See
+\&\fIEVP_DigestSignInit\fR\|(3) for detail.
+.PP
+.Vb 3
+\& int (*verifyctx_init) (EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx);
+\& int (*verifyctx) (EVP_PKEY_CTX *ctx, const unsigned char *sig, int siglen,
+\&                   EVP_MD_CTX *mctx);
+.Ve
+.PP
+The \fIverifyctx_init()\fR and \fIverifyctx()\fR methods are used to verify a signature
+against the data in a \fB\s-1EVP_MD_CTX\s0\fR object. They are called by the various
+EVP_DigestVerify functions. See \fIEVP_DigestVerifyInit\fR\|(3) for detail.
+.PP
+.Vb 3
+\& int (*encrypt_init) (EVP_PKEY_CTX *ctx);
+\& int (*encrypt) (EVP_PKEY_CTX *ctx, unsigned char *out, size_t *outlen,
+\&                 const unsigned char *in, size_t inlen);
+.Ve
+.PP
+The \fIencrypt_init()\fR and \fIencrypt()\fR methods are used to encrypt a piece of data.
+They are called by \fIEVP_PKEY_encrypt_init\fR\|(3) and \fIEVP_PKEY_encrypt\fR\|(3).
+.PP
+.Vb 3
+\& int (*decrypt_init) (EVP_PKEY_CTX *ctx);
+\& int (*decrypt) (EVP_PKEY_CTX *ctx, unsigned char *out, size_t *outlen,
+\&                 const unsigned char *in, size_t inlen);
+.Ve
+.PP
+The \fIdecrypt_init()\fR and \fIdecrypt()\fR methods are used to decrypt a piece of data.
+They are called by \fIEVP_PKEY_decrypt_init\fR\|(3) and \fIEVP_PKEY_decrypt\fR\|(3).
+.PP
+.Vb 2
+\& int (*derive_init) (EVP_PKEY_CTX *ctx);
+\& int (*derive) (EVP_PKEY_CTX *ctx, unsigned char *key, size_t *keylen);
+.Ve
+.PP
+The \fIderive_init()\fR and \fIderive()\fR methods are used to derive the shared secret
+from a public key algorithm (for instance, the \s-1DH\s0 algorithm). They are called by
+\&\fIEVP_PKEY_derive_init\fR\|(3) and \fIEVP_PKEY_derive\fR\|(3).
+.PP
+.Vb 2
+\& int (*ctrl) (EVP_PKEY_CTX *ctx, int type, int p1, void *p2);
+\& int (*ctrl_str) (EVP_PKEY_CTX *ctx, const char *type, const char *value);
+.Ve
+.PP
+The \fIctrl()\fR and \fIctrl_str()\fR methods are used to adjust algorithm-specific
+settings. See \fIEVP_PKEY_CTX_ctrl\fR\|(3) and related functions for detail.
+.PP
+.Vb 5
+\& int (*digestsign) (EVP_MD_CTX *ctx, unsigned char *sig, size_t *siglen,
+\&                    const unsigned char *tbs, size_t tbslen);
+\& int (*digestverify) (EVP_MD_CTX *ctx, const unsigned char *sig,
+\&                      size_t siglen, const unsigned char *tbs,
+\&                      size_t tbslen);
+.Ve
+.PP
+The \fIdigestsign()\fR and \fIdigestverify()\fR methods are used to generate or verify
+a signature in a one-shot mode. They could be called by \fIEVP_DigetSign\fR\|(3)
+and \fIEVP_DigestVerify\fR\|(3).
+.SS "Functions"
+.IX Subsection "Functions"
+\&\fIEVP_PKEY_meth_new()\fR creates and returns a new \fB\s-1EVP_PKEY_METHOD\s0\fR object,
+and associates the given \fBid\fR and \fBflags\fR. The following flags are
+supported:
+.PP
+.Vb 2
+\& EVP_PKEY_FLAG_AUTOARGLEN
+\& EVP_PKEY_FLAG_SIGCTX_CUSTOM
+.Ve
+.PP
+If an \fB\s-1EVP_PKEY_METHOD\s0\fR is set with the \fB\s-1EVP_PKEY_FLAG_AUTOARGLEN\s0\fR flag, the
+maximum size of the output buffer will be automatically calculated or checked
+in corresponding \s-1EVP\s0 methods by the \s-1EVP\s0 framework. Thus the implementations of
+these methods don't need to care about handling the case of returning output
+buffer size by themselves. For details on the output buffer size, refer to
+\&\fIEVP_PKEY_sign\fR\|(3).
+.PP
+The \fB\s-1EVP_PKEY_FLAG_SIGCTX_CUSTOM\s0\fR is used to indicate the \fIsignctx()\fR method
+of an \fB\s-1EVP_PKEY_METHOD\s0\fR is always called by the \s-1EVP\s0 framework while doing a
+digest signing operation by calling \fIEVP_DigestSignFinal\fR\|(3).
+.PP
+\&\fIEVP_PKEY_meth_free()\fR frees an existing \fB\s-1EVP_PKEY_METHOD\s0\fR pointed by
+\&\fBpmeth\fR.
+.PP
+\&\fIEVP_PKEY_meth_copy()\fR copies an \fB\s-1EVP_PKEY_METHOD\s0\fR object from \fBsrc\fR
+to \fBdst\fR.
+.PP
+\&\fIEVP_PKEY_meth_find()\fR finds an \fB\s-1EVP_PKEY_METHOD\s0\fR object with the \fBid\fR.
+This function first searches through the user-defined method objects and
+then the built-in objects.
+.PP
+\&\fIEVP_PKEY_meth_add0()\fR adds \fBpmeth\fR to the user defined stack of methods.
+.PP
+The EVP_PKEY_meth_set functions set the corresponding fields of
+\&\fB\s-1EVP_PKEY_METHOD\s0\fR structure with the arguments passed.
+.PP
+The EVP_PKEY_meth_get functions get the corresponding fields of
+\&\fB\s-1EVP_PKEY_METHOD\s0\fR structure to the arguments provided.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+\&\fIEVP_PKEY_meth_new()\fR returns a pointer to a new \fB\s-1EVP_PKEY_METHOD\s0\fR
+object or returns \s-1NULL\s0 on error.
+.PP
+\&\fIEVP_PKEY_meth_free()\fR and \fIEVP_PKEY_meth_copy()\fR do not return values.
+.PP
+\&\fIEVP_PKEY_meth_find()\fR returns a pointer to the found \fB\s-1EVP_PKEY_METHOD\s0\fR
+object or returns \s-1NULL\s0 if not found.
+.PP
+\&\fIEVP_PKEY_meth_add0()\fR returns 1 if method is added successfully or 0
+if an error occurred.
+.PP
+All EVP_PKEY_meth_set and EVP_PKEY_meth_get functions have no return
+values. For the 'get' functions, function pointers are returned by
+arguments.
+.SH "COPYRIGHT"
+.IX Header "COPYRIGHT"
+Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+.PP
+Licensed under the OpenSSL license (the \*(L"License\*(R").  You may not use
+this file except in compliance with the License.  You can obtain a copy
+in the file \s-1LICENSE\s0 in the source distribution or at
+<https://www.openssl.org/source/license.html>.


Property changes on: trunk/secure/lib/libcrypto/man/EVP_PKEY_meth_new.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/OPENSSL_instrument_bus.3
===================================================================
--- trunk/secure/lib/libcrypto/man/OPENSSL_instrument_bus.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/OPENSSL_instrument_bus.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,175 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "OPENSSL_instrument_bus 3"
+.TH OPENSSL_instrument_bus 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+OPENSSL_instrument_bus, OPENSSL_instrument_bus2 \- instrument references to memory bus
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 4
+\& #ifdef OPENSSL_CPUID_OBJ
+\& size_t OPENSSL_instrument_bus (int *vector,size_t num);
+\& size_t OPENSSL_instrument_bus2(int *vector,size_t num,size_t max);
+\& #endif
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+It was empirically found that timings of references to primary memory
+are subject to irregular, apparently non-deterministic variations. The
+subroutines in question instrument these references for purposes of
+gathering entropy for random number generator. In order to make it
+bus-bound a 'flush cache line' instruction is used between probes. In
+addition probes are added to \fBvector\fR elements in atomic or
+interlocked manner, which should contribute additional noise on
+multi-processor systems. This also means that \fBvector[num]\fR should be
+zeroed upon invocation (if you want to retrieve actual probe values).
+.PP
+OPENSSL_instrument_bus performs \fBnum\fR probes and records the number of
+oscillator cycles every probe took.
+.PP
+OPENSSL_instrument_bus2 on the other hand \fBaccumulates\fR consecutive
+probes with the same value, i.e. in a way it records duration of
+periods when probe values appeared deterministic. The subroutine
+performs at most \fBmax\fR probes in attempt to fill the \fBvector[num]\fR,
+with \fBmax\fR value of 0 meaning \*(L"as many as it takes.\*(R"
+.SH "RETURN VALUE"
+.IX Header "RETURN VALUE"
+Return value of 0 indicates that \s-1CPU\s0 is not capable of performing the
+benchmark, either because oscillator counter or 'flush cache line' is
+not available on current platform. For reference, on x86 'flush cache
+line' was introduced with the \s-1SSE2\s0 extensions.
+.PP
+Otherwise number of recorded values is returned.


Property changes on: trunk/secure/lib/libcrypto/man/OPENSSL_instrument_bus.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/X509_check_host.3
===================================================================
--- trunk/secure/lib/libcrypto/man/X509_check_host.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/X509_check_host.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,266 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "X509_check_host 3"
+.TH X509_check_host 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+X509_check_host, X509_check_email, X509_check_ip, X509_check_ip_asc \- X.509 certificate matching
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& #include <openssl/x509.h>
+\&
+\& int X509_check_host(X509 *, const char *name, size_t namelen,
+\&                     unsigned int flags, char **peername);
+\& int X509_check_email(X509 *, const char *address, size_t addresslen,
+\&                      unsigned int flags);
+\& int X509_check_ip(X509 *, const unsigned char *address, size_t addresslen,
+\&                   unsigned int flags);
+\& int X509_check_ip_asc(X509 *, const char *address, unsigned int flags);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+The certificate matching functions are used to check whether a
+certificate matches a given host name, email address, or \s-1IP\s0 address.
+The validity of the certificate and its trust level has to be checked by
+other means.
+.PP
+\&\fIX509_check_host()\fR checks if the certificate Subject Alternative
+Name (\s-1SAN\s0) or Subject CommonName (\s-1CN\s0) matches the specified host
+name, which must be encoded in the preferred name syntax described
+in section 3.5 of \s-1RFC 1034.\s0  By default, wildcards are supported
+and they match  only in the left-most label; but they may match
+part of that label with an explicit prefix or suffix.  For example,
+by default, the host \fBname\fR \*(L"www.example.com\*(R" would match a
+certificate with a \s-1SAN\s0 or \s-1CN\s0 value of \*(L"*.example.com\*(R", \*(L"w*.example.com\*(R"
+or \*(L"*w.example.com\*(R".
+.PP
+Per section 6.4.2 of \s-1RFC 6125,\s0 \fBname\fR values representing international
+domain names must be given in A\-label form.  The \fBnamelen\fR argument
+must be the number of characters in the name string or zero in which
+case the length is calculated with strlen(\fBname\fR).  When \fBname\fR starts
+with a dot (e.g \*(L".example.com\*(R"), it will be matched by a certificate
+valid for any sub-domain of \fBname\fR, (see also
+\&\fBX509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS\fR below).
+.PP
+When the certificate is matched, and \fBpeername\fR is not \s-1NULL,\s0 a
+pointer to a copy of the matching \s-1SAN\s0 or \s-1CN\s0 from the peer certificate
+is stored at the address passed in \fBpeername\fR.  The application
+is responsible for freeing the peername via \fIOPENSSL_free()\fR when it
+is no longer needed.
+.PP
+\&\fIX509_check_email()\fR checks if the certificate matches the specified
+email \fBaddress\fR.  Only the mailbox syntax of \s-1RFC 822\s0 is supported,
+comments are not allowed, and no attempt is made to normalize quoted
+characters.  The \fBaddresslen\fR argument must be the number of
+characters in the address string or zero in which case the length
+is calculated with strlen(\fBaddress\fR).
+.PP
+\&\fIX509_check_ip()\fR checks if the certificate matches a specified IPv4 or
+IPv6 address.  The \fBaddress\fR array is in binary format, in network
+byte order.  The length is either 4 (IPv4) or 16 (IPv6).  Only
+explicitly marked addresses in the certificates are considered; \s-1IP\s0
+addresses stored in \s-1DNS\s0 names and Common Names are ignored.
+.PP
+\&\fIX509_check_ip_asc()\fR is similar, except that the NUL-terminated
+string \fBaddress\fR is first converted to the internal representation.
+.PP
+The \fBflags\fR argument is usually 0.  It can be the bitwise \s-1OR\s0 of the
+flags:
+.IP "\fBX509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT\fR," 4
+.IX Item "X509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT,"
+.PD 0
+.IP "\fBX509_CHECK_FLAG_NO_WILDCARDS\fR," 4
+.IX Item "X509_CHECK_FLAG_NO_WILDCARDS,"
+.IP "\fBX509_CHECK_FLAG_NO_PARTIAL_WILDCARDS\fR," 4
+.IX Item "X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS,"
+.IP "\fBX509_CHECK_FLAG_MULTI_LABEL_WILDCARDS\fR." 4
+.IX Item "X509_CHECK_FLAG_MULTI_LABEL_WILDCARDS."
+.IP "\fBX509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS\fR." 4
+.IX Item "X509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS."
+.PD
+.PP
+The \fBX509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT\fR flag causes the function
+to consider the subject \s-1DN\s0 even if the certificate contains at least
+one subject alternative name of the right type (\s-1DNS\s0 name or email
+address as appropriate); the default is to ignore the subject \s-1DN\s0
+when at least one corresponding subject alternative names is present.
+.PP
+If set, \fBX509_CHECK_FLAG_NO_WILDCARDS\fR disables wildcard
+expansion; this only applies to \fBX509_check_host\fR.
+.PP
+If set, \fBX509_CHECK_FLAG_NO_PARTIAL_WILDCARDS\fR suppresses support
+for \*(L"*\*(R" as wildcard pattern in labels that have a prefix or suffix,
+such as: \*(L"www*\*(R" or \*(L"*www\*(R"; this only aplies to \fBX509_check_host\fR.
+.PP
+If set, \fBX509_CHECK_FLAG_MULTI_LABEL_WILDCARDS\fR allows a \*(L"*\*(R" that
+constitutes the complete label of a \s-1DNS\s0 name (e.g. \*(L"*.example.com\*(R")
+to match more than one label in \fBname\fR; this flag only applies
+to \fBX509_check_host\fR.
+.PP
+If set, \fBX509_CHECK_FLAG_SINGLE_LABEL_SUBDOMAINS\fR restricts \fBname\fR
+values which start with \*(L".\*(R", that would otherwise match any sub-domain
+in the peer certificate, to only match direct child sub-domains.
+Thus, for instance, with this flag set a \fBname\fR of \*(L".example.com\*(R"
+would match a peer certificate with a \s-1DNS\s0 name of \*(L"www.example.com\*(R",
+but would not match a peer certificate with a \s-1DNS\s0 name of
+\&\*(L"www.sub.example.com\*(R"; this flag only applies to \fBX509_check_host\fR.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+The functions return 1 for a successful match, 0 for a failed match
+and \-1 for an internal error: typically a memory allocation failure
+or an \s-1ASN.1\s0 decoding error.
+.PP
+All functions can also return \-2 if the input is malformed. For example,
+\&\fIX509_check_host()\fR returns \-2 if the provided \fBname\fR contains embedded
+NULs.
+.SH "NOTES"
+.IX Header "NOTES"
+Applications are encouraged to use \fIX509_VERIFY_PARAM_set1_host()\fR
+rather than explicitly calling \fIX509_check_host\fR\|(3). Host name
+checks are out of scope with the \s-1\fIDANE\-EE\s0\fR\|(3) certificate usage,
+and the internal checks will be suppressed as appropriate when
+\&\s-1DANE\s0 support is added to OpenSSL.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fISSL_get_verify_result\fR\|(3),
+\&\fIX509_VERIFY_PARAM_set1_host\fR\|(3),
+\&\fIX509_VERIFY_PARAM_add1_host\fR\|(3),
+\&\fIX509_VERIFY_PARAM_set1_email\fR\|(3),
+\&\fIX509_VERIFY_PARAM_set1_ip\fR\|(3),
+\&\fIX509_VERIFY_PARAM_set1_ipasc\fR\|(3)
+.SH "HISTORY"
+.IX Header "HISTORY"
+These functions were added in OpenSSL 1.0.2.


Property changes on: trunk/secure/lib/libcrypto/man/X509_check_host.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/X509_check_private_key.3
===================================================================
--- trunk/secure/lib/libcrypto/man/X509_check_private_key.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/X509_check_private_key.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,182 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "X509_check_private_key 3"
+.TH X509_check_private_key 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+X509_check_private_key, X509_REQ_check_private_key \- check the consistency
+of a private key with the public key in an X509 certificate or certificate
+request
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& #include <openssl/x509.h>
+\&
+\& int X509_check_private_key(X509 *x, EVP_PKEY *k);
+\&
+\& int X509_REQ_check_private_key(X509_REQ *x, EVP_PKEY *k);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+\&\fIX509_check_private_key()\fR function checks the consistency of private
+key \fBk\fR with the public key in \fBx\fR.
+.PP
+\&\fIX509_REQ_check_private_key()\fR is equivalent to \fIX509_check_private_key()\fR
+except that \fBx\fR represents a certificate request of structure \fBX509_REQ\fR.
+.SH "RETURN VALUE"
+.IX Header "RETURN VALUE"
+\&\fIX509_check_private_key()\fR and \fIX509_REQ_check_private_key()\fR return 1 if
+the keys match each other, and 0 if not.
+.PP
+If the key is invalid or an error occurred, the reason code can be
+obtained using \fIERR_get_error\fR\|(3).
+.SH "BUGS"
+.IX Header "BUGS"
+The \fBcheck_private_key\fR functions don't check if \fBk\fR itself is indeed
+a private key or not. It merely compares the public materials (e.g. exponent
+and modulus of an \s-1RSA\s0 key) and/or key parameters (e.g. \s-1EC\s0 params of an \s-1EC\s0 key)
+of a key pair. So if you pass a public key to these functions in \fBk\fR, it will
+return success.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIERR_get_error\fR\|(3)
+.SH "COPYRIGHT"
+.IX Header "COPYRIGHT"
+Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+.PP
+Licensed under the OpenSSL license (the \*(L"License\*(R").  You may not use
+this file except in compliance with the License.  You can obtain a copy
+in the file \s-1LICENSE\s0 in the source distribution or at
+<https://www.openssl.org/source/license.html>.


Property changes on: trunk/secure/lib/libcrypto/man/X509_check_private_key.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/X509_cmp_time.3
===================================================================
--- trunk/secure/lib/libcrypto/man/X509_cmp_time.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/X509_cmp_time.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,168 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "X509_cmp_time 3"
+.TH X509_cmp_time 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+X509_cmp_time \- X509 time functions
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& X509_cmp_time(const ASN1_TIME *asn1_time, time_t *cmp_time);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+\&\fIX509_cmp_time()\fR compares the \s-1ASN1_TIME\s0 in \fBasn1_time\fR with the time in
+<cmp_time>.
+.PP
+\&\fBasn1_time\fR must satisfy the \s-1ASN1_TIME\s0 format mandated by \s-1RFC 5280,\s0 i.e.,
+its format must be either \s-1YYMMDDHHMMSSZ\s0 or \s-1YYYYMMDDHHMMSSZ.\s0
+.PP
+If \fBcmp_time\fR is \s-1NULL\s0 the current time is used.
+.SH "BUGS"
+.IX Header "BUGS"
+Unlike many standard comparison functions, X509_cmp_time returns 0 on error.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+\&\fIX509_cmp_time()\fR returns \-1 if \fBasn1_time\fR is earlier than, or equal to,
+\&\fBcmp_time\fR, and 1 otherwise. It returns 0 on error.
+.SH "COPYRIGHT"
+.IX Header "COPYRIGHT"
+Copyright 2017\-2018 The OpenSSL Project Authors. All Rights Reserved.
+.PP
+Licensed under the OpenSSL license (the \*(L"License\*(R").  You may not use
+this file except in compliance with the License.  You can obtain a copy
+in the file \s-1LICENSE\s0 in the source distribution or at
+<https://www.openssl.org/source/license.html>.


Property changes on: trunk/secure/lib/libcrypto/man/X509_cmp_time.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/d2i_ECPKParameters.3
===================================================================
--- trunk/secure/lib/libcrypto/man/d2i_ECPKParameters.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/d2i_ECPKParameters.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,213 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "d2i_ECPKParameters 3"
+.TH d2i_ECPKParameters 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+d2i_ECPKParameters, i2d_ECPKParameters, d2i_ECPKParameters_bio, i2d_ECPKParameters_bio, d2i_ECPKParameters_fp, i2d_ECPKParameters_fp, ECPKParameters_print, ECPKParameters_print_fp \- Functions for decoding and encoding ASN1 representations of elliptic curve entities
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& #include <openssl/ec.h>
+\&
+\& EC_GROUP *d2i_ECPKParameters(EC_GROUP **px, const unsigned char **in, long len);
+\& int i2d_ECPKParameters(const EC_GROUP *x, unsigned char **out);
+\& #define d2i_ECPKParameters_bio(bp,x) ASN1_d2i_bio_of(EC_GROUP,NULL,d2i_ECPKParameters,bp,x)
+\& #define i2d_ECPKParameters_bio(bp,x) ASN1_i2d_bio_of_const(EC_GROUP,i2d_ECPKParameters,bp,x)
+\& #define d2i_ECPKParameters_fp(fp,x) (EC_GROUP *)ASN1_d2i_fp(NULL, \e
+\&                (char *(*)())d2i_ECPKParameters,(fp),(unsigned char **)(x))
+\& #define i2d_ECPKParameters_fp(fp,x) ASN1_i2d_fp(i2d_ECPKParameters,(fp), \e
+\&                (unsigned char *)(x))
+\& int     ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off);
+\& int     ECPKParameters_print_fp(FILE *fp, const EC_GROUP *x, int off);
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+The ECPKParameters encode and decode routines encode and parse the public parameters for an
+\&\fB\s-1EC_GROUP\s0\fR structure, which represents a curve.
+.PP
+\&\fId2i_ECPKParameters()\fR attempts to decode \fBlen\fR bytes at \fB*in\fR. If 
+successful a pointer to the \fB\s-1EC_GROUP\s0\fR structure is returned. If an error
+occurred then \fB\s-1NULL\s0\fR is returned. If \fBpx\fR is not \fB\s-1NULL\s0\fR then the
+returned structure is written to \fB*px\fR. If \fB*px\fR is not \fB\s-1NULL\s0\fR
+then it is assumed that \fB*px\fR contains a valid \fB\s-1EC_GROUP\s0\fR
+structure and an attempt is made to reuse it. If the call is
+successful \fB*in\fR is incremented to the byte following the
+parsed data.
+.PP
+\&\fIi2d_ECPKParameters()\fR encodes the structure pointed to by \fBx\fR into \s-1DER\s0 format.
+If \fBout\fR is not \fB\s-1NULL\s0\fR is writes the \s-1DER\s0 encoded data to the buffer
+at \fB*out\fR, and increments it to point after the data just written.
+If the return value is negative an error occurred, otherwise it
+returns the length of the encoded data.
+.PP
+If \fB*out\fR is \fB\s-1NULL\s0\fR memory will be allocated for a buffer and the encoded
+data written to it. In this case \fB*out\fR is not incremented and it points to
+the start of the data just written.
+.PP
+\&\fId2i_ECPKParameters_bio()\fR is similar to \fId2i_ECPKParameters()\fR except it attempts
+to parse data from \s-1BIO\s0 \fBbp\fR.
+.PP
+\&\fId2i_ECPKParameters_fp()\fR is similar to \fId2i_ECPKParameters()\fR except it attempts
+to parse data from \s-1FILE\s0 pointer \fBfp\fR.
+.PP
+\&\fIi2d_ECPKParameters_bio()\fR is similar to \fIi2d_ECPKParameters()\fR except it writes
+the encoding of the structure \fBx\fR to \s-1BIO\s0 \fBbp\fR and it
+returns 1 for success and 0 for failure.
+.PP
+\&\fIi2d_ECPKParameters_fp()\fR is similar to \fIi2d_ECPKParameters()\fR except it writes
+the encoding of the structure \fBx\fR to \s-1BIO\s0 \fBbp\fR and it
+returns 1 for success and 0 for failure.
+.PP
+These functions are very similar to the X509 functions described in \fId2i_X509\fR\|(3),
+where further notes and examples are available.
+.PP
+The ECPKParameters_print and ECPKParameters_print_fp functions print a human-readable output
+of the public parameters of the \s-1EC_GROUP\s0 to \fBbp\fR or \fBfp\fR. The output lines are indented by \fBoff\fR spaces.
+.SH "RETURN VALUES"
+.IX Header "RETURN VALUES"
+\&\fId2i_ECPKParameters()\fR, \fId2i_ECPKParameters_bio()\fR and \fId2i_ECPKParameters_fp()\fR return a valid \fB\s-1EC_GROUP\s0\fR structure
+or \fB\s-1NULL\s0\fR if an error occurs.
+.PP
+\&\fIi2d_ECPKParameters()\fR returns the number of bytes successfully encoded or a negative
+value if an error occurs.
+.PP
+\&\fIi2d_ECPKParameters_bio()\fR, \fIi2d_ECPKParameters_fp()\fR, ECPKParameters_print and ECPKParameters_print_fp
+return 1 for success and 0 if an error occurs.
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIec\fR\|(3), \fIEC_GROUP_new\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_X509\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/d2i_ECPKParameters.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/secure/lib/libcrypto/man/ec.3
===================================================================
--- trunk/secure/lib/libcrypto/man/ec.3	                        (rev 0)
+++ trunk/secure/lib/libcrypto/man/ec.3	2019-01-20 05:40:35 UTC (rev 12156)
@@ -0,0 +1,330 @@
+.\" $MidnightBSD$
+.\" Automatically generated by Pod::Man 4.09 (Pod::Simple 3.35)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.if !\nF .nr F 0
+.if \nF>0 \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    if !\nF==2 \{\
+.        nr % 0
+.        nr F 2
+.    \}
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "ec 3"
+.TH ec 3 "2018-11-20" "1.0.2q" "OpenSSL"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+ec \- Elliptic Curve functions
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 2
+\& #include <openssl/ec.h>
+\& #include <openssl/bn.h>
+\&
+\& const EC_METHOD *EC_GFp_simple_method(void);
+\& const EC_METHOD *EC_GFp_mont_method(void);
+\& const EC_METHOD *EC_GFp_nist_method(void);
+\& const EC_METHOD *EC_GFp_nistp224_method(void);
+\& const EC_METHOD *EC_GFp_nistp256_method(void);
+\& const EC_METHOD *EC_GFp_nistp521_method(void);
+\&
+\& const EC_METHOD *EC_GF2m_simple_method(void);
+\&
+\& EC_GROUP *EC_GROUP_new(const EC_METHOD *meth);
+\& void EC_GROUP_free(EC_GROUP *group);
+\& void EC_GROUP_clear_free(EC_GROUP *group);
+\& int EC_GROUP_copy(EC_GROUP *dst, const EC_GROUP *src);
+\& EC_GROUP *EC_GROUP_dup(const EC_GROUP *src);
+\& const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group);
+\& int EC_METHOD_get_field_type(const EC_METHOD *meth);
+\& int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator, const BIGNUM *order, const BIGNUM *cofactor);
+\& const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group);
+\& int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx);
+\& int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor, BN_CTX *ctx);
+\& void EC_GROUP_set_curve_name(EC_GROUP *group, int nid);
+\& int EC_GROUP_get_curve_name(const EC_GROUP *group);
+\& void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
+\& int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
+\& void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
+\& point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
+\& unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
+\& size_t EC_GROUP_get_seed_len(const EC_GROUP *);
+\& size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
+\& int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+\& int EC_GROUP_get_degree(const EC_GROUP *group);
+\& int EC_GROUP_check(const EC_GROUP *group, BN_CTX *ctx);
+\& int EC_GROUP_check_discriminant(const EC_GROUP *group, BN_CTX *ctx);
+\& int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
+\& EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+\& EC_GROUP *EC_GROUP_new_by_curve_name(int nid);
+\&
+\& size_t EC_get_builtin_curves(EC_builtin_curve *r, size_t nitems);
+\&
+\& EC_POINT *EC_POINT_new(const EC_GROUP *group);
+\& void EC_POINT_free(EC_POINT *point);
+\& void EC_POINT_clear_free(EC_POINT *point);
+\& int EC_POINT_copy(EC_POINT *dst, const EC_POINT *src);
+\& EC_POINT *EC_POINT_dup(const EC_POINT *src, const EC_GROUP *group);
+\& const EC_METHOD *EC_POINT_method_of(const EC_POINT *point);
+\& int EC_POINT_set_to_infinity(const EC_GROUP *group, EC_POINT *point);
+\& int EC_POINT_set_Jprojective_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, const BIGNUM *z, BN_CTX *ctx);
+\& int EC_POINT_get_Jprojective_coordinates_GFp(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BIGNUM *z, BN_CTX *ctx);
+\& int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, int y_bit, BN_CTX *ctx);
+\& int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
+\&        const EC_POINT *p, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+\& int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
+\&        const BIGNUM *x, int y_bit, BN_CTX *ctx);
+\& size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *p,
+\&        point_conversion_form_t form,
+\&        unsigned char *buf, size_t len, BN_CTX *ctx);
+\& int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *p,
+\&        const unsigned char *buf, size_t len, BN_CTX *ctx);
+\& BIGNUM *EC_POINT_point2bn(const EC_GROUP *, const EC_POINT *,
+\&        point_conversion_form_t form, BIGNUM *, BN_CTX *);
+\& EC_POINT *EC_POINT_bn2point(const EC_GROUP *, const BIGNUM *,
+\&        EC_POINT *, BN_CTX *);
+\& char *EC_POINT_point2hex(const EC_GROUP *, const EC_POINT *,
+\&        point_conversion_form_t form, BN_CTX *);
+\& EC_POINT *EC_POINT_hex2point(const EC_GROUP *, const char *,
+\&        EC_POINT *, BN_CTX *);
+\&
+\& int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
+\& int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, BN_CTX *ctx);
+\& int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a, BN_CTX *ctx);
+\& int EC_POINT_is_at_infinity(const EC_GROUP *group, const EC_POINT *p);
+\& int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *ctx);
+\& int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
+\& int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+\& int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
+\& int EC_POINTs_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *n, size_t num, const EC_POINT *p[], const BIGNUM *m[], BN_CTX *ctx);
+\& int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *n, const EC_POINT *q, const BIGNUM *m, BN_CTX *ctx);
+\& int EC_GROUP_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+\& int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
+\&
+\& int EC_GROUP_get_basis_type(const EC_GROUP *);
+\& int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
+\& int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
+\&        unsigned int *k2, unsigned int *k3);
+\& EC_GROUP *d2i_ECPKParameters(EC_GROUP **, const unsigned char **in, long len);
+\& int i2d_ECPKParameters(const EC_GROUP *, unsigned char **out);
+\& #define d2i_ECPKParameters_bio(bp,x) ASN1_d2i_bio_of(EC_GROUP,NULL,d2i_ECPKParameters,bp,x)
+\& #define i2d_ECPKParameters_bio(bp,x) ASN1_i2d_bio_of_const(EC_GROUP,i2d_ECPKParameters,bp,x)
+\& #define d2i_ECPKParameters_fp(fp,x) (EC_GROUP *)ASN1_d2i_fp(NULL, \e
+\&                (char *(*)())d2i_ECPKParameters,(fp),(unsigned char **)(x))
+\& #define i2d_ECPKParameters_fp(fp,x) ASN1_i2d_fp(i2d_ECPKParameters,(fp), \e
+\&                (unsigned char *)(x))
+\& int     ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off);
+\& int     ECPKParameters_print_fp(FILE *fp, const EC_GROUP *x, int off);
+\&
+\& EC_KEY *EC_KEY_new(void);
+\& int EC_KEY_get_flags(const EC_KEY *key);
+\& void EC_KEY_set_flags(EC_KEY *key, int flags);
+\& void EC_KEY_clear_flags(EC_KEY *key, int flags);
+\& EC_KEY *EC_KEY_new_by_curve_name(int nid);
+\& void EC_KEY_free(EC_KEY *key);
+\& EC_KEY *EC_KEY_copy(EC_KEY *dst, const EC_KEY *src);
+\& EC_KEY *EC_KEY_dup(const EC_KEY *src);
+\& int EC_KEY_up_ref(EC_KEY *key);
+\& const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key);
+\& int EC_KEY_set_group(EC_KEY *key, const EC_GROUP *group);
+\& const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key);
+\& int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *prv);
+\& const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
+\& int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
+\& unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
+\& void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+\& point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+\& void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
+\& void *EC_KEY_get_key_method_data(EC_KEY *key, 
+\&        void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
+\& void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+\&        void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
+\& void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
+\& int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx);
+\& int EC_KEY_generate_key(EC_KEY *key);
+\& int EC_KEY_check_key(const EC_KEY *key);
+\& int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+\&
+\& EC_KEY *d2i_ECPrivateKey(EC_KEY **key, const unsigned char **in, long len);
+\& int i2d_ECPrivateKey(EC_KEY *key, unsigned char **out);
+\&
+\& EC_KEY *d2i_ECParameters(EC_KEY **key, const unsigned char **in, long len);
+\& int i2d_ECParameters(EC_KEY *key, unsigned char **out);
+\&
+\& EC_KEY *o2i_ECPublicKey(EC_KEY **key, const unsigned char **in, long len);
+\& int i2o_ECPublicKey(EC_KEY *key, unsigned char **out);
+\& int    ECParameters_print(BIO *bp, const EC_KEY *key);
+\& int    EC_KEY_print(BIO *bp, const EC_KEY *key, int off);
+\& int    ECParameters_print_fp(FILE *fp, const EC_KEY *key);
+\& int    EC_KEY_print_fp(FILE *fp, const EC_KEY *key, int off);
+\& #define ECParameters_dup(x) ASN1_dup_of(EC_KEY,i2d_ECParameters,d2i_ECParameters,x)
+\& #define EVP_PKEY_CTX_set_ec_paramgen_curve_nid(ctx, nid) \e
+\&        EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, EVP_PKEY_OP_PARAMGEN, \e
+\&                                EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID, nid, NULL)
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+This library provides an extensive set of functions for performing operations on elliptic curves over finite fields.
+In general an elliptic curve is one with an equation of the form:
+.PP
+y^2 = x^3 + ax + b
+.PP
+An \fB\s-1EC_GROUP\s0\fR structure is used to represent the definition of an elliptic curve. Points on a curve are stored using an
+\&\fB\s-1EC_POINT\s0\fR structure. An \fB\s-1EC_KEY\s0\fR is used to hold a private/public key pair, where a private key is simply a \s-1BIGNUM\s0 and a
+public key is a point on a curve (represented by an \fB\s-1EC_POINT\s0\fR).
+.PP
+The library contains a number of alternative implementations of the different functions. Each implementation is optimised
+for different scenarios. No matter which implementation is being used, the interface remains the same. The library
+handles calling the correct implementation when an interface function is invoked. An implementation is represented by
+an \fB\s-1EC_METHOD\s0\fR structure.
+.PP
+The creation and destruction of \fB\s-1EC_GROUP\s0\fR objects is described in \fIEC_GROUP_new\fR\|(3). Functions for
+manipulating \fB\s-1EC_GROUP\s0\fR objects are described in \fIEC_GROUP_copy\fR\|(3).
+.PP
+Functions for creating, destroying and manipulating \fB\s-1EC_POINT\s0\fR objects are explained in \fIEC_POINT_new\fR\|(3),
+whilst functions for performing mathematical operations and tests on \fBEC_POINTs\fR are coverd in \fIEC_POINT_add\fR\|(3).
+.PP
+For working with private and public keys refer to \fIEC_KEY_new\fR\|(3). Implementations are covered in
+\&\fIEC_GFp_simple_method\fR\|(3).
+.PP
+For information on encoding and decoding curve parameters to and from \s-1ASN1\s0 see \fId2i_ECPKParameters\fR\|(3).
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+\&\fIcrypto\fR\|(3), \fIEC_GROUP_new\fR\|(3), \fIEC_GROUP_copy\fR\|(3),
+\&\fIEC_POINT_new\fR\|(3), \fIEC_POINT_add\fR\|(3), \fIEC_KEY_new\fR\|(3),
+\&\fIEC_GFp_simple_method\fR\|(3), \fId2i_ECPKParameters\fR\|(3)


Property changes on: trunk/secure/lib/libcrypto/man/ec.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property


More information about the Midnightbsd-cvs mailing list