summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllen Zhong2017-04-29 00:45:46 +0800
committerAllen Zhong2017-04-29 00:45:46 +0800
commit46413113645dd62645c7340822285f271128e43d (patch)
tree7230f36b7cd9ae90cfcfbd37242db0896b826f1d
parentc6c103f3886018ddd87846010b1a39359555fe39 (diff)
parent3d2c209b72a0908dad5122c848e1dacebb2c477f (diff)
downloadaur-46413113645dd62645c7340822285f271128e43d.tar.gz
Merge branch 'master' of https://aur.archlinux.org/openssl-chacha20
-rw-r--r--.SRCINFO20
-rw-r--r--PKGBUILD26
-rw-r--r--ca-dir.patch44
-rw-r--r--no-rpath.patch11
-rw-r--r--openssl__1.1.0_chacha20_poly1305.patch60
-rw-r--r--openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch4718
-rw-r--r--ssl3-test-failure.patch26
7 files changed, 97 insertions, 4808 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 4c10efa46667..e82a51edb9d6 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,6 +1,6 @@
pkgbase = openssl-chacha20
pkgdesc = The Open Source toolkit for Secure Sockets Layer and Transport Layer Security with Chacha20 cipher
- pkgver = 1.0.2.k
+ pkgver = 1.1.0.e
pkgrel = 1
url = https://www.openssl.org
arch = i686
@@ -8,23 +8,19 @@ pkgbase = openssl-chacha20
license = custom:BSD
depends = perl
optdepends = ca-certificates
- provides = openssl=1.0.2.k
+ provides = openssl=1.1.0.e
conflicts = openssl
options = !makeflags
backup = etc/ssl/openssl.cnf
- source = https://www.openssl.org/source/openssl-1.0.2k.tar.gz
- source = https://www.openssl.org/source/openssl-1.0.2k.tar.gz.asc
- source = no-rpath.patch
- source = ssl3-test-failure.patch
+ source = https://www.openssl.org/source/openssl-1.1.0e.tar.gz
+ source = https://www.openssl.org/source/openssl-1.1.0e.tar.gz.asc
source = ca-dir.patch
- source = openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
+ source = openssl__1.1.0_chacha20_poly1305.patch
validpgpkeys = 8657ABB260F056B1E5190839D9C4D26D0E604491
- sha256sums = 6b3977c61f2aedf0f96367dcfb5c6e578cf37e7b8d913b4ecb6643c3cb88d8c0
+ sha256sums = 57be8618979d80c910728cfc99369bf97b2a1abd8f366ab6ebdee8975ad3874c
sha256sums = SKIP
- sha256sums = 754d6107a306311e15a1db6a1cc031b81691c8b9865e8809ac60ca6f184c957c
- sha256sums = c54ae87c602eaa1530a336ab7c6e22e12898e1941012349c153e52553df64a13
- sha256sums = 9e8126f3a748f4c1d6fe34d4436de72b16a40e97a6d18234d2e88caa179d50c4
- sha256sums = d6f9427d5cb63c7299563c201cd8708c7166e0f8c98b57a1fee69767362bf0f7
+ sha256sums = 90c7411fed0157116f2df8f4be755aaf5a26e8484351b4e6a79492805d5f2790
+ sha256sums = 3c1b39f8d17dc384486ebe61aa783cc4a649ed9d7b633c02f36693b8af265160
pkgname = openssl-chacha20
diff --git a/PKGBUILD b/PKGBUILD
index 5423623dda02..b01e1325ff96 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -3,7 +3,7 @@
_pkgname=openssl
pkgname=${_pkgname}-chacha20
-_ver=1.0.2k
+_ver=1.1.0e
# use a pacman compatible version scheme
pkgver=${_ver/[a-z]/.${_ver//[0-9.]/}}
#pkgver=$_ver
@@ -20,33 +20,23 @@ conflicts=('openssl')
provides=("openssl=${pkgver}")
source=("https://www.openssl.org/source/${_pkgname}-${_ver}.tar.gz"
"https://www.openssl.org/source/${_pkgname}-${_ver}.tar.gz.asc"
- 'no-rpath.patch'
- 'ssl3-test-failure.patch'
'ca-dir.patch'
- 'openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch')
-sha256sums=('6b3977c61f2aedf0f96367dcfb5c6e578cf37e7b8d913b4ecb6643c3cb88d8c0'
+ 'openssl__1.1.0_chacha20_poly1305.patch')
+sha256sums=('57be8618979d80c910728cfc99369bf97b2a1abd8f366ab6ebdee8975ad3874c'
'SKIP'
- '754d6107a306311e15a1db6a1cc031b81691c8b9865e8809ac60ca6f184c957c'
- 'c54ae87c602eaa1530a336ab7c6e22e12898e1941012349c153e52553df64a13'
- '9e8126f3a748f4c1d6fe34d4436de72b16a40e97a6d18234d2e88caa179d50c4'
- 'd6f9427d5cb63c7299563c201cd8708c7166e0f8c98b57a1fee69767362bf0f7')
+ '90c7411fed0157116f2df8f4be755aaf5a26e8484351b4e6a79492805d5f2790'
+ '3c1b39f8d17dc384486ebe61aa783cc4a649ed9d7b633c02f36693b8af265160')
validpgpkeys=('8657ABB260F056B1E5190839D9C4D26D0E604491')
prepare() {
cd $srcdir/$_pkgname-$_ver
- # remove rpath: http://bugs.archlinux.org/task/14367
- patch -p0 -i $srcdir/no-rpath.patch
-
- # disable a test that fails when ssl3 is disabled
- patch -p1 -i $srcdir/ssl3-test-failure.patch
-
# set ca dir to /etc/ssl by default
patch -p0 -i $srcdir/ca-dir.patch
# Cloudflare patch
- # https://github.com/cloudflare/sslconfig/blob/master/patches/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
- patch -p1 -i $srcdir/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
+ # https://raw.githubusercontent.com/cloudflare/sslconfig/master/patches/openssl__1.1.0_chacha20_poly1305.patch
+ patch -p1 -i $srcdir/openssl__1.1.0_chacha20_poly1305.patch
}
build() {
@@ -83,6 +73,6 @@ check() {
package() {
cd $srcdir/$_pkgname-$_ver
- make INSTALL_PREFIX=$pkgdir MANDIR=/usr/share/man MANSUFFIX=ssl install
+ make DESTDIR=$pkgdir MANDIR=/usr/share/man MANSUFFIX=ssl install_sw install_ssldirs install_man_docs
install -D -m644 LICENSE $pkgdir/usr/share/licenses/$_pkgname/LICENSE
}
diff --git a/ca-dir.patch b/ca-dir.patch
index 41d1386d3d06..1daba849b4ca 100644
--- a/ca-dir.patch
+++ b/ca-dir.patch
@@ -1,27 +1,16 @@
---- apps/CA.pl.in 2006-04-28 02:30:49.000000000 +0200
-+++ apps/CA.pl.in 2010-04-01 00:35:02.600553509 +0200
-@@ -53,7 +53,7 @@
- $X509="$openssl x509";
- $PKCS12="$openssl pkcs12";
+--- apps/CA.pl.in 2016-09-26 11:46:04.000000000 +0200
++++ apps/CA.pl.in 2016-11-01 16:02:16.709616823 +0100
+@@ -33,7 +33,7 @@
+ my $PKCS12 = "$openssl pkcs12";
--$CATOP="./demoCA";
-+$CATOP="/etc/ssl";
- $CAKEY="cakey.pem";
- $CAREQ="careq.pem";
- $CACERT="cacert.pem";
---- apps/CA.sh 2009-10-15 19:27:47.000000000 +0200
-+++ apps/CA.sh 2010-04-01 00:35:02.600553509 +0200
-@@ -68,7 +68,7 @@
- X509="$OPENSSL x509"
- PKCS12="openssl pkcs12"
-
--if [ -z "$CATOP" ] ; then CATOP=./demoCA ; fi
-+if [ -z "$CATOP" ] ; then CATOP=/etc/ssl ; fi
- CAKEY=./cakey.pem
- CAREQ=./careq.pem
- CACERT=./cacert.pem
---- apps/openssl.cnf 2009-04-04 20:09:43.000000000 +0200
-+++ apps/openssl.cnf 2010-04-01 00:35:02.607220681 +0200
+ # default openssl.cnf file has setup as per the following
+-my $CATOP = "./demoCA";
++my $CATOP = "/etc/ssl";
+ my $CAKEY = "cakey.pem";
+ my $CAREQ = "careq.pem";
+ my $CACERT = "cacert.pem";
+--- apps/openssl.cnf 2016-09-26 11:46:04.000000000 +0200
++++ apps/openssl.cnf 2016-11-01 16:02:48.378503427 +0100
@@ -39,7 +39,7 @@
####################################################################
[ CA_default ]
@@ -31,3 +20,12 @@
certs = $dir/certs # Where the issued certs are kept
crl_dir = $dir/crl # Where the issued crl are kept
database = $dir/index.txt # database index file.
+@@ -323,7 +323,7 @@
+ [ tsa_config1 ]
+
+ # These are used by the TSA reply generation only.
+-dir = ./demoCA # TSA root directory
++dir = /etc/ssl # TSA root directory
+ serial = $dir/tsaserial # The current serial number (mandatory)
+ crypto_device = builtin # OpenSSL engine to use for signing
+ signer_cert = $dir/tsacert.pem # The TSA signing certificate
diff --git a/no-rpath.patch b/no-rpath.patch
deleted file mode 100644
index ebd95e23d397..000000000000
--- a/no-rpath.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- Makefile.shared.no-rpath 2005-06-23 22:47:54.000000000 +0200
-+++ Makefile.shared 2005-11-16 22:35:37.000000000 +0100
-@@ -153,7 +153,7 @@
- NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \
- SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared -Wl,-Bsymbolic -Wl,-soname=$$SHLIB$$SHLIB_SOVER$$SHLIB_SUFFIX"
-
--DO_GNU_APP=LDFLAGS="$(CFLAGS) -Wl,-rpath,$(LIBRPATH)"
-+DO_GNU_APP=LDFLAGS="$(CFLAGS)"
-
- #This is rather special. It's a special target with which one can link
- #applications without bothering with any features that have anything to
diff --git a/openssl__1.1.0_chacha20_poly1305.patch b/openssl__1.1.0_chacha20_poly1305.patch
new file mode 100644
index 000000000000..34da57b4af1d
--- /dev/null
+++ b/openssl__1.1.0_chacha20_poly1305.patch
@@ -0,0 +1,60 @@
+diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
+index e94ee83..3cd7e3a 100644
+--- a/ssl/s3_lib.c
++++ b/ssl/s3_lib.c
+@@ -3582,6 +3582,7 @@ const SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
+ STACK_OF(SSL_CIPHER) *prio, *allow;
+ int i, ii, ok;
+ unsigned long alg_k, alg_a, mask_k, mask_a;
++ int use_chacha = 0;
+
+ /* Let's see which ciphers we can support */
+
+@@ -3610,13 +3611,20 @@ const SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
+ fprintf(stderr, "%p:%s\n", (void *)c, c->name);
+ }
+ #endif
+-
++retry:
+ if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) {
+ prio = srvr;
+ allow = clnt;
++ /* Use ChaCha20+Poly1305 if it's client's most preferred cipher suite */
++ if (sk_SSL_CIPHER_num(clnt) > 0) {
++ c = sk_SSL_CIPHER_value(clnt, 0);
++ if (c->algorithm_enc == SSL_CHACHA20POLY1305)
++ use_chacha = 1;
++ }
+ } else {
+ prio = clnt;
+ allow = srvr;
++ use_chacha = 1;
+ }
+
+ tls1_set_cert_validity(s);
+@@ -3634,6 +3642,10 @@ const SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
+ DTLS_VERSION_GT(s->version, c->max_dtls)))
+ continue;
+
++ /* Skip ChaCha unless top client priority */
++ if (c->algorithm_enc == SSL_CHACHA20POLY1305 && !use_chacha)
++ continue;
++
+ mask_k = s->s3->tmp.mask_k;
+ mask_a = s->s3->tmp.mask_a;
+ #ifndef OPENSSL_NO_SRP
+@@ -3687,6 +3699,14 @@ const SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
+ break;
+ }
+ }
++
++ if (ret == NULL && !use_chacha) {
++ /* If no shared cipher was found due to some unusual preferences, try
++ * again with CHACHA enabled even if not top priority */
++ use_chacha = 1;
++ goto retry;
++ }
++
+ return (ret);
+ }
+
diff --git a/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch b/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
deleted file mode 100644
index cdb767379f85..000000000000
--- a/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
+++ /dev/null
@@ -1,4718 +0,0 @@
-From dcf9b5698b8658c9248327b3fdb280090c5c78ec Mon Sep 17 00:00:00 2001
-From: vkrasnov <vlad@cloudflare.com>
-Date: Tue, 4 Oct 2016 15:47:32 -0700
-Subject: [PATCH] ChaCha20-Poly1305 draft and RFC cipher suites for OpenSSL
- 1.0.2j
-
----
- Configure | 44 +-
- Makefile.org | 4 +-
- crypto/chacha20_poly1305/Makefile | 89 +
- .../asm/chacha20_poly1305_x86_64.pl | 2299 ++++++++++++++++++++
- crypto/chacha20_poly1305/asm/chacha20_x86_64.pl | 415 ++++
- crypto/chacha20_poly1305/asm/poly1305_x86_64.pl | 280 +++
- crypto/chacha20_poly1305/chacha20.c | 142 ++
- crypto/chacha20_poly1305/chacha20poly1305.h | 64 +
- crypto/chacha20_poly1305/poly1305.c | 355 +++
- crypto/evp/Makefile | 8 +-
- crypto/evp/c_allc.c | 5 +
- crypto/evp/e_chacha20_poly1305.c | 362 +++
- crypto/evp/evp.h | 5 +
- crypto/objects/obj_dat.h | 13 +-
- crypto/objects/obj_mac.h | 8 +
- crypto/objects/obj_mac.num | 2 +
- crypto/objects/objects.txt | 2 +
- ssl/s3_lib.c | 128 +-
- ssl/ssl.h | 2 +
- ssl/ssl_ciph.c | 31 +-
- ssl/ssl_locl.h | 2 +
- ssl/tls1.h | 26 +
- 22 files changed, 4260 insertions(+), 26 deletions(-)
- create mode 100644 crypto/chacha20_poly1305/Makefile
- create mode 100755 crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl
- create mode 100644 crypto/chacha20_poly1305/asm/chacha20_x86_64.pl
- create mode 100644 crypto/chacha20_poly1305/asm/poly1305_x86_64.pl
- create mode 100644 crypto/chacha20_poly1305/chacha20.c
- create mode 100644 crypto/chacha20_poly1305/chacha20poly1305.h
- create mode 100644 crypto/chacha20_poly1305/poly1305.c
- create mode 100644 crypto/evp/e_chacha20_poly1305.c
-
-diff --git a/Configure b/Configure
-index c39f71a..f5f7c06 100755
---- a/Configure
-+++ b/Configure
-@@ -150,25 +150,25 @@ my $tlib="-lnsl -lsocket";
- my $bits1="THIRTY_TWO_BIT ";
- my $bits2="SIXTY_FOUR_BIT ";
-
--my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:";
-+my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o::";
-
- my $x86_elf_asm="$x86_asm:elf";
-
--my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:";
--my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
--my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void";
--my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void";
--my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void";
--my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
-+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:chacha20_poly1305_x86_64.o poly1305_x86_64.o chacha20_x86_64.o:";
-+my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:::void";
-+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o:::void";
-+my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o::::::::::::::void";
-+my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o:::void";
-+my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::::";
- my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//;
--my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
--my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void";
--my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
--my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
--my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
--my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
-+my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o::";
-+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o:::void";
-+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o::";
-+my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::32";
-+my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::64";
-+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o::";
- my $ppc32_asm=$ppc64_asm;
--my $no_asm="::::::::::::::::void";
-+my $no_asm=":::::::::::::::::void";
-
- # As for $BSDthreads. Idea is to maintain "collective" set of flags,
- # which would cover all BSD flavors. -pthread applies to them all,
-@@ -179,7 +179,7 @@ my $no_asm="::::::::::::::::void";
- # seems to be sufficient?
- my $BSDthreads="-pthread -D_THREAD_SAFE -D_REENTRANT";
-
--#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $ec_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib
-+#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $ec_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $chapoly_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib
-
- my %table=(
- # File 'TABLE' (created by 'make TABLE') contains the data from this list,
-@@ -713,6 +713,7 @@ my $idx_rc5_obj = $idx++;
- my $idx_wp_obj = $idx++;
- my $idx_cmll_obj = $idx++;
- my $idx_modes_obj = $idx++;
-+my $idx_chapoly_obj = $idx++;
- my $idx_engines_obj = $idx++;
- my $idx_perlasm_scheme = $idx++;
- my $idx_dso_scheme = $idx++;
-@@ -1239,6 +1240,7 @@ my $rc5_obj = $fields[$idx_rc5_obj];
- my $wp_obj = $fields[$idx_wp_obj];
- my $cmll_obj = $fields[$idx_cmll_obj];
- my $modes_obj = $fields[$idx_modes_obj];
-+my $chapoly_obj= $fields[$idx_chapoly_obj];
- my $engines_obj = $fields[$idx_engines_obj];
- my $perlasm_scheme = $fields[$idx_perlasm_scheme];
- my $dso_scheme = $fields[$idx_dso_scheme];
-@@ -1407,7 +1409,8 @@ if ($no_asm)
- {
- $cpuid_obj=$bn_obj=$ec_obj=
- $des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj=
-- $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj="";
-+ $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=
-+ $chapoly_obj="";
- }
-
- if (!$no_shared)
-@@ -1622,6 +1625,10 @@ if ($ec_obj =~ /ecp_nistz256/)
- {
- $cflags.=" -DECP_NISTZ256_ASM";
- }
-+if ($chapoly_obj =~ /chacha20_poly1305/)
-+ {
-+ $cflags.=" -DCHAPOLY_ASM";
-+ }
-
- # "Stringify" the C flags string. This permits it to be made part of a string
- # and works as well on command lines.
-@@ -1751,6 +1758,7 @@ while (<IN>)
- s/^WP_ASM_OBJ=.*$/WP_ASM_OBJ= $wp_obj/;
- s/^CMLL_ENC=.*$/CMLL_ENC= $cmll_obj/;
- s/^MODES_ASM_OBJ.=*$/MODES_ASM_OBJ= $modes_obj/;
-+ s/^CHAPOLY_ASM=.*$/CHAPOLY_ASM= $chapoly_obj/;
- s/^ENGINES_ASM_OBJ.=*$/ENGINES_ASM_OBJ= $engines_obj/;
- s/^PERLASM_SCHEME=.*$/PERLASM_SCHEME= $perlasm_scheme/;
- s/^PROCESSOR=.*/PROCESSOR= $processor/;
-@@ -1812,6 +1820,7 @@ print "SHA1_OBJ_ASM =$sha1_obj\n";
- print "RMD160_OBJ_ASM=$rmd160_obj\n";
- print "CMLL_ENC =$cmll_obj\n";
- print "MODES_OBJ =$modes_obj\n";
-+print "CHAPOLY_ASM =$chapoly_obj\n";
- print "ENGINES_OBJ =$engines_obj\n";
- print "PROCESSOR =$processor\n";
- print "RANLIB =$ranlib\n";
-@@ -2211,7 +2220,7 @@ sub print_table_entry
- my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags,
- $bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj,
- $md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj,
-- $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj,
-+ $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $chapoly_obj, $engines_obj,
- $perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag,
- $shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)=
- split(/\s*:\s*/,$table{$target} . ":" x 30 , -1);
-@@ -2241,6 +2250,7 @@ sub print_table_entry
- \$wp_obj = $wp_obj
- \$cmll_obj = $cmll_obj
- \$modes_obj = $modes_obj
-+\$chapoly_obj = $chapoly_obj
- \$engines_obj = $engines_obj
- \$perlasm_scheme = $perlasm_scheme
- \$dso_scheme = $dso_scheme
-diff --git a/Makefile.org b/Makefile.org
-index 2377f50..1f20a61 100644
---- a/Makefile.org
-+++ b/Makefile.org
-@@ -103,6 +103,7 @@ WP_ASM_OBJ=
- CMLL_ENC=
- MODES_ASM_OBJ=
- ENGINES_ASM_OBJ=
-+CHAPOLY_ASM=
- PERLASM_SCHEME=
-
- # KRB5 stuff
-@@ -149,7 +150,7 @@ SDIRS= \
- bn ec rsa dsa ecdsa dh ecdh dso engine \
- buffer bio stack lhash rand err \
- evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \
-- cms pqueue ts jpake srp store cmac
-+ cms pqueue ts jpake srp store cmac chacha20_poly1305
- # keep in mind that the above list is adjusted by ./Configure
- # according to no-xxx arguments...
-
-@@ -240,6 +241,7 @@ BUILDENV= LC_ALL=C PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)'\
- FIPSLIBDIR='${FIPSLIBDIR}' \
- FIPSDIR='${FIPSDIR}' \
- FIPSCANLIB="$${FIPSCANLIB:-$(FIPSCANLIB)}" \
-+ CHAPOLY_ASM='$(CHAPOLY_ASM)' \
- THIS=$${THIS:-$@} MAKEFILE=Makefile MAKEOVERRIDES=
- # MAKEOVERRIDES= effectively "equalizes" GNU-ish and SysV-ish make flavors,
- # which in turn eliminates ambiguities in variable treatment with -e.
-diff --git a/crypto/chacha20_poly1305/Makefile b/crypto/chacha20_poly1305/Makefile
-new file mode 100644
-index 0000000..87f4ba3
---- /dev/null
-+++ b/crypto/chacha20_poly1305/Makefile
-@@ -0,0 +1,89 @@
-+#
-+# crypto/chacha20poly1305/Makefile
-+#
-+
-+DIR= chacha20poly1305
-+TOP= ../..
-+CC= cc
-+INCLUDES= -I.. -I$(TOP) -I../../include
-+CFLAG=-g
-+MAKEFILE= Makefile
-+AR= ar r
-+
-+CFLAGS= $(INCLUDES) $(CFLAG)
-+ASFLAGS= $(INCLUDES) $(ASFLAG)
-+AFLAGS= $(ASFLAGS)
-+
-+GENERAL=Makefile
-+TEST=
-+APPS=
-+
-+LIB=$(TOP)/libcrypto.a
-+LIBSRC= chacha20.c poly1305.c
-+LIBOBJ= chacha20.o poly1305.o $(CHAPOLY_ASM)
-+
-+SRC= $(LIBSRC)
-+
-+EXHEADER= chacha20poly1305.h
-+HEADER= $(EXHEADER)
-+
-+ALL= $(GENERAL) $(SRC) $(HEADER)
-+
-+top:
-+ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-+
-+all: lib
-+
-+lib: $(LIBOBJ)
-+ $(AR) $(LIB) $(LIBOBJ)
-+ $(RANLIB) $(LIB) || echo Never mind.
-+ @touch lib
-+
-+chacha20_poly1305_x86_64.s: asm/chacha20_poly1305_x86_64.pl
-+ $(PERL) asm/chacha20_poly1305_x86_64.pl $(PERLASM_SCHEME) > $@
-+
-+poly1305_x86_64.s: asm/poly1305_x86_64.pl
-+ $(PERL) asm/poly1305_x86_64.pl $(PERLASM_SCHEME) > $@
-+
-+chacha20_x86_64.s: asm/chacha20_x86_64.pl
-+ $(PERL) asm/chacha20_x86_64.pl $(PERLASM_SCHEME) > $@
-+
-+files:
-+ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
-+
-+links:
-+ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
-+ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
-+ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-+
-+install:
-+ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
-+ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
-+ do \
-+ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-+ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-+ done;
-+
-+tags:
-+ ctags $(SRC)
-+
-+tests:
-+
-+lint:
-+ lint -DLINT $(INCLUDES) $(SRC)>fluff
-+
-+depend:
-+ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
-+ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-+
-+dclean:
-+ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-+ mv -f Makefile.new $(MAKEFILE)
-+
-+clean:
-+ rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-+
-+# DO NOT DELETE THIS LINE -- make depend depends on it.
-+
-+chacha20.o: ../../include/openssl/chacha20poly1305.h chacha20.c
-+poly1305.o: ../../include/openssl/chacha20poly1305.h poly1305.c
-diff --git a/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl b/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl
-new file mode 100755
-index 0000000..ef90831
---- /dev/null
-+++ b/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl
-@@ -0,0 +1,2299 @@
-+#!/usr/bin/env perl
-+
-+##############################################################################
-+# #
-+# Copyright 2016 CloudFlare LTD #
-+# #
-+# Licensed under the Apache License, Version 2.0 (the "License"); #
-+# you may not use this file except in compliance with the License. #
-+# You may obtain a copy of the License at #
-+# #
-+# http://www.apache.org/licenses/LICENSE-2.0 #
-+# #
-+# Unless required by applicable law or agreed to in writing, software #
-+# distributed under the License is distributed on an "AS IS" BASIS, #
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-+# See the License for the specific language governing permissions and #
-+# limitations under the License. #
-+# #
-+##############################################################################
-+# #
-+# Author: Vlad Krasnov #
-+# #
-+##############################################################################
-+
-+$flavour = shift;
-+$output = shift;
-+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-+
-+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-+die "can't locate x86_64-xlate.pl";
-+
-+open OUT,"| \"$^X\" $xlate $flavour $output";
-+*STDOUT=*OUT;
-+
-+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.19) + ($1>=2.22);
-+}
-+
-+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
-+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.09) + ($1>=2.10);
-+}
-+
-+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
-+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-+ $avx = ($1>=10) + ($1>=11);
-+}
-+
-+if (`$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
-+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
-+ $avx = ($ver>=3.0) + ($ver>=3.01);
-+}
-+
-+$code.=<<___;
-+.text
-+.extern OPENSSL_ia32cap_P
-+.align 64
-+.chacha20_consts:
-+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-+.rol8:
-+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-+.rol16:
-+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-+.avx2_init:
-+.long 0,0,0,0
-+.sse_inc:
-+.long 1,0,0,0
-+.avx2_inc:
-+.long 2,0,0,0,2,0,0,0
-+.clamp:
-+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
-+.align 16
-+.and_masks:
-+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
-+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
-+___
-+
-+my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
-+my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
-+my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
-+my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
-+my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
-+my $r_store="0*16(%rbp)";
-+my $s_store="1*16(%rbp)";
-+my $len_store="2*16(%rbp)";
-+my $state1_store="3*16(%rbp)";
-+my $state2_store="4*16(%rbp)";
-+my $tmp_store="5*16(%rbp)";
-+my $ctr0_store="6*16(%rbp)";
-+my $ctr1_store="7*16(%rbp)";
-+my $ctr2_store="8*16(%rbp)";
-+my $ctr3_store="9*16(%rbp)";
-+
-+sub chacha_qr {
-+my ($a,$b,$c,$d,$t,$dir)=@_;
-+$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
-+$code.="paddd $b, $a
-+ pxor $a, $d
-+ pshufb .rol16(%rip), $d
-+ paddd $d, $c
-+ pxor $c, $b
-+ movdqa $b, $t
-+ pslld \$12, $t
-+ psrld \$20, $b
-+ pxor $t, $b
-+ paddd $b, $a
-+ pxor $a, $d
-+ pshufb .rol8(%rip), $d
-+ paddd $d, $c
-+ pxor $c, $b
-+ movdqa $b, $t
-+ pslld \$7, $t
-+ psrld \$25, $b
-+ pxor $t, $b\n";
-+$code.="palignr \$4, $b, $b
-+ palignr \$8, $c, $c
-+ palignr \$12, $d, $d\n" if ($dir =~ /left/);
-+$code.="palignr \$12, $b, $b
-+ palignr \$8, $c, $c
-+ palignr \$4, $d, $d\n" if ($dir =~ /right/);
-+$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
-+}
-+
-+sub poly_add {
-+my ($src)=@_;
-+$code.="add $src, $acc0
-+ adc 8+$src, $acc1
-+ adc \$1, $acc2\n";
-+}
-+
-+sub poly_stage1 {
-+$code.="mov 0+$r_store, %rax
-+ mov %rax, $t2
-+ mul $acc0
-+ mov %rax, $t0
-+ mov %rdx, $t1
-+ mov 0+$r_store, %rax
-+ mul $acc1
-+ imul $acc2, $t2
-+ add %rax, $t1
-+ adc %rdx, $t2\n";
-+}
-+
-+sub poly_stage2 {
-+$code.="mov 8+$r_store, %rax
-+ mov %rax, $t3
-+ mul $acc0
-+ add %rax, $t1
-+ adc \$0, %rdx
-+ mov %rdx, $acc0
-+ mov 8+$r_store, %rax
-+ mul $acc1
-+ add %rax, $t2
-+ adc \$0, %rdx\n";
-+}
-+
-+sub poly_stage3 {
-+$code.="imul $acc2, $t3
-+ add $acc0, $t2
-+ adc %rdx, $t3\n";
-+}
-+
-+sub poly_reduce_stage {
-+$code.="mov $t0, $acc0
-+ mov $t1, $acc1
-+ mov $t2, $acc2
-+ and \$3, $acc2
-+ mov $t2, $t0
-+ and \$-4, $t0
-+ mov $t3, $t1
-+ shrd \$2, $t3, $t2
-+ shr \$2, $t3
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$0, $acc2
-+ add $t2, $acc0
-+ adc $t3, $acc1
-+ adc \$0, $acc2\n";
-+}
-+
-+sub poly_mul {
-+ &poly_stage1();
-+ &poly_stage2();
-+ &poly_stage3();
-+ &poly_reduce_stage();
-+}
-+
-+sub prep_state {
-+my ($n)=@_;
-+$code.="movdqa .chacha20_consts(%rip), $A0
-+ movdqa $state1_store, $B0
-+ movdqa $state2_store, $C0\n";
-+$code.="movdqa $A0, $A1
-+ movdqa $B0, $B1
-+ movdqa $C0, $C1\n" if ($n ge 2);
-+$code.="movdqa $A0, $A2
-+ movdqa $B0, $B2
-+ movdqa $C0, $C2\n" if ($n ge 3);
-+$code.="movdqa $A0, $A3
-+ movdqa $B0, $B3
-+ movdqa $C0, $C3\n" if ($n ge 4);
-+$code.="movdqa $ctr0_store, $D0
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $ctr0_store\n" if ($n eq 1);
-+$code.="movdqa $ctr0_store, $D1
-+ paddd .sse_inc(%rip), $D1
-+ movdqa $D1, $D0
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $ctr0_store
-+ movdqa $D1, $ctr1_store\n" if ($n eq 2);
-+$code.="movdqa $ctr0_store, $D2
-+ paddd .sse_inc(%rip), $D2
-+ movdqa $D2, $D1
-+ paddd .sse_inc(%rip), $D1
-+ movdqa $D1, $D0
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $ctr0_store
-+ movdqa $D1, $ctr1_store
-+ movdqa $D2, $ctr2_store\n" if ($n eq 3);
-+$code.="movdqa $ctr0_store, $D3
-+ paddd .sse_inc(%rip), $D3
-+ movdqa $D3, $D2
-+ paddd .sse_inc(%rip), $D2
-+ movdqa $D2, $D1
-+ paddd .sse_inc(%rip), $D1
-+ movdqa $D1, $D0
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $ctr0_store
-+ movdqa $D1, $ctr1_store
-+ movdqa $D2, $ctr2_store
-+ movdqa $D3, $ctr3_store\n" if ($n eq 4);
-+}
-+
-+sub finalize_state {
-+my ($n)=@_;
-+$code.="paddd .chacha20_consts(%rip), $A3
-+ paddd $state1_store, $B3
-+ paddd $state2_store, $C3
-+ paddd $ctr3_store, $D3\n" if ($n eq 4);
-+$code.="paddd .chacha20_consts(%rip), $A2
-+ paddd $state1_store, $B2
-+ paddd $state2_store, $C2
-+ paddd $ctr2_store, $D2\n" if ($n ge 3);
-+$code.="paddd .chacha20_consts(%rip), $A1
-+ paddd $state1_store, $B1
-+ paddd $state2_store, $C1
-+ paddd $ctr1_store, $D1\n" if ($n ge 2);
-+$code.="paddd .chacha20_consts(%rip), $A0
-+ paddd $state1_store, $B0
-+ paddd $state2_store, $C0
-+ paddd $ctr0_store, $D0\n";
-+}
-+
-+sub xor_stream {
-+my ($A, $B, $C, $D, $offset)=@_;
-+$code.="movdqu 0*16 + $offset($inp), $A3
-+ movdqu 1*16 + $offset($inp), $B3
-+ movdqu 2*16 + $offset($inp), $C3
-+ movdqu 3*16 + $offset($inp), $D3
-+ pxor $A3, $A
-+ pxor $B3, $B
-+ pxor $C3, $C
-+ pxor $D, $D3
-+ movdqu $A, 0*16 + $offset($oup)
-+ movdqu $B, 1*16 + $offset($oup)
-+ movdqu $C, 2*16 + $offset($oup)
-+ movdqu $D3, 3*16 + $offset($oup)\n";
-+}
-+
-+sub xor_stream_using_temp {
-+my ($A, $B, $C, $D, $offset, $temp)=@_;
-+$code.="movdqa $temp, $tmp_store
-+ movdqu 0*16 + $offset($inp), $temp
-+ pxor $A, $temp
-+ movdqu $temp, 0*16 + $offset($oup)
-+ movdqu 1*16 + $offset($inp), $temp
-+ pxor $B, $temp
-+ movdqu $temp, 1*16 + $offset($oup)
-+ movdqu 2*16 + $offset($inp), $temp
-+ pxor $C, $temp
-+ movdqu $temp, 2*16 + $offset($oup)
-+ movdqu 3*16 + $offset($inp), $temp
-+ pxor $D, $temp
-+ movdqu $temp, 3*16 + $offset($oup)\n";
-+}
-+
-+sub gen_chacha_round {
-+my ($rot1, $rot2, $shift)=@_;
-+my $round="";
-+$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
-+$round.="movdqa $rot2, $C0
-+ paddd $B3, $A3
-+ paddd $B2, $A2
-+ paddd $B1, $A1
-+ paddd $B0, $A0
-+ pxor $A3, $D3
-+ pxor $A2, $D2
-+ pxor $A1, $D1
-+ pxor $A0, $D0
-+ pshufb $C0, $D3
-+ pshufb $C0, $D2
-+ pshufb $C0, $D1
-+ pshufb $C0, $D0
-+ movdqa $tmp_store, $C0
-+ paddd $D3, $C3
-+ paddd $D2, $C2
-+ paddd $D1, $C1
-+ paddd $D0, $C0
-+ pxor $C3, $B3
-+ pxor $C2, $B2
-+ pxor $C1, $B1
-+ pxor $C0, $B0
-+ movdqa $C0, $tmp_store
-+ movdqa $B3, $C0
-+ psrld \$$rot1, $C0
-+ pslld \$32-$rot1, $B3
-+ pxor $C0, $B3
-+ movdqa $B2, $C0
-+ psrld \$$rot1, $C0
-+ pslld \$32-$rot1, $B2
-+ pxor $C0, $B2
-+ movdqa $B1, $C0
-+ psrld \$$rot1, $C0
-+ pslld \$32-$rot1, $B1
-+ pxor $C0, $B1
-+ movdqa $B0, $C0
-+ psrld \$$rot1, $C0
-+ pslld \$32-$rot1, $B0
-+ pxor $C0, $B0\n";
-+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
-+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
-+$round.="movdqa $tmp_store, $C0
-+ palignr \$$s1, $B3, $B3
-+ palignr \$$s2, $C3, $C3
-+ palignr \$$s3, $D3, $D3
-+ palignr \$$s1, $B2, $B2
-+ palignr \$$s2, $C2, $C2
-+ palignr \$$s3, $D2, $D2
-+ palignr \$$s1, $B1, $B1
-+ palignr \$$s2, $C1, $C1
-+ palignr \$$s3, $D1, $D1
-+ palignr \$$s1, $B0, $B0
-+ palignr \$$s2, $C0, $C0
-+ palignr \$$s3, $D0, $D0\n"
-+if (($shift =~ /left/) || ($shift =~ /right/));
-+return $round;
-+};
-+
-+$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
-+ &gen_chacha_round(25, ".rol8(%rip)", "left") .
-+ &gen_chacha_round(20, ".rol16(%rip)") .
-+ &gen_chacha_round(25, ".rol8(%rip)", "right");
-+
-+my @loop_body = split /\n/, $chacha_body;
-+
-+sub emit_body {
-+my ($n)=@_;
-+ for (my $i=0; $i < $n; $i++) {
-+ $code=$code.shift(@loop_body)."\n";
-+ };
-+}
-+
-+{
-+################################################################################
-+# void poly_hash_ad_internal();
-+$code.="
-+.type poly_hash_ad_internal,\@function,2
-+.align 64
-+poly_hash_ad_internal:
-+ xor $acc0, $acc0
-+ xor $acc1, $acc1
-+ xor $acc2, $acc2
-+ cmp \$13, $itr2
-+ jne hash_ad_loop
-+poly_fast_tls_ad:
-+ # Special treatment for the TLS case of 13 bytes
-+ mov ($adp), $acc0
-+ mov 5($adp), $acc1
-+ shr \$24, $acc1
-+ mov \$1, $acc2\n";
-+ &poly_mul(); $code.="
-+ ret
-+hash_ad_loop:
-+ # Hash in 16 byte chunk
-+ cmp \$16, $itr2
-+ jb hash_ad_tail\n";
-+ &poly_add("0($adp)");
-+ &poly_mul(); $code.="
-+ lea (1*16)($adp), $adp
-+ sub \$16, $itr2
-+ jmp hash_ad_loop
-+hash_ad_tail:
-+ cmp \$0, $itr2
-+ je 1f
-+ # Hash last < 16 byte tail
-+ xor $t0, $t0
-+ xor $t1, $t1
-+ xor $t2, $t2
-+ add $itr2, $adp
-+hash_ad_tail_loop:
-+ shld \$8, $t0, $t1
-+ shl \$8, $t0
-+ movzxb -1($adp), $t2
-+ xor $t2, $t0
-+ dec $adp
-+ dec $itr2
-+ jne hash_ad_tail_loop
-+
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$1, $acc2\n";
-+ &poly_mul(); $code.="
-+ # Finished AD
-+1:
-+ ret
-+.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
-+}
-+
-+{
-+################################################################################
-+# int chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
-+$code.="
-+.globl chacha20_poly1305_open
-+.type chacha20_poly1305_open,\@function,2
-+.align 64
-+chacha20_poly1305_open:
-+ push %rbp
-+ push %rbx
-+ push %r12
-+ push %r13
-+ push %r14
-+ push %r15
-+ sub \$288 + 32, %rsp
-+ lea 32(%rsp), %rbp
-+ and \$-32, %rbp
-+ mov %rdx, 8+$len_store
-+ mov %r8, 0+$len_store
-+ mov %rdx, $inl\n"; $code.="
-+ mov OPENSSL_ia32cap_P+8(%rip), %eax
-+ test \$`1<<5`, %eax
-+ jnz chacha20_poly1305_open_avx2\n" if ($avx>1);
-+$code.="
-+ cmp \$128, $inl
-+ jbe open_sse_128
-+ # For long buffers, prepare the poly key first
-+ movdqa .chacha20_consts(%rip), $A0
-+ movdqu 0*16($keyp), $B0
-+ movdqu 1*16($keyp), $C0
-+ movdqu 2*16($keyp), $D0
-+ movdqa $D0, $T1
-+ # Store on stack, to free keyp
-+ movdqa $B0, $state1_store
-+ movdqa $C0, $state2_store
-+ movdqa $D0, $ctr0_store
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-+ paddd .chacha20_consts(%rip), $A0
-+ paddd $state1_store, $B0
-+ # Clamp and store the key
-+ pand .clamp(%rip), $A0
-+ movdqa $A0, $r_store
-+ movdqa $B0, $s_store
-+ # Hash
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+open_sse_main_loop:
-+ cmp \$16*16, $inl
-+ jb 2f
-+ # Load state, increment counter blocks\n";
-+ &prep_state(4); $code.="
-+ # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
-+ # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
-+ mov \$4, $itr1
-+ mov $inp, $itr2
-+1: \n";
-+ &emit_body(20);
-+ &poly_add("0($itr2)"); $code.="
-+ lea 2*8($itr2), $itr2\n";
-+ &emit_body(20);
-+ &poly_stage1();
-+ &emit_body(20);
-+ &poly_stage2();
-+ &emit_body(20);
-+ &poly_stage3();
-+ &emit_body(20);
-+ &poly_reduce_stage();
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ dec $itr1
-+ jge 1b\n";
-+ &poly_add("0($itr2)");
-+ &poly_mul(); $code.="
-+ lea 2*8($itr2), $itr2
-+ cmp \$-6, $itr1
-+ jg 1b\n";
-+ &finalize_state(4);
-+ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
-+ &xor_stream($A2, $B2, $C2, $D2, "4*16");
-+ &xor_stream($A1, $B1, $C1, $D1, "8*16");
-+ &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
-+ lea 16*16($inp), $inp
-+ lea 16*16($oup), $oup
-+ sub \$16*16, $inl
-+ jmp open_sse_main_loop
-+2:
-+ # Handle the various tail sizes efficiently
-+ test $inl, $inl
-+ jz open_sse_finalize
-+ cmp \$4*16, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # At most 64 bytes are left
-+ &prep_state(1); $code.="
-+ xor $itr2, $itr2
-+ mov $inl, $itr1
-+ cmp \$16, $itr1
-+ jb 2f
-+1: \n";
-+ &poly_add("0($inp, $itr2)");
-+ &poly_mul(); $code.="
-+ sub \$16, $itr1
-+2:
-+ add \$16, $itr2\n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
-+ cmp \$16, $itr1
-+ jae 1b
-+ cmp \$10*16, $itr2
-+ jne 2b\n";
-+ &finalize_state(1); $code.="
-+ jmp open_sse_tail_64_dec_loop
-+3:
-+ cmp \$8*16, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # 65 - 128 bytes are left
-+ &prep_state(2); $code.="
-+ mov $inl, $itr1
-+ and \$-16, $itr1
-+ xor $itr2, $itr2
-+1: \n";
-+ &poly_add("0($inp, $itr2)");
-+ &poly_mul(); $code.="
-+2:
-+ add \$16, $itr2\n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
-+ cmp $itr1, $itr2
-+ jb 1b
-+ cmp \$10*16, $itr2
-+ jne 2b\n";
-+ &finalize_state(2);
-+ &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
-+ sub \$4*16, $inl
-+ lea 4*16($inp), $inp
-+ lea 4*16($oup), $oup
-+ jmp open_sse_tail_64_dec_loop
-+3:
-+ cmp \$12*16, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # 129 - 192 bytes are left
-+ &prep_state(3); $code.="
-+ mov $inl, $itr1
-+ mov \$10*16, $itr2
-+ cmp \$10*16, $itr1
-+ cmovg $itr2, $itr1
-+ and \$-16, $itr1
-+ xor $itr2, $itr2
-+1: \n";
-+ &poly_add("0($inp, $itr2)");
-+ &poly_mul(); $code.="
-+2:
-+ add \$16, $itr2\n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ cmp $itr1, $itr2
-+ jb 1b
-+ cmp \$10*16, $itr2
-+ jne 2b
-+ cmp \$11*16, $inl
-+ jb 1f\n";
-+ &poly_add("10*16($inp)");
-+ &poly_mul(); $code.="
-+ cmp \$12*16, $inl
-+ jb 1f\n";
-+ &poly_add("11*16($inp)");
-+ &poly_mul(); $code.="
-+1: \n";
-+ &finalize_state(3);
-+ &xor_stream($A2, $B2, $C2, $D2, "0*16");
-+ &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
-+ sub \$8*16, $inl
-+ lea 8*16($inp), $inp
-+ lea 8*16($oup), $oup
-+ jmp open_sse_tail_64_dec_loop
-+3:
-+###############################################################################\n";
-+ # 193 - 255 bytes are left
-+ &prep_state(4); $code.="
-+ xor $itr2, $itr2
-+1: \n";
-+ &poly_add("0($inp, $itr2)");
-+ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
-+ &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
-+ &poly_stage1();
-+ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
-+ &poly_stage2();
-+ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
-+ &poly_stage3();
-+ &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
-+ &poly_reduce_stage();
-+ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
-+ add \$16, $itr2
-+ cmp \$10*16, $itr2
-+ jb 1b
-+ mov $inl, $itr1
-+ and \$-16, $itr1
-+1: \n";
-+ &poly_add("0($inp, $itr2)");
-+ &poly_mul(); $code.="
-+ add \$16, $itr2
-+ cmp $itr1, $itr2
-+ jb 1b\n";
-+ &finalize_state(4);
-+ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
-+ &xor_stream($A2, $B2, $C2, $D2, "4*16");
-+ &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
-+ movdqa $tmp_store, $D0
-+ sub \$12*16, $inl
-+ lea 12*16($inp), $inp
-+ lea 12*16($oup), $oup
-+###############################################################################
-+ # Decrypt the remaining data, 16B at a time, using existing stream
-+open_sse_tail_64_dec_loop:
-+ cmp \$16, $inl
-+ jb 1f
-+ sub \$16, $inl
-+ movdqu ($inp), $T0
-+ pxor $T0, $A0
-+ movdqu $A0, ($oup)
-+ lea 16($inp), $inp
-+ lea 16($oup), $oup
-+ movdqa $B0, $A0
-+ movdqa $C0, $B0
-+ movdqa $D0, $C0
-+ jmp open_sse_tail_64_dec_loop
-+1:
-+ movdqa $A0, $A1
-+ # Decrypt up to 16B
-+open_sse_tail_16:
-+ test $inl, $inl
-+ jz open_sse_finalize
-+ # We can safely load the CT from the end, because it is padded with the MAC
-+ mov $inl, $itr2
-+ shl \$4, $itr2
-+ lea .and_masks(%rip), $t0
-+ movdqu ($inp), $T0
-+ add $inl, $inp
-+ pand -16($t0, $itr2), $T0
-+ movq $T0, $t0
-+ pextrq \$1, $T0, $t1
-+ pxor $A1, $T0
-+ # We can only store 1 byte at a time, since plaintext can be shorter than 16 bytes
-+2:
-+ pextrb \$0, $T0, ($oup)
-+ psrldq \$1, $T0
-+ inc $oup
-+ dec $inl
-+ jne 2b
-+
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$1, $acc2\n";
-+ &poly_mul(); $code.="
-+
-+open_sse_finalize:\n";
-+ &poly_add($len_store);
-+ &poly_mul(); $code.="
-+ # Final reduce
-+ mov $acc0, $t0
-+ mov $acc1, $t1
-+ mov $acc2, $t2
-+ sub \$-5, $acc0
-+ sbb \$-1, $acc1
-+ sbb \$3, $acc2
-+ cmovc $t0, $acc0
-+ cmovc $t1, $acc1
-+ cmovc $t2, $acc2
-+ # Add in s part of the key
-+ add 0+$s_store, $acc0
-+ adc 8+$s_store, $acc1
-+ # Constant time compare
-+ xor %rax, %rax
-+ mov \$1, %rdx
-+ xor 0*8($inp), $acc0
-+ xor 1*8($inp), $acc1
-+ or $acc1, $acc0
-+ cmovz %rdx, %rax
-+
-+ add \$288 + 32, %rsp
-+ pop %r15
-+ pop %r14
-+ pop %r13
-+ pop %r12
-+ pop %rbx
-+ pop %rbp
-+ ret
-+###############################################################################
-+open_sse_128:
-+ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
-+ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
-+ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
-+ movdqu 2*16($keyp), $D0
-+ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
-+ movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
-+ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ dec $acc0
-+ jnz 1b
-+ paddd .chacha20_consts(%rip), $A0
-+ paddd .chacha20_consts(%rip), $A1
-+ paddd .chacha20_consts(%rip), $A2
-+ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
-+ paddd $T2, $C1\npaddd $T2, $C2
-+ paddd $T3, $D1
-+ paddd .sse_inc(%rip), $T3
-+ paddd $T3, $D2
-+ # Clamp and store the key
-+ pand .clamp(%rip), $A0
-+ movdqa $A0, $r_store
-+ movdqa $B0, $s_store
-+ # Hash
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+1:
-+ cmp \$16, $inl
-+ jb open_sse_tail_16
-+ sub \$16, $inl\n";
-+ # Load for hashing
-+ &poly_add("0*8($inp)"); $code.="
-+ # Load for decryption
-+ movdqu 0*16($inp), $T0
-+ pxor $T0, $A1
-+ movdqu $A1, 0*16($oup)
-+ lea 1*16($inp), $inp
-+ lea 1*16($oup), $oup\n";
-+ &poly_mul(); $code.="
-+ # Shift the stream left
-+ movdqa $B1, $A1
-+ movdqa $C1, $B1
-+ movdqa $D1, $C1
-+ movdqa $A2, $D1
-+ movdqa $B2, $A2
-+ movdqa $C2, $B2
-+ movdqa $D2, $C2
-+ jmp 1b
-+ jmp open_sse_tail_16
-+.size chacha20_poly1305_open, .-chacha20_poly1305_open
-+################################################################################
-+################################################################################
-+# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
-+.globl chacha20_poly1305_seal
-+.type chacha20_poly1305_seal,\@function,2
-+.align 64
-+chacha20_poly1305_seal:
-+ push %rbp
-+ push %rbx
-+ push %r12
-+ push %r13
-+ push %r14
-+ push %r15
-+ sub \$288 + 32, %rsp
-+ lea 32(%rsp), %rbp
-+ and \$-32, %rbp
-+ mov %rdx, 8+$len_store
-+ mov %r8, 0+$len_store
-+ mov %rdx, $inl\n"; $code.="
-+ mov OPENSSL_ia32cap_P+8(%rip), %eax
-+ test \$`1<<5`, %eax
-+ jnz chacha20_poly1305_seal_avx2\n" if ($avx>1);
-+$code.="
-+ cmp \$128, $inl
-+ jbe seal_sse_128
-+ # For longer buffers, prepare the poly key + some stream
-+ movdqa .chacha20_consts(%rip), $A0
-+ movdqu 0*16($keyp), $B0
-+ movdqu 1*16($keyp), $C0
-+ movdqu 2*16($keyp), $D0
-+ movdqa $A0, $A1
-+ movdqa $A0, $A2
-+ movdqa $A0, $A3
-+ movdqa $B0, $B1
-+ movdqa $B0, $B2
-+ movdqa $B0, $B3
-+ movdqa $C0, $C1
-+ movdqa $C0, $C2
-+ movdqa $C0, $C3
-+ movdqa $D0, $D3
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $D2
-+ paddd .sse_inc(%rip), $D0
-+ movdqa $D0, $D1
-+ paddd .sse_inc(%rip), $D0
-+ # Store on stack
-+ movdqa $B0, $state1_store
-+ movdqa $C0, $state2_store
-+ movdqa $D0, $ctr0_store
-+ movdqa $D1, $ctr1_store
-+ movdqa $D2, $ctr2_store
-+ movdqa $D3, $ctr3_store
-+ mov \$10, $acc0
-+1: \n";
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ dec $acc0
-+ jnz 1b\n";
-+ &finalize_state(4); $code.="
-+ # Clamp and store the key
-+ pand .clamp(%rip), $A3
-+ movdqa $A3, $r_store
-+ movdqa $B3, $s_store
-+ # Hash
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal\n";
-+ &xor_stream($A2,$B2,$C2,$D2,"0*16");
-+ &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
-+ cmp \$12*16, $inl
-+ ja 1f
-+ mov \$8*16, $itr1
-+ sub \$8*16, $inl
-+ lea 8*16($inp), $inp
-+ jmp seal_sse_128_seal_hash
-+1: \n";
-+ &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
-+ mov \$12*16, $itr1
-+ sub \$12*16, $inl
-+ lea 12*16($inp), $inp
-+ mov \$2, $itr1
-+ mov \$8, $itr2
-+ cmp \$4*16, $inl
-+ jbe seal_sse_tail_64
-+ cmp \$8*16, $inl
-+ jbe seal_sse_tail_128
-+ cmp \$12*16, $inl
-+ jbe seal_sse_tail_192
-+
-+1: \n";
-+ # The main loop
-+ &prep_state(4); $code.="
-+2: \n";
-+ &emit_body(20);
-+ &poly_add("0($oup)");
-+ &emit_body(20);
-+ &poly_stage1();
-+ &emit_body(20);
-+ &poly_stage2();
-+ &emit_body(20);
-+ &poly_stage3();
-+ &emit_body(20);
-+ &poly_reduce_stage();
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ lea 16($oup), $oup
-+ dec $itr2
-+ jge 2b\n";
-+ &poly_add("0*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 16($oup), $oup
-+ dec $itr1
-+ jg 2b\n";
-+
-+ &finalize_state(4);$code.="
-+ movdqa $D2, $tmp_store\n";
-+ &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
-+ movdqa $tmp_store, $D2\n";
-+ &xor_stream($A2,$B2,$C2,$D2, 4*16);
-+ &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
-+ cmp \$16*16, $inl
-+ ja 3f
-+
-+ mov \$12*16, $itr1
-+ sub \$12*16, $inl
-+ lea 12*16($inp), $inp
-+ jmp seal_sse_128_seal_hash
-+3: \n";
-+ &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
-+ lea 16*16($inp), $inp
-+ sub \$16*16, $inl
-+ mov \$6, $itr1
-+ mov \$4, $itr2
-+ cmp \$12*16, $inl
-+ jg 1b
-+ mov $inl, $itr1
-+ test $inl, $inl
-+ je seal_sse_128_seal_hash
-+ mov \$6, $itr1
-+ cmp \$4*16, $inl
-+ jg 3f
-+###############################################################################
-+seal_sse_tail_64:\n";
-+ &prep_state(1); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 16($oup), $oup
-+2: \n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 16($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state(1); $code.="
-+ jmp seal_sse_128_seal
-+3:
-+ cmp \$8*16, $inl
-+ jg 3f
-+###############################################################################
-+seal_sse_tail_128:\n";
-+ &prep_state(2); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 16($oup), $oup
-+2: \n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &poly_add("0($oup)");
-+ &poly_mul();
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
-+ lea 16($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state(2);
-+ &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
-+ mov \$4*16, $itr1
-+ sub \$4*16, $inl
-+ lea 4*16($inp), $inp
-+ jmp seal_sse_128_seal_hash
-+3:
-+###############################################################################
-+seal_sse_tail_192:\n";
-+ &prep_state(3); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 16($oup), $oup
-+2: \n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
-+ &poly_add("0($oup)");
-+ &poly_mul();
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ lea 16($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state(3);
-+ &xor_stream($A2,$B2,$C2,$D2,0*16);
-+ &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
-+ mov \$8*16, $itr1
-+ sub \$8*16, $inl
-+ lea 8*16($inp), $inp
-+###############################################################################
-+seal_sse_128_seal_hash:
-+ cmp \$16, $itr1
-+ jb seal_sse_128_seal\n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ sub \$16, $itr1
-+ lea 16($oup), $oup
-+ jmp seal_sse_128_seal_hash
-+
-+seal_sse_128_seal:
-+ cmp \$16, $inl
-+ jb seal_sse_tail_16
-+ sub \$16, $inl
-+ # Load for decryption
-+ movdqu 0*16($inp), $T0
-+ pxor $T0, $A0
-+ movdqu $A0, 0*16($oup)
-+ # Then hash
-+ add 0*8($oup), $acc0
-+ adc 1*8($oup), $acc1
-+ adc \$1, $acc2
-+ lea 1*16($inp), $inp
-+ lea 1*16($oup), $oup\n";
-+ &poly_mul(); $code.="
-+ # Shift the stream left
-+ movdqa $B0, $A0
-+ movdqa $C0, $B0
-+ movdqa $D0, $C0
-+ movdqa $A1, $D0
-+ movdqa $B1, $A1
-+ movdqa $C1, $B1
-+ movdqa $D1, $C1
-+ jmp seal_sse_128_seal
-+
-+seal_sse_tail_16:
-+ test $inl, $inl
-+ jz seal_sse_finalize
-+ # We can only load the PT one byte at a time to avoid buffer overread
-+ mov $inl, $itr2
-+ shl \$4, $itr2
-+ lea .and_masks(%rip), $t0
-+ mov $inl, $itr1
-+ lea -1($inp, $inl), $inp
-+ pxor $T3, $T3
-+1:
-+ pslldq \$1, $T3
-+ pinsrb \$0, ($inp), $T3
-+ lea -1($inp), $inp
-+ dec $itr1
-+ jne 1b
-+ pxor $A0, $T3
-+ movdqu $T3, ($oup)
-+ pand -16($t0, $itr2), $T3
-+ movq $T3, $t0
-+ pextrq \$1, $T3, $t1
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$1, $acc2
-+ lea ($inl, $oup), $oup\n";
-+ &poly_mul(); $code.="
-+seal_sse_finalize:\n";
-+ &poly_add($len_store);
-+ &poly_mul(); $code.="
-+ # Final reduce
-+ mov $acc0, $t0
-+ mov $acc1, $t1
-+ mov $acc2, $t2
-+ sub \$-5, $acc0
-+ sbb \$-1, $acc1
-+ sbb \$3, $acc2
-+ cmovc $t0, $acc0
-+ cmovc $t1, $acc1
-+ cmovc $t2, $acc2
-+ # Add in s part of the key
-+ add 0+$s_store, $acc0
-+ adc 8+$s_store, $acc1
-+ mov $acc0, 0*8($oup)
-+ mov $acc1, 1*8($oup)
-+ add \$288 + 32, %rsp
-+ pop %r15
-+ pop %r14
-+ pop %r13
-+ pop %r12
-+ pop %rbx
-+ pop %rbp
-+ ret
-+################################################################################
-+seal_sse_128:
-+ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
-+ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
-+ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
-+ movdqu 2*16($keyp), $D2
-+ movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
-+ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
-+ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
-+ mov \$10, $acc0
-+1:\n";
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ dec $acc0
-+ jnz 1b
-+ paddd .chacha20_consts(%rip), $A0
-+ paddd .chacha20_consts(%rip), $A1
-+ paddd .chacha20_consts(%rip), $A2
-+ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
-+ paddd $T2, $C0\npaddd $T2, $C1
-+ paddd $T3, $D0
-+ paddd .sse_inc(%rip), $T3
-+ paddd $T3, $D1
-+ # Clamp and store the key
-+ pand .clamp(%rip), $A2
-+ movdqa $A2, $r_store
-+ movdqa $B2, $s_store
-+ # Hash
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+ jmp seal_sse_128_seal
-+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
-+}
-+
-+if ($avx>1) {
-+
-+($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
-+my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
-+($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
-+$state1_store="2*32(%rbp)";
-+$state2_store="3*32(%rbp)";
-+$tmp_store="4*32(%rbp)";
-+$ctr0_store="5*32(%rbp)";
-+$ctr1_store="6*32(%rbp)";
-+$ctr2_store="7*32(%rbp)";
-+$ctr3_store="8*32(%rbp)";
-+
-+sub chacha_qr_avx2 {
-+my ($a,$b,$c,$d,$t,$dir)=@_;
-+$code.=<<___ if ($dir =~ /store/);
-+ vmovdqa $t, $tmp_store
-+___
-+$code.=<<___;
-+ vpaddd $b, $a, $a
-+ vpxor $a, $d, $d
-+ vpshufb .rol16(%rip), $d, $d
-+ vpaddd $d, $c, $c
-+ vpxor $c, $b, $b
-+ vpsrld \$20, $b, $t
-+ vpslld \$12, $b, $b
-+ vpxor $t, $b, $b
-+ vpaddd $b, $a, $a
-+ vpxor $a, $d, $d
-+ vpshufb .rol8(%rip), $d, $d
-+ vpaddd $d, $c, $c
-+ vpxor $c, $b, $b
-+ vpslld \$7, $b, $t
-+ vpsrld \$25, $b, $b
-+ vpxor $t, $b, $b
-+___
-+$code.=<<___ if ($dir =~ /left/);
-+ vpalignr \$12, $d, $d, $d
-+ vpalignr \$8, $c, $c, $c
-+ vpalignr \$4, $b, $b, $b
-+___
-+$code.=<<___ if ($dir =~ /right/);
-+ vpalignr \$4, $d, $d, $d
-+ vpalignr \$8, $c, $c, $c
-+ vpalignr \$12, $b, $b, $b
-+___
-+$code.=<<___ if ($dir =~ /load/);
-+ vmovdqa $tmp_store, $t
-+___
-+}
-+
-+sub prep_state_avx2 {
-+my ($n)=@_;
-+$code.=<<___;
-+ vmovdqa .chacha20_consts(%rip), $A0
-+ vmovdqa $state1_store, $B0
-+ vmovdqa $state2_store, $C0
-+___
-+$code.=<<___ if ($n ge 2);
-+ vmovdqa $A0, $A1
-+ vmovdqa $B0, $B1
-+ vmovdqa $C0, $C1
-+___
-+$code.=<<___ if ($n ge 3);
-+ vmovdqa $A0, $A2
-+ vmovdqa $B0, $B2
-+ vmovdqa $C0, $C2
-+___
-+$code.=<<___ if ($n ge 4);
-+ vmovdqa $A0, $A3
-+ vmovdqa $B0, $B3
-+ vmovdqa $C0, $C3
-+___
-+$code.=<<___ if ($n eq 1);
-+ vmovdqa .avx2_inc(%rip), $D0
-+ vpaddd $ctr0_store, $D0, $D0
-+ vmovdqa $D0, $ctr0_store
-+___
-+$code.=<<___ if ($n eq 2);
-+ vmovdqa .avx2_inc(%rip), $D0
-+ vpaddd $ctr0_store, $D0, $D1
-+ vpaddd $D1, $D0, $D0
-+ vmovdqa $D0, $ctr0_store
-+ vmovdqa $D1, $ctr1_store
-+___
-+$code.=<<___ if ($n eq 3);
-+ vmovdqa .avx2_inc(%rip), $D0
-+ vpaddd $ctr0_store, $D0, $D2
-+ vpaddd $D2, $D0, $D1
-+ vpaddd $D1, $D0, $D0
-+ vmovdqa $D0, $ctr0_store
-+ vmovdqa $D1, $ctr1_store
-+ vmovdqa $D2, $ctr2_store
-+___
-+$code.=<<___ if ($n eq 4);
-+ vmovdqa .avx2_inc(%rip), $D0
-+ vpaddd $ctr0_store, $D0, $D3
-+ vpaddd $D3, $D0, $D2
-+ vpaddd $D2, $D0, $D1
-+ vpaddd $D1, $D0, $D0
-+ vmovdqa $D3, $ctr3_store
-+ vmovdqa $D2, $ctr2_store
-+ vmovdqa $D1, $ctr1_store
-+ vmovdqa $D0, $ctr0_store
-+___
-+}
-+
-+sub finalize_state_avx2 {
-+my ($n)=@_;
-+$code.=<<___ if ($n eq 4);
-+ vpaddd .chacha20_consts(%rip), $A3, $A3
-+ vpaddd $state1_store, $B3, $B3
-+ vpaddd $state2_store, $C3, $C3
-+ vpaddd $ctr3_store, $D3, $D3
-+___
-+$code.=<<___ if ($n ge 3);
-+ vpaddd .chacha20_consts(%rip), $A2, $A2
-+ vpaddd $state1_store, $B2, $B2
-+ vpaddd $state2_store, $C2, $C2
-+ vpaddd $ctr2_store, $D2, $D2
-+___
-+$code.=<<___ if ($n ge 2);
-+ vpaddd .chacha20_consts(%rip), $A1, $A1
-+ vpaddd $state1_store, $B1, $B1
-+ vpaddd $state2_store, $C1, $C1
-+ vpaddd $ctr1_store, $D1, $D1
-+___
-+$code.=<<___;
-+ vpaddd .chacha20_consts(%rip), $A0, $A0
-+ vpaddd $state1_store, $B0, $B0
-+ vpaddd $state2_store, $C0, $C0
-+ vpaddd $ctr0_store, $D0, $D0
-+___
-+}
-+
-+sub xor_stream_avx2 {
-+my ($A, $B, $C, $D, $offset, $hlp)=@_;
-+$code.=<<___;
-+ vperm2i128 \$0x02, $A, $B, $hlp
-+ vperm2i128 \$0x13, $A, $B, $B
-+ vperm2i128 \$0x02, $C, $D, $A
-+ vperm2i128 \$0x13, $C, $D, $C
-+ vpxor 0*32+$offset($inp), $hlp, $hlp
-+ vpxor 1*32+$offset($inp), $A, $A
-+ vpxor 2*32+$offset($inp), $B, $B
-+ vpxor 3*32+$offset($inp), $C, $C
-+ vmovdqu $hlp, 0*32+$offset($oup)
-+ vmovdqu $A, 1*32+$offset($oup)
-+ vmovdqu $B, 2*32+$offset($oup)
-+ vmovdqu $C, 3*32+$offset($oup)
-+___
-+}
-+
-+sub finish_stream_avx2 {
-+my ($A, $B, $C, $D, $hlp)=@_;
-+$code.=<<___;
-+ vperm2i128 \$0x13, $A, $B, $hlp
-+ vperm2i128 \$0x02, $A, $B, $A
-+ vperm2i128 \$0x02, $C, $D, $B
-+ vperm2i128 \$0x13, $C, $D, $D
-+ vmovdqa $hlp, $C
-+___
-+}
-+
-+sub poly_stage1_mulx {
-+$code.=<<___;
-+ mov 0+$r_store, %rdx
-+ mov %rdx, $t2
-+ mulx $acc0, $t0, $t1
-+ mulx $acc1, %rax, %rdx
-+ imul $acc2, $t2
-+ add %rax, $t1
-+ adc %rdx, $t2
-+___
-+}
-+
-+sub poly_stage2_mulx {
-+$code.=<<___;
-+ mov 8+$r_store, %rdx
-+ mulx $acc0, $acc0, %rax
-+ add $acc0, $t1
-+ mulx $acc1, $acc1, $t3
-+ adc $acc1, $t2
-+ adc \$0, $t3
-+ imul $acc2, %rdx
-+___
-+}
-+
-+sub poly_stage3_mulx {
-+$code.=<<___;
-+ add %rax, $t2
-+ adc %rdx, $t3
-+___
-+}
-+
-+sub poly_mul_mulx {
-+ &poly_stage1_mulx();
-+ &poly_stage2_mulx();
-+ &poly_stage3_mulx();
-+ &poly_reduce_stage();
-+}
-+
-+sub gen_chacha_round_avx2 {
-+my ($rot1, $rot2, $shift)=@_;
-+my $round="";
-+$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
-+$round=$round ."vmovdqa $rot2, $C0
-+ vpaddd $B3, $A3, $A3
-+ vpaddd $B2, $A2, $A2
-+ vpaddd $B1, $A1, $A1
-+ vpaddd $B0, $A0, $A0
-+ vpxor $A3, $D3, $D3
-+ vpxor $A2, $D2, $D2
-+ vpxor $A1, $D1, $D1
-+ vpxor $A0, $D0, $D0
-+ vpshufb $C0, $D3, $D3
-+ vpshufb $C0, $D2, $D2
-+ vpshufb $C0, $D1, $D1
-+ vpshufb $C0, $D0, $D0
-+ vmovdqa $tmp_store, $C0
-+ vpaddd $D3, $C3, $C3
-+ vpaddd $D2, $C2, $C2
-+ vpaddd $D1, $C1, $C1
-+ vpaddd $D0, $C0, $C0
-+ vpxor $C3, $B3, $B3
-+ vpxor $C2, $B2, $B2
-+ vpxor $C1, $B1, $B1
-+ vpxor $C0, $B0, $B0
-+ vmovdqa $C0, $tmp_store
-+ vpsrld \$$rot1, $B3, $C0
-+ vpslld \$32-$rot1, $B3, $B3
-+ vpxor $C0, $B3, $B3
-+ vpsrld \$$rot1, $B2, $C0
-+ vpslld \$32-$rot1, $B2, $B2
-+ vpxor $C0, $B2, $B2
-+ vpsrld \$$rot1, $B1, $C0
-+ vpslld \$32-$rot1, $B1, $B1
-+ vpxor $C0, $B1, $B1
-+ vpsrld \$$rot1, $B0, $C0
-+ vpslld \$32-$rot1, $B0, $B0
-+ vpxor $C0, $B0, $B0\n";
-+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
-+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
-+$round=$round ."vmovdqa $tmp_store, $C0
-+ vpalignr \$$s1, $B3, $B3, $B3
-+ vpalignr \$$s2, $C3, $C3, $C3
-+ vpalignr \$$s3, $D3, $D3, $D3
-+ vpalignr \$$s1, $B2, $B2, $B2
-+ vpalignr \$$s2, $C2, $C2, $C2
-+ vpalignr \$$s3, $D2, $D2, $D2
-+ vpalignr \$$s1, $B1, $B1, $B1
-+ vpalignr \$$s2, $C1, $C1, $C1
-+ vpalignr \$$s3, $D1, $D1, $D1
-+ vpalignr \$$s1, $B0, $B0, $B0
-+ vpalignr \$$s2, $C0, $C0, $C0
-+ vpalignr \$$s3, $D0, $D0, $D0\n"
-+if (($shift =~ /left/) || ($shift =~ /right/));
-+return $round;
-+};
-+
-+$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
-+ &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
-+ &gen_chacha_round_avx2(20, ".rol16(%rip)") .
-+ &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
-+
-+@loop_body = split /\n/, $chacha_body;
-+
-+$code.="
-+###############################################################################
-+.type chacha20_poly1305_open_avx2,\@function,2
-+.align 64
-+chacha20_poly1305_open_avx2:
-+ vzeroupper
-+ vmovdqa .chacha20_consts(%rip), $A0
-+ vbroadcasti128 0*16($keyp), $B0
-+ vbroadcasti128 1*16($keyp), $C0
-+ vbroadcasti128 2*16($keyp), $D0
-+ vpaddd .avx2_init(%rip), $D0, $D0
-+ cmp \$6*32, $inl
-+ jbe open_avx2_192
-+ cmp \$10*32, $inl
-+ jbe open_avx2_320
-+
-+ vmovdqa $B0, $state1_store
-+ vmovdqa $C0, $state2_store
-+ vmovdqa $D0, $ctr0_store
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ vpaddd .chacha20_consts(%rip), $A0, $A0
-+ vpaddd $state1_store, $B0, $B0
-+ vpaddd $state2_store, $C0, $C0
-+ vpaddd $ctr0_store, $D0, $D0
-+
-+ vperm2i128 \$0x02, $A0, $B0, $T0
-+ # Clamp and store key
-+ vpand .clamp(%rip), $T0, $T0
-+ vmovdqa $T0, $r_store
-+ # Stream for the first 64 bytes
-+ vperm2i128 \$0x13, $A0, $B0, $A0
-+ vperm2i128 \$0x13, $C0, $D0, $B0
-+ # Hash AD + first 64 bytes
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+ xor $itr1, $itr1
-+ # Hash first 64 bytes
-+1: \n";
-+ &poly_add("0($inp, $itr1)");
-+ &poly_mul(); $code.="
-+ add \$16, $itr1
-+ cmp \$2*32, $itr1
-+ jne 1b
-+ # Decrypt first 64 bytes
-+ vpxor 0*32($inp), $A0, $A0
-+ vpxor 1*32($inp), $B0, $B0
-+ vmovdqu $A0, 0*32($oup)
-+ vmovdqu $B0, 1*32($oup)
-+ lea 2*32($inp), $inp
-+ lea 2*32($oup), $oup
-+ sub \$2*32, $inl
-+1:
-+ # Hash and decrypt 512 bytes each iteration
-+ cmp \$16*32, $inl
-+ jb 3f\n";
-+ &prep_state_avx2(4); $code.="
-+ xor $itr1, $itr1
-+2: \n";
-+ &poly_add("0*8($inp, $itr1)");
-+ &emit_body(10);
-+ &poly_stage1_mulx();
-+ &emit_body(9);
-+ &poly_stage2_mulx();
-+ &emit_body(12);
-+ &poly_stage3_mulx();
-+ &emit_body(10);
-+ &poly_reduce_stage();
-+ &emit_body(9);
-+ &poly_add("2*8($inp, $itr1)");
-+ &emit_body(8);
-+ &poly_stage1_mulx();
-+ &emit_body(18);
-+ &poly_stage2_mulx();
-+ &emit_body(18);
-+ &poly_stage3_mulx();
-+ &emit_body(9);
-+ &poly_reduce_stage();
-+ &emit_body(8);
-+ &poly_add("4*8($inp, $itr1)"); $code.="
-+ lea 6*8($itr1), $itr1\n";
-+ &emit_body(18);
-+ &poly_stage1_mulx();
-+ &emit_body(8);
-+ &poly_stage2_mulx();
-+ &emit_body(8);
-+ &poly_stage3_mulx();
-+ &emit_body(18);
-+ &poly_reduce_stage();
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ cmp \$10*6*8, $itr1
-+ jne 2b\n";
-+ &finalize_state_avx2(4); $code.="
-+ vmovdqa $A0, $tmp_store\n";
-+ &poly_add("10*6*8($inp)");
-+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
-+ vmovdqa $tmp_store, $A0\n";
-+ &poly_mul();
-+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
-+ &poly_add("10*6*8+2*8($inp)");
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
-+ &poly_mul();
-+ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
-+ lea 16*32($inp), $inp
-+ lea 16*32($oup), $oup
-+ sub \$16*32, $inl
-+ jmp 1b
-+3:
-+ test $inl, $inl
-+ vzeroupper
-+ je open_sse_finalize
-+3:
-+ cmp \$4*32, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # 1-128 bytes left
-+ &prep_state_avx2(1); $code.="
-+ xor $itr2, $itr2
-+ mov $inl, $itr1
-+ and \$-16, $itr1
-+ test $itr1, $itr1
-+ je 2f
-+1: \n";
-+ &poly_add("0*8($inp, $itr2)");
-+ &poly_mul(); $code.="
-+2:
-+ add \$16, $itr2\n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
-+ cmp $itr1, $itr2
-+ jb 1b
-+ cmp \$160, $itr2
-+ jne 2b\n";
-+ &finalize_state_avx2(1);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-+ jmp open_avx2_tail_loop
-+3:
-+ cmp \$8*32, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # 129-256 bytes left
-+ &prep_state_avx2(2); $code.="
-+ mov $inl, $tmp_store
-+ mov $inl, $itr1
-+ sub \$4*32, $itr1
-+ shr \$4, $itr1
-+ mov \$10, $itr2
-+ cmp \$10, $itr1
-+ cmovg $itr2, $itr1
-+ mov $inp, $inl
-+ xor $itr2, $itr2
-+1: \n";
-+ &poly_add("0*8($inl)");
-+ &poly_mul_mulx(); $code.="
-+ lea 16($inl), $inl
-+2: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
-+ inc $itr2\n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ cmp $itr1, $itr2
-+ jb 1b
-+ cmp \$10, $itr2
-+ jne 2b
-+ mov $inl, $itr2
-+ sub $inp, $inl
-+ mov $inl, $itr1
-+ mov $tmp_store, $inl
-+1:
-+ add \$16, $itr1
-+ cmp $inl, $itr1
-+ jg 1f\n";
-+ &poly_add("0*8($itr2)");
-+ &poly_mul_mulx(); $code.="
-+ lea 16($itr2), $itr2
-+ jmp 1b
-+1: \n";
-+ &finalize_state_avx2(2);
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
-+ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
-+ lea 4*32($inp), $inp
-+ lea 4*32($oup), $oup
-+ sub \$4*32, $inl
-+ jmp open_avx2_tail_loop
-+3:
-+ cmp \$12*32, $inl
-+ ja 3f\n";
-+###############################################################################
-+ # 257-383 bytes left
-+ &prep_state_avx2(3); $code.="
-+ mov $inl, $tmp_store
-+ mov $inl, $itr1
-+ sub \$8*32, $itr1
-+ shr \$4, $itr1
-+ add \$6, $itr1
-+ mov \$10, $itr2
-+ cmp \$10, $itr1
-+ cmovg $itr2, $itr1
-+ mov $inp, $inl
-+ xor $itr2, $itr2
-+1: \n";
-+ &poly_add("0*8($inl)");
-+ &poly_mul_mulx(); $code.="
-+ lea 16($inl), $inl
-+2: \n";
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &poly_add("0*8($inl)");
-+ &poly_mul(); $code.="
-+ lea 16($inl), $inl
-+ inc $itr2\n";
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
-+ cmp $itr1, $itr2
-+ jb 1b
-+ cmp \$10, $itr2
-+ jne 2b
-+ mov $inl, $itr2
-+ sub $inp, $inl
-+ mov $inl, $itr1
-+ mov $tmp_store, $inl
-+1:
-+ add \$16, $itr1
-+ cmp $inl, $itr1
-+ jg 1f\n";
-+ &poly_add("0*8($itr2)");
-+ &poly_mul_mulx(); $code.="
-+ lea 16($itr2), $itr2
-+ jmp 1b
-+1: \n";
-+ &finalize_state_avx2(3);
-+ &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
-+ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
-+ lea 8*32($inp), $inp
-+ lea 8*32($oup), $oup
-+ sub \$8*32, $inl
-+ jmp open_avx2_tail_loop
-+3: \n";
-+###############################################################################
-+ # 384-512 bytes left
-+ &prep_state_avx2(4); $code.="
-+ xor $itr1, $itr1
-+ mov $inp, $itr2
-+1: \n";
-+ &poly_add("0*8($itr2)");
-+ &poly_mul(); $code.="
-+ lea 2*8($itr2), $itr2
-+2: \n";
-+ &emit_body(37);
-+ &poly_add("0*8($itr2)");
-+ &poly_mul_mulx();
-+ &emit_body(48);
-+ &poly_add("2*8($itr2)");
-+ &poly_mul_mulx(); $code.="
-+ lea 4*8($itr2), $itr2\n";
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ inc $itr1
-+ cmp \$4, $itr1
-+ jl 1b
-+ cmp \$10, $itr1
-+ jne 2b
-+ mov $inl, $itr1
-+ sub \$12*32, $itr1
-+ and \$-16, $itr1
-+1:
-+ test $itr1, $itr1
-+ je 1f\n";
-+ &poly_add("0*8($itr2)");
-+ &poly_mul_mulx(); $code.="
-+ lea 2*8($itr2), $itr2
-+ sub \$2*8, $itr1
-+ jmp 1b
-+1: \n";
-+ &finalize_state_avx2(4); $code.="
-+ vmovdqa $A0, $tmp_store\n";
-+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
-+ vmovdqa $tmp_store, $A0\n";
-+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
-+ &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
-+ lea 12*32($inp), $inp
-+ lea 12*32($oup), $oup
-+ sub \$12*32, $inl
-+open_avx2_tail_loop:
-+ cmp \$32, $inl
-+ jb open_avx2_tail
-+ sub \$32, $inl
-+ vpxor ($inp), $A0, $A0
-+ vmovdqu $A0, ($oup)
-+ lea 1*32($inp), $inp
-+ lea 1*32($oup), $oup
-+ vmovdqa $B0, $A0
-+ vmovdqa $C0, $B0
-+ vmovdqa $D0, $C0
-+ jmp open_avx2_tail_loop
-+open_avx2_tail:
-+ cmp \$16, $inl
-+ vmovdqa $A0x, $A1x
-+ jb 1f
-+ sub \$16, $inl
-+ #load for decryption
-+ vpxor ($inp), $A0x, $A1x
-+ vmovdqu $A1x, ($oup)
-+ lea 1*16($inp), $inp
-+ lea 1*16($oup), $oup
-+ vperm2i128 \$0x11, $A0, $A0, $A0
-+ vmovdqa $A0x, $A1x
-+1:
-+ vzeroupper
-+ jmp open_sse_tail_16
-+###############################################################################
-+open_avx2_192:
-+ vmovdqa $A0, $A1
-+ vmovdqa $A0, $A2
-+ vmovdqa $B0, $B1
-+ vmovdqa $B0, $B2
-+ vmovdqa $C0, $C1
-+ vmovdqa $C0, $C2
-+ vpaddd .avx2_inc(%rip), $D0, $D1
-+ vmovdqa $D0, $T2
-+ vmovdqa $D1, $T3
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ vpaddd $A2, $A0, $A0
-+ vpaddd $A2, $A1, $A1
-+ vpaddd $B2, $B0, $B0
-+ vpaddd $B2, $B1, $B1
-+ vpaddd $C2, $C0, $C0
-+ vpaddd $C2, $C1, $C1
-+ vpaddd $T2, $D0, $D0
-+ vpaddd $T3, $D1, $D1
-+ vperm2i128 \$0x02, $A0, $B0, $T0
-+ # Clamp and store the key
-+ vpand .clamp(%rip), $T0, $T0
-+ vmovdqa $T0, $r_store
-+ # Stream for up to 192 bytes
-+ vperm2i128 \$0x13, $A0, $B0, $A0
-+ vperm2i128 \$0x13, $C0, $D0, $B0
-+ vperm2i128 \$0x02, $A1, $B1, $C0
-+ vperm2i128 \$0x02, $C1, $D1, $D0
-+ vperm2i128 \$0x13, $A1, $B1, $A1
-+ vperm2i128 \$0x13, $C1, $D1, $B1
-+open_avx2_short:
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+open_avx2_hash_and_xor_loop:
-+ cmp \$32, $inl
-+ jb open_avx2_short_tail_32
-+ sub \$32, $inl\n";
-+ # Load + hash
-+ &poly_add("0*8($inp)");
-+ &poly_mul();
-+ &poly_add("2*8($inp)");
-+ &poly_mul(); $code.="
-+ # Load + decrypt
-+ vpxor ($inp), $A0, $A0
-+ vmovdqu $A0, ($oup)
-+ lea 1*32($inp), $inp
-+ lea 1*32($oup), $oup
-+ # Shift stream
-+ vmovdqa $B0, $A0
-+ vmovdqa $C0, $B0
-+ vmovdqa $D0, $C0
-+ vmovdqa $A1, $D0
-+ vmovdqa $B1, $A1
-+ vmovdqa $C1, $B1
-+ vmovdqa $D1, $C1
-+ vmovdqa $A2, $D1
-+ vmovdqa $B2, $A2
-+ jmp open_avx2_hash_and_xor_loop
-+open_avx2_short_tail_32:
-+ cmp \$16, $inl
-+ vmovdqa $A0x, $A1x
-+ jb 1f
-+ sub \$16, $inl\n";
-+ &poly_add("0*8($inp)");
-+ &poly_mul(); $code.="
-+ vpxor ($inp), $A0x, $A3x
-+ vmovdqu $A3x, ($oup)
-+ lea 1*16($inp), $inp
-+ lea 1*16($oup), $oup
-+ vextracti128 \$1, $A0, $A1x
-+1:
-+ vzeroupper
-+ jmp open_sse_tail_16
-+###############################################################################
-+open_avx2_320:
-+ vmovdqa $A0, $A1
-+ vmovdqa $A0, $A2
-+ vmovdqa $B0, $B1
-+ vmovdqa $B0, $B2
-+ vmovdqa $C0, $C1
-+ vmovdqa $C0, $C2
-+ vpaddd .avx2_inc(%rip), $D0, $D1
-+ vpaddd .avx2_inc(%rip), $D1, $D2
-+ vmovdqa $B0, $T1
-+ vmovdqa $C0, $T2
-+ vmovdqa $D0, $ctr0_store
-+ vmovdqa $D1, $ctr1_store
-+ vmovdqa $D2, $ctr2_store
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ vpaddd .chacha20_consts(%rip), $A0, $A0
-+ vpaddd .chacha20_consts(%rip), $A1, $A1
-+ vpaddd .chacha20_consts(%rip), $A2, $A2
-+ vpaddd $T1, $B0, $B0
-+ vpaddd $T1, $B1, $B1
-+ vpaddd $T1, $B2, $B2
-+ vpaddd $T2, $C0, $C0
-+ vpaddd $T2, $C1, $C1
-+ vpaddd $T2, $C2, $C2
-+ vpaddd $ctr0_store, $D0, $D0
-+ vpaddd $ctr1_store, $D1, $D1
-+ vpaddd $ctr2_store, $D2, $D2
-+ vperm2i128 \$0x02, $A0, $B0, $T0
-+ # Clamp and store the key
-+ vpand .clamp(%rip), $T0, $T0
-+ vmovdqa $T0, $r_store
-+ # Stream for up to 320 bytes
-+ vperm2i128 \$0x13, $A0, $B0, $A0
-+ vperm2i128 \$0x13, $C0, $D0, $B0
-+ vperm2i128 \$0x02, $A1, $B1, $C0
-+ vperm2i128 \$0x02, $C1, $D1, $D0
-+ vperm2i128 \$0x13, $A1, $B1, $A1
-+ vperm2i128 \$0x13, $C1, $D1, $B1
-+ vperm2i128 \$0x02, $A2, $B2, $C1
-+ vperm2i128 \$0x02, $C2, $D2, $D1
-+ vperm2i128 \$0x13, $A2, $B2, $A2
-+ vperm2i128 \$0x13, $C2, $D2, $B2
-+ jmp open_avx2_short
-+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
-+###############################################################################
-+###############################################################################
-+.type chacha20_poly1305_seal_avx2,\@function,2
-+.align 64
-+chacha20_poly1305_seal_avx2:
-+ vzeroupper
-+ vmovdqa .chacha20_consts(%rip), $A0
-+ vbroadcasti128 0*16($keyp), $B0
-+ vbroadcasti128 1*16($keyp), $C0
-+ vbroadcasti128 2*16($keyp), $D0
-+ vpaddd .avx2_init(%rip), $D0, $D0
-+ cmp \$6*32, $inl
-+ jbe seal_avx2_192
-+ cmp \$10*32, $inl
-+ jbe seal_avx2_320
-+ vmovdqa $A0, $A1
-+ vmovdqa $A0, $A2
-+ vmovdqa $A0, $A3
-+ vmovdqa $B0, $B1
-+ vmovdqa $B0, $B2
-+ vmovdqa $B0, $B3
-+ vmovdqa $B0, $state1_store
-+ vmovdqa $C0, $C1
-+ vmovdqa $C0, $C2
-+ vmovdqa $C0, $C3
-+ vmovdqa $C0, $state2_store
-+ vmovdqa $D0, $D3
-+ vpaddd .avx2_inc(%rip), $D3, $D2
-+ vpaddd .avx2_inc(%rip), $D2, $D1
-+ vpaddd .avx2_inc(%rip), $D1, $D0
-+ vmovdqa $D0, $ctr0_store
-+ vmovdqa $D1, $ctr1_store
-+ vmovdqa $D2, $ctr2_store
-+ vmovdqa $D3, $ctr3_store
-+ mov \$10, $acc0
-+1: \n";
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ dec $acc0
-+ jnz 1b\n";
-+ &finalize_state_avx2(4); $code.="
-+ vperm2i128 \$0x13, $C3, $D3, $C3
-+ vperm2i128 \$0x02, $A3, $B3, $D3
-+ vperm2i128 \$0x13, $A3, $B3, $A3
-+ vpand .clamp(%rip), $D3, $D3
-+ vmovdqa $D3, $r_store
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+ # Safely store 320 bytes (otherwise would handle with optimized call)
-+ vpxor 0*32($inp), $A3, $A3
-+ vpxor 1*32($inp), $C3, $C3
-+ vmovdqu $A3, 0*32($oup)
-+ vmovdqu $C3, 1*32($oup)\n";
-+ &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
-+ &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
-+ lea 10*32($inp), $inp
-+ sub \$10*32, $inl
-+ mov \$10*32, $itr1
-+ cmp \$4*32, $inl
-+ jbe seal_avx2_hash
-+ vpxor 0*32($inp), $A0, $A0
-+ vpxor 1*32($inp), $B0, $B0
-+ vpxor 2*32($inp), $C0, $C0
-+ vpxor 3*32($inp), $D0, $D0
-+ vmovdqu $A0, 10*32($oup)
-+ vmovdqu $B0, 11*32($oup)
-+ vmovdqu $C0, 12*32($oup)
-+ vmovdqu $D0, 13*32($oup)
-+ lea 4*32($inp), $inp
-+ sub \$4*32, $inl
-+ mov \$8, $itr1
-+ mov \$2, $itr2
-+ cmp \$4*32, $inl
-+ jbe seal_avx2_tail_128
-+ cmp \$8*32, $inl
-+ jbe seal_avx2_tail_256
-+ cmp \$12*32, $inl
-+ jbe seal_avx2_tail_384
-+ cmp \$16*32, $inl
-+ jbe seal_avx2_tail_512\n";
-+ # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
-+ &prep_state_avx2(4);
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body;
-+ &emit_body(41);
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ sub \$16, $oup
-+ mov \$9, $itr1
-+ jmp 4f
-+1: \n";
-+ &prep_state_avx2(4); $code.="
-+ mov \$10, $itr1
-+2: \n";
-+ &poly_add("0*8($oup)");
-+ &emit_body(10);
-+ &poly_stage1_mulx();
-+ &emit_body(9);
-+ &poly_stage2_mulx();
-+ &emit_body(12);
-+ &poly_stage3_mulx();
-+ &emit_body(10);
-+ &poly_reduce_stage(); $code.="
-+4: \n";
-+ &emit_body(9);
-+ &poly_add("2*8($oup)");
-+ &emit_body(8);
-+ &poly_stage1_mulx();
-+ &emit_body(18);
-+ &poly_stage2_mulx();
-+ &emit_body(18);
-+ &poly_stage3_mulx();
-+ &emit_body(9);
-+ &poly_reduce_stage();
-+ &emit_body(8);
-+ &poly_add("4*8($oup)"); $code.="
-+ lea 6*8($oup), $oup\n";
-+ &emit_body(18);
-+ &poly_stage1_mulx();
-+ &emit_body(8);
-+ &poly_stage2_mulx();
-+ &emit_body(8);
-+ &poly_stage3_mulx();
-+ &emit_body(18);
-+ &poly_reduce_stage();
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ dec $itr1
-+ jne 2b\n";
-+ &finalize_state_avx2(4); $code.="
-+ lea 4*8($oup), $oup
-+ vmovdqa $A0, $tmp_store\n";
-+ &poly_add("-4*8($oup)");
-+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
-+ vmovdqa $tmp_store, $A0\n";
-+ &poly_mul();
-+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
-+ &poly_add("-2*8($oup)");
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
-+ &poly_mul();
-+ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
-+ lea 16*32($inp), $inp
-+ sub \$16*32, $inl
-+ cmp \$16*32, $inl
-+ jg 1b\n";
-+ &poly_add("0*8($oup)");
-+ &poly_mul();
-+ &poly_add("2*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 4*8($oup), $oup
-+ mov \$10, $itr1
-+ xor $itr2, $itr2
-+ cmp \$4*32, $inl
-+ ja 3f
-+###############################################################################
-+seal_avx2_tail_128:\n";
-+ &prep_state_avx2(1); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 2*8($oup), $oup
-+2: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &poly_add("0*8($oup)");
-+ &poly_mul();
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &poly_add("2*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 4*8($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state_avx2(1);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-+ jmp seal_avx2_short_loop
-+3:
-+ cmp \$8*32, $inl
-+ ja 3f
-+###############################################################################
-+seal_avx2_tail_256:\n";
-+ &prep_state_avx2(2); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 2*8($oup), $oup
-+2: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &poly_add("0*8($oup)");
-+ &poly_mul();
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &poly_add("2*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 4*8($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state_avx2(2);
-+ &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-+ mov \$4*32, $itr1
-+ lea 4*32($inp), $inp
-+ sub \$4*32, $inl
-+ jmp seal_avx2_hash
-+3:
-+ cmp \$12*32, $inl
-+ ja seal_avx2_tail_512
-+###############################################################################
-+seal_avx2_tail_384:\n";
-+ &prep_state_avx2(3); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ lea 2*8($oup), $oup
-+2: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &poly_add("0*8($oup)");
-+ &poly_mul();
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &poly_add("2*8($oup)");
-+ &poly_mul();
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ lea 4*8($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state_avx2(3);
-+ &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
-+ &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-+ mov \$8*32, $itr1
-+ lea 8*32($inp), $inp
-+ sub \$8*32, $inl
-+ jmp seal_avx2_hash
-+###############################################################################
-+seal_avx2_tail_512:\n";
-+ &prep_state_avx2(4); $code.="
-+1: \n";
-+ &poly_add("0($oup)");
-+ &poly_mul_mulx(); $code.="
-+ lea 2*8($oup), $oup
-+2: \n";
-+ &emit_body(20);
-+ &poly_add("0*8($oup)");
-+ &emit_body(20);
-+ &poly_stage1_mulx();
-+ &emit_body(20);
-+ &poly_stage2_mulx();
-+ &emit_body(20);
-+ &poly_stage3_mulx();
-+ &emit_body(20);
-+ &poly_reduce_stage();
-+ &emit_body(20);
-+ &poly_add("2*8($oup)");
-+ &emit_body(20);
-+ &poly_stage1_mulx();
-+ &emit_body(20);
-+ &poly_stage2_mulx();
-+ &emit_body(20);
-+ &poly_stage3_mulx();
-+ &emit_body(20);
-+ &poly_reduce_stage();
-+ foreach $l (@loop_body) {$code.=$l."\n";}
-+ @loop_body = split /\n/, $chacha_body; $code.="
-+ lea 4*8($oup), $oup
-+ dec $itr1
-+ jg 1b
-+ dec $itr2
-+ jge 2b\n";
-+ &finalize_state_avx2(4); $code.="
-+ vmovdqa $A0, $tmp_store\n";
-+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
-+ vmovdqa $tmp_store, $A0\n";
-+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
-+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
-+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-+ mov \$12*32, $itr1
-+ lea 12*32($inp), $inp
-+ sub \$12*32, $inl
-+ jmp seal_avx2_hash
-+################################################################################
-+seal_avx2_320:
-+ vmovdqa $A0, $A1
-+ vmovdqa $A0, $A2
-+ vmovdqa $B0, $B1
-+ vmovdqa $B0, $B2
-+ vmovdqa $C0, $C1
-+ vmovdqa $C0, $C2
-+ vpaddd .avx2_inc(%rip), $D0, $D1
-+ vpaddd .avx2_inc(%rip), $D1, $D2
-+ vmovdqa $B0, $T1
-+ vmovdqa $C0, $T2
-+ vmovdqa $D0, $ctr0_store
-+ vmovdqa $D1, $ctr1_store
-+ vmovdqa $D2, $ctr2_store
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
-+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ vpaddd .chacha20_consts(%rip), $A0, $A0
-+ vpaddd .chacha20_consts(%rip), $A1, $A1
-+ vpaddd .chacha20_consts(%rip), $A2, $A2
-+ vpaddd $T1, $B0, $B0
-+ vpaddd $T1, $B1, $B1
-+ vpaddd $T1, $B2, $B2
-+ vpaddd $T2, $C0, $C0
-+ vpaddd $T2, $C1, $C1
-+ vpaddd $T2, $C2, $C2
-+ vpaddd $ctr0_store, $D0, $D0
-+ vpaddd $ctr1_store, $D1, $D1
-+ vpaddd $ctr2_store, $D2, $D2
-+ vperm2i128 \$0x02, $A0, $B0, $T0
-+ # Clamp and store the key
-+ vpand .clamp(%rip), $T0, $T0
-+ vmovdqa $T0, $r_store
-+ # Stream for up to 320 bytes
-+ vperm2i128 \$0x13, $A0, $B0, $A0
-+ vperm2i128 \$0x13, $C0, $D0, $B0
-+ vperm2i128 \$0x02, $A1, $B1, $C0
-+ vperm2i128 \$0x02, $C1, $D1, $D0
-+ vperm2i128 \$0x13, $A1, $B1, $A1
-+ vperm2i128 \$0x13, $C1, $D1, $B1
-+ vperm2i128 \$0x02, $A2, $B2, $C1
-+ vperm2i128 \$0x02, $C2, $D2, $D1
-+ vperm2i128 \$0x13, $A2, $B2, $A2
-+ vperm2i128 \$0x13, $C2, $D2, $B2
-+ jmp seal_avx2_short
-+################################################################################
-+seal_avx2_192:
-+ vmovdqa $A0, $A1
-+ vmovdqa $A0, $A2
-+ vmovdqa $B0, $B1
-+ vmovdqa $B0, $B2
-+ vmovdqa $C0, $C1
-+ vmovdqa $C0, $C2
-+ vpaddd .avx2_inc(%rip), $D0, $D1
-+ vmovdqa $D0, $T2
-+ vmovdqa $D1, $T3
-+ mov \$10, $acc0
-+1: \n";
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
-+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
-+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
-+ dec $acc0
-+ jne 1b
-+ vpaddd $A2, $A0, $A0
-+ vpaddd $A2, $A1, $A1
-+ vpaddd $B2, $B0, $B0
-+ vpaddd $B2, $B1, $B1
-+ vpaddd $C2, $C0, $C0
-+ vpaddd $C2, $C1, $C1
-+ vpaddd $T2, $D0, $D0
-+ vpaddd $T3, $D1, $D1
-+ vperm2i128 \$0x02, $A0, $B0, $T0
-+ # Clamp and store the key
-+ vpand .clamp(%rip), $T0, $T0
-+ vmovdqa $T0, $r_store
-+ # Stream for up to 192 bytes
-+ vperm2i128 \$0x13, $A0, $B0, $A0
-+ vperm2i128 \$0x13, $C0, $D0, $B0
-+ vperm2i128 \$0x02, $A1, $B1, $C0
-+ vperm2i128 \$0x02, $C1, $D1, $D0
-+ vperm2i128 \$0x13, $A1, $B1, $A1
-+ vperm2i128 \$0x13, $C1, $D1, $B1
-+seal_avx2_short:
-+ mov %r8, $itr2
-+ call poly_hash_ad_internal
-+ xor $itr1, $itr1
-+seal_avx2_hash:
-+ cmp \$16, $itr1
-+ jb seal_avx2_short_loop\n";
-+ &poly_add("0($oup)");
-+ &poly_mul(); $code.="
-+ sub \$16, $itr1
-+ add \$16, $oup
-+ jmp seal_avx2_hash
-+seal_avx2_short_loop:
-+ cmp \$32, $inl
-+ jb seal_avx2_short_tail
-+ sub \$32, $inl
-+ # Encrypt
-+ vpxor ($inp), $A0, $A0
-+ vmovdqu $A0, ($oup)
-+ lea 1*32($inp), $inp
-+ # Load + hash\n";
-+ &poly_add("0*8($oup)");
-+ &poly_mul();
-+ &poly_add("2*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 1*32($oup), $oup
-+ # Shift stream
-+ vmovdqa $B0, $A0
-+ vmovdqa $C0, $B0
-+ vmovdqa $D0, $C0
-+ vmovdqa $A1, $D0
-+ vmovdqa $B1, $A1
-+ vmovdqa $C1, $B1
-+ vmovdqa $D1, $C1
-+ vmovdqa $A2, $D1
-+ vmovdqa $B2, $A2
-+ jmp seal_avx2_short_loop
-+seal_avx2_short_tail:
-+ cmp \$16, $inl
-+ jb 1f
-+ sub \$16, $inl
-+ vpxor ($inp), $A0x, $A3x
-+ vmovdqu $A3x, ($oup)
-+ lea 1*16($inp), $inp\n";
-+ &poly_add("0*8($oup)");
-+ &poly_mul(); $code.="
-+ lea 1*16($oup), $oup
-+ vextracti128 \$1, $A0, $A0x
-+1:
-+ vzeroupper
-+ jmp seal_sse_tail_16
-+";
-+}
-+
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
-+print $code;
-+close STDOUT;
-diff --git a/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl b/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl
-new file mode 100644
-index 0000000..538af42
---- /dev/null
-+++ b/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl
-@@ -0,0 +1,415 @@
-+#!/usr/bin/env perl
-+
-+##############################################################################
-+# #
-+# Copyright 2014 Intel Corporation #
-+# #
-+# Licensed under the Apache License, Version 2.0 (the "License"); #
-+# you may not use this file except in compliance with the License. #
-+# You may obtain a copy of the License at #
-+# #
-+# http://www.apache.org/licenses/LICENSE-2.0 #
-+# #
-+# Unless required by applicable law or agreed to in writing, software #
-+# distributed under the License is distributed on an "AS IS" BASIS, #
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-+# See the License for the specific language governing permissions and #
-+# limitations under the License. #
-+# #
-+##############################################################################
-+# #
-+# Developers and authors: #
-+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
-+# (1) Intel Corporation, Israel Development Center #
-+# (2) University of Haifa #
-+# #
-+# Related work: #
-+# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE #
-+# Proceedings of 11th International Conference on Information #
-+# Technology: New Generations (ITNG 2014), 612-615 (2014). #
-+# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"#
-+# to be published. #
-+# A. Langley, chacha20poly1305 for the AEAD head #
-+# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 #
-+##############################################################################
-+
-+
-+$flavour = shift;
-+$output = shift;
-+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-+
-+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-+die "can't locate x86_64-xlate.pl";
-+
-+open OUT,"| \"$^X\" $xlate $flavour $output";
-+*STDOUT=*OUT;
-+
-+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.19) + ($1>=2.22);
-+}
-+
-+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
-+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.09) + ($1>=2.10);
-+}
-+
-+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
-+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-+ $avx = ($1>=10) + ($1>=11);
-+}
-+
-+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
-+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
-+ $avx = ($ver>=3.0) + ($ver>=3.01);
-+}
-+
-+{
-+
-+my ($rol8, $rol16, $state_cdef, $tmp,
-+ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7,
-+ $v8, $v9, $v10, $v11)=map("%xmm$_",(0..15));
-+
-+sub chacha_qr {
-+
-+my ($a,$b,$c,$d)=@_;
-+$code.=<<___;
-+ paddd $b, $a # a += b
-+ pxor $a, $d # d ^= a
-+ pshufb $rol16, $d # d <<<= 16
-+
-+ paddd $d, $c # c += d
-+ pxor $c, $b # b ^= c
-+
-+ movdqa $b, $tmp
-+ pslld \$12, $tmp
-+ psrld \$20, $b
-+ pxor $tmp, $b # b <<<= 12
-+
-+ paddd $b, $a # a += b
-+ pxor $a, $d # d ^= a
-+ pshufb $rol8, $d # d <<<= 8
-+
-+ paddd $d, $c # c += d
-+ pxor $c, $b # b ^= c
-+
-+ movdqa $b, $tmp
-+ pslld \$7, $tmp
-+ psrld \$25, $b
-+ pxor $tmp, $b # b <<<= 7
-+___
-+
-+}
-+
-+$code.=<<___;
-+.text
-+.align 16
-+chacha20_consts:
-+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-+.rol8:
-+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-+.rol16:
-+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-+.avxInc:
-+.quad 1,0
-+___
-+
-+{
-+my ($out, $in, $in_len, $key_ptr, $nr)
-+ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8");
-+
-+$code.=<<___;
-+.globl chacha_20_core_asm
-+.type chacha_20_core_asm ,\@function,2
-+.align 64
-+chacha_20_core_asm:
-+
-+ # Init state
-+ movdqa .rol8(%rip), $rol8
-+ movdqa .rol16(%rip), $rol16
-+ movdqu 2*16($key_ptr), $state_cdef
-+
-+2:
-+ cmp \$3*64, $in_len
-+ jb 2f
-+
-+ movdqa chacha20_consts(%rip), $v0
-+ movdqu 0*16($key_ptr), $v1
-+ movdqu 1*16($key_ptr), $v2
-+ movdqa $state_cdef, $v3
-+ movdqa $v0, $v4
-+ movdqa $v0, $v8
-+ movdqa $v1, $v5
-+ movdqa $v1, $v9
-+ movdqa $v2, $v6
-+ movdqa $v2, $v10
-+ movdqa $v3, $v7
-+ paddd .avxInc(%rip), $v7
-+ movdqa $v7, $v11
-+ paddd .avxInc(%rip), $v11
-+
-+ mov \$10, $nr
-+
-+ 1:
-+___
-+ &chacha_qr( $v0, $v1, $v2, $v3);
-+ &chacha_qr( $v4, $v5, $v6, $v7);
-+ &chacha_qr( $v8, $v9,$v10,$v11);
-+$code.=<<___;
-+ palignr \$4, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$12, $v3, $v3
-+ palignr \$4, $v5, $v5
-+ palignr \$8, $v6, $v6
-+ palignr \$12, $v7, $v7
-+ palignr \$4, $v9, $v9
-+ palignr \$8, $v10, $v10
-+ palignr \$12, $v11, $v11
-+___
-+ &chacha_qr( $v0, $v1, $v2, $v3);
-+ &chacha_qr( $v4, $v5, $v6, $v7);
-+ &chacha_qr( $v8, $v9,$v10,$v11);
-+$code.=<<___;
-+ palignr \$12, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$4, $v3, $v3
-+ palignr \$12, $v5, $v5
-+ palignr \$8, $v6, $v6
-+ palignr \$4, $v7, $v7
-+ palignr \$12, $v9, $v9
-+ palignr \$8, $v10, $v10
-+ palignr \$4, $v11, $v11
-+ dec $nr
-+
-+ jnz 1b
-+ paddd chacha20_consts(%rip), $v0
-+ paddd chacha20_consts(%rip), $v4
-+ paddd chacha20_consts(%rip), $v8
-+
-+ movdqu 16*0($key_ptr), $tmp
-+ paddd $tmp, $v1
-+ paddd $tmp, $v5
-+ paddd $tmp, $v9
-+
-+ movdqu 16*1($key_ptr), $tmp
-+ paddd $tmp, $v2
-+ paddd $tmp, $v6
-+ paddd $tmp, $v10
-+
-+ paddd $state_cdef, $v3
-+ paddq .avxInc(%rip), $state_cdef
-+ paddd $state_cdef, $v7
-+ paddq .avxInc(%rip), $state_cdef
-+ paddd $state_cdef, $v11
-+ paddq .avxInc(%rip), $state_cdef
-+
-+ movdqu 16*0($in), $tmp
-+ pxor $tmp, $v0
-+ movdqu 16*1($in), $tmp
-+ pxor $tmp, $v1
-+ movdqu 16*2($in), $tmp
-+ pxor $tmp, $v2
-+ movdqu 16*3($in), $tmp
-+ pxor $tmp, $v3
-+
-+ movdqu $v0, 16*0($out)
-+ movdqu $v1, 16*1($out)
-+ movdqu $v2, 16*2($out)
-+ movdqu $v3, 16*3($out)
-+
-+ movdqu 16*4($in), $tmp
-+ pxor $tmp, $v4
-+ movdqu 16*5($in), $tmp
-+ pxor $tmp, $v5
-+ movdqu 16*6($in), $tmp
-+ pxor $tmp, $v6
-+ movdqu 16*7($in), $tmp
-+ pxor $tmp, $v7
-+
-+ movdqu $v4, 16*4($out)
-+ movdqu $v5, 16*5($out)
-+ movdqu $v6, 16*6($out)
-+ movdqu $v7, 16*7($out)
-+
-+ movdqu 16*8($in), $tmp
-+ pxor $tmp, $v8
-+ movdqu 16*9($in), $tmp
-+ pxor $tmp, $v9
-+ movdqu 16*10($in), $tmp
-+ pxor $tmp, $v10
-+ movdqu 16*11($in), $tmp
-+ pxor $tmp, $v11
-+
-+ movdqu $v8, 16*8($out)
-+ movdqu $v9, 16*9($out)
-+ movdqu $v10, 16*10($out)
-+ movdqu $v11, 16*11($out)
-+
-+ lea 16*12($in), $in
-+ lea 16*12($out), $out
-+ sub \$16*12, $in_len
-+
-+ jmp 2b
-+
-+2:
-+ cmp \$2*64, $in_len
-+ jb 2f
-+
-+ movdqa chacha20_consts(%rip), $v0
-+ movdqa chacha20_consts(%rip), $v4
-+ movdqu 16*0($key_ptr), $v1
-+ movdqu 16*0($key_ptr), $v5
-+ movdqu 16*1($key_ptr), $v2
-+ movdqu 16*1($key_ptr), $v6
-+ movdqa $state_cdef, $v3
-+ movdqa $v3, $v7
-+ paddd .avxInc(%rip), $v7
-+
-+ mov \$10, $nr
-+ 1:
-+___
-+ &chacha_qr($v0,$v1,$v2,$v3);
-+ &chacha_qr($v4,$v5,$v6,$v7);
-+$code.=<<___;
-+ palignr \$4, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$12, $v3, $v3
-+ palignr \$4, $v5, $v5
-+ palignr \$8, $v6, $v6
-+ palignr \$12, $v7, $v7
-+___
-+ &chacha_qr($v0,$v1,$v2,$v3);
-+ &chacha_qr($v4,$v5,$v6,$v7);
-+$code.=<<___;
-+ palignr \$12, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$4, $v3, $v3
-+ palignr \$12, $v5, $v5
-+ palignr \$8, $v6, $v6
-+ palignr \$4, $v7, $v7
-+ dec $nr
-+ jnz 1b
-+
-+ paddd chacha20_consts(%rip), $v0
-+ paddd chacha20_consts(%rip), $v4
-+
-+ movdqu 16*0($key_ptr), $tmp
-+ paddd $tmp, $v1
-+ paddd $tmp, $v5
-+
-+ movdqu 16*1($key_ptr), $tmp
-+ paddd $tmp, $v2
-+ paddd $tmp, $v6
-+
-+ paddd $state_cdef, $v3
-+ paddq .avxInc(%rip), $state_cdef
-+ paddd $state_cdef, $v7
-+ paddq .avxInc(%rip), $state_cdef
-+
-+ movdqu 16*0($in), $tmp
-+ pxor $tmp, $v0
-+ movdqu 16*1($in), $tmp
-+ pxor $tmp, $v1
-+ movdqu 16*2($in), $tmp
-+ pxor $tmp, $v2
-+ movdqu 16*3($in), $tmp
-+ pxor $tmp, $v3
-+
-+ movdqu $v0, 16*0($out)
-+ movdqu $v1, 16*1($out)
-+ movdqu $v2, 16*2($out)
-+ movdqu $v3, 16*3($out)
-+
-+ movdqu 16*4($in), $tmp
-+ pxor $tmp, $v4
-+ movdqu 16*5($in), $tmp
-+ pxor $tmp, $v5
-+ movdqu 16*6($in), $tmp
-+ pxor $tmp, $v6
-+ movdqu 16*7($in), $tmp
-+ pxor $tmp, $v7
-+
-+ movdqu $v4, 16*4($out)
-+ movdqu $v5, 16*5($out)
-+ movdqu $v6, 16*6($out)
-+ movdqu $v7, 16*7($out)
-+
-+ lea 16*8($in), $in
-+ lea 16*8($out), $out
-+ sub \$16*8, $in_len
-+
-+ jmp 2b
-+2:
-+ cmp \$64, $in_len
-+ jb 2f
-+
-+ movdqa chacha20_consts(%rip), $v0
-+ movdqu 16*0($key_ptr), $v1
-+ movdqu 16*1($key_ptr), $v2
-+ movdqa $state_cdef, $v3
-+
-+ mov \$10, $nr
-+
-+ 1:
-+___
-+ &chacha_qr($v0,$v1,$v2,$v3);
-+$code.=<<___;
-+ palignr \$4, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$12, $v3, $v3
-+___
-+ &chacha_qr($v0,$v1,$v2,$v3);
-+$code.=<<___;
-+ palignr \$12, $v1, $v1
-+ palignr \$8, $v2, $v2
-+ palignr \$4, $v3, $v3
-+ dec $nr
-+ jnz 1b
-+
-+ paddd chacha20_consts(%rip), $v0
-+
-+ movdqu 16*0($key_ptr), $tmp
-+ paddd $tmp, $v1
-+
-+ movdqu 16*1($key_ptr), $tmp
-+ paddd $tmp, $v2
-+
-+ paddd $state_cdef, $v3
-+ paddq .avxInc(%rip), $state_cdef
-+
-+ movdqu 16*0($in), $tmp
-+ pxor $tmp, $v0
-+ movdqu 16*1($in), $tmp
-+ pxor $tmp, $v1
-+ movdqu 16*2($in), $tmp
-+ pxor $tmp, $v2
-+ movdqu 16*3($in), $tmp
-+ pxor $tmp, $v3
-+
-+ movdqu $v0, 16*0($out)
-+ movdqu $v1, 16*1($out)
-+ movdqu $v2, 16*2($out)
-+ movdqu $v3, 16*3($out)
-+
-+ lea 16*4($in), $in
-+ lea 16*4($out), $out
-+ sub \$16*4, $in_len
-+ jmp 2b
-+
-+2:
-+ movdqu $state_cdef, 16*2($key_ptr)
-+ ret
-+.size chacha_20_core_asm,.-chacha_20_core_asm
-+___
-+}
-+}
-+
-+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-+
-+print $code;
-+
-+close STDOUT;
-diff --git a/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl b/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl
-new file mode 100644
-index 0000000..05e4bc5
---- /dev/null
-+++ b/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl
-@@ -0,0 +1,280 @@
-+##############################################################################
-+# #
-+# Copyright 2016 CloudFlare LTD #
-+# #
-+# Licensed under the Apache License, Version 2.0 (the "License"); #
-+# you may not use this file except in compliance with the License. #
-+# You may obtain a copy of the License at #
-+# #
-+# http://www.apache.org/licenses/LICENSE-2.0 #
-+# #
-+# Unless required by applicable law or agreed to in writing, software #
-+# distributed under the License is distributed on an "AS IS" BASIS, #
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-+# See the License for the specific language governing permissions and #
-+# limitations under the License. #
-+# #
-+##############################################################################
-+# #
-+# Author: Vlad Krasnov #
-+# #
-+##############################################################################
-+
-+$flavour = shift;
-+$output = shift;
-+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-+
-+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-+die "can't locate x86_64-xlate.pl";
-+
-+open OUT,"| \"$^X\" $xlate $flavour $output";
-+*STDOUT=*OUT;
-+
-+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.19) + ($1>=2.22);
-+}
-+
-+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
-+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
-+ $avx = ($1>=2.09) + ($1>=2.10);
-+}
-+
-+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
-+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-+ $avx = ($1>=10) + ($1>=11);
-+}
-+
-+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
-+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
-+ $avx = ($ver>=3.0) + ($ver>=3.01);
-+}
-+
-+
-+{
-+{
-+
-+my ($state, $key)
-+ =("%rdi", "%rsi");
-+
-+$code.=<<___;
-+
-+.LrSet:
-+.align 16
-+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-+###############################################################################
-+# void poly1305_init_x64(void *state, uint8_t key[32])
-+
-+.globl poly1305_init_x64
-+.type poly1305_init_x64, \@function, 2
-+.align 64
-+poly1305_init_x64:
-+
-+ xor %rax, %rax
-+ mov %rax, 8*0($state)
-+ mov %rax, 8*1($state)
-+ mov %rax, 8*2($state)
-+
-+ movdqu 16*0($key), %xmm0
-+ movdqu 16*1($key), %xmm1
-+ pand .LrSet(%rip), %xmm0
-+
-+ movdqu %xmm0, 8*3($state)
-+ movdqu %xmm1, 8*3+16($state)
-+ movq \$0, 8*7($state)
-+
-+ ret
-+.size poly1305_init_x64,.-poly1305_init_x64
-+___
-+}
-+
-+{
-+
-+my ($state, $inp)
-+ =("%rdi", "%rsi");
-+
-+my ($acc0, $acc1, $acc2, $inl, $t0, $t1, $t2, $t3, $r0)
-+ =("%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
-+
-+my ($r1)
-+ =("8*4($state)");
-+
-+$code.=<<___;
-+###############################################################################
-+# void* poly1305_update_x64(void* state, void* in, uint64_t in_len)
-+.globl poly1305_update_x64
-+.type poly1305_update_x64, \@function, 2
-+.align 64
-+poly1305_update_x64:
-+
-+ push %r11
-+ push %r12
-+ push %r13
-+ push %r14
-+ push %r15
-+
-+ mov %rdx, $inl
-+
-+ mov 8*0($state), $acc0
-+ mov 8*1($state), $acc1
-+ mov 8*2($state), $acc2
-+ mov 8*3($state), $r0
-+
-+ cmp \$16, $inl
-+ jb 2f
-+ jmp 1f
-+
-+.align 64
-+1:
-+############################
-+ add 8*0($inp), $acc0
-+ adc 8*1($inp), $acc1
-+ lea 16($inp), $inp
-+ adc \$1, $acc2
-+
-+5:
-+ mov $r0, %rax
-+ mulq $acc0
-+ mov %rax, $t0
-+ mov %rdx, $t1
-+
-+ mov $r0, %rax
-+ mulq $acc1
-+ add %rax, $t1
-+ adc \$0, %rdx
-+
-+ mov $r0, $t2
-+ imul $acc2, $t2
-+ add %rdx, $t2
-+############################
-+ mov $r1, %rax
-+ mulq $acc0
-+ add %rax, $t1
-+ adc \$0, %rdx
-+ mov %rdx, $acc0
-+
-+ mov $r1, %rax
-+ mulq $acc1
-+ add $acc0, $t2
-+ adc \$0, %rdx
-+ add %rax, $t2
-+ adc \$0, %rdx
-+
-+ mov $r1, $t3
-+ imul $acc2, $t3
-+ add %rdx, $t3
-+############################
-+
-+ mov $t0, $acc0
-+ mov $t1, $acc1
-+ mov $t2, $acc2
-+ and \$3, $acc2
-+
-+ mov $t2, $t0
-+ mov $t3, $t1
-+
-+ and \$-4, $t0
-+ shrd \$2, $t3, $t2
-+ shr \$2, $t3
-+
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$0, $acc2
-+
-+ add $t2, $acc0
-+ adc $t3, $acc1
-+ adc \$0, $acc2
-+
-+ sub \$16, $inl
-+ cmp \$16, $inl
-+ jae 1b
-+
-+2:
-+ test $inl, $inl
-+ jz 3f
-+
-+ mov \$1, $t0
-+ xor $t1, $t1
-+ xor $t2, $t2
-+ add $inl, $inp
-+
-+4:
-+ shld \$8, $t0, $t1
-+ shl \$8, $t0
-+ movzxb -1($inp), $t2
-+ xor $t2, $t0
-+ dec $inp
-+ dec $inl
-+ jnz 4b
-+
-+ add $t0, $acc0
-+ adc $t1, $acc1
-+ adc \$0, $acc2
-+
-+ mov \$16, $inl
-+ jmp 5b
-+
-+3:
-+
-+ mov $acc0, 8*0($state)
-+ mov $acc1, 8*1($state)
-+ mov $acc2, 8*2($state)
-+
-+ pop %r15
-+ pop %r14
-+ pop %r13
-+ pop %r12
-+ pop %r11
-+ ret
-+.size poly1305_update_x64, .-poly1305_update_x64
-+___
-+}
-+
-+{
-+
-+my ($mac, $state)=("%rsi", "%rdi");
-+
-+my ($acc0, $acc1, $acc2, $t0, $t1, $t2)
-+ =("%rcx", "%rax", "%rdx", "%r8", "%r9", "%r10");
-+
-+$code.=<<___;
-+###############################################################################
-+# void poly1305_finish_x64(void* state, uint64_t mac[2]);
-+.type poly1305_finish_x64,\@function, 2
-+.align 64
-+.globl poly1305_finish_x64
-+poly1305_finish_x64:
-+
-+ mov 8*0($state), $acc0
-+ mov 8*1($state), $acc1
-+ mov 8*2($state), $acc2
-+
-+ mov $acc0, $t0
-+ mov $acc1, $t1
-+ mov $acc2, $t2
-+
-+ sub \$-5, $acc0
-+ sbb \$-1, $acc1
-+ sbb \$3, $acc2
-+
-+ cmovc $t0, $acc0
-+ cmovc $t1, $acc1
-+ cmovc $t2, $acc2
-+
-+ add 8*5($state), $acc0
-+ adc 8*6($state), $acc1
-+ mov $acc0, ($mac)
-+ mov $acc1, 8($mac)
-+
-+ ret
-+.size poly1305_finish_x64, .-poly1305_finish_x64
-+___
-+}
-+}
-+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-+print $code;
-+close STDOUT;
-diff --git a/crypto/chacha20_poly1305/chacha20.c b/crypto/chacha20_poly1305/chacha20.c
-new file mode 100644
-index 0000000..b48d857
---- /dev/null
-+++ b/crypto/chacha20_poly1305/chacha20.c
-@@ -0,0 +1,142 @@
-+/* Copyright (c) 2014, Google Inc.
-+ *
-+ * Permission to use, copy, modify, and/or distribute this software for any
-+ * purpose with or without fee is hereby granted, provided that the above
-+ * copyright notice and this permission notice appear in all copies.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-+
-+/* Adapted from the public domain, estream code by D. Bernstein. */
-+
-+#include "chacha20poly1305.h"
-+
-+/* sigma contains the ChaCha constants, which happen to be an ASCII string. */
-+static const char sigma[16] = "expand 32-byte k";
-+
-+#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
-+#define XOR(v, w) ((v) ^ (w))
-+#define PLUS(x, y) ((x) + (y))
-+#define PLUSONE(v) (PLUS((v), 1))
-+
-+#define U32TO8_LITTLE(p, v) \
-+ { \
-+ (p)[0] = (v >> 0) & 0xff; \
-+ (p)[1] = (v >> 8) & 0xff; \
-+ (p)[2] = (v >> 16) & 0xff; \
-+ (p)[3] = (v >> 24) & 0xff; \
-+ }
-+
-+#define U8TO32_LITTLE(p) \
-+ (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \
-+ ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
-+
-+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
-+#define QUARTERROUND(a,b,c,d) \
-+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
-+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
-+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
-+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);
-+
-+/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in
-+ * |input| and writes the 64 output bytes to |output|. */
-+static void chacha_core(uint8_t output[64], const uint32_t input[16]) {
-+ uint32_t x[16];
-+ int i;
-+
-+ memcpy(x, input, sizeof(uint32_t) * 16);
-+ for (i = 20; i > 0; i -= 2) {
-+ QUARTERROUND(0, 4, 8, 12)
-+ QUARTERROUND(1, 5, 9, 13)
-+ QUARTERROUND(2, 6, 10, 14)
-+ QUARTERROUND(3, 7, 11, 15)
-+ QUARTERROUND(0, 5, 10, 15)
-+ QUARTERROUND(1, 6, 11, 12)
-+ QUARTERROUND(2, 7, 8, 13)
-+ QUARTERROUND(3, 4, 9, 14)
-+ }
-+
-+ for (i = 0; i < 16; ++i) {
-+ x[i] = PLUS(x[i], input[i]);
-+ }
-+ for (i = 0; i < 16; ++i) {
-+ U32TO8_LITTLE(output + 4 * i, x[i]);
-+ }
-+}
-+
-+#if CHAPOLY_ASM
-+void chacha_20_core_asm(uint8_t *out, const uint8_t *in, size_t in_len,
-+ uint8_t nonce[48]);
-+#endif
-+
-+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
-+ uint8_t nonce[48]) {
-+
-+ uint8_t buf[64];
-+ uint32_t input[16];
-+ size_t todo, i;
-+
-+#ifdef CHAPOLY_ASM
-+ chacha_20_core_asm(out, in, in_len, nonce);
-+ todo = in_len & (63);
-+
-+ if(todo) {
-+ out += in_len - todo;
-+ in += in_len - todo;
-+ memcpy(buf, in, todo);
-+
-+ chacha_20_core_asm(buf, buf, sizeof(buf), nonce);
-+
-+ memcpy(out, buf, todo);
-+ memset(buf, 0, sizeof(buf));
-+ }
-+ return;
-+#endif
-+
-+ input[0] = U8TO32_LITTLE(sigma + 0);
-+ input[1] = U8TO32_LITTLE(sigma + 4);
-+ input[2] = U8TO32_LITTLE(sigma + 8);
-+ input[3] = U8TO32_LITTLE(sigma + 12);
-+
-+ input[4] = U8TO32_LITTLE(nonce + 0);
-+ input[5] = U8TO32_LITTLE(nonce + 4);
-+ input[6] = U8TO32_LITTLE(nonce + 8);
-+ input[7] = U8TO32_LITTLE(nonce + 12);
-+
-+ input[8] = U8TO32_LITTLE(nonce + 16);
-+ input[9] = U8TO32_LITTLE(nonce + 20);
-+ input[10] = U8TO32_LITTLE(nonce + 24);
-+ input[11] = U8TO32_LITTLE(nonce + 28);
-+
-+ input[12] = U8TO32_LITTLE(nonce + 32);
-+ input[13] = U8TO32_LITTLE(nonce + 36);
-+ input[14] = U8TO32_LITTLE(nonce + 40);
-+ input[15] = U8TO32_LITTLE(nonce + 44);
-+
-+ while (in_len > 0) {
-+ todo = 64;
-+ if (in_len < todo) {
-+ todo = in_len;
-+ }
-+
-+ chacha_core(buf, input);
-+ for (i = 0; i < todo; i++) {
-+ out[i] = in[i] ^ buf[i];
-+ }
-+
-+ out += todo;
-+ in += todo;
-+ in_len -= todo;
-+
-+ ((uint64_t*)input)[6]++;
-+ }
-+
-+ U32TO8_LITTLE(nonce + 32, input[12]);
-+ U32TO8_LITTLE(nonce + 36, input[13]);
-+}
-+
-diff --git a/crypto/chacha20_poly1305/chacha20poly1305.h b/crypto/chacha20_poly1305/chacha20poly1305.h
-new file mode 100644
-index 0000000..3968c40
---- /dev/null
-+++ b/crypto/chacha20_poly1305/chacha20poly1305.h
-@@ -0,0 +1,64 @@
-+/* Copyright (c) 2014, Google Inc.
-+ *
-+ * Permission to use, copy, modify, and/or distribute this software for any
-+ * purpose with or without fee is hereby granted, provided that the above
-+ * copyright notice and this permission notice appear in all copies.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-+
-+#ifndef OPENSSL_HEADER_POLY1305_H
-+#define OPENSSL_HEADER_POLY1305_H
-+
-+#include <stdint.h>
-+#include <stddef.h>
-+#include <string.h>
-+#include "crypto.h"
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+#define POLY1305_MAC_LEN (16)
-+#define POLY1305_PAD_LEN (16)
-+
-+typedef unsigned char poly1305_state[92];
-+
-+
-+/* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an
-+ * authentication tag with the one-time key |key|. Note that |key| is a
-+ * one-time key and therefore there is no `reset' method because that would
-+ * enable several messages to be authenticated with the same key. */
-+void CRYPTO_poly1305_init(poly1305_state* state, const uint8_t key[32]);
-+
-+/* CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called
-+ * zero or more times after poly1305_init. */
-+void CRYPTO_poly1305_update(poly1305_state* state, const uint8_t* in,
-+ size_t in_len);
-+
-+/* CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16
-+ * byte authentication tag to |mac|. */
-+void CRYPTO_poly1305_finish(poly1305_state* state,
-+ uint8_t mac[POLY1305_MAC_LEN]);
-+
-+/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and
-+ * nonce and writes the result to |out|, which may be equal to |in|. The
-+ * initial block counter is specified by |counter|. */
-+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
-+ uint8_t nonce[48]);
-+
-+#if CHAPOLY_ASM
-+int chacha20_poly1305_open(uint8_t *pt, const uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *key);
-+void chacha20_poly1305_seal(uint8_t *ct, const uint8_t *pt, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *key);
-+#endif
-+
-+#if defined(__cplusplus)
-+} /* extern C */
-+#endif
-+
-+#endif /* OPENSSL_HEADER_POLY1305_H */
-diff --git a/crypto/chacha20_poly1305/poly1305.c b/crypto/chacha20_poly1305/poly1305.c
-new file mode 100644
-index 0000000..6bd553b
---- /dev/null
-+++ b/crypto/chacha20_poly1305/poly1305.c
-@@ -0,0 +1,355 @@
-+/* Copyright (c) 2014, Google Inc.
-+ *
-+ * Permission to use, copy, modify, and/or distribute this software for any
-+ * purpose with or without fee is hereby granted, provided that the above
-+ * copyright notice and this permission notice appear in all copies.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-+
-+/* This implementation of poly1305 is by Andrew Moon
-+ * (https://github.com/floodyberry/poly1305-donna) and released as public
-+ * domain. */
-+
-+#include "chacha20poly1305.h"
-+
-+#include <string.h>
-+#ifndef CHAPOLY_ASM
-+
-+#if !defined(B_ENDIAN)
-+/* We can assume little-endian. */
-+static uint32_t U8TO32_LE(const uint8_t *m) {
-+ uint32_t r;
-+ memcpy(&r, m, sizeof(r));
-+ return r;
-+}
-+
-+static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); }
-+#else
-+static uint32_t U8TO32_LE(const uint8_t *m) {
-+ return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 |
-+ (uint32_t)m[3] << 24;
-+}
-+
-+static void U32TO8_LE(uint8_t *m, uint32_t v) {
-+ m[0] = v;
-+ m[1] = v >> 8;
-+ m[2] = v >> 16;
-+ m[3] = v >> 24;
-+}
-+#endif
-+
-+static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
-+
-+struct poly1305_state_st {
-+ uint32_t r0, r1, r2, r3, r4;
-+ uint32_t s1, s2, s3, s4;
-+ uint32_t h0, h1, h2, h3, h4;
-+ uint8_t buf[16];
-+ unsigned int buf_used;
-+ uint8_t key[16];
-+};
-+
-+/* poly1305_blocks updates |state| given some amount of input data. This
-+ * function may only be called with a |len| that is not a multiple of 16 at the
-+ * end of the data. Otherwise the input must be buffered into 16 byte blocks. */
-+static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
-+ size_t len) {
-+ uint32_t t0, t1, t2, t3;
-+ uint64_t t[5];
-+ uint32_t b;
-+ uint64_t c;
-+ size_t j;
-+ uint8_t mp[16];
-+
-+ if (len < 16) {
-+ goto poly1305_donna_atmost15bytes;
-+ }
-+
-+poly1305_donna_16bytes:
-+ t0 = U8TO32_LE(in);
-+ t1 = U8TO32_LE(in + 4);
-+ t2 = U8TO32_LE(in + 8);
-+ t3 = U8TO32_LE(in + 12);
-+
-+ in += 16;
-+ len -= 16;
-+
-+ state->h0 += t0 & 0x3ffffff;
-+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
-+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
-+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
-+ state->h4 += (t3 >> 8) | (1 << 24);
-+
-+poly1305_donna_mul:
-+ t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) +
-+ mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) +
-+ mul32x32_64(state->h4, state->s1);
-+ t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) +
-+ mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) +
-+ mul32x32_64(state->h4, state->s2);
-+ t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) +
-+ mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) +
-+ mul32x32_64(state->h4, state->s3);
-+ t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) +
-+ mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) +
-+ mul32x32_64(state->h4, state->s4);
-+ t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) +
-+ mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) +
-+ mul32x32_64(state->h4, state->r0);
-+
-+ state->h0 = (uint32_t)t[0] & 0x3ffffff;
-+ c = (t[0] >> 26);
-+ t[1] += c;
-+ state->h1 = (uint32_t)t[1] & 0x3ffffff;
-+ b = (uint32_t)(t[1] >> 26);
-+ t[2] += b;
-+ state->h2 = (uint32_t)t[2] & 0x3ffffff;
-+ b = (uint32_t)(t[2] >> 26);
-+ t[3] += b;
-+ state->h3 = (uint32_t)t[3] & 0x3ffffff;
-+ b = (uint32_t)(t[3] >> 26);
-+ t[4] += b;
-+ state->h4 = (uint32_t)t[4] & 0x3ffffff;
-+ b = (uint32_t)(t[4] >> 26);
-+ state->h0 += b * 5;
-+
-+ if (len >= 16)
-+ goto poly1305_donna_16bytes;
-+
-+/* final bytes */
-+poly1305_donna_atmost15bytes:
-+ if (!len)
-+ return;
-+
-+ for (j = 0; j < len; j++)
-+ mp[j] = in[j];
-+ mp[j++] = 1;
-+ for (; j < 16; j++)
-+ mp[j] = 0;
-+ len = 0;
-+
-+ t0 = U8TO32_LE(mp + 0);
-+ t1 = U8TO32_LE(mp + 4);
-+ t2 = U8TO32_LE(mp + 8);
-+ t3 = U8TO32_LE(mp + 12);
-+
-+ state->h0 += t0 & 0x3ffffff;
-+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
-+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
-+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
-+ state->h4 += (t3 >> 8);
-+
-+ goto poly1305_donna_mul;
-+}
-+
-+void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+ uint32_t t0, t1, t2, t3;
-+
-+ t0 = U8TO32_LE(key + 0);
-+ t1 = U8TO32_LE(key + 4);
-+ t2 = U8TO32_LE(key + 8);
-+ t3 = U8TO32_LE(key + 12);
-+
-+ /* precompute multipliers */
-+ state->r0 = t0 & 0x3ffffff;
-+ t0 >>= 26;
-+ t0 |= t1 << 6;
-+ state->r1 = t0 & 0x3ffff03;
-+ t1 >>= 20;
-+ t1 |= t2 << 12;
-+ state->r2 = t1 & 0x3ffc0ff;
-+ t2 >>= 14;
-+ t2 |= t3 << 18;
-+ state->r3 = t2 & 0x3f03fff;
-+ t3 >>= 8;
-+ state->r4 = t3 & 0x00fffff;
-+
-+ state->s1 = state->r1 * 5;
-+ state->s2 = state->r2 * 5;
-+ state->s3 = state->r3 * 5;
-+ state->s4 = state->r4 * 5;
-+
-+ /* init state */
-+ state->h0 = 0;
-+ state->h1 = 0;
-+ state->h2 = 0;
-+ state->h3 = 0;
-+ state->h4 = 0;
-+
-+ state->buf_used = 0;
-+ memcpy(state->key, key + 16, sizeof(state->key));
-+}
-+
-+void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in,
-+ size_t in_len) {
-+ unsigned int i;
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+
-+ if (state->buf_used) {
-+ unsigned int todo = 16 - state->buf_used;
-+ if (todo > in_len)
-+ todo = in_len;
-+ for (i = 0; i < todo; i++)
-+ state->buf[state->buf_used + i] = in[i];
-+ state->buf_used += todo;
-+ in_len -= todo;
-+ in += todo;
-+
-+ if (state->buf_used == 16) {
-+ poly1305_update(state, state->buf, 16);
-+ state->buf_used = 0;
-+ }
-+ }
-+
-+ if (in_len >= 16) {
-+ size_t todo = in_len & ~0xf;
-+ poly1305_update(state, in, todo);
-+ in += todo;
-+ in_len &= 0xf;
-+ }
-+
-+ if (in_len) {
-+ for (i = 0; i < in_len; i++)
-+ state->buf[i] = in[i];
-+ state->buf_used = in_len;
-+ }
-+}
-+
-+void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+
-+ uint64_t f0, f1, f2, f3;
-+ uint32_t g0, g1, g2, g3, g4;
-+ uint32_t b, nb;
-+
-+ if (state->buf_used)
-+ poly1305_update(state, state->buf, state->buf_used);
-+
-+ b = state->h0 >> 26;
-+ state->h0 = state->h0 & 0x3ffffff;
-+ state->h1 += b;
-+ b = state->h1 >> 26;
-+ state->h1 = state->h1 & 0x3ffffff;
-+ state->h2 += b;
-+ b = state->h2 >> 26;
-+ state->h2 = state->h2 & 0x3ffffff;
-+ state->h3 += b;
-+ b = state->h3 >> 26;
-+ state->h3 = state->h3 & 0x3ffffff;
-+ state->h4 += b;
-+ b = state->h4 >> 26;
-+ state->h4 = state->h4 & 0x3ffffff;
-+ state->h0 += b * 5;
-+
-+ g0 = state->h0 + 5;
-+ b = g0 >> 26;
-+ g0 &= 0x3ffffff;
-+ g1 = state->h1 + b;
-+ b = g1 >> 26;
-+ g1 &= 0x3ffffff;
-+ g2 = state->h2 + b;
-+ b = g2 >> 26;
-+ g2 &= 0x3ffffff;
-+ g3 = state->h3 + b;
-+ b = g3 >> 26;
-+ g3 &= 0x3ffffff;
-+ g4 = state->h4 + b - (1 << 26);
-+
-+ b = (g4 >> 31) - 1;
-+ nb = ~b;
-+ state->h0 = (state->h0 & nb) | (g0 & b);
-+ state->h1 = (state->h1 & nb) | (g1 & b);
-+ state->h2 = (state->h2 & nb) | (g2 & b);
-+ state->h3 = (state->h3 & nb) | (g3 & b);
-+ state->h4 = (state->h4 & nb) | (g4 & b);
-+
-+ f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
-+ f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
-+ (uint64_t)U8TO32_LE(&state->key[4]);
-+ f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
-+ (uint64_t)U8TO32_LE(&state->key[8]);
-+ f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
-+ (uint64_t)U8TO32_LE(&state->key[12]);
-+
-+ U32TO8_LE(&mac[0], f0);
-+ f1 += (f0 >> 32);
-+ U32TO8_LE(&mac[4], f1);
-+ f2 += (f1 >> 32);
-+ U32TO8_LE(&mac[8], f2);
-+ f3 += (f2 >> 32);
-+ U32TO8_LE(&mac[12], f3);
-+}
-+
-+#else
-+
-+struct poly1305_state_st {
-+ uint8_t opaque[8*8];
-+ uint8_t buf[16];
-+ unsigned int buf_used;
-+};
-+
-+void poly1305_init_x64(struct poly1305_state_st* state, const uint8_t key[32]);
-+void poly1305_update_x64(struct poly1305_state_st* state, const uint8_t *in, size_t in_len);
-+void poly1305_finish_x64(struct poly1305_state_st* state, uint8_t mac[16]);
-+
-+#define poly1305_update poly1305_update_x64
-+
-+void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+ state->buf_used = 0;
-+ return poly1305_init_x64(state, key);
-+}
-+
-+void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in,
-+ size_t in_len) {
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+ int todo;
-+ /* Attempt to fill as many bytes as possible before calling the update
-+ function */
-+ if (in_len < 16 || state->buf_used) {
-+ todo = 16 - state->buf_used;
-+ todo = in_len < todo ? in_len : todo;
-+ memcpy(state->buf + state->buf_used, in, todo);
-+ state->buf_used += todo;
-+ in += todo;
-+ in_len -= todo;
-+
-+ if (state->buf_used == 16) {
-+ poly1305_update_x64(state, state->buf, 16);
-+ state->buf_used = 0;
-+ }
-+ }
-+
-+ if (in_len >= 16) {
-+ poly1305_update_x64(state, in, in_len & (-16));
-+ in += in_len & (-16);
-+ in_len &= (15);
-+ }
-+
-+ if (in_len) {
-+ memcpy(state->buf, in, in_len);
-+ state->buf_used = in_len;
-+ }
-+}
-+
-+void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
-+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-+
-+ if (state->buf_used) {
-+ if (state->buf_used % POLY1305_PAD_LEN) {
-+ memset(state->buf + state->buf_used, 0,
-+ POLY1305_PAD_LEN - (state->buf_used % POLY1305_PAD_LEN));
-+ }
-+ poly1305_update_x64(state, state->buf, state->buf_used);
-+ }
-+
-+ poly1305_finish_x64(state, mac);
-+}
-+#endif
-diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile
-index fa138d0..c87896b 100644
---- a/crypto/evp/Makefile
-+++ b/crypto/evp/Makefile
-@@ -29,7 +29,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \
- c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
- evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
- e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \
-- e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c
-+ e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c \
-+ e_chacha20_poly1305.c
-
- LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
- e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\
-@@ -42,7 +43,8 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
- c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \
- evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \
- e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \
-- e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o
-+ e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o \
-+ e_chacha20_poly1305.o
-
- SRC= $(LIBSRC)
-
-@@ -793,3 +795,5 @@ pmeth_lib.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
- pmeth_lib.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
- pmeth_lib.o: ../../include/openssl/x509_vfy.h ../asn1/asn1_locl.h ../cryptlib.h
- pmeth_lib.o: evp_locl.h pmeth_lib.c
-+e_chacha20_poly1305.o: ../../include/openssl/chacha20poly1305.h
-+e_chacha20_poly1305.o: e_chacha20_poly1305.c
-diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
-index 280e584..694f168 100644
---- a/crypto/evp/c_allc.c
-+++ b/crypto/evp/c_allc.c
-@@ -238,4 +238,9 @@ void OpenSSL_add_all_ciphers(void)
- EVP_add_cipher_alias(SN_camellia_256_cbc, "CAMELLIA256");
- EVP_add_cipher_alias(SN_camellia_256_cbc, "camellia256");
- #endif
-+
-+#ifndef OPENSSL_NO_CHACHA_POLY
-+ EVP_add_cipher(EVP_chacha20_poly1305());
-+ EVP_add_cipher(EVP_chacha20_poly1305_draft());
-+#endif
- }
-diff --git a/crypto/evp/e_chacha20_poly1305.c b/crypto/evp/e_chacha20_poly1305.c
-new file mode 100644
-index 0000000..1e072ec
---- /dev/null
-+++ b/crypto/evp/e_chacha20_poly1305.c
-@@ -0,0 +1,362 @@
-+/* ====================================================================
-+ * Copyright (c) 2001-2014 The OpenSSL Project. All rights reserved.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ *
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in
-+ * the documentation and/or other materials provided with the
-+ * distribution.
-+ *
-+ * 3. All advertising materials mentioning features or use of this
-+ * software must display the following acknowledgment:
-+ * "This product includes software developed by the OpenSSL Project
-+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
-+ *
-+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
-+ * endorse or promote products derived from this software without
-+ * prior written permission. For written permission, please contact
-+ * openssl-core@openssl.org.
-+ *
-+ * 5. Products derived from this software may not be called "OpenSSL"
-+ * nor may "OpenSSL" appear in their names without prior written
-+ * permission of the OpenSSL Project.
-+ *
-+ * 6. Redistributions of any form whatsoever must retain the following
-+ * acknowledgment:
-+ * "This product includes software developed by the OpenSSL Project
-+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
-+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
-+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
-+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-+ * OF THE POSSIBILITY OF SUCH DAMAGE.
-+ * ====================================================================
-+ *
-+ */
-+
-+#include <openssl/opensslconf.h>
-+#ifndef OPENSSL_NO_CHACHA_POLY
-+# include <openssl/evp.h>
-+# include <openssl/chacha20poly1305.h>
-+
-+#define FILL_BUFFER ((size_t)128)
-+
-+typedef struct {
-+ uint8_t iv[12];
-+ uint8_t nonce[48];
-+ size_t aad_l;
-+ size_t ct_l;
-+ unsigned valid:1;
-+ unsigned draft:1;
-+ uint8_t poly_buffer[FILL_BUFFER];
-+ uint8_t chacha_buffer[FILL_BUFFER];
-+ uint16_t poly_buffer_used;
-+ uint16_t chacha_used;
-+ poly1305_state poly_state;
-+ #define poly_finish(c,m) CRYPTO_poly1305_finish(&c->poly_state,m)
-+} EVP_CHACHA20_POLY1305_CTX;
-+
-+static int EVP_chacha20_poly1305_init_draft(EVP_CIPHER_CTX *ctx,
-+ const unsigned char *key,
-+ const unsigned char *iv,
-+ int enc)
-+{
-+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
-+ memcpy(aead_ctx->nonce, key, 32);
-+ aead_ctx->valid = 0;
-+ aead_ctx->draft = 1;
-+ return 1;
-+}
-+
-+static int EVP_chacha20_poly1305_init(EVP_CIPHER_CTX *ctx,
-+ const unsigned char *key,
-+ const unsigned char *iv,
-+ int enc)
-+{
-+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
-+ memcpy(aead_ctx->nonce, key, 32);
-+ memcpy(aead_ctx->iv, iv, 12);
-+ aead_ctx->valid = 0;
-+ aead_ctx->draft = 0;
-+ return 1;
-+}
-+
-+static int EVP_chacha20_poly1305_cipher(EVP_CIPHER_CTX *ctx,
-+ unsigned char *out,
-+ const unsigned char *in,
-+ size_t inl)
-+{
-+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
-+ uint8_t poly_mac[POLY1305_MAC_LEN];
-+ uint8_t zero[POLY1305_PAD_LEN] = {0};
-+ uint64_t cmp;
-+ int i, todo;
-+
-+ if (!aead_ctx->valid)
-+ return 0;
-+
-+ if (inl < POLY1305_MAC_LEN)
-+ return -1;
-+
-+ /* Fix for MAC */
-+ inl -= POLY1305_MAC_LEN;
-+
-+#if (CHAPOLY_ASM)
-+ if (!aead_ctx->draft) {
-+ aead_ctx->valid = 0;
-+ if (ctx->encrypt) {
-+ chacha20_poly1305_seal(out, in, inl,
-+ aead_ctx->poly_buffer,
-+ aead_ctx->poly_buffer_used,
-+ aead_ctx->nonce);
-+ } else {
-+ int cmp = chacha20_poly1305_open(out, in, inl,
-+ aead_ctx->poly_buffer,
-+ aead_ctx->poly_buffer_used,
-+ aead_ctx->nonce);
-+ if (!cmp) {
-+ OPENSSL_cleanse(out, inl);
-+ return -1;
-+ }
-+ }
-+ return inl;
-+ }
-+#endif
-+
-+ if (!ctx->encrypt) {
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, in, inl);
-+ }
-+
-+ i = 0;
-+ if (inl < 256) {
-+ /* Consume the buffer we computed during poly initialization */
-+ todo = inl > (FILL_BUFFER - aead_ctx->chacha_used) ?
-+ FILL_BUFFER - aead_ctx->chacha_used :
-+ inl;
-+
-+ for (; i < todo; i++) {
-+ out[i] = in[i] ^ aead_ctx->chacha_buffer[i + 64 /*aead_ctx->chacha_used*/];
-+ }
-+
-+ } else {
-+ /* For long messages don't use precomputed buffer */
-+ ((uint64_t *)(aead_ctx->nonce))[4]--;
-+ }
-+
-+ todo = inl - i;
-+
-+ if (todo) {
-+ CRYPTO_chacha_20(&out[i], &in[i], todo, aead_ctx->nonce);
-+ }
-+
-+ if (ctx->encrypt) {
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, out, inl);
-+ }
-+
-+ aead_ctx->ct_l += inl;
-+
-+ if (!aead_ctx->draft) {
-+ /* For RFC padd ciphertext with zeroes, then mac len(aad)||len(ct) */
-+ todo = aead_ctx->ct_l % POLY1305_PAD_LEN ?
-+ POLY1305_PAD_LEN - (aead_ctx->ct_l % POLY1305_PAD_LEN) :
-+ 0;
-+
-+ if (todo) {
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, zero, todo);
-+ }
-+
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->aad_l, 8);
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->ct_l, 8);
-+
-+ } else {
-+ /* For the draft don't pad, mac len(ct) */
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->ct_l, 8);
-+ }
-+ aead_ctx->valid = 0;
-+
-+ if (ctx->encrypt) {
-+ poly_finish(aead_ctx, &out[inl]);
-+ return inl + POLY1305_MAC_LEN;
-+
-+ } else { /* Decryption */
-+ poly_finish(aead_ctx, poly_mac);
-+ /* Constant time comparison */
-+ cmp = (*(uint64_t *)(poly_mac)) ^ (*(uint64_t *)(in + inl));
-+ cmp |= (*(uint64_t *)(poly_mac + 8)) ^ (*(uint64_t *)(in + inl + 8));
-+
-+ if (cmp) {
-+ OPENSSL_cleanse(out, inl);
-+ return -1;
-+ }
-+
-+ return inl;
-+ }
-+}
-+
-+
-+static int EVP_chacha20_poly1305_cleanup(EVP_CIPHER_CTX *ctx)
-+{
-+ return 1;
-+}
-+
-+
-+static int EVP_chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx,
-+ int type,
-+ int arg,
-+ void *ptr)
-+{
-+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
-+ uint8_t aad[EVP_AEAD_TLS1_AAD_LEN + 8];
-+ uint64_t thirteen = EVP_AEAD_TLS1_AAD_LEN;
-+
-+ switch (type) {
-+ case EVP_CTRL_AEAD_TLS1_AAD:
-+
-+ /* Initialize poly keys */
-+ memset(aead_ctx->chacha_buffer, 0, FILL_BUFFER);
-+
-+ if (!aead_ctx->draft) {
-+ /* RFC IV = (0 || iv) ^ seq_num */
-+ memset(aead_ctx->nonce + 32, 0, 4);
-+ memcpy(aead_ctx->nonce + 36, aead_ctx->iv, 12);
-+ *(uint64_t *)(aead_ctx->nonce + 40) ^= *(uint64_t *)(ptr);
-+
-+ } else {
-+ /* draft IV = 0 || seq_num */
-+ memset(aead_ctx->nonce + 32, 0, 8);
-+ memcpy(aead_ctx->nonce + 40, ptr, 8);
-+ }
-+
-+#if (CHAPOLY_ASM)
-+ if (!aead_ctx->draft) {
-+ if (arg == EVP_AEAD_TLS1_AAD_LEN) {
-+ /* For RFC, use optimized seal/open */
-+ memcpy(aad, ptr, arg);
-+ unsigned int len = (aad[arg-2] << 8) | aad[arg-1];
-+ if (!ctx->encrypt) {
-+ len -= POLY1305_MAC_LEN;
-+ aad[arg-2] = len>>8;
-+ aad[arg-1] = len & 0xff;
-+ }
-+ memcpy(aead_ctx->poly_buffer, aad, arg);
-+ } else if (arg <= FILL_BUFFER) {
-+ memcpy(aead_ctx->poly_buffer, ptr, arg);
-+ } else {
-+ aead_ctx->valid = 0;
-+ return 0;
-+ }
-+ aead_ctx->valid = 1;
-+ aead_ctx->poly_buffer_used = arg;
-+ return POLY1305_MAC_LEN;
-+ }
-+#endif
-+ /* Poly keys = ENC(0) */
-+ CRYPTO_chacha_20(aead_ctx->chacha_buffer,
-+ aead_ctx->chacha_buffer,
-+ FILL_BUFFER,
-+ aead_ctx->nonce);
-+
-+ CRYPTO_poly1305_init(&aead_ctx->poly_state, aead_ctx->chacha_buffer);
-+
-+ aead_ctx->chacha_used = 64;
-+ aead_ctx->poly_buffer_used = 0;
-+ aead_ctx->aad_l = arg;
-+ aead_ctx->ct_l = 0;
-+
-+ /* Absorb AAD */
-+ memcpy(aad, ptr, arg);
-+ memset(aad + arg, 0, sizeof(aad) - arg);
-+
-+ /* If decrypting fix length for tag */
-+ if (!ctx->encrypt) {
-+ unsigned int len = (aad[arg-2] << 8) | aad[arg-1];
-+ len -= POLY1305_MAC_LEN;
-+ aad[arg-2] = len>>8;
-+ aad[arg-1] = len & 0xff;
-+ }
-+
-+ if (!aead_ctx->draft) {
-+ /* In the RFC, AAD is padded with zeroes */
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, aad, POLY1305_PAD_LEN);
-+
-+ } else {
-+ /* In the draft AAD is followed by len(AAD) */
-+ memcpy(&aad[arg], &thirteen, sizeof(thirteen));
-+ CRYPTO_poly1305_update(&aead_ctx->poly_state, aad, arg + sizeof(thirteen));
-+ }
-+
-+ aead_ctx->valid = 1;
-+ return POLY1305_MAC_LEN;
-+
-+ break;
-+
-+ default:
-+ return 0;
-+ break;
-+ }
-+
-+ return 0;
-+}
-+
-+
-+#define CUSTOM_FLAGS (\
-+ EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
-+ | EVP_CIPH_ALWAYS_CALL_INIT \
-+ | EVP_CIPH_CUSTOM_COPY)
-+
-+
-+static const EVP_CIPHER chacha20_poly1305_d = {
-+ NID_chacha20_poly1305_draft,
-+ 1, /* block size, sorta */
-+ 32, /* key len */
-+ 0, /* iv len */
-+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */
-+ EVP_chacha20_poly1305_init_draft,
-+ EVP_chacha20_poly1305_cipher,
-+ EVP_chacha20_poly1305_cleanup,
-+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */
-+ NULL,
-+ NULL,
-+ EVP_chacha20_poly1305_ctrl,
-+ NULL
-+ };
-+
-+
-+static const EVP_CIPHER chacha20_poly1305 = {
-+ NID_chacha20_poly1305,
-+ 1, /* block size, sorta */
-+ 32, /* key len */
-+ 12, /* iv len */
-+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */
-+ EVP_chacha20_poly1305_init,
-+ EVP_chacha20_poly1305_cipher,
-+ EVP_chacha20_poly1305_cleanup,
-+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */
-+ NULL,
-+ NULL,
-+ EVP_chacha20_poly1305_ctrl,
-+ NULL
-+ };
-+
-+
-+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void)
-+{ return &chacha20_poly1305_d; }
-+
-+
-+const EVP_CIPHER *EVP_chacha20_poly1305(void)
-+{ return &chacha20_poly1305; }
-+#endif
-diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
-index 39ab793..8feaabc 100644
---- a/crypto/evp/evp.h
-+++ b/crypto/evp/evp.h
-@@ -902,6 +902,11 @@ const EVP_CIPHER *EVP_seed_cfb128(void);
- const EVP_CIPHER *EVP_seed_ofb(void);
- # endif
-
-+# ifndef OPENSSL_NO_CHACHA_POLY
-+const EVP_CIPHER *EVP_chacha20_poly1305(void);
-+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void);
-+# endif
-+
- void OPENSSL_add_all_algorithms_noconf(void);
- void OPENSSL_add_all_algorithms_conf(void);
-
-diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h
-index b7e3cf2..26612e2 100644
---- a/crypto/objects/obj_dat.h
-+++ b/crypto/objects/obj_dat.h
-@@ -62,9 +62,9 @@
- * [including the GNU Public Licence.]
- */
-
--#define NUM_NID 958
--#define NUM_SN 951
--#define NUM_LN 951
-+#define NUM_NID 960
-+#define NUM_SN 953
-+#define NUM_LN 953
- #define NUM_OBJ 890
-
- static const unsigned char lvalues[6255]={
-@@ -2514,6 +2514,9 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
- NID_jurisdictionStateOrProvinceName,11,&(lvalues[6232]),0},
- {"jurisdictionC","jurisdictionCountryName",
- NID_jurisdictionCountryName,11,&(lvalues[6243]),0},
-+{"CHACHA20-POLY1305","chacha20-poly1305",NID_chacha20_poly1305,0,NULL,0},
-+{"CHACHA20-POLY1305-D","chacha20-poly1305-draft",
-+ NID_chacha20_poly1305_draft,0,NULL,0},
- };
-
- static const unsigned int sn_objs[NUM_SN]={
-@@ -2574,6 +2577,8 @@ static const unsigned int sn_objs[NUM_SN]={
- 110, /* "CAST5-CFB" */
- 109, /* "CAST5-ECB" */
- 111, /* "CAST5-OFB" */
-+958, /* "CHACHA20-POLY1305" */
-+959, /* "CHACHA20-POLY1305-D" */
- 894, /* "CMAC" */
- 13, /* "CN" */
- 141, /* "CRLReason" */
-@@ -3728,6 +3733,8 @@ static const unsigned int ln_objs[NUM_LN]={
- 677, /* "certicom-arc" */
- 517, /* "certificate extensions" */
- 883, /* "certificateRevocationList" */
-+958, /* "chacha20-poly1305" */
-+959, /* "chacha20-poly1305-draft" */
- 54, /* "challengePassword" */
- 407, /* "characteristic-two-field" */
- 395, /* "clearance" */
-diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h
-index 779c309..35a2364 100644
---- a/crypto/objects/obj_mac.h
-+++ b/crypto/objects/obj_mac.h
-@@ -4047,6 +4047,14 @@
- #define LN_aes_256_cbc_hmac_sha256 "aes-256-cbc-hmac-sha256"
- #define NID_aes_256_cbc_hmac_sha256 950
-
-+#define SN_chacha20_poly1305 "CHACHA20-POLY1305"
-+#define LN_chacha20_poly1305 "chacha20-poly1305"
-+#define NID_chacha20_poly1305 958
-+
-+#define SN_chacha20_poly1305_draft "CHACHA20-POLY1305-D"
-+#define LN_chacha20_poly1305_draft "chacha20-poly1305-draft"
-+#define NID_chacha20_poly1305_draft 959
-+
- #define SN_dhpublicnumber "dhpublicnumber"
- #define LN_dhpublicnumber "X9.42 DH"
- #define NID_dhpublicnumber 920
-diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num
-index 8e5ea83..a3da329 100644
---- a/crypto/objects/obj_mac.num
-+++ b/crypto/objects/obj_mac.num
-@@ -955,3 +955,5 @@ ct_cert_scts 954
- jurisdictionLocalityName 955
- jurisdictionStateOrProvinceName 956
- jurisdictionCountryName 957
-+chacha20_poly1305 958
-+chacha20_poly1305_draft 959
-diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt
-index b57aabb..6a34a33 100644
---- a/crypto/objects/objects.txt
-+++ b/crypto/objects/objects.txt
-@@ -1294,6 +1294,8 @@ kisa 1 6 : SEED-OFB : seed-ofb
- : AES-128-CBC-HMAC-SHA256 : aes-128-cbc-hmac-sha256
- : AES-192-CBC-HMAC-SHA256 : aes-192-cbc-hmac-sha256
- : AES-256-CBC-HMAC-SHA256 : aes-256-cbc-hmac-sha256
-+ : CHACHA20-POLY1305 : chacha20-poly1305
-+ : CHACHA20-POLY1305-D : chacha20-poly1305-draft
-
- ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH
-
-diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
-index 0385e03..65fdc59 100644
---- a/ssl/s3_lib.c
-+++ b/ssl/s3_lib.c
-@@ -2945,6 +2945,110 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
- 256},
- #endif
-
-+#if !defined(OPENSSL_NO_CHACHA_POLY)
-+/* Draft ciphers */
-+ {
-+ 1,
-+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D,
-+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D,
-+ SSL_kEECDH,
-+ SSL_aRSA,
-+ SSL_CHACHA20POLY1305_D,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+
-+ {
-+ 1,
-+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D,
-+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D,
-+ SSL_kEECDH,
-+ SSL_aECDSA,
-+ SSL_CHACHA20POLY1305_D,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+
-+ {
-+ 1,
-+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D,
-+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D,
-+ SSL_kEDH,
-+ SSL_aRSA,
-+ SSL_CHACHA20POLY1305_D,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+ /* RFC ciphers */
-+ {
-+ 1,
-+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305,
-+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305,
-+ SSL_kECDHE,
-+ SSL_aRSA,
-+ SSL_CHACHA20POLY1305,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+ {
-+ 1,
-+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
-+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
-+ SSL_kECDHE,
-+ SSL_aECDSA,
-+ SSL_CHACHA20POLY1305,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+ {
-+ 1,
-+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305,
-+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305,
-+ SSL_kDHE,
-+ SSL_aRSA,
-+ SSL_CHACHA20POLY1305,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+ {
-+ 1,
-+ TLS1_TXT_PSK_WITH_CHACHA20_POLY1305,
-+ TLS1_CK_PSK_WITH_CHACHA20_POLY1305,
-+ SSL_kPSK,
-+ SSL_aPSK,
-+ SSL_CHACHA20POLY1305,
-+ SSL_AEAD,
-+ SSL_TLSV1_2,
-+ SSL_HIGH,
-+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
-+ 256,
-+ 256,
-+ },
-+#endif
- /* end of list */
- };
-
-@@ -4090,6 +4194,7 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
- int i, ii, ok;
- CERT *cert;
- unsigned long alg_k, alg_a, mask_k, mask_a, emask_k, emask_a;
-+ int use_chacha = 0;
-
- /* Let's see which ciphers we can support */
- cert = s->cert;
-@@ -4119,13 +4224,21 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
- fprintf(stderr, "%p:%s\n", (void *)c, c->name);
- }
- #endif
--
-+retry:
- if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) {
- prio = srvr;
- allow = clnt;
-+ /* Use ChaCha20+Poly1305 iff it's client's most preferred cipher suite */
-+ if (sk_SSL_CIPHER_num(clnt) > 0) {
-+ c = sk_SSL_CIPHER_value(clnt, 0);
-+ if (c->algorithm_enc == SSL_CHACHA20POLY1305 ||
-+ c->algorithm_enc == SSL_CHACHA20POLY1305_D)
-+ use_chacha = 1;
-+ }
- } else {
- prio = clnt;
- allow = srvr;
-+ use_chacha = 1;
- }
-
- tls1_set_cert_validity(s);
-@@ -4137,6 +4250,11 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
- if ((c->algorithm_ssl & SSL_TLSV1_2) && !SSL_USE_TLS1_2_CIPHERS(s))
- continue;
-
-+ /* Skip ChaCha unless top client priority */
-+ if ((c->algorithm_enc == SSL_CHACHA20POLY1305 ||
-+ c->algorithm_enc == SSL_CHACHA20POLY1305_D) && !use_chacha)
-+ continue;
-+
- ssl_set_cert_masks(cert, c);
- mask_k = cert->mask_k;
- mask_a = cert->mask_a;
-@@ -4216,6 +4334,14 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
- break;
- }
- }
-+
-+ if (ret == NULL && !use_chacha) {
-+ /* If no shared cipher was found due to some unusual preferences, try
-+ * again with CHACHA enabled even if not top priority */
-+ use_chacha = 1;
-+ goto retry;
-+ }
-+
- return (ret);
- }
-
-diff --git a/ssl/ssl.h b/ssl/ssl.h
-index 90aeb0c..f783baa 100644
---- a/ssl/ssl.h
-+++ b/ssl/ssl.h
-@@ -297,6 +297,8 @@ extern "C" {
- # define SSL_TXT_CAMELLIA128 "CAMELLIA128"
- # define SSL_TXT_CAMELLIA256 "CAMELLIA256"
- # define SSL_TXT_CAMELLIA "CAMELLIA"
-+# define SSL_TXT_CHACHA20_D "CHACHA20-draft"
-+# define SSL_TXT_CHACHA20 "CHACHA20"
-
- # define SSL_TXT_MD5 "MD5"
- # define SSL_TXT_SHA1 "SHA1"
-diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c
-index 2ad8f43..23c1c68 100644
---- a/ssl/ssl_ciph.c
-+++ b/ssl/ssl_ciph.c
-@@ -164,11 +164,13 @@
- #define SSL_ENC_SEED_IDX 11
- #define SSL_ENC_AES128GCM_IDX 12
- #define SSL_ENC_AES256GCM_IDX 13
--#define SSL_ENC_NUM_IDX 14
-+#define SSL_ENC_CHACHA20POLY1305_DRAFT_IDX 14
-+#define SSL_ENC_CHACHA20POLY1305_IDX 15
-+#define SSL_ENC_NUM_IDX 16
-
- static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX] = {
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-- NULL, NULL
-+ NULL, NULL, NULL, NULL
- };
-
- #define SSL_COMP_NULL_IDX 0
-@@ -315,6 +317,8 @@ static const SSL_CIPHER cipher_aliases[] = {
- {0, SSL_TXT_CAMELLIA256, 0, 0, 0, SSL_CAMELLIA256, 0, 0, 0, 0, 0, 0},
- {0, SSL_TXT_CAMELLIA, 0, 0, 0, SSL_CAMELLIA128 | SSL_CAMELLIA256, 0, 0, 0,
- 0, 0, 0},
-+ {0, SSL_TXT_CHACHA20_D, 0, 0, 0, SSL_CHACHA20POLY1305_D, 0, 0, 0, 0, 0, 0},
-+ {0, SSL_TXT_CHACHA20, 0, 0, 0, SSL_CHACHA20POLY1305, 0, 0, 0, 0, 0, 0},
-
- /* MAC aliases */
- {0, SSL_TXT_MD5, 0, 0, 0, 0, SSL_MD5, 0, 0, 0, 0, 0},
-@@ -431,6 +435,11 @@ void ssl_load_ciphers(void)
- ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] =
- EVP_get_cipherbyname(SN_aes_256_gcm);
-
-+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] =
-+ EVP_chacha20_poly1305_draft();
-+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] =
-+ EVP_chacha20_poly1305();
-+
- ssl_digest_methods[SSL_MD_MD5_IDX] = EVP_get_digestbyname(SN_md5);
- ssl_mac_secret_size[SSL_MD_MD5_IDX] =
- EVP_MD_size(ssl_digest_methods[SSL_MD_MD5_IDX]);
-@@ -581,6 +590,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
- case SSL_AES256GCM:
- i = SSL_ENC_AES256GCM_IDX;
- break;
-+ case SSL_CHACHA20POLY1305_D:
-+ i = SSL_ENC_CHACHA20POLY1305_DRAFT_IDX;
-+ break;
-+ case SSL_CHACHA20POLY1305:
-+ i = SSL_ENC_CHACHA20POLY1305_IDX;
-+ break;
- default:
- i = -1;
- break;
-@@ -805,6 +820,12 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth,
- (ssl_cipher_methods[SSL_ENC_GOST89_IDX] ==
- NULL) ? SSL_eGOST2814789CNT : 0;
- *enc |= (ssl_cipher_methods[SSL_ENC_SEED_IDX] == NULL) ? SSL_SEED : 0;
-+ *enc |=
-+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] ==
-+ NULL) ? SSL_CHACHA20POLY1305_D : 0;
-+ *enc |=
-+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] ==
-+ NULL) ? SSL_CHACHA20POLY1305 : 0;
-
- *mac |= (ssl_digest_methods[SSL_MD_MD5_IDX] == NULL) ? SSL_MD5 : 0;
- *mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1 : 0;
-@@ -1824,6 +1845,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
- case SSL_eGOST2814789CNT:
- enc = "GOST89(256)";
- break;
-+ case SSL_CHACHA20POLY1305_D:
-+ enc = "ChaCha20-Poly1305-draft";
-+ break;
-+ case SSL_CHACHA20POLY1305:
-+ enc = "ChaCha20-Poly1305";
-+ break;
- default:
- enc = "unknown";
- break;
-diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
-index 6df725f..dbe68f2 100644
---- a/ssl/ssl_locl.h
-+++ b/ssl/ssl_locl.h
-@@ -354,6 +354,8 @@
- # define SSL_SEED 0x00000800L
- # define SSL_AES128GCM 0x00001000L
- # define SSL_AES256GCM 0x00002000L
-+# define SSL_CHACHA20POLY1305_D 0x00040000L
-+# define SSL_CHACHA20POLY1305 0x00080000L /* Value from openssl */
-
- # define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM)
- # define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256)
-diff --git a/ssl/tls1.h b/ssl/tls1.h
-index 7e237d0..ff2e259 100644
---- a/ssl/tls1.h
-+++ b/ssl/tls1.h
-@@ -563,6 +563,19 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
- # define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031
- # define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032
-
-+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */
-+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC13
-+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D 0x0300CC14
-+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC15
-+/* ChaCha20-Poly1305 ciphersuites from RFC */
-+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCA8
-+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 0x0300CCA9
-+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCAA
-+# define TLS1_CK_PSK_WITH_CHACHA20_POLY1305 0x0300CCAB
-+# define TLS1_CK_ECDHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAC
-+# define TLS1_CK_DHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAD
-+# define TLS1_CK_RSA_PSK_WITH_CHACHA20_POLY1305 0x0300CCAE
-+
- /*
- * XXX * Backward compatibility alert: + * Older versions of OpenSSL gave
- * some DHE ciphers names with "EDH" + * instead of "DHE". Going forward, we
-@@ -713,6 +726,19 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
- # define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256"
- # define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384"
-
-+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */
-+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D "ECDHE-RSA-CHACHA20-POLY1305-D"
-+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D "ECDHE-ECDSA-CHACHA20-POLY1305-D"
-+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D "DHE-RSA-CHACHA20-POLY1305-D"
-+/* Chacha20-Poly1305 ciphersuites from RFC */
-+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305"
-+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305"
-+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305"
-+# define TLS1_TXT_PSK_WITH_CHACHA20_POLY1305 "PSK-CHACHA20-POLY1305"
-+# define TLS1_TXT_ECDHE_PSK_WITH_CHACHA20_POLY1305 "ECDHE-PSK-CHACHA20-POLY1305"
-+# define TLS1_TXT_DHE_PSK_WITH_CHACHA20_POLY1305 "DHE-PSK-CHACHA20-POLY1305"
-+# define TLS1_TXT_RSA_PSK_WITH_CHACHA20_POLY1305 "RSA-PSK-CHACHA20-POLY1305"
-+
- # define TLS_CT_RSA_SIGN 1
- # define TLS_CT_DSS_SIGN 2
- # define TLS_CT_RSA_FIXED_DH 3
---
-2.10.1
-
diff --git a/ssl3-test-failure.patch b/ssl3-test-failure.patch
deleted file mode 100644
index d161c3d4a593..000000000000
--- a/ssl3-test-failure.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From: Kurt Roeckx <kurt@roeckx.be>
-Date: Sun, 6 Sep 2015 16:04:11 +0200
-Subject: Disable SSLv3 test in test suite
-
-When testing SSLv3 the test program returns 0 for skip. The test for weak DH
-expects a failure, but gets success.
-
-It should probably be changed to return something other than 0 for a skipped
-test.
----
- test/testssl | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/test/testssl b/test/testssl
-index 747e4ba..1e4370b 100644
---- a/test/testssl
-+++ b/test/testssl
-@@ -160,7 +160,7 @@ test_cipher() {
- }
-
- echo "Testing ciphersuites"
--for protocol in TLSv1.2 SSLv3; do
-+for protocol in TLSv1.2; do
- echo "Testing ciphersuites for $protocol"
- for cipher in `../util/shlib_wrap.sh ../apps/openssl ciphers "RSA+$protocol" | tr ':' ' '`; do
- test_cipher $cipher $protocol