diff options
author | Yishen Miao | 2016-10-27 21:22:06 +0000 |
---|---|---|
committer | Yishen Miao | 2016-10-27 21:22:06 +0000 |
commit | dbe7d2e073b7739e386082bed79bdf24eb793544 (patch) | |
tree | 900895c3b1d8eee1a440232b0e6ef45c693f04a2 | |
parent | 17901dca257db3ee62335df84cb2da4d5082e098 (diff) | |
download | aur-dbe7d2e073b7739e386082bed79bdf24eb793544.tar.gz |
Update to 1.0.2j
Update to 1.0.2j.
modified: .SRCINFO
modified: PKGBUILD
deleted: openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch
new file: openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch
-rw-r--r-- | .SRCINFO | 14 | ||||
-rw-r--r-- | PKGBUILD | 12 | ||||
-rw-r--r-- | openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch | 5227 | ||||
-rw-r--r-- | openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch | 4718 |
4 files changed, 4731 insertions, 5240 deletions
@@ -1,6 +1,6 @@ pkgbase = openssl-chacha20 pkgdesc = The Open Source toolkit for Secure Sockets Layer and Transport Layer Security with Chacha20 cipher - pkgver = 1.0.2.h + pkgver = 1.0.2.j pkgrel = 1 url = https://www.openssl.org arch = i686 @@ -8,23 +8,23 @@ pkgbase = openssl-chacha20 license = custom:BSD depends = perl optdepends = ca-certificates - provides = openssl=1.0.2.h + provides = openssl=1.0.2.j conflicts = openssl options = !makeflags backup = etc/ssl/openssl.cnf - source = https://www.openssl.org/source/openssl-1.0.2h.tar.gz - source = https://www.openssl.org/source/openssl-1.0.2h.tar.gz.asc + source = https://www.openssl.org/source/openssl-1.0.2j.tar.gz + source = https://www.openssl.org/source/openssl-1.0.2j.tar.gz.asc source = no-rpath.patch source = ssl3-test-failure.patch source = ca-dir.patch - source = openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch + source = openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch validpgpkeys = 8657ABB260F056B1E5190839D9C4D26D0E604491 - sha256sums = 1d4007e53aad94a5b2002fe045ee7bb0b3d98f1a47f8b2bc851dcd1c74332919 + sha256sums = e7aff292be21c259c6af26469c7a9b3ba26e9abaaffd325e3dccc9785256c431 sha256sums = SKIP sha256sums = 754d6107a306311e15a1db6a1cc031b81691c8b9865e8809ac60ca6f184c957c sha256sums = c54ae87c602eaa1530a336ab7c6e22e12898e1941012349c153e52553df64a13 sha256sums = 9e8126f3a748f4c1d6fe34d4436de72b16a40e97a6d18234d2e88caa179d50c4 - sha256sums = 09a2e88f95d8cd12bd9c23cd87554ab700fb1625a848c0502951849fb1d564fc + sha256sums = d6f9427d5cb63c7299563c201cd8708c7166e0f8c98b57a1fee69767362bf0f7 pkgname = openssl-chacha20 @@ -3,7 +3,7 @@ _pkgname=openssl pkgname=${_pkgname}-chacha20 -_ver=1.0.2h +_ver=1.0.2j # use a pacman compatible version scheme pkgver=${_ver/[a-z]/.${_ver//[0-9.]/}} #pkgver=$_ver @@ -23,13 +23,13 @@ source=("https://www.openssl.org/source/${_pkgname}-${_ver}.tar.gz" 'no-rpath.patch' 'ssl3-test-failure.patch' 'ca-dir.patch' - 'openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch') -sha256sums=('1d4007e53aad94a5b2002fe045ee7bb0b3d98f1a47f8b2bc851dcd1c74332919' + 'openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch') +sha256sums=('e7aff292be21c259c6af26469c7a9b3ba26e9abaaffd325e3dccc9785256c431' 'SKIP' '754d6107a306311e15a1db6a1cc031b81691c8b9865e8809ac60ca6f184c957c' 'c54ae87c602eaa1530a336ab7c6e22e12898e1941012349c153e52553df64a13' '9e8126f3a748f4c1d6fe34d4436de72b16a40e97a6d18234d2e88caa179d50c4' - '09a2e88f95d8cd12bd9c23cd87554ab700fb1625a848c0502951849fb1d564fc') + 'd6f9427d5cb63c7299563c201cd8708c7166e0f8c98b57a1fee69767362bf0f7') validpgpkeys=('8657ABB260F056B1E5190839D9C4D26D0E604491') prepare() { @@ -45,8 +45,8 @@ prepare() { patch -p0 -i $srcdir/ca-dir.patch # Cloudflare patch - # https://github.com/cloudflare/sslconfig/blob/master/patches/openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch - patch -p1 -i $srcdir/openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch + # https://github.com/cloudflare/sslconfig/blob/master/patches/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch + patch -p1 -i $srcdir/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch } build() { diff --git a/openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch b/openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch deleted file mode 100644 index f67652325c1a..000000000000 --- a/openssl__chacha20_poly1305_draft_and_rfc_ossl102g.patch +++ /dev/null @@ -1,5227 +0,0 @@ -From d21c75c622eb13a80080ed15fa30474f806a0a0f Mon Sep 17 00:00:00 2001 -From: Vlad Krasnov <vlad@cloudflare.com> -Date: Fri, 12 Feb 2016 18:25:11 -0800 -Subject: [PATCH] Implementation of draft and RFC versions of CHACHA20-POLY1305 - ciphers - ---- - Configure | 48 +- - Makefile.org | 4 +- - apps/speed.c | 30 +- - crypto/chacha20poly1305/Makefile | 97 +++ - crypto/chacha20poly1305/asm/chacha20_avx.pl | 408 +++++++++++ - crypto/chacha20poly1305/asm/chacha20_avx2.pl | 443 ++++++++++++ - crypto/chacha20poly1305/asm/poly1305_avx.pl | 732 ++++++++++++++++++++ - crypto/chacha20poly1305/asm/poly1305_avx2.pl | 984 +++++++++++++++++++++++++++ - crypto/chacha20poly1305/asm/poly1305_x64.pl | 281 ++++++++ - crypto/chacha20poly1305/chacha20.c | 162 +++++ - crypto/chacha20poly1305/chacha20poly1305.h | 79 +++ - crypto/chacha20poly1305/chapolytest.c | 470 +++++++++++++ - crypto/chacha20poly1305/poly1305.c | 287 ++++++++ - crypto/cryptlib.c | 10 - - crypto/evp/Makefile | 7 +- - crypto/evp/e_chacha20poly1305.c | 435 ++++++++++++ - crypto/evp/evp.h | 4 + - ssl/s3_lib.c | 119 ++++ - ssl/ssl.h | 2 + - ssl/ssl_ciph.c | 60 +- - ssl/ssl_locl.h | 2 + - ssl/tls1.h | 28 + - test/Makefile | 17 +- - 23 files changed, 4655 insertions(+), 54 deletions(-) - create mode 100644 crypto/chacha20poly1305/Makefile - create mode 100644 crypto/chacha20poly1305/asm/chacha20_avx.pl - create mode 100644 crypto/chacha20poly1305/asm/chacha20_avx2.pl - create mode 100644 crypto/chacha20poly1305/asm/poly1305_avx.pl - create mode 100644 crypto/chacha20poly1305/asm/poly1305_avx2.pl - create mode 100644 crypto/chacha20poly1305/asm/poly1305_x64.pl - create mode 100644 crypto/chacha20poly1305/chacha20.c - create mode 100644 crypto/chacha20poly1305/chacha20poly1305.h - create mode 100644 crypto/chacha20poly1305/chapolytest.c - create mode 100644 crypto/chacha20poly1305/poly1305.c - create mode 100644 crypto/evp/e_chacha20poly1305.c - -diff --git a/Configure b/Configure -index 4a715dc..f3ab6cd 100755 ---- a/Configure -+++ b/Configure -@@ -146,25 +146,25 @@ my $tlib="-lnsl -lsocket"; - my $bits1="THIRTY_TWO_BIT "; - my $bits2="SIXTY_FOUR_BIT "; - --my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; -+my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o::"; - - my $x86_elf_asm="$x86_asm:elf"; - --my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; --my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; --my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; --my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void"; --my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void"; --my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; -+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o::chacha20_avx.o chacha20_avx2.o poly1305_x64.o poly1305_avx2.o"; -+my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:::void"; -+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o:::void"; -+my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o::::::::::::::void"; -+my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o:::void"; -+my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::::"; - my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//; --my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; --my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void"; --my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; --my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; --my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; --my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; -+my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o::"; -+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o:::void"; -+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o::"; -+my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::32"; -+my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::64"; -+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o::"; - my $ppc32_asm=$ppc64_asm; --my $no_asm="::::::::::::::::void"; -+my $no_asm=":::::::::::::::::void"; - - # As for $BSDthreads. Idea is to maintain "collective" set of flags, - # which would cover all BSD flavors. -pthread applies to them all, -@@ -710,6 +710,7 @@ my $idx_wp_obj = $idx++; - my $idx_cmll_obj = $idx++; - my $idx_modes_obj = $idx++; - my $idx_engines_obj = $idx++; -+my $idx_chapoly_obj = $idx++; - my $idx_perlasm_scheme = $idx++; - my $idx_dso_scheme = $idx++; - my $idx_shared_target = $idx++; -@@ -752,6 +753,7 @@ my $bf ="crypto/bf/bf_locl.h"; - my $bn_asm ="bn_asm.o"; - my $des_enc="des_enc.o fcrypt_b.o"; - my $aes_enc="aes_core.o aes_cbc.o"; -+my $chapoly_enc=""; - my $bf_enc ="bf_enc.o"; - my $cast_enc="c_enc.o"; - my $rc4_enc="rc4_enc.o rc4_skey.o"; -@@ -1210,7 +1212,7 @@ $openssldir=$prefix . "/" . $openssldir if $openssldir !~ /(^\/|^[a-zA-Z]:[\\\/] - - print "IsMK1MF=$IsMK1MF\n"; - --my @fields = split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); -+my @fields = split(/\s*:\s*/,$table{$target} . ":" x 31 , -1); - my $cc = $fields[$idx_cc]; - # Allow environment CC to override compiler... - if($ENV{CC}) { -@@ -1239,6 +1241,7 @@ my $wp_obj = $fields[$idx_wp_obj]; - my $cmll_obj = $fields[$idx_cmll_obj]; - my $modes_obj = $fields[$idx_modes_obj]; - my $engines_obj = $fields[$idx_engines_obj]; -+my $chapoly_obj = $fields[$idx_chapoly_obj]; - my $perlasm_scheme = $fields[$idx_perlasm_scheme]; - my $dso_scheme = $fields[$idx_dso_scheme]; - my $shared_target = $fields[$idx_shared_target]; -@@ -1405,7 +1408,7 @@ if ($no_asm) - { - $cpuid_obj=$bn_obj=$ec_obj= - $des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj= -- $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=""; -+ $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=$chapoly_obj=""; - } - - if (!$no_shared) -@@ -1558,6 +1561,14 @@ $bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/); - $cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/); - $rc4_obj=$rc4_enc unless ($rc4_obj =~ /\.o$/); - $rc5_obj=$rc5_enc unless ($rc5_obj =~ /\.o$/); -+if ($chapoly_obj =~ /\.o$/) -+ { -+ $cflags.=" -DCHAPOLY_x86_64_ASM"; -+ } -+else -+ { -+ $chapoly_obj=$chapoly_enc; -+ } - if ($sha1_obj =~ /\.o$/) - { - # $sha1_obj=$sha1_enc; -@@ -1740,6 +1751,7 @@ while (<IN>) - s/^WP_ASM_OBJ=.*$/WP_ASM_OBJ= $wp_obj/; - s/^CMLL_ENC=.*$/CMLL_ENC= $cmll_obj/; - s/^MODES_ASM_OBJ.=*$/MODES_ASM_OBJ= $modes_obj/; -+ s/^CHAPOLY_ENC=.*$/CHAPOLY_ENC= $chapoly_obj/; - s/^ENGINES_ASM_OBJ.=*$/ENGINES_ASM_OBJ= $engines_obj/; - s/^PERLASM_SCHEME=.*$/PERLASM_SCHEME= $perlasm_scheme/; - s/^PROCESSOR=.*/PROCESSOR= $processor/; -@@ -1802,6 +1814,7 @@ print "RMD160_OBJ_ASM=$rmd160_obj\n"; - print "CMLL_ENC =$cmll_obj\n"; - print "MODES_OBJ =$modes_obj\n"; - print "ENGINES_OBJ =$engines_obj\n"; -+print "CHAPOLY_ENC =$chapoly_obj\n"; - print "PROCESSOR =$processor\n"; - print "RANLIB =$ranlib\n"; - print "ARFLAGS =$arflags\n"; -@@ -2200,7 +2213,7 @@ sub print_table_entry - my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags, - $bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj, - $md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj, -- $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, -+ $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, $chapoly_obj, - $perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag, - $shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)= - split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); -@@ -2231,6 +2244,7 @@ sub print_table_entry - \$cmll_obj = $cmll_obj - \$modes_obj = $modes_obj - \$engines_obj = $engines_obj -+\$chapoly_obj = $chapoly_obj - \$perlasm_scheme = $perlasm_scheme - \$dso_scheme = $dso_scheme - \$shared_target= $shared_target -diff --git a/Makefile.org b/Makefile.org -index 76fdbdf..6556ef6 100644 ---- a/Makefile.org -+++ b/Makefile.org -@@ -91,6 +91,7 @@ BN_ASM= bn_asm.o - EC_ASM= - DES_ENC= des_enc.o fcrypt_b.o - AES_ENC= aes_core.o aes_cbc.o -+CHAPOLY_ENC= - BF_ENC= bf_enc.o - CAST_ENC= c_enc.o - RC4_ENC= rc4_enc.o -@@ -148,7 +149,7 @@ SDIRS= \ - bn ec rsa dsa ecdsa dh ecdh dso engine \ - buffer bio stack lhash rand err \ - evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \ -- cms pqueue ts jpake srp store cmac -+ cms pqueue ts jpake srp store cmac chacha20poly1305 - # keep in mind that the above list is adjusted by ./Configure - # according to no-xxx arguments... - -@@ -234,6 +235,7 @@ BUILDENV= LC_ALL=C PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)'\ - WP_ASM_OBJ='$(WP_ASM_OBJ)' \ - MODES_ASM_OBJ='$(MODES_ASM_OBJ)' \ - ENGINES_ASM_OBJ='$(ENGINES_ASM_OBJ)' \ -+ CHAPOLY_ENC='$(CHAPOLY_ENC)' \ - PERLASM_SCHEME='$(PERLASM_SCHEME)' \ - FIPSLIBDIR='${FIPSLIBDIR}' \ - FIPSDIR='${FIPSDIR}' \ -diff --git a/apps/speed.c b/apps/speed.c -index 95adcc1..94b80a1 100644 ---- a/apps/speed.c -+++ b/apps/speed.c -@@ -1,4 +1,4 @@ --/* apps/speed.c */ -+/* apps/speed.c -*- mode:C; c-file-style: "eay" -*- */ - /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * -@@ -226,7 +226,7 @@ - # endif - - # undef BUFSIZE --# define BUFSIZE ((long)1024*8+1) -+# define BUFSIZE ((long)1024*8+16) - static volatile int run = 0; - - static int mr = 0; -@@ -241,7 +241,7 @@ static void print_result(int alg, int run_no, int count, double time_used); - static int do_multi(int multi); - # endif - --# define ALGOR_NUM 30 -+# define ALGOR_NUM 31 - # define SIZE_NUM 5 - # define RSA_NUM 4 - # define DSA_NUM 3 -@@ -256,7 +256,7 @@ static const char *names[ALGOR_NUM] = { - "aes-128 cbc", "aes-192 cbc", "aes-256 cbc", - "camellia-128 cbc", "camellia-192 cbc", "camellia-256 cbc", - "evp", "sha256", "sha512", "whirlpool", -- "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash" -+ "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash", "chacha20-poly1305" - }; - - static double results[ALGOR_NUM][SIZE_NUM]; -@@ -516,6 +516,7 @@ int MAIN(int argc, char **argv) - # define D_IGE_192_AES 27 - # define D_IGE_256_AES 28 - # define D_GHASH 29 -+# define D_CHAPOLY 30 - double d = 0.0; - long c[ALGOR_NUM][SIZE_NUM]; - # define R_DSA_512 0 -@@ -972,6 +973,9 @@ int MAIN(int argc, char **argv) - doit[D_CBC_256_CML] = 1; - } else - # endif -+ if (strcmp(*argv, "chacha20-poly1305") == 0) { -+ doit[D_CHAPOLY] = 1; -+ } else - # ifndef OPENSSL_NO_RSA - if (strcmp(*argv, "rsa") == 0) { - rsa_doit[R_RSA_512] = 1; -@@ -1139,6 +1143,7 @@ int MAIN(int argc, char **argv) - BIO_printf(bio_err, "rc4"); - # endif - BIO_printf(bio_err, "\n"); -+ BIO_printf(bio_err, "chacha20-poly1305\n"); - - # ifndef OPENSSL_NO_RSA - BIO_printf(bio_err, "rsa512 rsa1024 rsa2048 rsa4096\n"); -@@ -1370,6 +1375,7 @@ int MAIN(int argc, char **argv) - c[D_IGE_192_AES][0] = count; - c[D_IGE_256_AES][0] = count; - c[D_GHASH][0] = count; -+ c[D_CHAPOLY][0] = count; - - for (i = 1; i < SIZE_NUM; i++) { - c[D_MD2][i] = c[D_MD2][0] * 4 * lengths[0] / lengths[i]; -@@ -1821,6 +1827,22 @@ int MAIN(int argc, char **argv) - CRYPTO_gcm128_release(ctx); - } - # endif -+ if (doit[D_CHAPOLY]) { -+ EVP_CIPHER_CTX ctx; -+ EVP_CIPHER_CTX_init(&ctx); -+ EVP_CipherInit_ex(&ctx, EVP_chacha20_poly1305(), NULL, key32, iv, 1); -+ -+ for (j = 0; j < SIZE_NUM; j++) { -+ print_message(names[D_CHAPOLY], c[D_CHAPOLY][j], lengths[j]); -+ Time_F(START); -+ for (count = 0, run = 1; COND(c[D_CHAPOLY][j]); count++) { -+ EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_AEAD_TLS1_AAD, 13, buf); -+ EVP_Cipher(&ctx, buf, buf, (unsigned long)lengths[j] + 16); -+ } -+ d = Time_F(STOP); -+ print_result(D_CHAPOLY, j, count, d); -+ } -+ } - # ifndef OPENSSL_NO_CAMELLIA - if (doit[D_CBC_128_CML]) { - for (j = 0; j < SIZE_NUM; j++) { -diff --git a/crypto/chacha20poly1305/Makefile b/crypto/chacha20poly1305/Makefile -new file mode 100644 -index 0000000..446eb27 ---- /dev/null -+++ b/crypto/chacha20poly1305/Makefile -@@ -0,0 +1,97 @@ -+# -+# crypto/chacha20poly1305/Makefile -+# -+ -+DIR= chacha20poly1305 -+TOP= ../.. -+CC= cc -+CPP= $(CC) -E -+INCLUDES= -+CFLAG=-g -+MAKEFILE= Makefile -+AR= ar r -+ -+CHAPOLY_ENC= -+ -+CFLAGS= $(INCLUDES) $(CFLAG) -+ASFLAGS= $(INCLUDES) $(ASFLAG) -+AFLAGS= $(ASFLAGS) -+ -+GENERAL=Makefile -+TEST=chapolytest.c -+APPS= -+ -+LIB=$(TOP)/libcrypto.a -+LIBSRC=chacha20.c poly1305.c -+LIBOBJ=chacha20.o poly1305.o $(CHAPOLY_ENC) -+ -+SRC= $(LIBSRC) -+ -+EXHEADER=chacha20poly1305.h -+HEADER= $(EXHEADER) -+ -+ALL= $(GENERAL) $(SRC) $(HEADER) -+ -+top: -+ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) -+ -+all: lib -+ -+lib: $(LIBOBJ) -+ $(AR) $(LIB) $(LIBOBJ) -+ $(RANLIB) $(LIB) || echo Never mind. -+ @touch lib -+ -+poly1305_x64.s:asm/poly1305_x64.pl -+ $(PERL) asm/poly1305_x64.pl $(PERLASM_SCHEME) > $@ -+chacha20_avx.s:asm/chacha20_avx.pl -+ $(PERL) asm/chacha20_avx.pl $(PERLASM_SCHEME) > $@ -+poly1305_avx.s:asm/poly1305_avx.pl -+ $(PERL) asm/poly1305_avx.pl $(PERLASM_SCHEME) > $@ -+chacha20_avx2.s:asm/chacha20_avx2.pl -+ $(PERL) asm/chacha20_avx2.pl $(PERLASM_SCHEME) > $@ -+poly1305_avx2.s:asm/poly1305_avx2.pl -+ $(PERL) asm/poly1305_avx2.pl $(PERLASM_SCHEME) > $@ -+ -+files: -+ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO -+ -+links: -+ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) -+ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) -+ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) -+ -+install: -+ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... -+ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ -+ do \ -+ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ -+ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ -+ done; -+ -+tags: -+ ctags $(SRC) -+ -+tests: -+ -+chapolytest: top chapolytest.c $(LIB) -+ $(CC) $(CFLAGS) -Wall -Werror -g -o chapolytest cahpolytest.c $(LIB) -+ -+lint: -+ lint -DLINT $(INCLUDES) $(SRC)>fluff -+ -+depend: -+ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... -+ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) -+ -+dclean: -+ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new -+ mv -f Makefile.new $(MAKEFILE) -+ -+clean: -+ rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff -+ -+# DO NOT DELETE THIS LINE -- make depend depends on it. -+ -+chacha20.o: ../../include/openssl/chacha20poly1305.h chacha20.c -+poly1305.o: ../../include/openssl/chacha20poly1305.h poly1305.c -diff --git a/crypto/chacha20poly1305/asm/chacha20_avx.pl b/crypto/chacha20poly1305/asm/chacha20_avx.pl -new file mode 100644 -index 0000000..bf3e3f0 ---- /dev/null -+++ b/crypto/chacha20poly1305/asm/chacha20_avx.pl -@@ -0,0 +1,408 @@ -+#!/usr/bin/env perl -+ -+############################################################################## -+# # -+# Copyright 2014 Intel Corporation # -+# # -+# Licensed under the Apache License, Version 2.0 (the "License"); # -+# you may not use this file except in compliance with the License. # -+# You may obtain a copy of the License at # -+# # -+# http://www.apache.org/licenses/LICENSE-2.0 # -+# # -+# Unless required by applicable law or agreed to in writing, software # -+# distributed under the License is distributed on an "AS IS" BASIS, # -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -+# See the License for the specific language governing permissions and # -+# limitations under the License. # -+# # -+############################################################################## -+# # -+# Developers and authors: # -+# Shay Gueron (1, 2), and Vlad Krasnov (1) # -+# (1) Intel Corporation, Israel Development Center # -+# (2) University of Haifa # -+# # -+# Related work: # -+# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE # -+# Proceedings of 11th International Conference on Information # -+# Technology: New Generations (ITNG 2014), 612-615 (2014). # -+# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"# -+# to be published. # -+# A. Langley, chacha20poly1305 for the AEAD head # -+# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 # -+############################################################################## -+ -+ -+$flavour = shift; -+$output = shift; -+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -+ -+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -+die "can't locate x86_64-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` -+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.19) + ($1>=2.22); -+} -+ -+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && -+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.09) + ($1>=2.10); -+} -+ -+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && -+ `ml64 2>&1` =~ /Version ([0-9]+)\./) { -+ $avx = ($1>=10) + ($1>=11); -+} -+ -+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { -+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 -+ $avx = ($ver>=3.0) + ($ver>=3.01); -+} -+ -+if ($avx>=1) {{ -+ -+my ($rol8, $rol16, $state_cdef, $tmp, -+ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7, -+ $v8, $v9, $v10, $v11)=map("%xmm$_",(0..15)); -+ -+sub chacha_qr { -+ -+my ($a,$b,$c,$d)=@_; -+$code.=<<___; -+ -+ vpaddd $b, $a, $a # a += b -+ vpxor $a, $d, $d # d ^= a -+ vpshufb $rol16, $d, $d # d <<<= 16 -+ -+ vpaddd $d, $c, $c # c += d -+ vpxor $c, $b, $b # b ^= c -+ vpslld \$12, $b, $tmp -+ vpsrld \$20, $b, $b -+ vpxor $tmp, $b, $b # b <<<= 12 -+ -+ vpaddd $b, $a, $a # a += b -+ vpxor $a, $d, $d # d ^= a -+ vpshufb $rol8, $d, $d # d <<<= 8 -+ -+ vpaddd $d, $c, $c # c += d -+ vpxor $c, $b, $b # b ^= c -+ -+ vpslld \$7, $b, $tmp -+ vpsrld \$25, $b, $b -+ vpxor $tmp, $b, $b # b <<<= 7 -+___ -+ -+} -+ -+$code.=<<___; -+ -+.text -+.align 16 -+chacha20_consts: -+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -+.rol8: -+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -+.rol16: -+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -+.avxInc: -+.quad 1,0 -+ -+___ -+ -+{ -+ -+ -+my ($out, $in, $in_len, $key_ptr, $nr) -+ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8"); -+ -+$code.=<<___; -+.globl chacha_20_core_avx -+.type chacha_20_core_avx ,\@function,2 -+.align 64 -+chacha_20_core_avx: -+ vzeroupper -+ -+ # Init state -+ vmovdqa .rol8(%rip), $rol8 -+ vmovdqa .rol16(%rip), $rol16 -+ vmovdqu 16*2($key_ptr), $state_cdef -+ -+2: -+ cmp \$64*3, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqu 16*0($key_ptr), $v1 -+ vmovdqu 16*1($key_ptr), $v2 -+ vmovdqa $state_cdef, $v3 -+ -+ vmovdqa $v0, $v4 -+ vmovdqa $v0, $v8 -+ -+ vmovdqa $v1, $v5 -+ vmovdqa $v1, $v9 -+ -+ vmovdqa $v2, $v6 -+ vmovdqa $v2, $v10 -+ -+ vpaddq .avxInc(%rip), $v3, $v7 -+ vpaddq .avxInc(%rip), $v7, $v11 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr( $v0, $v1, $v2, $v3); -+ &chacha_qr( $v4, $v5, $v6, $v7); -+ &chacha_qr( $v8, $v9,$v10,$v11); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+ vpalignr \$4, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$12, $v7, $v7, $v7 -+ vpalignr \$4, $v9, $v9, $v9 -+ vpalignr \$8, $v10, $v10, $v10 -+ vpalignr \$12, $v11, $v11, $v11 -+___ -+ -+ &chacha_qr( $v0, $v1, $v2, $v3); -+ &chacha_qr( $v4, $v5, $v6, $v7); -+ &chacha_qr( $v8, $v9,$v10,$v11); -+ -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ vpalignr \$12, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$4, $v7, $v7, $v7 -+ vpalignr \$12, $v9, $v9, $v9 -+ vpalignr \$8, $v10, $v10, $v10 -+ vpalignr \$4, $v11, $v11, $v11 -+ -+ dec $nr -+ -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd chacha20_consts(%rip), $v4, $v4 -+ vpaddd chacha20_consts(%rip), $v8, $v8 -+ -+ vpaddd 16*0($key_ptr), $v1, $v1 -+ vpaddd 16*0($key_ptr), $v5, $v5 -+ vpaddd 16*0($key_ptr), $v9, $v9 -+ -+ vpaddd 16*1($key_ptr), $v2, $v2 -+ vpaddd 16*1($key_ptr), $v6, $v6 -+ vpaddd 16*1($key_ptr), $v10, $v10 -+ -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v7, $v7 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v11, $v11 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ -+ vpxor 16*0($in), $v0, $v0 -+ vpxor 16*1($in), $v1, $v1 -+ vpxor 16*2($in), $v2, $v2 -+ vpxor 16*3($in), $v3, $v3 -+ -+ vmovdqu $v0, 16*0($out) -+ vmovdqu $v1, 16*1($out) -+ vmovdqu $v2, 16*2($out) -+ vmovdqu $v3, 16*3($out) -+ -+ vpxor 16*4($in), $v4, $v4 -+ vpxor 16*5($in), $v5, $v5 -+ vpxor 16*6($in), $v6, $v6 -+ vpxor 16*7($in), $v7, $v7 -+ -+ vmovdqu $v4, 16*4($out) -+ vmovdqu $v5, 16*5($out) -+ vmovdqu $v6, 16*6($out) -+ vmovdqu $v7, 16*7($out) -+ -+ vpxor 16*8($in), $v8, $v8 -+ vpxor 16*9($in), $v9, $v9 -+ vpxor 16*10($in), $v10, $v10 -+ vpxor 16*11($in), $v11, $v11 -+ -+ vmovdqu $v8, 16*8($out) -+ vmovdqu $v9, 16*9($out) -+ vmovdqu $v10, 16*10($out) -+ vmovdqu $v11, 16*11($out) -+ -+ lea 16*12($in), $in -+ lea 16*12($out), $out -+ sub \$16*12, $in_len -+ -+ jmp 2b -+ -+2: -+ cmp \$64*2, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqa chacha20_consts(%rip), $v4 -+ vmovdqu 16*0($key_ptr), $v1 -+ vmovdqu 16*0($key_ptr), $v5 -+ vmovdqu 16*1($key_ptr), $v2 -+ vmovdqu 16*1($key_ptr), $v6 -+ vmovdqu 16*1($key_ptr), $v10 -+ vmovdqa $state_cdef, $v3 -+ vpaddq .avxInc(%rip), $v3, $v7 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ &chacha_qr($v4,$v5,$v6,$v7); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+ vpalignr \$4, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$12, $v7, $v7, $v7 -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ &chacha_qr($v4,$v5,$v6,$v7); -+ -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ vpalignr \$12, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$4, $v7, $v7, $v7 -+ -+ dec $nr -+ -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd chacha20_consts(%rip), $v4, $v4 -+ -+ vpaddd 16*0($key_ptr), $v1, $v1 -+ vpaddd 16*0($key_ptr), $v5, $v5 -+ -+ vpaddd 16*1($key_ptr), $v2, $v2 -+ vpaddd 16*1($key_ptr), $v6, $v6 -+ -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v7, $v7 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ -+ vpxor 16*0($in), $v0, $v0 -+ vpxor 16*1($in), $v1, $v1 -+ vpxor 16*2($in), $v2, $v2 -+ vpxor 16*3($in), $v3, $v3 -+ -+ vmovdqu $v0, 16*0($out) -+ vmovdqu $v1, 16*1($out) -+ vmovdqu $v2, 16*2($out) -+ vmovdqu $v3, 16*3($out) -+ -+ vpxor 16*4($in), $v4, $v4 -+ vpxor 16*5($in), $v5, $v5 -+ vpxor 16*6($in), $v6, $v6 -+ vpxor 16*7($in), $v7, $v7 -+ -+ vmovdqu $v4, 16*4($out) -+ vmovdqu $v5, 16*5($out) -+ vmovdqu $v6, 16*6($out) -+ vmovdqu $v7, 16*7($out) -+ -+ lea 16*8($in), $in -+ lea 16*8($out), $out -+ sub \$16*8, $in_len -+ -+ jmp 2b -+2: -+ cmp \$64, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqu 16*0($key_ptr), $v1 -+ vmovdqu 16*1($key_ptr), $v2 -+ vmovdqa $state_cdef, $v3 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ -+ dec $nr -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd 16*0($key_ptr), $v1, $v1 -+ vpaddd 16*1($key_ptr), $v2, $v2 -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef -+ -+ vpxor 16*0($in), $v0, $v0 -+ vpxor 16*1($in), $v1, $v1 -+ vpxor 16*2($in), $v2, $v2 -+ vpxor 16*3($in), $v3, $v3 -+ -+ vmovdqu $v0, 16*0($out) -+ vmovdqu $v1, 16*1($out) -+ vmovdqu $v2, 16*2($out) -+ vmovdqu $v3, 16*3($out) -+ -+ lea 16*4($in), $in -+ lea 16*4($out), $out -+ sub \$16*4, $in_len -+ jmp 2b -+ -+2: -+ vmovdqu $state_cdef, 16*2($key_ptr) -+ -+ vzeroupper -+ ret -+.size chacha_20_core_avx,.-chacha_20_core_avx -+___ -+} -+}} -+ -+ -+$code =~ s/\`([^\`]*)\`/eval($1)/gem; -+ -+print $code; -+ -+close STDOUT; -+ -diff --git a/crypto/chacha20poly1305/asm/chacha20_avx2.pl b/crypto/chacha20poly1305/asm/chacha20_avx2.pl -new file mode 100644 -index 0000000..9f31f86 ---- /dev/null -+++ b/crypto/chacha20poly1305/asm/chacha20_avx2.pl -@@ -0,0 +1,443 @@ -+#!/usr/bin/env perl -+ -+############################################################################## -+# # -+# Copyright 2014 Intel Corporation # -+# # -+# Licensed under the Apache License, Version 2.0 (the "License"); # -+# you may not use this file except in compliance with the License. # -+# You may obtain a copy of the License at # -+# # -+# http://www.apache.org/licenses/LICENSE-2.0 # -+# # -+# Unless required by applicable law or agreed to in writing, software # -+# distributed under the License is distributed on an "AS IS" BASIS, # -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -+# See the License for the specific language governing permissions and # -+# limitations under the License. # -+# # -+############################################################################## -+# # -+# Developers and authors: # -+# Shay Gueron (1, 2), and Vlad Krasnov (1) # -+# (1) Intel Corporation, Israel Development Center # -+# (2) University of Haifa # -+# # -+# Related work: # -+# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE # -+# Proceedings of 11th International Conference on Information # -+# Technology: New Generations (ITNG 2014), 612-615 (2014). # -+# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"# -+# to be published. # -+# A. Langley, chacha20poly1305 for the AEAD head # -+# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 # -+############################################################################## -+ -+$flavour = shift; -+$output = shift; -+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -+ -+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -+die "can't locate x86_64-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` -+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.19) + ($1>=2.22); -+} -+ -+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && -+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.09) + ($1>=2.10); -+} -+ -+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && -+ `ml64 2>&1` =~ /Version ([0-9]+)\./) { -+ $avx = ($1>=10) + ($1>=11); -+} -+ -+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { -+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 -+ $avx = ($ver>=3.0) + ($ver>=3.01); -+} -+ -+if ($avx>=2) {{ -+ -+my ($state_4567, $state_89ab, $state_cdef, $tmp, -+ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7, -+ $v8, $v9, $v10, $v11)=map("%ymm$_",(0..15)); -+ -+sub chacha_qr { -+ -+my ($a,$b,$c,$d)=@_; -+ -+$code.=<<___; -+ -+ vpaddd $b, $a, $a # a += b -+ vpxor $a, $d, $d # d ^= a -+ vpshufb .rol16(%rip), $d, $d # d <<<= 16 -+ -+ vpaddd $d, $c, $c # c += d -+ vpxor $c, $b, $b # b ^= c -+ vpslld \$12, $b, $tmp -+ vpsrld \$20, $b, $b -+ vpxor $tmp, $b, $b # b <<<= 12 -+ -+ vpaddd $b, $a, $a # a += b -+ vpxor $a, $d, $d # d ^= a -+ vpshufb .rol8(%rip), $d, $d # d <<<= 8 -+ -+ vpaddd $d, $c, $c # c += d -+ vpxor $c, $b, $b # b ^= c -+ -+ vpslld \$7, $b, $tmp -+ vpsrld \$25, $b, $b -+ vpxor $tmp, $b, $b # b <<<= 7 -+___ -+} -+ -+ -+$code.=<<___; -+.text -+.align 32 -+chacha20_consts: -+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -+.rol8: -+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -+.rol16: -+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -+.avx2Init: -+.quad 0,0,1,0 -+.avx2Inc: -+.quad 2,0,2,0 -+___ -+ -+{ -+ -+my $state_cdef_xmm=$state_cdef; -+ -+substr($state_cdef_xmm, 1, 1, "x"); -+ -+my ($out, $in, $in_len, $key_ptr, $nr) -+ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8"); -+ -+$code.=<<___; -+.globl chacha_20_core_avx2 -+.type chacha_20_core_avx2 ,\@function,2 -+.align 64 -+chacha_20_core_avx2: -+ -+ vzeroupper -+ -+ # Init state -+ vbroadcasti128 16*0($key_ptr), $state_4567 -+ vbroadcasti128 16*1($key_ptr), $state_89ab -+ vbroadcasti128 16*2($key_ptr), $state_cdef -+ vpaddq .avx2Init(%rip), $state_cdef, $state_cdef -+ -+2: -+ cmp \$6*64, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqa chacha20_consts(%rip), $v4 -+ vmovdqa chacha20_consts(%rip), $v8 -+ -+ vmovdqa $state_4567, $v1 -+ vmovdqa $state_4567, $v5 -+ vmovdqa $state_4567, $v9 -+ -+ vmovdqa $state_89ab, $v2 -+ vmovdqa $state_89ab, $v6 -+ vmovdqa $state_89ab, $v10 -+ -+ vmovdqa $state_cdef, $v3 -+ vpaddq .avx2Inc(%rip), $v3, $v7 -+ vpaddq .avx2Inc(%rip), $v7, $v11 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr( $v0, $v1, $v2, $v3); -+ &chacha_qr( $v4, $v5, $v6, $v7); -+ &chacha_qr( $v8, $v9,$v10,$v11); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+ vpalignr \$4, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$12, $v7, $v7, $v7 -+ vpalignr \$4, $v9, $v9, $v9 -+ vpalignr \$8, $v10, $v10, $v10 -+ vpalignr \$12, $v11, $v11, $v11 -+___ -+ -+ &chacha_qr( $v0, $v1, $v2, $v3); -+ &chacha_qr( $v4, $v5, $v6, $v7); -+ &chacha_qr( $v8, $v9,$v10,$v11); -+ -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ vpalignr \$12, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$4, $v7, $v7, $v7 -+ vpalignr \$12, $v9, $v9, $v9 -+ vpalignr \$8, $v10, $v10, $v10 -+ vpalignr \$4, $v11, $v11, $v11 -+ -+ dec $nr -+ -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd chacha20_consts(%rip), $v4, $v4 -+ vpaddd chacha20_consts(%rip), $v8, $v8 -+ -+ vpaddd $state_4567, $v1, $v1 -+ vpaddd $state_4567, $v5, $v5 -+ vpaddd $state_4567, $v9, $v9 -+ -+ vpaddd $state_89ab, $v2, $v2 -+ vpaddd $state_89ab, $v6, $v6 -+ vpaddd $state_89ab, $v10, $v10 -+ -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v7, $v7 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v11, $v11 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ -+ vperm2i128 \$0x02, $v0, $v1, $tmp -+ vpxor 32*0($in), $tmp, $tmp -+ vmovdqu $tmp, 32*0($out) -+ vperm2i128 \$0x02, $v2, $v3, $tmp -+ vpxor 32*1($in), $tmp, $tmp -+ vmovdqu $tmp, 32*1($out) -+ vperm2i128 \$0x13, $v0, $v1, $tmp -+ vpxor 32*2($in), $tmp, $tmp -+ vmovdqu $tmp, 32*2($out) -+ vperm2i128 \$0x13, $v2, $v3, $tmp -+ vpxor 32*3($in), $tmp, $tmp -+ vmovdqu $tmp, 32*3($out) -+ -+ vperm2i128 \$0x02, $v4, $v5, $v0 -+ vperm2i128 \$0x02, $v6, $v7, $v1 -+ vperm2i128 \$0x13, $v4, $v5, $v2 -+ vperm2i128 \$0x13, $v6, $v7, $v3 -+ -+ vpxor 32*4($in), $v0, $v0 -+ vpxor 32*5($in), $v1, $v1 -+ vpxor 32*6($in), $v2, $v2 -+ vpxor 32*7($in), $v3, $v3 -+ -+ vmovdqu $v0, 32*4($out) -+ vmovdqu $v1, 32*5($out) -+ vmovdqu $v2, 32*6($out) -+ vmovdqu $v3, 32*7($out) -+ -+ vperm2i128 \$0x02, $v8, $v9, $v0 -+ vperm2i128 \$0x02, $v10, $v11, $v1 -+ vperm2i128 \$0x13, $v8, $v9, $v2 -+ vperm2i128 \$0x13, $v10, $v11, $v3 -+ -+ vpxor 32*8($in), $v0, $v0 -+ vpxor 32*9($in), $v1, $v1 -+ vpxor 32*10($in), $v2, $v2 -+ vpxor 32*11($in), $v3, $v3 -+ -+ vmovdqu $v0, 32*8($out) -+ vmovdqu $v1, 32*9($out) -+ vmovdqu $v2, 32*10($out) -+ vmovdqu $v3, 32*11($out) -+ -+ lea 64*6($in), $in -+ lea 64*6($out), $out -+ sub \$64*6, $in_len -+ -+ jmp 2b -+ -+2: -+ cmp \$4*64, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqa chacha20_consts(%rip), $v4 -+ vmovdqa $state_4567, $v1 -+ vmovdqa $state_4567, $v5 -+ vmovdqa $state_89ab, $v2 -+ vmovdqa $state_89ab, $v6 -+ vmovdqa $state_89ab, $v10 -+ vmovdqa $state_cdef, $v3 -+ vpaddq .avx2Inc(%rip), $v3, $v7 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ &chacha_qr($v4,$v5,$v6,$v7); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+ vpalignr \$4, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$12, $v7, $v7, $v7 -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ &chacha_qr($v4,$v5,$v6,$v7); -+ -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ vpalignr \$12, $v5, $v5, $v5 -+ vpalignr \$8, $v6, $v6, $v6 -+ vpalignr \$4, $v7, $v7, $v7 -+ -+ dec $nr -+ -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd chacha20_consts(%rip), $v4, $v4 -+ -+ vpaddd $state_4567, $v1, $v1 -+ vpaddd $state_4567, $v5, $v5 -+ -+ vpaddd $state_89ab, $v2, $v2 -+ vpaddd $state_89ab, $v6, $v6 -+ -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ vpaddd $state_cdef, $v7, $v7 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ -+ vperm2i128 \$0x02, $v0, $v1, $v8 -+ vperm2i128 \$0x02, $v2, $v3, $v9 -+ vperm2i128 \$0x13, $v0, $v1, $v10 -+ vperm2i128 \$0x13, $v2, $v3, $v11 -+ -+ vpxor 32*0($in), $v8, $v8 -+ vpxor 32*1($in), $v9, $v9 -+ vpxor 32*2($in), $v10, $v10 -+ vpxor 32*3($in), $v11, $v11 -+ -+ vmovdqu $v8, 32*0($out) -+ vmovdqu $v9, 32*1($out) -+ vmovdqu $v10, 32*2($out) -+ vmovdqu $v11, 32*3($out) -+ -+ vperm2i128 \$0x02, $v4, $v5, $v0 -+ vperm2i128 \$0x02, $v6, $v7, $v1 -+ vperm2i128 \$0x13, $v4, $v5, $v2 -+ vperm2i128 \$0x13, $v6, $v7, $v3 -+ -+ vpxor 32*4($in), $v0, $v0 -+ vpxor 32*5($in), $v1, $v1 -+ vpxor 32*6($in), $v2, $v2 -+ vpxor 32*7($in), $v3, $v3 -+ -+ vmovdqu $v0, 32*4($out) -+ vmovdqu $v1, 32*5($out) -+ vmovdqu $v2, 32*6($out) -+ vmovdqu $v3, 32*7($out) -+ -+ lea 64*4($in), $in -+ lea 64*4($out), $out -+ sub \$64*4, $in_len -+ -+ jmp 2b -+2: -+ cmp \$128, $in_len -+ jb 2f -+ -+ vmovdqa chacha20_consts(%rip), $v0 -+ vmovdqa $state_4567, $v1 -+ vmovdqa $state_89ab, $v2 -+ vmovdqa $state_cdef, $v3 -+ -+ mov \$10, $nr -+ -+ 1: -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+ -+$code.=<<___; -+ vpalignr \$4, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$12, $v3, $v3, $v3 -+___ -+ -+ &chacha_qr($v0,$v1,$v2,$v3); -+$code.=<<___; -+ vpalignr \$12, $v1, $v1, $v1 -+ vpalignr \$8, $v2, $v2, $v2 -+ vpalignr \$4, $v3, $v3, $v3 -+ -+ dec $nr -+ jnz 1b -+ -+ vpaddd chacha20_consts(%rip), $v0, $v0 -+ vpaddd $state_4567, $v1, $v1 -+ vpaddd $state_89ab, $v2, $v2 -+ vpaddd $state_cdef, $v3, $v3 -+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef -+ -+ vperm2i128 \$0x02, $v0, $v1, $v8 -+ vperm2i128 \$0x02, $v2, $v3, $v9 -+ vperm2i128 \$0x13, $v0, $v1, $v10 -+ vperm2i128 \$0x13, $v2, $v3, $v11 -+ -+ vpxor 32*0($in), $v8, $v8 -+ vpxor 32*1($in), $v9, $v9 -+ vpxor 32*2($in), $v10, $v10 -+ vpxor 32*3($in), $v11, $v11 -+ -+ vmovdqu $v8, 32*0($out) -+ vmovdqu $v9, 32*1($out) -+ vmovdqu $v10, 32*2($out) -+ vmovdqu $v11, 32*3($out) -+ -+ lea 64*2($in), $in -+ lea 64*2($out), $out -+ sub \$64*2, $in_len -+ jmp 2b -+ -+2: -+ vmovdqu $state_cdef_xmm, 16*2($key_ptr) -+ -+ vzeroupper -+ ret -+.size chacha_20_core_avx2,.-chacha_20_core_avx2 -+___ -+} -+}} -+ -+ -+$code =~ s/\`([^\`]*)\`/eval($1)/gem; -+ -+print $code; -+ -+close STDOUT; -+ -diff --git a/crypto/chacha20poly1305/asm/poly1305_avx.pl b/crypto/chacha20poly1305/asm/poly1305_avx.pl -new file mode 100644 -index 0000000..cedeca1 ---- /dev/null -+++ b/crypto/chacha20poly1305/asm/poly1305_avx.pl -@@ -0,0 +1,732 @@ -+############################################################################## -+# # -+# Copyright 2014 Intel Corporation # -+# # -+# Licensed under the Apache License, Version 2.0 (the "License"); # -+# you may not use this file except in compliance with the License. # -+# You may obtain a copy of the License at # -+# # -+# http://www.apache.org/licenses/LICENSE-2.0 # -+# # -+# Unless required by applicable law or agreed to in writing, software # -+# distributed under the License is distributed on an "AS IS" BASIS, # -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -+# See the License for the specific language governing permissions and # -+# limitations under the License. # -+# # -+############################################################################## -+# # -+# Developers and authors: # -+# Shay Gueron (1, 2), and Vlad Krasnov (1) # -+# (1) Intel Corporation, Israel Development Center # -+# (2) University of Haifa # -+# # -+############################################################################## -+# state: -+# 0: r[0] || r^2[0] -+# 16: r[1] || r^2[1] -+# 32: r[2] || r^2[2] -+# 48: r[3] || r^2[3] -+# 64: r[4] || r^2[4] -+# 80: r[1]*5 || r^2[1]*5 -+# 96: r[2]*5 || r^2[2]*5 -+#112: r[3]*5 || r^2[3]*5 -+#128: r[4]*5 || r^2[4]*5 -+#144: k -+#160: A0 -+#164: A1 -+#168: A2 -+#172: A3 -+#176: A4 -+#180: END -+ -+$flavour = shift; -+$output = shift; -+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -+ -+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -+die "can't locate x86_64-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` -+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.19) + ($1>=2.22); -+} -+ -+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && -+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.09) + ($1>=2.10); -+} -+ -+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && -+ `ml64 2>&1` =~ /Version ([0-9]+)\./) { -+ $avx = ($1>=10) + ($1>=11); -+} -+ -+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { -+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 -+ $avx = ($ver>=3.0) + ($ver>=3.01); -+} -+ -+if ($avx>=1) {{ -+ -+my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_, -+ $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5, -+ $_k_, -+ $_A0_, $_A1_, $_A2_, $_A3_, $_A4_) -+ = (0, 16, 32, 48, 64, -+ 80, 96, 112, 128, -+ 144, -+ 160, 164, 168, 172, 176); -+ -+$code.=<<___; -+.text -+.align 32 -+.LandMask: -+.quad 0x3FFFFFF, 0x3FFFFFF -+.LsetBit: -+.quad 0x1000000, 0x1000000 -+.LrSet: -+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFF -+.quad 0x0FFFFFFC0FFFFFFC, 0x0FFFFFFC0FFFFFFC -+.Lone: -+.quad 1,0 -+.Lpoly: -+.quad 0xfffffffffffffffb, 0xffffffffffffffff -+___ -+ -+ -+{ -+my ($A0, $A1, $A2, $A3, $A4, -+ $r0, $r1, $r2, $r3, $r4, -+ $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15)); -+my ($state, $key) -+ =("%rdi", "%rsi"); -+ -+$code.=<<___; -+################################################################################ -+# void poly1305_init_avx(void *state, uint8_t key[32]) -+ -+.globl poly1305_init_avx -+.type poly1305_init_avx, \@function, 2 -+.align 64 -+poly1305_init_avx: -+ vzeroupper -+ # load and convert r -+ vmovq 8*0($key), $r0 -+ vmovq 8*1($key), $T0 -+ vpand .LrSet(%rip), $r0, $r0 -+ vpand .LrSet+16(%rip), $T0, $T0 -+ -+ vpsrlq \$26, $r0, $r1 -+ vpand .LandMask(%rip), $r0, $r0 -+ vpsrlq \$26, $r1, $r2 -+ vpand .LandMask(%rip), $r1, $r1 -+ vpsllq \$12, $T0, $T1 -+ vpxor $T1, $r2, $r2 -+ vpsrlq \$26, $r2, $r3 -+ vpsrlq \$40, $T0, $r4 -+ vpand .LandMask(%rip), $r2, $r2 -+ vpand .LandMask(%rip), $r3, $r3 -+ -+ # SQR R -+ vpmuludq $r0, $r0, $A0 -+ vpmuludq $r1, $r0, $A1 -+ vpmuludq $r2, $r0, $A2 -+ vpmuludq $r3, $r0, $A3 -+ vpmuludq $r4, $r0, $A4 -+ -+ vpsllq \$1, $A1, $A1 -+ vpsllq \$1, $A2, $A2 -+ vpmuludq $r1, $r1, $T0 -+ vpaddq $T0, $A2, $A2 -+ vpmuludq $r2, $r1, $T0 -+ vpaddq $T0, $A3, $A3 -+ vpmuludq $r3, $r1, $T0 -+ vpaddq $T0, $A4, $A4 -+ vpmuludq $r4, $r1, $A5 -+ -+ vpsllq \$1, $A3, $A3 -+ vpsllq \$1, $A4, $A4 -+ vpmuludq $r2, $r2, $T0 -+ vpaddq $T0, $A4, $A4 -+ vpmuludq $r3, $r2, $T0 -+ vpaddq $T0, $A5, $A5 -+ vpmuludq $r4, $r2, $A6 -+ -+ vpsllq \$1, $A5, $A5 -+ vpsllq \$1, $A6, $A6 -+ vpmuludq $r3, $r3, $T0 -+ vpaddq $T0, $A6, $A6 -+ vpmuludq $r4, $r3, $A7 -+ -+ vpsllq \$1, $A7, $A7 -+ vpmuludq $r4, $r4, $A8 -+ -+ # Reduce -+ vpsrlq \$26, $A4, $T0 -+ vpand .LandMask(%rip), $A4, $A4 -+ vpaddq $T0, $A5, $A5 -+ -+ vpsllq \$2, $A5, $T0 -+ vpaddq $T0, $A5, $A5 -+ vpsllq \$2, $A6, $T0 -+ vpaddq $T0, $A6, $A6 -+ vpsllq \$2, $A7, $T0 -+ vpaddq $T0, $A7, $A7 -+ vpsllq \$2, $A8, $T0 -+ vpaddq $T0, $A8, $A8 -+ -+ vpaddq $A5, $A0, $A0 -+ vpaddq $A6, $A1, $A1 -+ vpaddq $A7, $A2, $A2 -+ vpaddq $A8, $A3, $A3 -+ -+ vpsrlq \$26, $A0, $T0 -+ vpand .LandMask(%rip), $A0, $A0 -+ vpaddq $T0, $A1, $A1 -+ vpsrlq \$26, $A1, $T0 -+ vpand .LandMask(%rip), $A1, $A1 -+ vpaddq $T0, $A2, $A2 -+ vpsrlq \$26, $A2, $T0 -+ vpand .LandMask(%rip), $A2, $A2 -+ vpaddq $T0, $A3, $A3 -+ vpsrlq \$26, $A3, $T0 -+ vpand .LandMask(%rip), $A3, $A3 -+ vpaddq $T0, $A4, $A4 -+ -+ vpunpcklqdq $r0, $A0, $r0 -+ vpunpcklqdq $r1, $A1, $r1 -+ vpunpcklqdq $r2, $A2, $r2 -+ vpunpcklqdq $r3, $A3, $r3 -+ vpunpcklqdq $r4, $A4, $r4 -+ -+ vmovdqu $r0, $_r0_($state) -+ vmovdqu $r1, $_r1_($state) -+ vmovdqu $r2, $_r2_($state) -+ vmovdqu $r3, $_r3_($state) -+ vmovdqu $r4, $_r4_($state) -+ -+ vpsllq \$2, $r1, $A1 -+ vpsllq \$2, $r2, $A2 -+ vpsllq \$2, $r3, $A3 -+ vpsllq \$2, $r4, $A4 -+ -+ vpaddq $A1, $r1, $A1 -+ vpaddq $A2, $r2, $A2 -+ vpaddq $A3, $r3, $A3 -+ vpaddq $A4, $r4, $A4 -+ -+ vmovdqu $A1, $_r1_x5($state) -+ vmovdqu $A2, $_r2_x5($state) -+ vmovdqu $A3, $_r3_x5($state) -+ vmovdqu $A4, $_r4_x5($state) -+ # Store k -+ vmovdqu 16*1($key), $T0 -+ vmovdqu $T0, $_k_($state) -+ # Init the MAC value -+ vpxor $T0, $T0, $T0 -+ vmovdqu $T0, $_A0_($state) -+ vmovd $T0, $_A4_($state) -+ vzeroupper -+ ret -+.size poly1305_init_avx,.-poly1305_init_avx -+___ -+} -+ -+{ -+ -+my ($A0, $A1, $A2, $A3, $A4, -+ $T0, $T1, $R0, $R1, $R2, -+ $R3, $R4, $AND_MASK) -+ = map("%xmm$_",(0..12)); -+ -+my ($state, $in, $in_len) -+ =("%rdi", "%rsi", "%rdx"); -+ -+$code.=<<___; -+ -+############################################################################### -+# void* poly1305_update_avx(void* state, void* in, uint64_t in_len) -+.globl poly1305_update_avx -+.type poly1305_update_avx, \@function, 2 -+.align 64 -+poly1305_update_avx: -+ -+ vzeroupper -+ vmovd $_A0_($state), $A0 -+ vmovd $_A1_($state), $A1 -+ vmovd $_A2_($state), $A2 -+ vmovd $_A3_($state), $A3 -+ vmovd $_A4_($state), $A4 -+ vmovdqa .LandMask(%rip), $AND_MASK -+ # Skip to single block case -+ cmp \$32, $in_len -+ jb 3f -+1: -+ cmp \$16*4, $in_len -+ jb 1f -+ sub \$16*2, $in_len -+ # load the next two blocks -+ vmovdqu 16*0($in), $R2 -+ vmovdqu 16*1($in), $R3 -+ add \$16*2, $in -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor .LsetBit(%rip), $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ -+ # Multiply input by R[0] -+ vbroadcastss $_r0_($state), $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vbroadcastss $_r1_x5($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vbroadcastss $_r1_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vbroadcastss $_r2_x5($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vbroadcastss $_r2_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vbroadcastss $_r3_x5($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vbroadcastss $_r3_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vbroadcastss $_r4_x5($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vbroadcastss $_r4_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Reduce -+ vpsrlq \$26, $R3, $T0 -+ vpaddq $T0, $R4, $R4 -+ vpand $AND_MASK, $R3, $R3 -+ -+ vpsrlq \$26, $R4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $R0, $R0 -+ vpand $AND_MASK, $R4, $R4 -+ -+ vpsrlq \$26, $R0, $T0 -+ vpand $AND_MASK, $R0, $A0 -+ vpaddq $T0, $R1, $R1 -+ vpsrlq \$26, $R1, $T0 -+ vpand $AND_MASK, $R1, $A1 -+ vpaddq $T0, $R2, $R2 -+ vpsrlq \$26, $R2, $T0 -+ vpand $AND_MASK, $R2, $A2 -+ vpaddq $T0, $R3, $R3 -+ vpsrlq \$26, $R3, $T0 -+ vpand $AND_MASK, $R3, $A3 -+ vpaddq $T0, $R4, $A4 -+ jmp 1b -+1: -+ cmp \$16*2, $in_len -+ jb 1f -+ sub \$16*2, $in_len -+ # load the next two blocks -+ vmovdqu 16*0($in), $R2 -+ vmovdqu 16*1($in), $R3 -+ add \$16*2, $in -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor .LsetBit(%rip), $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ -+ # Multiply input by R[0] -+ vmovdqu $_r0_($state), $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vmovdqu $_r1_x5($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vmovdqu $_r1_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vmovdqu $_r2_x5($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vmovdqu $_r2_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r3_x5($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vmovdqu $_r3_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r4_x5($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vmovdqu $_r4_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+1: -+ vpsrldq \$8, $R0, $A0 -+ vpsrldq \$8, $R1, $A1 -+ vpsrldq \$8, $R2, $A2 -+ vpsrldq \$8, $R3, $A3 -+ vpsrldq \$8, $R4, $A4 -+ -+ vpaddq $R0, $A0, $A0 -+ vpaddq $R1, $A1, $A1 -+ vpaddq $R2, $A2, $A2 -+ vpaddq $R3, $A3, $A3 -+ vpaddq $R4, $A4, $A4 -+ # Reduce -+ vpsrlq \$26, $A3, $T0 -+ vpaddq $T0, $A4, $A4 -+ vpand $AND_MASK, $A3, $A3 -+ vpsrlq \$26, $A4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $A0, $A0 -+ vpand $AND_MASK, $A4, $A4 -+ vpsrlq \$26, $A0, $T0 -+ vpand $AND_MASK, $A0, $A0 -+ vpaddq $T0, $A1, $A1 -+ vpsrlq \$26, $A1, $T0 -+ vpand $AND_MASK, $A1, $A1 -+ vpaddq $T0, $A2, $A2 -+ vpsrlq \$26, $A2, $T0 -+ vpand $AND_MASK, $A2, $A2 -+ vpaddq $T0, $A3, $A3 -+ vpsrlq \$26, $A3, $T0 -+ vpand $AND_MASK, $A3, $A3 -+ vpaddq $T0, $A4, $A4 -+3: -+ cmp \$16, $in_len -+ jb 1f -+ -+ # load the next block -+ vmovq 8*0($in), $R0 -+ vmovq 8*1($in), $R1 -+ add \$16, $in -+ sub \$16, $in_len -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor .LsetBit(%rip), $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+2: -+ # Multiply input by R[0] -+ vmovq $_r0_+8($state), $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vmovq $_r1_x5+8($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vmovq $_r1_+8($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vmovq $_r2_x5+8($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vmovq $_r2_+8($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovq $_r3_x5+8($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vmovq $_r3_+8($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovq $_r4_x5+8($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vmovq $_r4_+8($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Reduce -+ vpsrlq \$26, $R3, $T0 -+ vpaddq $T0, $R4, $R4 -+ vpand $AND_MASK, $R3, $R3 -+ vpsrlq \$26, $R4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $R0, $R0 -+ vpand $AND_MASK, $R4, $R4 -+ vpsrlq \$26, $R0, $T0 -+ vpand $AND_MASK, $R0, $A0 -+ vpaddq $T0, $R1, $R1 -+ vpsrlq \$26, $R1, $T0 -+ vpand $AND_MASK, $R1, $A1 -+ vpaddq $T0, $R2, $R2 -+ vpsrlq \$26, $R2, $T0 -+ vpand $AND_MASK, $R2, $A2 -+ vpaddq $T0, $R3, $R3 -+ vpsrlq \$26, $R3, $T0 -+ vpand $AND_MASK, $R3, $A3 -+ vpaddq $T0, $R4, $A4 -+ -+1: -+ test $in_len, $in_len -+ jz 1f -+ -+ vmovdqa .Lone(%rip), $R0 -+3: -+ dec $in_len -+ vpslldq \$1, $R0, $R0 -+ vpinsrb \$0, ($in, $in_len), $R0, $R0 -+ test $in_len, $in_len -+ jnz 3b -+ -+ vpsrldq \$8, $R0, $R1 -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ xor $in_len, $in_len -+ jmp 2b -+1: -+ -+ vmovd $A0, $_A0_($state) -+ vmovd $A1, $_A1_($state) -+ vmovd $A2, $_A2_($state) -+ vmovd $A3, $_A3_($state) -+ vmovd $A4, $_A4_($state) -+ -+ mov $in, %rax -+ vzeroupper -+ ret -+.size poly1305_update_avx,.-poly1305_update_avx -+############################################################################### -+# void poly1305_finish_avx(void* $state, uint64_t mac[2]); -+.type poly1305_finish_avx,\@function, 2 -+.globl poly1305_finish_avx -+poly1305_finish_avx: -+___ -+my $mac="%rsi"; -+$code.=<<___; -+ mov $_A0_($state), %r8d -+ mov $_A1_($state), %eax -+ mov $_A2_($state), %r9d -+ mov $_A3_($state), %ecx -+ mov $_A4_($state), %r10d -+ -+ shl \$26, %rax -+ add %rax, %r8 -+ -+ mov %r9, %rax -+ shl \$52, %rax -+ shr \$12, %r9 -+ add %rax, %r8 -+ adc \$0, %r9 -+ -+ mov %r10, %rax -+ shl \$14, %rcx -+ shl \$40, %rax -+ shr \$24, %r10 -+ -+ add %rcx, %r9 -+ add %rax, %r9 -+ adc \$0, %r10 -+ -+ mov %r10, %rax -+ shr \$2, %rax -+ and \$3, %r10 -+ -+ mov %rax, %rcx -+ shl \$2, %rax -+ add %rcx, %rax -+ -+ add %rax, %r8 -+ adc \$0, %r9 -+ adc \$0, %r10 -+ -+ mov %r8, %rax -+ mov %r9, %rcx -+ sub \$-5, %rax -+ sbb \$-1, %rcx -+ sbb \$3, %r10 -+ -+ cmovc %r8, %rax -+ cmovc %r9, %rcx -+ add $_k_($state), %rax -+ adc $_k_+8($state), %rcx -+ mov %rax, ($mac) -+ mov %rcx, 8($mac) -+ ret -+.size poly1305_finish_avx,.-poly1305_finish_avx -+___ -+} -+}} -+ -+$code =~ s/\`([^\`]*)\`/eval($1)/gem; -+print $code; -+close STDOUT; -+ -diff --git a/crypto/chacha20poly1305/asm/poly1305_avx2.pl b/crypto/chacha20poly1305/asm/poly1305_avx2.pl -new file mode 100644 -index 0000000..d2dd51f ---- /dev/null -+++ b/crypto/chacha20poly1305/asm/poly1305_avx2.pl -@@ -0,0 +1,984 @@ -+############################################################################## -+# # -+# Copyright 2014 Intel Corporation # -+# # -+# Licensed under the Apache License, Version 2.0 (the "License"); # -+# you may not use this file except in compliance with the License. # -+# You may obtain a copy of the License at # -+# # -+# http://www.apache.org/licenses/LICENSE-2.0 # -+# # -+# Unless required by applicable law or agreed to in writing, software # -+# distributed under the License is distributed on an "AS IS" BASIS, # -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -+# See the License for the specific language governing permissions and # -+# limitations under the License. # -+# # -+############################################################################## -+# # -+# Developers and authors: # -+# Shay Gueron (1, 2), and Vlad Krasnov (1) # -+# (1) Intel Corporation, Israel Development Center # -+# (2) University of Haifa # -+# # -+############################################################################## -+ -+$flavour = shift; -+$output = shift; -+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -+ -+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -+die "can't locate x86_64-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` -+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.19) + ($1>=2.22); -+} -+ -+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && -+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.09) + ($1>=2.10); -+} -+ -+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && -+ `ml64 2>&1` =~ /Version ([0-9]+)\./) { -+ $avx = ($1>=10) + ($1>=11); -+} -+ -+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { -+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 -+ $avx = ($ver>=3.0) + ($ver>=3.01); -+} -+ -+if ($avx>=1) {{ -+ -+my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_, -+ $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5, -+ $_k_, -+ $_A0_, $_A1_, $_A2_, $_A3_, $_A4_) -+ = (64, 96, 128, 160, 192, -+ 224, 256, 288, 320, -+ 40, -+ 352, 356, 360, 364, 368); -+ -+$code.=<<___; -+.text -+.align 32 -+.LandMask: -+.quad 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF -+.LsetBit: -+.quad 0x1000000, 0x1000000, 0x1000000, 0x1000000 -+.LrSet: -+.quad 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF -+.quad 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC -+ -+.LpermFix: -+.long 6,7,6,7,6,7,6,7 -+.long 4,5,6,7,6,7,6,7 -+.long 2,3,6,7,4,5,6,7 -+.long 0,1,4,5,2,3,6,7 -+___ -+ -+ -+{ -+my ($A0, $A1, $A2, $A3, $A4, -+ $r0, $r1, $r2, $r3, $r4, -+ $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15)); -+ -+my ($A0_y, $A1_y, $A2_y, $A3_y, $A4_y, -+ $r0_y, $r1_y, $r2_y, $r3_y, $r4_y)=map("%ymm$_",(0..9)); -+ -+my ($state, $key) -+ =("%rdi", "%rsi"); -+ -+$code.=<<___; -+################################################################################ -+# void poly1305_init_avx2(void *state) -+ -+.type poly1305_init_avx2, \@function, 2 -+.align 64 -+poly1305_init_avx2: -+ vzeroupper -+ -+ # Init the MAC value -+ mov 8*0($state), %r8 -+ mov 8*1($state), %r9 -+ mov 8*2($state), %r10 -+ -+ mov %r8, %rax -+ mov %r9, %rcx -+ -+ sub \$-5, %r8 -+ sbb \$-1, %r9 -+ sbb \$3, %r10 -+ -+ cmovc %rax, %r8 -+ cmovc %rcx, %r9 -+ cmovc 8*2($state), %r10 -+ -+ mov %r8, %rax -+ and \$0x3ffffff, %rax -+ mov %eax, $_A0_($state) -+ -+ shrd \$26, %r9, %r8 -+ shrd \$26, %r10, %r9 -+ -+ mov %r8, %rax -+ and \$0x3ffffff, %rax -+ mov %eax, $_A1_($state) -+ -+ shrd \$26, %r9, %r8 -+ shr \$26, %r9 -+ -+ mov %r8, %rax -+ and \$0x3ffffff, %rax -+ mov %eax, $_A2_($state) -+ -+ shrd \$26, %r9, %r8 -+ -+ mov %r8, %rax -+ and \$0x3ffffff, %rax -+ mov %eax, $_A3_($state) -+ -+ shr \$26, %r8 -+ -+ mov %r8, %rax -+ and \$0x3ffffff, %rax -+ mov %eax, $_A4_($state) -+ -+ # load and convert r -+ vmovq 8*3($state), $r0 -+ vmovq 8*4($state), $T0 -+ vpand .LrSet(%rip), $r0, $r0 -+ vpand .LrSet+32(%rip), $T0, $T0 -+ -+ vpsrlq \$26, $r0, $r1 -+ vpand .LandMask(%rip), $r0, $r0 -+ vpsrlq \$26, $r1, $r2 -+ vpand .LandMask(%rip), $r1, $r1 -+ vpsllq \$12, $T0, $T1 -+ vpxor $T1, $r2, $r2 -+ vpsrlq \$26, $r2, $r3 -+ vpsrlq \$40, $T0, $r4 -+ vpand .LandMask(%rip), $r2, $r2 -+ vpand .LandMask(%rip), $r3, $r3 -+ # SQR R -+ vpmuludq $r0, $r0, $A0 -+ vpmuludq $r1, $r0, $A1 -+ vpmuludq $r2, $r0, $A2 -+ vpmuludq $r3, $r0, $A3 -+ vpmuludq $r4, $r0, $A4 -+ -+ vpsllq \$1, $A1, $A1 -+ vpsllq \$1, $A2, $A2 -+ vpmuludq $r1, $r1, $T0 -+ vpaddq $T0, $A2, $A2 -+ vpmuludq $r2, $r1, $T0 -+ vpaddq $T0, $A3, $A3 -+ vpmuludq $r3, $r1, $T0 -+ vpaddq $T0, $A4, $A4 -+ vpmuludq $r4, $r1, $A5 -+ -+ vpsllq \$1, $A3, $A3 -+ vpsllq \$1, $A4, $A4 -+ vpmuludq $r2, $r2, $T0 -+ vpaddq $T0, $A4, $A4 -+ vpmuludq $r3, $r2, $T0 -+ vpaddq $T0, $A5, $A5 -+ vpmuludq $r4, $r2, $A6 -+ -+ vpsllq \$1, $A5, $A5 -+ vpsllq \$1, $A6, $A6 -+ vpmuludq $r3, $r3, $T0 -+ vpaddq $T0, $A6, $A6 -+ vpmuludq $r4, $r3, $A7 -+ -+ vpsllq \$1, $A7, $A7 -+ vpmuludq $r4, $r4, $A8 -+ -+ # Reduce -+ vpsrlq \$26, $A4, $T0 -+ vpand .LandMask(%rip), $A4, $A4 -+ vpaddq $T0, $A5, $A5 -+ -+ vpsllq \$2, $A5, $T0 -+ vpaddq $T0, $A5, $A5 -+ vpsllq \$2, $A6, $T0 -+ vpaddq $T0, $A6, $A6 -+ vpsllq \$2, $A7, $T0 -+ vpaddq $T0, $A7, $A7 -+ vpsllq \$2, $A8, $T0 -+ vpaddq $T0, $A8, $A8 -+ -+ vpaddq $A5, $A0, $A0 -+ vpaddq $A6, $A1, $A1 -+ vpaddq $A7, $A2, $A2 -+ vpaddq $A8, $A3, $A3 -+ -+ vpsrlq \$26, $A0, $T0 -+ vpand .LandMask(%rip), $A0, $A0 -+ vpaddq $T0, $A1, $A1 -+ vpsrlq \$26, $A1, $T0 -+ vpand .LandMask(%rip), $A1, $A1 -+ vpaddq $T0, $A2, $A2 -+ vpsrlq \$26, $A2, $T0 -+ vpand .LandMask(%rip), $A2, $A2 -+ vpaddq $T0, $A3, $A3 -+ vpsrlq \$26, $A3, $T0 -+ vpand .LandMask(%rip), $A3, $A3 -+ vpaddq $T0, $A4, $A4 -+ -+ vpunpcklqdq $r0, $A0, $r0 -+ vpunpcklqdq $r1, $A1, $r1 -+ vpunpcklqdq $r2, $A2, $r2 -+ vpunpcklqdq $r3, $A3, $r3 -+ vpunpcklqdq $r4, $A4, $r4 -+ -+ vmovdqu $r0, $_r0_+16($state) -+ vmovdqu $r1, $_r1_+16($state) -+ vmovdqu $r2, $_r2_+16($state) -+ vmovdqu $r3, $_r3_+16($state) -+ vmovdqu $r4, $_r4_+16($state) -+ -+ vpsllq \$2, $r1, $A1 -+ vpsllq \$2, $r2, $A2 -+ vpsllq \$2, $r3, $A3 -+ vpsllq \$2, $r4, $A4 -+ -+ vpaddq $A1, $r1, $A1 -+ vpaddq $A2, $r2, $A2 -+ vpaddq $A3, $r3, $A3 -+ vpaddq $A4, $r4, $A4 -+ -+ vmovdqu $A1, $_r1_x5+16($state) -+ vmovdqu $A2, $_r2_x5+16($state) -+ vmovdqu $A3, $_r3_x5+16($state) -+ vmovdqu $A4, $_r4_x5+16($state) -+ -+ # Compute r^3 and r^4 -+ vpshufd \$0x44, $r0, $A0 -+ vpshufd \$0x44, $r1, $A1 -+ vpshufd \$0x44, $r2, $A2 -+ vpshufd \$0x44, $r3, $A3 -+ vpshufd \$0x44, $r4, $A4 -+ -+ # Multiply input by R[0] -+ vmovdqu $_r0_+16($state), $T0 -+ vpmuludq $T0, $A0, $r0 -+ vpmuludq $T0, $A1, $r1 -+ vpmuludq $T0, $A2, $r2 -+ vpmuludq $T0, $A3, $r3 -+ vpmuludq $T0, $A4, $r4 -+ # Multiply input by R[1] (and R[1]*5) -+ vmovdqu $_r1_x5+16($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $r0, $r0 -+ vmovdqu $_r1_+16($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $r1, $r1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $r2, $r2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $r3, $r3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $r4, $r4 -+ # Etc -+ vmovdqu $_r2_x5+16($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $r0, $r0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $r1, $r1 -+ vmovdqu $_r2_+16($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $r2, $r2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $r3, $r3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $r4, $r4 -+ -+ vmovdqu $_r3_x5+16($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $r0, $r0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $r1, $r1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $r2, $r2 -+ vmovdqu $_r3_+16($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $r3, $r3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $r4, $r4 -+ -+ vmovdqu $_r4_x5+16($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $r0, $r0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $r1, $r1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $r2, $r2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $r3, $r3 -+ vmovdqu $_r4_+16($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $r4, $r4 -+ # Reduce -+ vpsrlq \$26, $r3, $T0 -+ vpaddq $T0, $r4, $r4 -+ vpand .LandMask(%rip), $r3, $r3 -+ vpsrlq \$26, $r4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $r0, $r0 -+ vpand .LandMask(%rip), $r4, $r4 -+ vpsrlq \$26, $r0, $T0 -+ vpand .LandMask(%rip), $r0, $r0 -+ vpaddq $T0, $r1, $r1 -+ vpsrlq \$26, $r1, $T0 -+ vpand .LandMask(%rip), $r1, $r1 -+ vpaddq $T0, $r2, $r2 -+ vpsrlq \$26, $r2, $T0 -+ vpand .LandMask(%rip), $r2, $r2 -+ vpaddq $T0, $r3, $r3 -+ vpsrlq \$26, $r3, $T0 -+ vpand .LandMask(%rip), $r3, $r3 -+ vpaddq $T0, $r4, $r4 -+ -+ vmovdqu $r0, $_r0_($state) -+ vmovdqu $r1, $_r1_($state) -+ vmovdqu $r2, $_r2_($state) -+ vmovdqu $r3, $_r3_($state) -+ vmovdqu $r4, $_r4_($state) -+ -+ vpsllq \$2, $r1, $A1 -+ vpsllq \$2, $r2, $A2 -+ vpsllq \$2, $r3, $A3 -+ vpsllq \$2, $r4, $A4 -+ -+ vpaddq $A1, $r1, $A1 -+ vpaddq $A2, $r2, $A2 -+ vpaddq $A3, $r3, $A3 -+ vpaddq $A4, $r4, $A4 -+ -+ vmovdqu $A1, $_r1_x5($state) -+ vmovdqu $A2, $_r2_x5($state) -+ vmovdqu $A3, $_r3_x5($state) -+ vmovdqu $A4, $_r4_x5($state) -+ -+ movq \$1, 8*7($state) -+ -+ ret -+.size poly1305_init_avx2,.-poly1305_init_avx2 -+___ -+} -+ -+{ -+ -+my ($A0, $A1, $A2, $A3, $A4, -+ $T0, $T1, $R0, $R1, $R2, -+ $R3, $R4, $AND_MASK, $PERM_MASK, $SET_MASK) -+ =map("%ymm$_",(0..14)); -+ -+my ($A0_x, $A1_x, $A2_x, $A3_x, $A4_x, -+ $T0_x, $T1_x, $R0_x, $R1_x, $R2_x, -+ $R3_x, $R4_x, $AND_MASK_x, $PERM_MASK_x, $SET_MASK_x) -+ =map("%xmm$_",(0..14)); -+ -+my ($state, $in, $in_len, $hlp, $rsp_save) -+ =("%rdi", "%rsi", "%rdx", "%rcx", "%rax"); -+ -+$code.=<<___; -+ -+############################################################################### -+# void poly1305_update_avx2(void* $state, void* in, uint64_t in_len) -+.globl poly1305_update_avx2 -+.type poly1305_update_avx2, \@function, 2 -+.align 64 -+poly1305_update_avx2: -+ -+ -+ test $in_len, $in_len -+ jz 6f -+ -+ cmpq \$0, 8*7($state) -+ jne 1f # This means we already started avx2, and must finish -+ -+ # If we are here we need to check in_len is longer than the min for avx2 -+ -+ cmp \$512, $in_len -+ jge 2f -+ -+ jmp poly1305_update_x64 -+ -+2: -+ call poly1305_init_avx2 -+ -+1: -+ vmovd $_A0_($state), $A0_x -+ vmovd $_A1_($state), $A1_x -+ vmovd $_A2_($state), $A2_x -+ vmovd $_A3_($state), $A3_x -+ vmovd $_A4_($state), $A4_x -+ -+ vmovdqa .LandMask(%rip), $AND_MASK -+1: -+ cmp \$32*4, $in_len -+ jb 1f -+ sub \$32*2, $in_len -+ -+ # load the next four blocks -+ vmovdqu 32*0($in), $R2 -+ vmovdqu 32*1($in), $R3 -+ add \$32*2, $in -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ -+ vpermq \$0xD8, $R0, $R0 # it is possible to rearrange the precomputations, and save this shuffle -+ vpermq \$0xD8, $R1, $R1 -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor .LsetBit(%rip), $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ -+ # Multiply input by R[0] -+ vpbroadcastq $_r0_($state), $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vpbroadcastq $_r1_x5($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpbroadcastq $_r1_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vpbroadcastq $_r2_x5($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpbroadcastq $_r2_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vpbroadcastq $_r3_x5($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpbroadcastq $_r3_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vpbroadcastq $_r4_x5($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpbroadcastq $_r4_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Reduce -+ vpsrlq \$26, $R3, $T0 -+ vpaddq $T0, $R4, $R4 -+ vpand $AND_MASK, $R3, $R3 -+ -+ vpsrlq \$26, $R4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $R0, $R0 -+ vpand $AND_MASK, $R4, $R4 -+ -+ vpsrlq \$26, $R0, $T0 -+ vpand $AND_MASK, $R0, $A0 -+ vpaddq $T0, $R1, $R1 -+ vpsrlq \$26, $R1, $T0 -+ vpand $AND_MASK, $R1, $A1 -+ vpaddq $T0, $R2, $R2 -+ vpsrlq \$26, $R2, $T0 -+ vpand $AND_MASK, $R2, $A2 -+ vpaddq $T0, $R3, $R3 -+ vpsrlq \$26, $R3, $T0 -+ vpand $AND_MASK, $R3, $A3 -+ vpaddq $T0, $R4, $A4 -+ jmp 1b -+1: -+ -+ cmp \$32*2, $in_len -+ jb 1f -+ sub \$32*2, $in_len -+ # load the next four blocks -+ vmovdqu 32*0($in), $R2 -+ vmovdqu 32*1($in), $R3 -+ add \$32*2, $in -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ -+ vpermq \$0xD8, $R0, $R0 -+ vpermq \$0xD8, $R1, $R1 -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor .LsetBit(%rip), $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ -+ # Multiply input by R[0] -+ vmovdqu $_r0_($state), $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vmovdqu $_r1_x5($state), $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vmovdqu $_r1_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vmovdqu $_r2_x5($state), $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vmovdqu $_r2_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r3_x5($state), $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vmovdqu $_r3_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r4_x5($state), $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vmovdqu $_r4_($state), $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Reduce -+ vpsrlq \$26, $R3, $T0 -+ vpaddq $T0, $R4, $R4 -+ vpand $AND_MASK, $R3, $R3 -+ vpsrlq \$26, $R4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $R0, $R0 -+ vpand $AND_MASK, $R4, $R4 -+ vpsrlq \$26, $R0, $T0 -+ vpand $AND_MASK, $R0, $A0 -+ vpaddq $T0, $R1, $R1 -+ vpsrlq \$26, $R1, $T0 -+ vpand $AND_MASK, $R1, $A1 -+ vpaddq $T0, $R2, $R2 -+ vpsrlq \$26, $R2, $T0 -+ vpand $AND_MASK, $R2, $A2 -+ vpaddq $T0, $R3, $R3 -+ vpsrlq \$26, $R3, $T0 -+ vpand $AND_MASK, $R3, $A3 -+ vpaddq $T0, $R4, $A4 -+ -+ vpsrldq \$8, $A0, $R0 -+ vpsrldq \$8, $A1, $R1 -+ vpsrldq \$8, $A2, $R2 -+ vpsrldq \$8, $A3, $R3 -+ vpsrldq \$8, $A4, $R4 -+ -+ vpaddq $R0, $A0, $A0 -+ vpaddq $R1, $A1, $A1 -+ vpaddq $R2, $A2, $A2 -+ vpaddq $R3, $A3, $A3 -+ vpaddq $R4, $A4, $A4 -+ -+ vpermq \$0xAA, $A0, $R0 -+ vpermq \$0xAA, $A1, $R1 -+ vpermq \$0xAA, $A2, $R2 -+ vpermq \$0xAA, $A3, $R3 -+ vpermq \$0xAA, $A4, $R4 -+ -+ vpaddq $R0, $A0, $A0 -+ vpaddq $R1, $A1, $A1 -+ vpaddq $R2, $A2, $A2 -+ vpaddq $R3, $A3, $A3 -+ vpaddq $R4, $A4, $A4 -+1: -+ test $in_len, $in_len -+ jz 5f -+ # In case 1,2 or 3 blocks remain, we want to multiply them correctly -+ vmovq $A0_x, $A0_x -+ vmovq $A1_x, $A1_x -+ vmovq $A2_x, $A2_x -+ vmovq $A3_x, $A3_x -+ vmovq $A4_x, $A4_x -+ -+ mov .LsetBit(%rip), $hlp -+ mov %rsp, $rsp_save -+ test \$15, $in_len -+ jz 1f -+ xor $hlp, $hlp -+ sub \$64, %rsp -+ vpxor $R0, $R0, $R0 -+ vmovdqu $R0, (%rsp) -+ vmovdqu $R0, 32(%rsp) -+3: -+ movb ($in, $hlp), %r8b -+ movb %r8b, (%rsp, $hlp) -+ inc $hlp -+ cmp $hlp, $in_len -+ jne 3b -+ -+ movb \$1, (%rsp, $hlp) -+ xor $hlp, $hlp -+ mov %rsp, $in -+ -+1: -+ -+ cmp \$16, $in_len -+ ja 2f -+ vmovq 8*0($in), $R0_x -+ vmovq 8*1($in), $R1_x -+ vmovq $hlp, $SET_MASK_x -+ vmovdqa .LpermFix(%rip), $PERM_MASK -+ jmp 1f -+2: -+ cmp \$32, $in_len -+ ja 2f -+ vmovdqu 16*0($in), $R2_x -+ vmovdqu 16*1($in), $R3_x -+ vmovq .LsetBit(%rip), $SET_MASK_x -+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x -+ vmovdqa .LpermFix+32(%rip), $PERM_MASK -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ jmp 1f -+2: -+ cmp \$48, $in_len -+ ja 2f -+ vmovdqu 32*0($in), $R2 -+ vmovdqu 32*1($in), $R3_x -+ vmovq .LsetBit(%rip), $SET_MASK_x -+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x -+ vpermq \$0xc4, $SET_MASK, $SET_MASK -+ vmovdqa .LpermFix+64(%rip), $PERM_MASK -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ jmp 1f -+2: -+ vmovdqu 32*0($in), $R2 -+ vmovdqu 32*1($in), $R3 -+ vmovq .LsetBit(%rip), $SET_MASK_x -+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x -+ vpermq \$0x40, $SET_MASK, $SET_MASK -+ vmovdqa .LpermFix+96(%rip), $PERM_MASK -+ -+ vpunpcklqdq $R3, $R2, $R0 -+ vpunpckhqdq $R3, $R2, $R1 -+ -+1: -+ mov $rsp_save, %rsp -+ -+ vpsrlq \$26, $R0, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A0, $A0 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpand $AND_MASK, $R2, $R2 -+ vpaddq $R2, $A1, $A1 -+ -+ vpsllq \$12, $R1, $R2 -+ vpxor $R2, $R0, $R0 -+ vpand $AND_MASK, $R0, $R0 -+ vpaddq $R0, $A2, $A2 -+ -+ vpsrlq \$26, $R2, $R0 -+ vpsrlq \$40, $R1, $R2 -+ vpand $AND_MASK, $R0, $R0 -+ vpxor $SET_MASK, $R2, $R2 -+ vpaddq $R0, $A3, $A3 -+ vpaddq $R2, $A4, $A4 -+ -+ # Multiply input by R[0] -+ vmovdqu $_r0_($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A0, $R0 -+ vpmuludq $T0, $A1, $R1 -+ vpmuludq $T0, $A2, $R2 -+ vpmuludq $T0, $A3, $R3 -+ vpmuludq $T0, $A4, $R4 -+ # Multiply input by R[1] (and R[1]*5) -+ vmovdqu $_r1_x5($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R0, $R0 -+ vmovdqu $_r1_($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Etc -+ vmovdqu $_r2_x5($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R1, $R1 -+ vmovdqu $_r2_($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r3_x5($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R2, $R2 -+ vmovdqu $_r3_($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R3, $R3 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R4, $R4 -+ -+ vmovdqu $_r4_x5($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A1, $T1 -+ vpaddq $T1, $R0, $R0 -+ vpmuludq $T0, $A2, $T1 -+ vpaddq $T1, $R1, $R1 -+ vpmuludq $T0, $A3, $T1 -+ vpaddq $T1, $R2, $R2 -+ vpmuludq $T0, $A4, $T1 -+ vpaddq $T1, $R3, $R3 -+ vmovdqu $_r4_($state), $T0 -+ vpermd $T0, $PERM_MASK, $T0 -+ vpmuludq $T0, $A0, $T1 -+ vpaddq $T1, $R4, $R4 -+ # Reduce -+ vpsrlq \$26, $R3, $T0 -+ vpaddq $T0, $R4, $R4 -+ vpand $AND_MASK, $R3, $R3 -+ vpsrlq \$26, $R4, $T0 -+ vpsllq \$2, $T0, $T1 -+ vpaddq $T1, $T0, $T0 -+ vpaddq $T0, $R0, $R0 -+ vpand $AND_MASK, $R4, $R4 -+ vpsrlq \$26, $R0, $T0 -+ vpand $AND_MASK, $R0, $A0 -+ vpaddq $T0, $R1, $R1 -+ vpsrlq \$26, $R1, $T0 -+ vpand $AND_MASK, $R1, $A1 -+ vpaddq $T0, $R2, $R2 -+ vpsrlq \$26, $R2, $T0 -+ vpand $AND_MASK, $R2, $A2 -+ vpaddq $T0, $R3, $R3 -+ vpsrlq \$26, $R3, $T0 -+ vpand $AND_MASK, $R3, $A3 -+ vpaddq $T0, $R4, $A4 -+ -+ vpsrldq \$8, $A0, $R0 -+ vpsrldq \$8, $A1, $R1 -+ vpsrldq \$8, $A2, $R2 -+ vpsrldq \$8, $A3, $R3 -+ vpsrldq \$8, $A4, $R4 -+ -+ vpaddq $R0, $A0, $A0 -+ vpaddq $R1, $A1, $A1 -+ vpaddq $R2, $A2, $A2 -+ vpaddq $R3, $A3, $A3 -+ vpaddq $R4, $A4, $A4 -+ -+ vpermq \$0xAA, $A0, $R0 -+ vpermq \$0xAA, $A1, $R1 -+ vpermq \$0xAA, $A2, $R2 -+ vpermq \$0xAA, $A3, $R3 -+ vpermq \$0xAA, $A4, $R4 -+ -+ vpaddq $R0, $A0, $A0 -+ vpaddq $R1, $A1, $A1 -+ vpaddq $R2, $A2, $A2 -+ vpaddq $R3, $A3, $A3 -+ vpaddq $R4, $A4, $A4 -+ -+5: -+ vmovd $A0_x, $_A0_($state) -+ vmovd $A1_x, $_A1_($state) -+ vmovd $A2_x, $_A2_($state) -+ vmovd $A3_x, $_A3_($state) -+ vmovd $A4_x, $_A4_($state) -+6: -+ ret -+.size poly1305_update_avx2,.-poly1305_update_avx2 -+############################################################################### -+# void poly1305_finish_avx2(void* $state, uint8_t mac[16]); -+.type poly1305_finish_avx2,\@function,2 -+.globl poly1305_finish_avx2 -+poly1305_finish_avx2: -+___ -+my $mac="%rsi"; -+my ($A0, $A1, $A2, $A3, $A4, $T0, $T1) -+ =map("%xmm$_",(0..6)); -+ -+$code.=<<___; -+ cmpq \$0, 8*7($state) -+ jne 1f -+ jmp poly1305_finish_x64 -+ -+1: -+ mov $_A0_($state), %r8d -+ mov $_A1_($state), %eax -+ mov $_A2_($state), %r9d -+ mov $_A3_($state), %ecx -+ mov $_A4_($state), %r10d -+ -+ shl \$26, %rax -+ add %rax, %r8 -+ -+ mov %r9, %rax -+ shl \$52, %rax -+ shr \$12, %r9 -+ add %rax, %r8 -+ adc \$0, %r9 -+ -+ mov %r10, %rax -+ shl \$14, %rcx -+ shl \$40, %rax -+ shr \$24, %r10 -+ -+ add %rcx, %r9 -+ add %rax, %r9 -+ adc \$0, %r10 -+ -+ mov %r10, %rax -+ shr \$2, %rax -+ and \$3, %r10 -+ -+ mov %rax, %rcx -+ shl \$2, %rax -+ add %rcx, %rax -+ -+ add %rax, %r8 -+ adc \$0, %r9 -+ adc \$0, %r10 -+ -+ mov %r8, %rax -+ mov %r9, %rcx -+ sub \$-5, %rax -+ sbb \$-1, %rcx -+ sbb \$3, %r10 -+ -+ cmovc %r8, %rax -+ cmovc %r9, %rcx -+ add $_k_($state), %rax -+ adc $_k_+8($state), %rcx -+ mov %rax, ($mac) -+ mov %rcx, 8($mac) -+ -+ ret -+.size poly1305_finish_avx2,.-poly1305_finish_avx2 -+___ -+} -+}} -+ -+$code =~ s/\`([^\`]*)\`/eval(\$1)/gem; -+print $code; -+close STDOUT; -+ -diff --git a/crypto/chacha20poly1305/asm/poly1305_x64.pl b/crypto/chacha20poly1305/asm/poly1305_x64.pl -new file mode 100644 -index 0000000..31c4c47 ---- /dev/null -+++ b/crypto/chacha20poly1305/asm/poly1305_x64.pl -@@ -0,0 +1,281 @@ -+ -+############################################################################## -+# # -+# Copyright 2016 CloudFlare LTD # -+# # -+# Licensed under the Apache License, Version 2.0 (the "License"); # -+# you may not use this file except in compliance with the License. # -+# You may obtain a copy of the License at # -+# # -+# http://www.apache.org/licenses/LICENSE-2.0 # -+# # -+# Unless required by applicable law or agreed to in writing, software # -+# distributed under the License is distributed on an "AS IS" BASIS, # -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -+# See the License for the specific language governing permissions and # -+# limitations under the License. # -+# # -+############################################################################## -+# # -+# Author: Vlad Krasnov # -+# # -+############################################################################## -+ -+$flavour = shift; -+$output = shift; -+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -+ -+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -+die "can't locate x86_64-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` -+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.19) + ($1>=2.22); -+} -+ -+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && -+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { -+ $avx = ($1>=2.09) + ($1>=2.10); -+} -+ -+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && -+ `ml64 2>&1` =~ /Version ([0-9]+)\./) { -+ $avx = ($1>=10) + ($1>=11); -+} -+ -+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { -+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 -+ $avx = ($ver>=3.0) + ($ver>=3.01); -+} -+ -+ -+{ -+{ -+ -+my ($state, $key) -+ =("%rdi", "%rsi"); -+ -+$code.=<<___; -+ -+.LrSet: -+.align 16 -+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC -+############################################################################### -+# void poly1305_init_x64(void *state, uint8_t key[32]) -+ -+.globl poly1305_init_x64 -+.type poly1305_init_x64, \@function, 2 -+.align 64 -+poly1305_init_x64: -+ -+ xor %rax, %rax -+ mov %rax, 8*0($state) -+ mov %rax, 8*1($state) -+ mov %rax, 8*2($state) -+ -+ movdqu 16*0($key), %xmm0 -+ movdqu 16*1($key), %xmm1 -+ pand .LrSet(%rip), %xmm0 -+ -+ movdqu %xmm0, 8*3($state) -+ movdqu %xmm1, 8*3+16($state) -+ movq \$0, 8*7($state) -+ -+ ret -+.size poly1305_init_x64,.-poly1305_init_x64 -+___ -+} -+ -+{ -+ -+my ($state, $inp) -+ =("%rdi", "%rsi"); -+ -+my ($acc0, $acc1, $acc2, $inl, $t0, $t1, $t2, $t3, $r0) -+ =("%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"); -+ -+my ($r1) -+ =("8*4($state)"); -+ -+$code.=<<___; -+############################################################################### -+# void* poly1305_update_x64(void* state, void* in, uint64_t in_len) -+.globl poly1305_update_x64 -+.type poly1305_update_x64, \@function, 2 -+.align 64 -+poly1305_update_x64: -+ -+ push %r11 -+ push %r12 -+ push %r13 -+ push %r14 -+ push %r15 -+ -+ mov %rdx, $inl -+ -+ mov 8*0($state), $acc0 -+ mov 8*1($state), $acc1 -+ mov 8*2($state), $acc2 -+ mov 8*3($state), $r0 -+ -+ cmp \$16, $inl -+ jb 2f -+ jmp 1f -+ -+.align 64 -+1: -+############################ -+ add 8*0($inp), $acc0 -+ adc 8*1($inp), $acc1 -+ lea 16($inp), $inp -+ adc \$1, $acc2 -+ -+5: -+ mov $r0, %rax -+ mulq $acc0 -+ mov %rax, $t0 -+ mov %rdx, $t1 -+ -+ mov $r0, %rax -+ mulq $acc1 -+ add %rax, $t1 -+ adc \$0, %rdx -+ -+ mov $r0, $t2 -+ imul $acc2, $t2 -+ add %rdx, $t2 -+############################ -+ mov $r1, %rax -+ mulq $acc0 -+ add %rax, $t1 -+ adc \$0, %rdx -+ mov %rdx, $acc0 -+ -+ mov $r1, %rax -+ mulq $acc1 -+ add $acc0, $t2 -+ adc \$0, %rdx -+ add %rax, $t2 -+ adc \$0, %rdx -+ -+ mov $r1, $t3 -+ imul $acc2, $t3 -+ add %rdx, $t3 -+############################ -+ -+ mov $t0, $acc0 -+ mov $t1, $acc1 -+ mov $t2, $acc2 -+ and \$3, $acc2 -+ -+ mov $t2, $t0 -+ mov $t3, $t1 -+ -+ and \$-4, $t0 -+ shrd \$2, $t3, $t2 -+ shr \$2, $t3 -+ -+ add $t0, $acc0 -+ adc $t1, $acc1 -+ adc \$0, $acc2 -+ -+ add $t2, $acc0 -+ adc $t3, $acc1 -+ adc \$0, $acc2 -+ -+ sub \$16, $inl -+ cmp \$16, $inl -+ jae 1b -+ -+2: -+ test $inl, $inl -+ jz 3f -+ -+ mov \$1, $t0 -+ xor $t1, $t1 -+ xor $t2, $t2 -+ add $inl, $inp -+ -+4: -+ shld \$8, $t0, $t1 -+ shl \$8, $t0 -+ movzxb -1($inp), $t2 -+ xor $t2, $t0 -+ dec $inp -+ dec $inl -+ jnz 4b -+ -+ add $t0, $acc0 -+ adc $t1, $acc1 -+ adc \$0, $acc2 -+ -+ mov \$16, $inl -+ jmp 5b -+ -+3: -+ -+ mov $acc0, 8*0($state) -+ mov $acc1, 8*1($state) -+ mov $acc2, 8*2($state) -+ -+ pop %r15 -+ pop %r14 -+ pop %r13 -+ pop %r12 -+ pop %r11 -+ ret -+.size poly1305_update_x64, .-poly1305_update_x64 -+___ -+} -+ -+{ -+ -+my ($mac, $state)=("%rsi", "%rdi"); -+ -+my ($acc0, $acc1, $acc2, $t0, $t1, $t2) -+ =("%rcx", "%rax", "%rdx", "%r8", "%r9", "%r10"); -+ -+$code.=<<___; -+############################################################################### -+# void poly1305_finish_x64(void* state, uint64_t mac[2]); -+.type poly1305_finish_x64,\@function, 2 -+.align 64 -+.globl poly1305_finish_x64 -+poly1305_finish_x64: -+ -+ mov 8*0($state), $acc0 -+ mov 8*1($state), $acc1 -+ mov 8*2($state), $acc2 -+ -+ mov $acc0, $t0 -+ mov $acc1, $t1 -+ mov $acc2, $t2 -+ -+ sub \$-5, $acc0 -+ sbb \$-1, $acc1 -+ sbb \$3, $acc2 -+ -+ cmovc $t0, $acc0 -+ cmovc $t1, $acc1 -+ cmovc $t2, $acc2 -+ -+ add 8*5($state), $acc0 -+ adc 8*6($state), $acc1 -+ mov $acc0, ($mac) -+ mov $acc1, 8($mac) -+ -+ ret -+.size poly1305_finish_x64, .-poly1305_finish_x64 -+___ -+} -+} -+$code =~ s/\`([^\`]*)\`/eval($1)/gem; -+print $code; -+close STDOUT; -diff --git a/crypto/chacha20poly1305/chacha20.c b/crypto/chacha20poly1305/chacha20.c -new file mode 100644 -index 0000000..3044751 ---- /dev/null -+++ b/crypto/chacha20poly1305/chacha20.c -@@ -0,0 +1,162 @@ -+/* Copyright (c) 2014, Google Inc. -+ * -+ * Permission to use, copy, modify, and/or distribute this software for any -+ * purpose with or without fee is hereby granted, provided that the above -+ * copyright notice and this permission notice appear in all copies. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -+ -+/* Adapted from the public domain, estream code by D. Bernstein. */ -+ -+#include "chacha20poly1305.h" -+ -+/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ -+static const char sigma[16] = "expand 32-byte k"; -+ -+#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) -+#define XOR(v, w) ((v) ^ (w)) -+#define PLUS(x, y) ((x) + (y)) -+#define PLUSONE(v) (PLUS((v), 1)) -+ -+#define U32TO8_LITTLE(p, v) \ -+ { \ -+ (p)[0] = (v >> 0) & 0xff; \ -+ (p)[1] = (v >> 8) & 0xff; \ -+ (p)[2] = (v >> 16) & 0xff; \ -+ (p)[3] = (v >> 24) & 0xff; \ -+ } -+ -+#define U8TO32_LITTLE(p) \ -+ (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ -+ ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) -+ -+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ -+#define QUARTERROUND(a,b,c,d) \ -+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ -+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ -+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ -+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); -+ -+/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in -+ * |input| and writes the 64 output bytes to |output|. */ -+static void chacha_core(uint8_t output[64], const uint32_t input[16]) { -+ uint32_t x[16]; -+ int i; -+ -+ memcpy(x, input, sizeof(uint32_t) * 16); -+ for (i = 20; i > 0; i -= 2) { -+ QUARTERROUND(0, 4, 8, 12) -+ QUARTERROUND(1, 5, 9, 13) -+ QUARTERROUND(2, 6, 10, 14) -+ QUARTERROUND(3, 7, 11, 15) -+ QUARTERROUND(0, 5, 10, 15) -+ QUARTERROUND(1, 6, 11, 12) -+ QUARTERROUND(2, 7, 8, 13) -+ QUARTERROUND(3, 4, 9, 14) -+ } -+ -+ for (i = 0; i < 16; ++i) { -+ x[i] = PLUS(x[i], input[i]); -+ } -+ for (i = 0; i < 16; ++i) { -+ U32TO8_LITTLE(output + 4 * i, x[i]); -+ } -+} -+ -+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, -+ uint8_t nonce[48]) { -+ -+#ifdef CHAPOLY_x86_64_ASM -+ const int AVX2_threshold = 256; -+ const int AVX2_min_buf = 128; -+ const int AVX_min_buf = 64; -+ -+ uint8_t buf[128]; -+ size_t buf_size; -+ -+ void (*core_func)(uint8_t *out, const uint8_t *in, size_t in_len, -+ uint8_t nonce[48]) = NULL; -+#else /* !CHAPOLY_x86_64_ASM */ -+ -+ uint8_t buf[64]; -+ -+#endif -+ -+ uint32_t input[16]; -+ size_t todo, i; -+ -+#ifdef CHAPOLY_x86_64_ASM -+ if (in_len >= AVX2_threshold && ((OPENSSL_ia32cap_loc()[1] >> 5) & 1)) { -+ buf_size = AVX2_min_buf; -+ core_func = chacha_20_core_avx2; -+ } else if ((OPENSSL_ia32cap_loc()[0] >> 60) & 1) { -+ buf_size = AVX_min_buf; -+ core_func = chacha_20_core_avx; -+ } else goto do_legacy; -+ -+ core_func(out, in, in_len, nonce); -+ todo = in_len & (buf_size - 1); -+ -+ if(todo) { -+ out += in_len - todo; -+ in += in_len - todo; -+ memcpy(buf, in, todo); -+ -+ core_func(buf, buf, buf_size, nonce); -+ -+ memcpy(out, buf, todo); -+ memset(buf, 0, buf_size); -+ } -+ return; -+ -+do_legacy: -+#endif /* CHAPOLY_x86_64_ASM */ -+ -+ input[0] = U8TO32_LITTLE(sigma + 0); -+ input[1] = U8TO32_LITTLE(sigma + 4); -+ input[2] = U8TO32_LITTLE(sigma + 8); -+ input[3] = U8TO32_LITTLE(sigma + 12); -+ -+ input[4] = U8TO32_LITTLE(nonce + 0); -+ input[5] = U8TO32_LITTLE(nonce + 4); -+ input[6] = U8TO32_LITTLE(nonce + 8); -+ input[7] = U8TO32_LITTLE(nonce + 12); -+ -+ input[8] = U8TO32_LITTLE(nonce + 16); -+ input[9] = U8TO32_LITTLE(nonce + 20); -+ input[10] = U8TO32_LITTLE(nonce + 24); -+ input[11] = U8TO32_LITTLE(nonce + 28); -+ -+ input[12] = U8TO32_LITTLE(nonce + 32); -+ input[13] = U8TO32_LITTLE(nonce + 36); -+ input[14] = U8TO32_LITTLE(nonce + 40); -+ input[15] = U8TO32_LITTLE(nonce + 44); -+ -+ while (in_len > 0) { -+ todo = 64; -+ if (in_len < todo) { -+ todo = in_len; -+ } -+ -+ chacha_core(buf, input); -+ for (i = 0; i < todo; i++) { -+ out[i] = in[i] ^ buf[i]; -+ } -+ -+ out += todo; -+ in += todo; -+ in_len -= todo; -+ -+ ((uint64_t*)input)[6]++; -+ } -+ -+ U32TO8_LITTLE(nonce + 32, input[12]); -+ U32TO8_LITTLE(nonce + 36, input[13]); -+} -+ -diff --git a/crypto/chacha20poly1305/chacha20poly1305.h b/crypto/chacha20poly1305/chacha20poly1305.h -new file mode 100644 -index 0000000..09b0450 ---- /dev/null -+++ b/crypto/chacha20poly1305/chacha20poly1305.h -@@ -0,0 +1,79 @@ -+/* Copyright (c) 2014, Google Inc. -+ * -+ * Permission to use, copy, modify, and/or distribute this software for any -+ * purpose with or without fee is hereby granted, provided that the above -+ * copyright notice and this permission notice appear in all copies. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -+ -+#ifndef OPENSSL_HEADER_POLY1305_H -+#define OPENSSL_HEADER_POLY1305_H -+ -+#include <stdint.h> -+#include <stddef.h> -+#include <string.h> -+#include "crypto.h" -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+#define POLY1305_MAC_LEN (16) -+#define POLY1305_PAD_LEN (16) -+ -+typedef unsigned char poly1305_state[372]; -+ -+ -+/* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an -+ * authentication tag with the one-time key |key|. Note that |key| is a -+ * one-time key and therefore there is no `reset' method because that would -+ * enable several messages to be authenticated with the same key. */ -+void CRYPTO_poly1305_init(poly1305_state* state, const uint8_t key[32]); -+ -+/* CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called -+ * zero or more times after poly1305_init. */ -+void CRYPTO_poly1305_update(poly1305_state* state, const uint8_t* in, -+ size_t in_len); -+ -+/* CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16 -+ * byte authentication tag to |mac|. */ -+void CRYPTO_poly1305_finish(poly1305_state* state, -+ uint8_t mac[POLY1305_MAC_LEN]); -+ -+/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and -+ * nonce and writes the result to |out|, which may be equal to |in|. The -+ * initial block counter is specified by |counter|. */ -+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, -+ uint8_t nonce[48]); -+ -+#ifdef CHAPOLY_x86_64_ASM -+void poly1305_init_x64(poly1305_state* state, const uint8_t key[32]); -+void poly1305_update_x64(poly1305_state* state, const uint8_t *in, size_t in_len); -+void poly1305_finish_x64(poly1305_state* state, uint8_t mac[16]); -+ -+void poly1305_init_avx(poly1305_state* state, const uint8_t key[32]); -+void poly1305_update_avx(poly1305_state* state, const uint8_t *in, size_t in_len); -+void poly1305_finish_avx(poly1305_state* state, uint8_t mac[16]); -+ -+void poly1305_update_avx2(poly1305_state* state, const uint8_t *in, size_t in_len); -+void poly1305_finish_avx2(poly1305_state* state, uint8_t mac[16]); -+ -+void chacha_20_core_avx(uint8_t *out, const uint8_t *in, size_t in_len, -+ uint8_t nonce[48]); -+ -+void chacha_20_core_avx2(uint8_t *out, const uint8_t *in, size_t in_len, -+ uint8_t nonce[48]); -+#endif -+ -+ -+#if defined(__cplusplus) -+} /* extern C */ -+#endif -+ -+#endif /* OPENSSL_HEADER_POLY1305_H */ -diff --git a/crypto/chacha20poly1305/chapolytest.c b/crypto/chacha20poly1305/chapolytest.c -new file mode 100644 -index 0000000..7e2933f ---- /dev/null -+++ b/crypto/chacha20poly1305/chapolytest.c -@@ -0,0 +1,470 @@ -+/* ==================================================================== -+ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in -+ * the documentation and/or other materials provided with the -+ * distribution. -+ * -+ * 3. All advertising materials mentioning features or use of this -+ * software must display the following acknowledgment: -+ * "This product includes software developed by the OpenSSL Project -+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" -+ * -+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to -+ * endorse or promote products derived from this software without -+ * prior written permission. For written permission, please contact -+ * licensing@OpenSSL.org. -+ * -+ * 5. Products derived from this software may not be called "OpenSSL" -+ * nor may "OpenSSL" appear in their names without prior written -+ * permission of the OpenSSL Project. -+ * -+ * 6. Redistributions of any form whatsoever must retain the following -+ * acknowledgment: -+ * "This product includes software developed by the OpenSSL Project -+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY -+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR -+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -+ * OF THE POSSIBILITY OF SUCH DAMAGE. -+ * ==================================================================== -+ */ -+ -+ -+#include <stdio.h> -+#include <stdlib.h> -+#include <string.h> -+#include <stdint.h> -+ -+#include <openssl/chacha20poly1305.h> -+ -+struct chacha_test { -+ const char *noncehex; -+ const char *outhex; -+}; -+ -+struct poly1305_test { -+ const char *inputhex; -+ const char *keyhex; -+ const char *outhex; -+}; -+ -+static const struct chacha_test chacha_tests[] = { -+ { -+ "00000000000000000000000000000000""00000000000000000000000000000000" -+ "00000000000000000000000000000000", -+ "76b8e0ada0f13d90405d6ae55386bd28""bdd219b8a08ded1aa836efcc8b770dc7" -+ "da41597c5157488d7724e03fb8d84a37""6a43b8f41518a11cc387b669b2ee6586", -+ }, -+ { -+ "00000000000000000000000000000000""00000000000000000000000000000001" -+ "00000000000000000000000000000000", -+ "4540f05a9f1fb296d7736e7b208e3c96""eb4fe1834688d2604f450952ed432d41" -+ "bbe2a0b6ea7566d2a5d1e7e20d42af2c""53d792b1c43fea817e9ad275ae546963", -+ }, -+ { -+ "00000000000000000000000000000000""00000000000000000000000000000000" -+ "00000000000000000000000000000001", -+ "de9cba7bf3d69ef5e786dc63973f653a""0b49e015adbff7134fcb7df137821031" -+ "e85a050278a7084527214f73efc7fa5b""5277062eb7a0433e445f41e31afab757", -+ }, -+ { -+ "00000000000000000000000000000000""00000000000000000000000000000000" -+ "00000000000000000100000000000000", -+ "ef3fdfd6c61578fbf5cf35bd3dd33b80""09631634d21e42ac33960bd138e50d32" -+ "111e4caf237ee53ca8ad6426194a8854""5ddc497a0b466e7d6bbdb0041b2f586b", -+ }, -+ { -+ "000102030405060708090a0b0c0d0e0f""101112131415161718191a1b1c1d1e1f" -+ "00000000000000000001020304050607", -+ "f798a189f195e66982105ffb640bb775""7f579da31602fc93ec01ac56f85ac3c1" -+ "34a4547b733b46413042c94400491769""05d3be59ea1c53f15916155c2be8241a" -+ "38008b9a26bc35941e2444177c8ade66""89de95264986d95889fb60e84629c9bd" -+ "9a5acb1cc118be563eb9b3a4a472f82e""09a7e778492b562ef7130e88dfe031c7" -+ "9db9d4f7c7a899151b9a475032b63fc3""85245fe054e3dd5a97a5f576fe064025" -+ "d3ce042c566ab2c507b138db853e3d69""59660996546cc9c4a6eafdc777c040d7" -+ "0eaf46f76dad3979e5c5360c3317166a""1c894c94a371876a94df7628fe4eaaf2" -+ "ccb27d5aaae0ad7ad0f9d4b6ad3b5409""8746d4524d38407a6deb", -+ }, -+}; -+ -+static const struct poly1305_test poly1305_tests[] = { -+ { -+ "", -+ "c8afaac331ee372cd6082de134943b17""4710130e9f6fea8d72293850a667d86c", -+ "4710130e9f6fea8d72293850a667d86c", -+ }, -+ { -+ "48656c6c6f20776f726c6421", -+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", -+ "a6f745008f81c916a20dcc74eef2b2f0", -+ }, -+ { -+ "00000000000000000000000000000000""00000000000000000000000000000000", -+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", -+ "49ec78090e481ec6c26b33b91ccc0307", -+ }, -+ { -+ "43727970746f6772617068696320466f""72756d2052657365617263682047726f" -+ "7570", -+ "85d6be7857556d337f4452fe42d506a8""0103808afb0db2fd4abff6af4149f51b", -+ "a8061dc1305136c6c22b8baf0c0127a9" -+ }, -+ { -+ "f3f6", -+ "851fc40c3467ac0be05cc20404f3f700""580b3b0f9447bb1e69d095b5928b6dbc", -+ "f4c633c3044fc145f84f335cb81953de" -+ }, -+ { -+ "", -+ "a0f3080000f46400d0c7e9076c834403""dd3fab2251f11ac759f0887129cc2ee7", -+ "dd3fab2251f11ac759f0887129cc2ee7" -+ }, -+ { -+ "663cea190ffb83d89593f3f476b6bc24""d7e679107ea26adb8caf6652d0656136", -+ "48443d0bb0d21109c89a100b5ce2c208""83149c69b561dd88298a1798b10716ef", -+ "0ee1c16bb73f0f4fd19881753c01cdbe" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67""fa83e158c994d961c4cb21095c1bf9", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "5154ad0d2cb26e01274fc51148491f1b" -+ }, -+ /* -+ * self-generated -+ */ -+ { -+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67""fa83e158c994d961c4cb21095c1bf9af", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "812059a5da198637cac7c4a631bee466" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "5b88d7f6228b11e2e28579a5c0c1f761" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "bbb613b2b6d753ba07395b916aaece15" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "c794d7057d1778c4bbee0a39b3d97342" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "ffbcb9b371423152d7fca5ad042fbaa9" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" -+ "812059a5da198637cac7c4a631bee466", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "069ed6b8ef0f207b3e243bb1019fe632" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" -+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "cca339d9a45fa2368c2c68b3a4179133" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" -+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761" -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "53f6e828a2f0fe0ee815bf0bd5841a34" -+ }, -+ { -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" -+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761" -+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0" -+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af" -+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef" -+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136" -+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761", -+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57", -+ "b846d44e9bbd53cedffbfbb6b7fa4933" -+ }, -+ { -+ /* -+ * poly1305_ieee754.c failed this in final stage -+ */ -+ "842364e156336c0998b933a6237726180d9e3fdcbde4cd5d17080fc3beb49614" -+ "d7122c037463ff104d73f19c12704628d417c4c54a3fe30d3c3d7714382d43b0" -+ "382a50a5dee54be844b076e8df88201a1cd43b90eb21643fa96f39b518aa8340" -+ "c942ff3c31baf7c9bdbf0f31ae3fa096bf8c63030609829fe72e179824890bc8" -+ "e08c315c1cce2a83144dbbff09f74e3efc770b54d0984a8f19b14719e6363564" -+ "1d6b1eedf63efbf080e1783d32445412114c20de0b837a0dfa33d6b82825fff4" -+ "4c9a70ea54ce47f07df698e6b03323b53079364a5fc3e9dd034392bdde86dccd" -+ "da94321c5e44060489336cb65bf3989c36f7282c2f5d2b882c171e74", -+ "95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26", -+ "f248312e578d9d58f8b7bb4d19105431" -+ }, -+ /* -+ * test vectors from Google -+ */ -+ { -+ "", -+ "c8afaac331ee372cd6082de134943b17""4710130e9f6fea8d72293850a667d86c", -+ "4710130e9f6fea8d72293850a667d86c", -+ }, -+ { -+ "48656c6c6f20776f726c6421", -+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", -+ "a6f745008f81c916a20dcc74eef2b2f0" -+ }, -+ { -+ "0000000000000000000000000000000000000000000000000000000000000000", -+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035", -+ "49ec78090e481ec6c26b33b91ccc0307" -+ }, -+ /* -+ * test vectors from Andrew Moon -+ */ -+ { /* nacl */ -+ "8e993b9f48681273c29650ba32fc76ce48332ea7164d96a4476fb8c531a1186a" -+ "c0dfc17c98dce87b4da7f011ec48c97271d2c20f9b928fe2270d6fb863d51738" -+ "b48eeee314a7cc8ab932164548e526ae90224368517acfeabd6bb3732bc0e9da" -+ "99832b61ca01b6de56244a9e88d5f9b37973f622a43d14a6599b1f654cb45a74" -+ "e355a5", -+ "eea6a7251c1e72916d11c2cb214d3c25""2539121d8e234e652d651fa4c8cff880", -+ "f3ffc7703f9400e52a7dfb4b3d3305d9" -+ }, -+ { /* wrap 2^130-5 */ -+ "ffffffffffffffffffffffffffffffff", -+ "02000000000000000000000000000000""00000000000000000000000000000000", -+ "03000000000000000000000000000000" -+ }, -+ { /* wrap 2^128 */ -+ "02000000000000000000000000000000", -+ "02000000000000000000000000000000""ffffffffffffffffffffffffffffffff", -+ "03000000000000000000000000000000" -+ }, -+ { /* limb carry */ -+ "fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff" -+ "11000000000000000000000000000000", -+ "01000000000000000000000000000000""00000000000000000000000000000000", -+ "05000000000000000000000000000000" -+ }, -+ { /* 2^130-5 */ -+ "fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe" -+ "01010101010101010101010101010101", -+ "01000000000000000000000000000000""00000000000000000000000000000000", -+ "00000000000000000000000000000000" -+ }, -+ { /* 2^130-6 */ -+ "fdffffffffffffffffffffffffffffff", -+ "02000000000000000000000000000000""00000000000000000000000000000000", -+ "faffffffffffffffffffffffffffffff" -+ }, -+ { /* 5*H+L reduction intermediate */ -+ "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000" -+ "0000000000000000000000000000000001000000000000000000000000000000", -+ "01000000000000000400000000000000""00000000000000000000000000000000", -+ "14000000000000005500000000000000" -+ }, -+ { /* 5*H+L reduction final */ -+ "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000" -+ "00000000000000000000000000000000", -+ "01000000000000000400000000000000""00000000000000000000000000000000", -+ "13000000000000000000000000000000" -+ } -+}; -+ -+static unsigned char hex_digit(char h) -+{ -+ if (h >= '0' && h <= '9') -+ return h - '0'; -+ else if (h >= 'a' && h <= 'f') -+ return h - 'a' + 10; -+ else if (h >= 'A' && h <= 'F') -+ return h - 'A' + 10; -+ else -+ abort(); -+} -+ -+static void hex_decode(unsigned char *out, const char* hex) -+{ -+ size_t j = 0; -+ -+ while (*hex != 0) { -+ unsigned char v = hex_digit(*hex++); -+ v <<= 4; -+ v |= hex_digit(*hex++); -+ out[j++] = v; -+ } -+} -+ -+static void hexdump(unsigned char *a, size_t len) -+{ -+ size_t i; -+ -+ for (i = 0; i < len; i++) { -+ printf("%02x", a[i]); -+ } -+} -+ -+/* misalign returns a pointer that points 0 to 15 bytes into |in| such that the -+ * returned pointer has alignment 1 mod 16. */ -+static void* misalign(void* in) -+{ -+ intptr_t x = (intptr_t) in; -+ x += (17 - (x % 16)) % 16; -+ return (void*) x; -+} -+ -+int main() -+{ -+ -+ unsigned num_tests = -+ sizeof(chacha_tests) / sizeof(struct chacha_test); -+ unsigned i; -+ unsigned char nonce_bytes[48 + 16] = {0}; -+ -+ for (i = 0; i < num_tests; i++) { -+ unsigned char *nonce = misalign(nonce_bytes); -+ -+ printf("ChaCha20 test #%d\n", i); -+ const struct chacha_test *test = &chacha_tests[i]; -+ unsigned char *expected, *out_bytes, *zero_bytes, *out, *zeros; -+ size_t len = strlen(test->outhex); -+ -+ if (strlen(test->noncehex) != 48*2 || (len & 1) == 1) -+ return 1; -+ -+ len /= 2; -+ -+ hex_decode(nonce, test->noncehex); -+ -+ expected = malloc(len); -+ out_bytes = malloc(len+16); -+ zero_bytes = malloc(len+16); -+ /* Attempt to test unaligned inputs. */ -+ out = misalign(out_bytes); -+ zeros = misalign(zero_bytes); -+ memset(zeros, 0, len); -+ -+ hex_decode(expected, test->outhex); -+ CRYPTO_chacha_20(out, zeros, len, nonce); -+ -+ if (memcmp(out, expected, len) != 0) { -+ printf("ChaCha20 test #%d failed.\n", i); -+ printf("got: "); -+ hexdump(out, len); -+ printf("\nexpected: "); -+ hexdump(expected, len); -+ printf("\n"); -+ return 1; -+ } -+ -+ -+ free(expected); -+ free(zero_bytes); -+ free(out_bytes); -+ } -+ -+ num_tests = -+ sizeof(poly1305_tests) / sizeof(struct poly1305_test); -+ unsigned char key[32], out[16], expected[16]; -+ poly1305_state poly1305; -+ -+ for (i = 0; i < num_tests; i++) { -+ printf("Poly1305 test #%d\n", i); -+ const struct poly1305_test *test = &poly1305_tests[i]; -+ unsigned char *in; -+ size_t inlen = strlen(test->inputhex); -+ -+ if (strlen(test->keyhex) != sizeof(key)*2 || -+ strlen(test->outhex) != sizeof(out)*2 || -+ (inlen & 1) == 1) -+ return 1; -+ -+ inlen /= 2; -+ -+ hex_decode(key, test->keyhex); -+ hex_decode(expected, test->outhex); -+ -+ in = malloc(inlen); -+ -+ hex_decode(in, test->inputhex); -+ -+#ifdef CHAPOLY_x86_64_ASM -+ if((OPENSSL_ia32cap_loc()[1] >> 5) & 1) { -+ poly1305_init_x64(&poly1305, key); -+ poly1305_update_avx2(&poly1305, in, inlen); -+ poly1305_finish_avx2(&poly1305, out); -+ } else { -+ poly1305_init_x64(&poly1305, key); -+ poly1305_update_x64(&poly1305, in, inlen); -+ poly1305_finish_x64(&poly1305, out); -+ } -+#else -+ { -+ CRYPTO_poly1305_init(&poly1305, key); -+ CRYPTO_poly1305_update(&poly1305, in, inlen); -+ CRYPTO_poly1305_finish(&poly1305, out); -+ } -+#endif -+ if (memcmp(out, expected, sizeof(expected)) != 0) { -+ printf("Poly1305 test #%d failed.\n", i); -+ printf("got: "); -+ hexdump(out, sizeof(out)); -+ printf("\nexpected: "); -+ hexdump(expected, sizeof(expected)); -+ printf("\n"); -+ return 1; -+ } -+ -+ free(in); -+ } -+ -+ printf("PASS\n"); -+ return 0; -+} -+ -diff --git a/crypto/chacha20poly1305/poly1305.c b/crypto/chacha20poly1305/poly1305.c -new file mode 100644 -index 0000000..50bc4a0 ---- /dev/null -+++ b/crypto/chacha20poly1305/poly1305.c -@@ -0,0 +1,287 @@ -+/* Copyright (c) 2014, Google Inc. -+ * -+ * Permission to use, copy, modify, and/or distribute this software for any -+ * purpose with or without fee is hereby granted, provided that the above -+ * copyright notice and this permission notice appear in all copies. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -+ -+/* This implementation of poly1305 is by Andrew Moon -+ * (https://github.com/floodyberry/poly1305-donna) and released as public -+ * domain. */ -+ -+#include "chacha20poly1305.h" -+ -+#include <string.h> -+ -+#if !defined(B_ENDIAN) -+/* We can assume little-endian. */ -+static uint32_t U8TO32_LE(const uint8_t *m) { -+ uint32_t r; -+ memcpy(&r, m, sizeof(r)); -+ return r; -+} -+ -+static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); } -+#else -+static uint32_t U8TO32_LE(const uint8_t *m) { -+ return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 | -+ (uint32_t)m[3] << 24; -+} -+ -+static void U32TO8_LE(uint8_t *m, uint32_t v) { -+ m[0] = v; -+ m[1] = v >> 8; -+ m[2] = v >> 16; -+ m[3] = v >> 24; -+} -+#endif -+ -+static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } -+ -+struct poly1305_state_st { -+ uint32_t r0, r1, r2, r3, r4; -+ uint32_t s1, s2, s3, s4; -+ uint32_t h0, h1, h2, h3, h4; -+ uint8_t buf[16]; -+ unsigned int buf_used; -+ uint8_t key[16]; -+}; -+ -+/* poly1305_blocks updates |state| given some amount of input data. This -+ * function may only be called with a |len| that is not a multiple of 16 at the -+ * end of the data. Otherwise the input must be buffered into 16 byte blocks. */ -+static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, -+ size_t len) { -+ uint32_t t0, t1, t2, t3; -+ uint64_t t[5]; -+ uint32_t b; -+ uint64_t c; -+ size_t j; -+ uint8_t mp[16]; -+ -+ if (len < 16) { -+ goto poly1305_donna_atmost15bytes; -+ } -+ -+poly1305_donna_16bytes: -+ t0 = U8TO32_LE(in); -+ t1 = U8TO32_LE(in + 4); -+ t2 = U8TO32_LE(in + 8); -+ t3 = U8TO32_LE(in + 12); -+ -+ in += 16; -+ len -= 16; -+ -+ state->h0 += t0 & 0x3ffffff; -+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; -+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; -+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; -+ state->h4 += (t3 >> 8) | (1 << 24); -+ -+poly1305_donna_mul: -+ t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + -+ mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + -+ mul32x32_64(state->h4, state->s1); -+ t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + -+ mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + -+ mul32x32_64(state->h4, state->s2); -+ t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + -+ mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + -+ mul32x32_64(state->h4, state->s3); -+ t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + -+ mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + -+ mul32x32_64(state->h4, state->s4); -+ t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + -+ mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + -+ mul32x32_64(state->h4, state->r0); -+ -+ state->h0 = (uint32_t)t[0] & 0x3ffffff; -+ c = (t[0] >> 26); -+ t[1] += c; -+ state->h1 = (uint32_t)t[1] & 0x3ffffff; -+ b = (uint32_t)(t[1] >> 26); -+ t[2] += b; -+ state->h2 = (uint32_t)t[2] & 0x3ffffff; -+ b = (uint32_t)(t[2] >> 26); -+ t[3] += b; -+ state->h3 = (uint32_t)t[3] & 0x3ffffff; -+ b = (uint32_t)(t[3] >> 26); -+ t[4] += b; -+ state->h4 = (uint32_t)t[4] & 0x3ffffff; -+ b = (uint32_t)(t[4] >> 26); -+ state->h0 += b * 5; -+ -+ if (len >= 16) -+ goto poly1305_donna_16bytes; -+ -+/* final bytes */ -+poly1305_donna_atmost15bytes: -+ if (!len) -+ return; -+ -+ for (j = 0; j < len; j++) -+ mp[j] = in[j]; -+ mp[j++] = 1; -+ for (; j < 16; j++) -+ mp[j] = 0; -+ len = 0; -+ -+ t0 = U8TO32_LE(mp + 0); -+ t1 = U8TO32_LE(mp + 4); -+ t2 = U8TO32_LE(mp + 8); -+ t3 = U8TO32_LE(mp + 12); -+ -+ state->h0 += t0 & 0x3ffffff; -+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; -+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; -+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; -+ state->h4 += (t3 >> 8); -+ -+ goto poly1305_donna_mul; -+} -+ -+void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { -+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; -+ uint32_t t0, t1, t2, t3; -+ -+ t0 = U8TO32_LE(key + 0); -+ t1 = U8TO32_LE(key + 4); -+ t2 = U8TO32_LE(key + 8); -+ t3 = U8TO32_LE(key + 12); -+ -+ /* precompute multipliers */ -+ state->r0 = t0 & 0x3ffffff; -+ t0 >>= 26; -+ t0 |= t1 << 6; -+ state->r1 = t0 & 0x3ffff03; -+ t1 >>= 20; -+ t1 |= t2 << 12; -+ state->r2 = t1 & 0x3ffc0ff; -+ t2 >>= 14; -+ t2 |= t3 << 18; -+ state->r3 = t2 & 0x3f03fff; -+ t3 >>= 8; -+ state->r4 = t3 & 0x00fffff; -+ -+ state->s1 = state->r1 * 5; -+ state->s2 = state->r2 * 5; -+ state->s3 = state->r3 * 5; -+ state->s4 = state->r4 * 5; -+ -+ /* init state */ -+ state->h0 = 0; -+ state->h1 = 0; -+ state->h2 = 0; -+ state->h3 = 0; -+ state->h4 = 0; -+ -+ state->buf_used = 0; -+ memcpy(state->key, key + 16, sizeof(state->key)); -+} -+ -+void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, -+ size_t in_len) { -+ unsigned int i; -+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; -+ -+ if (state->buf_used) { -+ unsigned int todo = 16 - state->buf_used; -+ if (todo > in_len) -+ todo = in_len; -+ for (i = 0; i < todo; i++) -+ state->buf[state->buf_used + i] = in[i]; -+ state->buf_used += todo; -+ in_len -= todo; -+ in += todo; -+ -+ if (state->buf_used == 16) { -+ poly1305_update(state, state->buf, 16); -+ state->buf_used = 0; -+ } -+ } -+ -+ if (in_len >= 16) { -+ size_t todo = in_len & ~0xf; -+ poly1305_update(state, in, todo); -+ in += todo; -+ in_len &= 0xf; -+ } -+ -+ if (in_len) { -+ for (i = 0; i < in_len; i++) -+ state->buf[i] = in[i]; -+ state->buf_used = in_len; -+ } -+} -+ -+void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { -+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; -+ uint64_t f0, f1, f2, f3; -+ uint32_t g0, g1, g2, g3, g4; -+ uint32_t b, nb; -+ -+ if (state->buf_used) -+ poly1305_update(state, state->buf, state->buf_used); -+ -+ b = state->h0 >> 26; -+ state->h0 = state->h0 & 0x3ffffff; -+ state->h1 += b; -+ b = state->h1 >> 26; -+ state->h1 = state->h1 & 0x3ffffff; -+ state->h2 += b; -+ b = state->h2 >> 26; -+ state->h2 = state->h2 & 0x3ffffff; -+ state->h3 += b; -+ b = state->h3 >> 26; -+ state->h3 = state->h3 & 0x3ffffff; -+ state->h4 += b; -+ b = state->h4 >> 26; -+ state->h4 = state->h4 & 0x3ffffff; -+ state->h0 += b * 5; -+ -+ g0 = state->h0 + 5; -+ b = g0 >> 26; -+ g0 &= 0x3ffffff; -+ g1 = state->h1 + b; -+ b = g1 >> 26; -+ g1 &= 0x3ffffff; -+ g2 = state->h2 + b; -+ b = g2 >> 26; -+ g2 &= 0x3ffffff; -+ g3 = state->h3 + b; -+ b = g3 >> 26; -+ g3 &= 0x3ffffff; -+ g4 = state->h4 + b - (1 << 26); -+ -+ b = (g4 >> 31) - 1; -+ nb = ~b; -+ state->h0 = (state->h0 & nb) | (g0 & b); -+ state->h1 = (state->h1 & nb) | (g1 & b); -+ state->h2 = (state->h2 & nb) | (g2 & b); -+ state->h3 = (state->h3 & nb) | (g3 & b); -+ state->h4 = (state->h4 & nb) | (g4 & b); -+ -+ f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); -+ f1 = ((state->h1 >> 6) | (state->h2 << 20)) + -+ (uint64_t)U8TO32_LE(&state->key[4]); -+ f2 = ((state->h2 >> 12) | (state->h3 << 14)) + -+ (uint64_t)U8TO32_LE(&state->key[8]); -+ f3 = ((state->h3 >> 18) | (state->h4 << 8)) + -+ (uint64_t)U8TO32_LE(&state->key[12]); -+ -+ U32TO8_LE(&mac[0], f0); -+ f1 += (f0 >> 32); -+ U32TO8_LE(&mac[4], f1); -+ f2 += (f1 >> 32); -+ U32TO8_LE(&mac[8], f2); -+ f3 += (f2 >> 32); -+ U32TO8_LE(&mac[12], f3); -+} -+ -diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c -index c9f674b..c6ff6ff 100644 ---- a/crypto/cryptlib.c -+++ b/crypto/cryptlib.c -@@ -656,16 +656,6 @@ const char *CRYPTO_get_lock_name(int type) - extern unsigned int OPENSSL_ia32cap_P[4]; - unsigned long *OPENSSL_ia32cap_loc(void) - { -- if (sizeof(long) == 4) -- /* -- * If 32-bit application pulls address of OPENSSL_ia32cap_P[0] -- * clear second element to maintain the illusion that vector -- * is 32-bit. -- */ -- OPENSSL_ia32cap_P[1] = 0; -- -- OPENSSL_ia32cap_P[2] = 0; -- - return (unsigned long *)OPENSSL_ia32cap_P; - } - -diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile -index aaaad98..e30b588 100644 ---- a/crypto/evp/Makefile -+++ b/crypto/evp/Makefile -@@ -29,7 +29,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \ - c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \ - evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \ - e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \ -- e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c -+ e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c \ -+ e_chacha20poly1305.c - - LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ - e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\ -@@ -42,7 +43,8 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ - c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \ - evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \ - e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \ -- e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o -+ e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o \ -+ e_chacha20poly1305.o - - SRC= $(LIBSRC) - -@@ -263,6 +265,7 @@ e_cast.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h - e_cast.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h - e_cast.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h - e_cast.o: ../../include/openssl/symhacks.h ../cryptlib.h e_cast.c evp_locl.h -+e_chacha20poly1305.o: ../../include/openssl/chacha20poly1305.h e_chacha20poly1305.c - e_des.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h - e_des.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h - e_des.o: ../../include/openssl/des.h ../../include/openssl/des_old.h -diff --git a/crypto/evp/e_chacha20poly1305.c b/crypto/evp/e_chacha20poly1305.c -new file mode 100644 -index 0000000..17f8cb4 ---- /dev/null -+++ b/crypto/evp/e_chacha20poly1305.c -@@ -0,0 +1,435 @@ -+/* ==================================================================== -+ * Copyright (c) 2001-2014 The OpenSSL Project. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in -+ * the documentation and/or other materials provided with the -+ * distribution. -+ * -+ * 3. All advertising materials mentioning features or use of this -+ * software must display the following acknowledgment: -+ * "This product includes software developed by the OpenSSL Project -+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" -+ * -+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to -+ * endorse or promote products derived from this software without -+ * prior written permission. For written permission, please contact -+ * openssl-core@openssl.org. -+ * -+ * 5. Products derived from this software may not be called "OpenSSL" -+ * nor may "OpenSSL" appear in their names without prior written -+ * permission of the OpenSSL Project. -+ * -+ * 6. Redistributions of any form whatsoever must retain the following -+ * acknowledgment: -+ * "This product includes software developed by the OpenSSL Project -+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY -+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR -+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -+ * OF THE POSSIBILITY OF SUCH DAMAGE. -+ * ==================================================================== -+ * -+ */ -+ -+#include <openssl/opensslconf.h> -+#ifndef OPENSSL_NO_CHACHA_POLY -+#include <openssl/evp.h> -+#include <openssl/err.h> -+#include <openssl/chacha20poly1305.h> -+#include "evp_locl.h" -+#include <openssl/rand.h> -+ -+#define FILL_BUFFER ((size_t)128) -+ -+typedef struct { -+ uint8_t iv[12]; -+ uint8_t nonce[48]; -+ size_t aad_l; -+ size_t ct_l; -+ unsigned valid:1; -+ unsigned draft:1; -+ uint8_t poly_buffer[FILL_BUFFER]; -+ uint8_t chacha_buffer[FILL_BUFFER]; -+ uint16_t poly_buffer_used; -+ uint16_t chacha_used; -+#ifdef CHAPOLY_x86_64_ASM -+ void (*poly1305_init_ptr)(poly1305_state *, const uint8_t *); -+ void (*poly1305_update_ptr)(poly1305_state *, const uint8_t *, size_t); -+ void (*poly1305_finish_ptr)(poly1305_state *, uint8_t *); -+ poly1305_state poly_state; -+ #define poly_init aead_ctx->poly1305_init_ptr -+ #define poly_update poly1305_update_wrapper -+ #define poly_finish poly1305_finish_wrapper -+#else -+ #define poly_init CRYPTO_poly1305_init -+ #define poly_update(c,i,l) CRYPTO_poly1305_update(&c->poly_state,i,l) -+ #define poly_finish(c,m) CRYPTO_poly1305_finish(&c->poly_state,m) -+#endif -+} EVP_CHACHA20_POLY1305_CTX; -+ -+ -+#ifdef CHAPOLY_x86_64_ASM -+#include <immintrin.h> -+ -+static void poly1305_update_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx, -+ const uint8_t *in, -+ size_t in_len) -+{ -+ int todo; -+ /* Attempt to fill as many bytes as possible before calling the update -+ function */ -+ if (in_len < FILL_BUFFER || ctx->poly_buffer_used) { -+ todo = FILL_BUFFER - ctx->poly_buffer_used; -+ todo = in_len < todo? in_len : todo; -+ memcpy(ctx->poly_buffer + ctx->poly_buffer_used, in, todo); -+ ctx->poly_buffer_used += todo; -+ in += todo; -+ in_len -= todo; -+ -+ if (ctx->poly_buffer_used == FILL_BUFFER) { -+ ctx->poly1305_update_ptr(&ctx->poly_state, -+ ctx->poly_buffer, -+ FILL_BUFFER); -+ ctx->poly_buffer_used = 0; -+ } -+ } -+ -+ if (in_len >= FILL_BUFFER) { -+ ctx->poly1305_update_ptr(&ctx->poly_state, in, in_len & (-FILL_BUFFER)); -+ in += in_len & (-FILL_BUFFER); -+ in_len &= (FILL_BUFFER - 1); -+ } -+ -+ if (in_len) { -+ memcpy(ctx->poly_buffer, in, in_len); -+ ctx->poly_buffer_used = in_len; -+ } -+} -+ -+ -+static void poly1305_finish_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx, -+ uint8_t mac[POLY1305_MAC_LEN]) -+{ -+ if (ctx->poly_buffer_used) { -+ -+ if (ctx->poly_buffer_used % POLY1305_PAD_LEN) { -+ memset(ctx->poly_buffer + ctx->poly_buffer_used, 0, -+ POLY1305_PAD_LEN - (ctx->poly_buffer_used % POLY1305_PAD_LEN)); -+ } -+ -+ ctx->poly1305_update_ptr(&ctx->poly_state, -+ ctx->poly_buffer, -+ ctx->poly_buffer_used); -+ } -+ -+ ctx->poly1305_finish_ptr(&ctx->poly_state, mac); -+ memset(ctx->poly_buffer, 0, FILL_BUFFER); -+} -+#endif -+ -+ -+#ifdef CHAPOLY_x86_64_ASM -+static void EVP_chacha20_poly1305_cpuid(EVP_CHACHA20_POLY1305_CTX *ctx) -+{ -+ if ((OPENSSL_ia32cap_loc()[1] >> 5) & 1) { /* AVX2 */ -+ ctx->poly1305_init_ptr = poly1305_init_x64; /* Lazy init */ -+ ctx->poly1305_update_ptr = poly1305_update_avx2; -+ ctx->poly1305_finish_ptr = poly1305_finish_avx2; -+/* -+ } else if (0 && (OPENSSL_ia32cap_loc()[0] >> 60) & 1) { // AVX -disabled -+ ctx->poly1305_init_ptr = poly1305_init_avx; -+ ctx->poly1305_update_ptr = poly1305_update_avx; -+ ctx->poly1305_finish_ptr = poly1305_finish_avx; -+*/ -+ } else { /* x64 code */ -+ ctx->poly1305_init_ptr = poly1305_init_x64; -+ ctx->poly1305_update_ptr = poly1305_update_x64; -+ ctx->poly1305_finish_ptr = poly1305_finish_x64; -+ } -+} -+#endif -+ -+ -+static int EVP_chacha20_poly1305_init_draft(EVP_CIPHER_CTX *ctx, -+ const unsigned char *key, -+ const unsigned char *iv, -+ int enc) -+{ -+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; -+ memcpy(aead_ctx->nonce, key, 32); -+ aead_ctx->valid = 0; -+ aead_ctx->draft = 1; -+ -+#ifdef CHAPOLY_x86_64_ASM -+ EVP_chacha20_poly1305_cpuid(aead_ctx); -+#endif -+ -+ return 1; -+} -+ -+ -+static int EVP_chacha20_poly1305_init(EVP_CIPHER_CTX *ctx, -+ const unsigned char *key, -+ const unsigned char *iv, -+ int enc) -+{ -+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; -+ memcpy(aead_ctx->nonce, key, 32); -+ memcpy(aead_ctx->iv, iv, 12); -+ aead_ctx->valid = 0; -+ aead_ctx->draft = 0; -+ -+#ifdef CHAPOLY_x86_64_ASM -+ EVP_chacha20_poly1305_cpuid(aead_ctx); -+#endif -+ -+ return 1; -+} -+ -+ -+static int EVP_chacha20_poly1305_cipher(EVP_CIPHER_CTX *ctx, -+ unsigned char *out, -+ const unsigned char *in, -+ size_t inl) -+{ -+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; -+ uint8_t poly_mac[POLY1305_MAC_LEN]; -+ uint8_t zero[POLY1305_PAD_LEN] = {0}; -+ uint64_t cmp; -+ int i, todo; -+ -+ if (!aead_ctx->valid) -+ return 0; -+ -+ if (inl < POLY1305_MAC_LEN) -+ return -1; -+ -+ /* Fix for MAC */ -+ inl -= POLY1305_MAC_LEN; -+ -+ if (!ctx->encrypt) { -+ poly_update(aead_ctx, in, inl); -+ } -+ -+ i = 0; -+ if (inl < 256) { -+ /* Consume the buffer we computed during poly initialization */ -+ todo = inl > (FILL_BUFFER - aead_ctx->chacha_used) ? -+ FILL_BUFFER - aead_ctx->chacha_used : -+ inl; -+ -+#ifdef CHAPOLY_x86_64_ASM -+ for (; i <= todo - 16; i+=16) { -+ _mm_storeu_si128((__m128i*)&out[i], -+ _mm_xor_si128(_mm_loadu_si128((__m128i *)&in[i]), -+ _mm_loadu_si128((__m128i *)&aead_ctx->chacha_buffer[i + 64]))); -+ } -+#endif -+ for (; i < todo; i++) { -+ out[i] = in[i] ^ aead_ctx->chacha_buffer[i + 64 /*aead_ctx->chacha_used*/]; -+ } -+ -+ } else { -+ /* For long messages don't use precomputed buffer */ -+ ((uint64_t *)(aead_ctx->nonce))[4]--; -+ } -+ -+ todo = inl - i; -+ -+ if (todo) { -+ CRYPTO_chacha_20(&out[i], &in[i], todo, aead_ctx->nonce); -+ } -+ -+ if (ctx->encrypt) { -+ poly_update(aead_ctx, out, inl); -+ } -+ -+ aead_ctx->ct_l += inl; -+ -+ if (!aead_ctx->draft) { -+ /* For RFC padd ciphertext with zeroes, then mac len(aad)||len(ct) */ -+ todo = aead_ctx->ct_l % POLY1305_PAD_LEN ? -+ POLY1305_PAD_LEN - (aead_ctx->ct_l % POLY1305_PAD_LEN) : -+ 0; -+ -+ if (todo) { -+ poly_update(aead_ctx, zero, todo); -+ } -+ -+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->aad_l, sizeof(uint64_t)); -+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->ct_l, sizeof(uint64_t)); -+ -+ } else { -+ /* For the draft don't pad, mac len(ct) */ -+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->ct_l, sizeof(uint64_t)); -+ } -+ aead_ctx->valid = 0; -+ -+ if (ctx->encrypt) { -+ poly_finish(aead_ctx, &out[inl]); -+ return inl + POLY1305_MAC_LEN; -+ -+ } else { /* Decryption */ -+ poly_finish(aead_ctx, poly_mac); -+ /* Constant time comparison */ -+ cmp = (*(uint64_t *)(poly_mac)) ^ (*(uint64_t *)(in + inl)); -+ cmp |= (*(uint64_t *)(poly_mac + 8)) ^ (*(uint64_t *)(in + inl + 8)); -+ -+ if (cmp) { -+ OPENSSL_cleanse(out, inl); -+ return -1; -+ } -+ -+ return inl; -+ } -+} -+ -+ -+static int EVP_chacha20_poly1305_cleanup(EVP_CIPHER_CTX *ctx) -+{ -+ return 1; -+} -+ -+ -+static int EVP_chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx, -+ int type, -+ int arg, -+ void *ptr) -+{ -+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; -+ uint8_t aad[EVP_AEAD_TLS1_AAD_LEN + 8]; -+ uint64_t thirteen = EVP_AEAD_TLS1_AAD_LEN; -+ -+ switch (type) { -+ case EVP_CTRL_AEAD_TLS1_AAD: -+ -+ if (arg != EVP_AEAD_TLS1_AAD_LEN) -+ return 0; -+ -+ /* Initialize poly keys */ -+ memset(aead_ctx->chacha_buffer, 0, FILL_BUFFER); -+ -+ if (!aead_ctx->draft) { -+ /* RFC IV = (0 || iv) ^ seq_num */ -+ memset(aead_ctx->nonce + 32, 0, 4); -+ memcpy(aead_ctx->nonce + 36, aead_ctx->iv, 12); -+ *(uint64_t *)(aead_ctx->nonce + 40) ^= *(uint64_t *)(ptr); -+ -+ } else { -+ /* draft IV = 0 || seq_num */ -+ memset(aead_ctx->nonce + 32, 0, 8); -+ memcpy(aead_ctx->nonce + 40, ptr, 8); -+ } -+ /* Poly keys = ENC(0) */ -+ CRYPTO_chacha_20(aead_ctx->chacha_buffer, -+ aead_ctx->chacha_buffer, -+ FILL_BUFFER, -+ aead_ctx->nonce); -+ -+ poly_init(&aead_ctx->poly_state, aead_ctx->chacha_buffer); -+ -+ aead_ctx->chacha_used = 64; -+ aead_ctx->poly_buffer_used = 0; -+ aead_ctx->aad_l = arg; -+ aead_ctx->ct_l = 0; -+ -+ /* Absorb AAD */ -+ memcpy(aad, ptr, arg); -+ memset(aad + arg, 0, sizeof(aad) - arg); -+ -+ /* If decrypting fix length for tag */ -+ if (!ctx->encrypt) { -+ unsigned int len = (aad[arg-2] << 8) | aad[arg-1]; -+ len -= POLY1305_MAC_LEN; -+ aad[arg-2] = len>>8; -+ aad[arg-1] = len & 0xff; -+ } -+ -+ if (!aead_ctx->draft) { -+ /* In the RFC, AAD is padded with zeroes */ -+ poly_update(aead_ctx, aad, POLY1305_PAD_LEN); -+ -+ } else { -+ /* In the draft AAD is followed by len(AAD) */ -+ memcpy(&aad[arg], &thirteen, sizeof(thirteen)); -+ poly_update(aead_ctx, aad, arg + sizeof(thirteen)); -+ } -+ -+ aead_ctx->valid = 1; -+ return POLY1305_MAC_LEN; -+ -+ break; -+ -+ default: -+ return 0; -+ break; -+ } -+ -+ return 0; -+} -+ -+ -+#define CUSTOM_FLAGS (\ -+ EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \ -+ | EVP_CIPH_ALWAYS_CALL_INIT \ -+ | EVP_CIPH_CUSTOM_COPY) -+ -+ -+static const EVP_CIPHER chacha20_poly1305_d = { -+ 0, /* nid ??? */ -+ 1, /* block size, sorta */ -+ 32, /* key len */ -+ 0, /* iv len */ -+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */ -+ EVP_chacha20_poly1305_init_draft, -+ EVP_chacha20_poly1305_cipher, -+ EVP_chacha20_poly1305_cleanup, -+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */ -+ NULL, -+ NULL, -+ EVP_chacha20_poly1305_ctrl, -+ NULL -+ }; -+ -+ -+static const EVP_CIPHER chacha20_poly1305 = { -+ 0, /* nid ??? */ -+ 1, /* block size, sorta */ -+ 32, /* key len */ -+ 12, /* iv len */ -+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */ -+ EVP_chacha20_poly1305_init, -+ EVP_chacha20_poly1305_cipher, -+ EVP_chacha20_poly1305_cleanup, -+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */ -+ NULL, -+ NULL, -+ EVP_chacha20_poly1305_ctrl, -+ NULL -+ }; -+ -+ -+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void) -+{ return &chacha20_poly1305_d; } -+ -+ -+const EVP_CIPHER *EVP_chacha20_poly1305(void) -+{ return &chacha20_poly1305; } -+#endif -diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h -index 39ab793..53ed671 100644 ---- a/crypto/evp/evp.h -+++ b/crypto/evp/evp.h -@@ -893,6 +893,10 @@ const EVP_CIPHER *EVP_camellia_256_cfb128(void); - # define EVP_camellia_256_cfb EVP_camellia_256_cfb128 - const EVP_CIPHER *EVP_camellia_256_ofb(void); - # endif -+# ifndef OPENSSL_NO_CHACHA_POLY -+const EVP_CIPHER *EVP_chacha20_poly1305(void); -+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void); -+# endif - - # ifndef OPENSSL_NO_SEED - const EVP_CIPHER *EVP_seed_ecb(void); -diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c -index f846cb5..b0364c1 100644 ---- a/ssl/s3_lib.c -+++ b/ssl/s3_lib.c -@@ -2891,6 +2891,111 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { - 256}, - #endif - -+#if !defined(OPENSSL_NO_CHACHA_POLY) -+ /* Draft ciphers */ -+ { -+ 1, -+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D, -+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D, -+ SSL_kEECDH, -+ SSL_aRSA, -+ SSL_CHACHA20POLY1305_D, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256}, -+ { -+ 1, -+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D, -+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D, -+ SSL_kEECDH, -+ SSL_aECDSA, -+ SSL_CHACHA20POLY1305_D, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256}, -+ { -+ 1, -+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D, -+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D, -+ SSL_kEDH, -+ SSL_aRSA, -+ SSL_CHACHA20POLY1305_D, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256}, -+ /* RFC ciphers */ -+ /* Cipher CCA8 as per draft-ietf-tls-chacha20-poly1305-03 */ -+ { -+ 1, -+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305, -+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305, -+ SSL_kECDHE, -+ SSL_aRSA, -+ SSL_CHACHA20POLY1305, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256, -+ }, -+ /* Cipher CCA9 */ -+ { -+ 1, -+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, -+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, -+ SSL_kECDHE, -+ SSL_aECDSA, -+ SSL_CHACHA20POLY1305, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256, -+ }, -+ /* Cipher CCAA */ -+ { -+ 1, -+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305, -+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305, -+ SSL_kDHE, -+ SSL_aRSA, -+ SSL_CHACHA20POLY1305, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256, -+ }, -+ /* Cipher CCAB */ -+ { -+ 1, -+ TLS1_TXT_PSK_WITH_CHACHA20_POLY1305, -+ TLS1_CK_PSK_WITH_CHACHA20_POLY1305, -+ SSL_kPSK, -+ SSL_aPSK, -+ SSL_CHACHA20POLY1305, -+ SSL_AEAD, -+ SSL_TLSV1_2, -+ SSL_HIGH, -+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, -+ 256, -+ 256, -+ }, -+#endif -+ -+ - /* end of list */ - }; - -@@ -4036,6 +4141,7 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, - int i, ii, ok; - CERT *cert; - unsigned long alg_k, alg_a, mask_k, mask_a, emask_k, emask_a; -+ int use_chacha = 0; - - /* Let's see which ciphers we can support */ - cert = s->cert; -@@ -4069,9 +4175,17 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, - if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) { - prio = srvr; - allow = clnt; -+ /* Use ChaCha20+Poly1305 iff it's client's most preferred cipher suite */ -+ if (sk_SSL_CIPHER_num(clnt) > 0) { -+ c = sk_SSL_CIPHER_value(clnt, 0); -+ if (c->algorithm_enc == SSL_CHACHA20POLY1305 || -+ c->algorithm_enc == SSL_CHACHA20POLY1305_D) -+ use_chacha = 1; -+ } - } else { - prio = clnt; - allow = srvr; -+ use_chacha = 1; - } - - tls1_set_cert_validity(s); -@@ -4083,6 +4197,11 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, - if ((c->algorithm_ssl & SSL_TLSV1_2) && !SSL_USE_TLS1_2_CIPHERS(s)) - continue; - -+ /* Skip ChaCha unless top client priority */ -+ if ((c->algorithm_enc == SSL_CHACHA20POLY1305 || -+ c->algorithm_enc == SSL_CHACHA20POLY1305_D) && !use_chacha) -+ continue; -+ - ssl_set_cert_masks(cert, c); - mask_k = cert->mask_k; - mask_a = cert->mask_a; -diff --git a/ssl/ssl.h b/ssl/ssl.h -index ae8c925..c7635b6 100644 ---- a/ssl/ssl.h -+++ b/ssl/ssl.h -@@ -294,6 +294,8 @@ extern "C" { - # define SSL_TXT_AES256 "AES256" - # define SSL_TXT_AES "AES" - # define SSL_TXT_AES_GCM "AESGCM" -+# define SSL_TXT_CHACHA20_D "CHACHA20-draft" -+# define SSL_TXT_CHACHA20 "CHACHA20" - # define SSL_TXT_CAMELLIA128 "CAMELLIA128" - # define SSL_TXT_CAMELLIA256 "CAMELLIA256" - # define SSL_TXT_CAMELLIA "CAMELLIA" -diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c -index 6957bda..21eae5c 100644 ---- a/ssl/ssl_ciph.c -+++ b/ssl/ssl_ciph.c -@@ -150,25 +150,27 @@ - #endif - #include "ssl_locl.h" - --#define SSL_ENC_DES_IDX 0 --#define SSL_ENC_3DES_IDX 1 --#define SSL_ENC_RC4_IDX 2 --#define SSL_ENC_RC2_IDX 3 --#define SSL_ENC_IDEA_IDX 4 --#define SSL_ENC_NULL_IDX 5 --#define SSL_ENC_AES128_IDX 6 --#define SSL_ENC_AES256_IDX 7 --#define SSL_ENC_CAMELLIA128_IDX 8 --#define SSL_ENC_CAMELLIA256_IDX 9 --#define SSL_ENC_GOST89_IDX 10 --#define SSL_ENC_SEED_IDX 11 --#define SSL_ENC_AES128GCM_IDX 12 --#define SSL_ENC_AES256GCM_IDX 13 --#define SSL_ENC_NUM_IDX 14 -+#define SSL_ENC_DES_IDX 0 -+#define SSL_ENC_3DES_IDX 1 -+#define SSL_ENC_RC4_IDX 2 -+#define SSL_ENC_RC2_IDX 3 -+#define SSL_ENC_IDEA_IDX 4 -+#define SSL_ENC_NULL_IDX 5 -+#define SSL_ENC_AES128_IDX 6 -+#define SSL_ENC_AES256_IDX 7 -+#define SSL_ENC_CAMELLIA128_IDX 8 -+#define SSL_ENC_CAMELLIA256_IDX 9 -+#define SSL_ENC_GOST89_IDX 10 -+#define SSL_ENC_SEED_IDX 11 -+#define SSL_ENC_AES128GCM_IDX 12 -+#define SSL_ENC_AES256GCM_IDX 13 -+#define SSL_ENC_CHACHA20POLY1305_DRAFT_IDX 14 -+#define SSL_ENC_CHACHA20POLY1305_IDX 15 -+#define SSL_ENC_NUM_IDX 16 - - static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX] = { - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -- NULL, NULL -+ NULL, NULL, NULL, NULL - }; - - #define SSL_COMP_NULL_IDX 0 -@@ -363,6 +365,9 @@ static const SSL_CIPHER cipher_aliases[] = { - {0, SSL3_TXT_DHE_RSA_DES_192_CBC3_SHA, 0, - SSL_kDHE, SSL_aRSA, SSL_3DES, SSL_SHA1, SSL_SSLV3, - SSL_NOT_EXP | SSL_HIGH | SSL_FIPS, 0, 0, 0,}, -+ -+ {0, SSL_TXT_CHACHA20_D, 0, 0, 0, SSL_CHACHA20POLY1305_D, 0, 0, 0, 0, 0, 0}, -+ {0, SSL_TXT_CHACHA20, 0, 0, 0, SSL_CHACHA20POLY1305, 0, 0, 0, 0, 0, 0}, - }; - - /* -@@ -432,6 +437,11 @@ void ssl_load_ciphers(void) - ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] = - EVP_get_cipherbyname(SN_aes_256_gcm); - -+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] = -+ EVP_chacha20_poly1305_draft(); -+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] = -+ EVP_chacha20_poly1305(); -+ - ssl_digest_methods[SSL_MD_MD5_IDX] = EVP_get_digestbyname(SN_md5); - ssl_mac_secret_size[SSL_MD_MD5_IDX] = - EVP_MD_size(ssl_digest_methods[SSL_MD_MD5_IDX]); -@@ -582,6 +592,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc, - case SSL_AES256GCM: - i = SSL_ENC_AES256GCM_IDX; - break; -+ case SSL_CHACHA20POLY1305_D: -+ i = SSL_ENC_CHACHA20POLY1305_DRAFT_IDX; -+ break; -+ case SSL_CHACHA20POLY1305: -+ i = SSL_ENC_CHACHA20POLY1305_IDX; -+ break; - default: - i = -1; - break; -@@ -797,6 +813,12 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, - (ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] == - NULL) ? SSL_AES256GCM : 0; - *enc |= -+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] == -+ NULL) ? SSL_CHACHA20POLY1305_D : 0; -+ *enc |= -+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] == -+ NULL) ? SSL_CHACHA20POLY1305 : 0; -+ *enc |= - (ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] == - NULL) ? SSL_CAMELLIA128 : 0; - *enc |= -@@ -1812,6 +1834,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) - case SSL_AES256GCM: - enc = "AESGCM(256)"; - break; -+ case SSL_CHACHA20POLY1305_D: -+ enc = "ChaCha20-Poly1305-draft"; -+ break; -+ case SSL_CHACHA20POLY1305: -+ enc = "ChaCha20-Poly1305"; -+ break; - case SSL_CAMELLIA128: - enc = "Camellia(128)"; - break; -diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h -index a8e4efc..67d355c 100644 ---- a/ssl/ssl_locl.h -+++ b/ssl/ssl_locl.h -@@ -354,6 +354,8 @@ - # define SSL_SEED 0x00000800L - # define SSL_AES128GCM 0x00001000L - # define SSL_AES256GCM 0x00002000L -+# define SSL_CHACHA20POLY1305_D 0x00004000L -+# define SSL_CHACHA20POLY1305 0x00080000U /* Value from openssl */ - - # define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM) - # define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256) -diff --git a/ssl/tls1.h b/ssl/tls1.h -index 7e237d0..fb0c981 100644 ---- a/ssl/tls1.h -+++ b/ssl/tls1.h -@@ -563,6 +563,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) - # define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031 - # define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032 - -+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ -+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC13 -+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D 0x0300CC14 -+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC15 -+ -+/* ChaCha20-Poly1305 ciphersuites from RFC */ -+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCA8 -+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 0x0300CCA9 -+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCAA -+# define TLS1_CK_PSK_WITH_CHACHA20_POLY1305 0x0300CCAB -+# define TLS1_CK_ECDHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAC -+# define TLS1_CK_DHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAD -+# define TLS1_CK_RSA_PSK_WITH_CHACHA20_POLY1305 0x0300CCAE -+ - /* - * XXX * Backward compatibility alert: + * Older versions of OpenSSL gave - * some DHE ciphers names with "EDH" + * instead of "DHE". Going forward, we -@@ -713,6 +727,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) - # define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256" - # define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384" - -+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ -+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D "ECDHE-RSA-CHACHA20-POLY1305-D" -+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D "ECDHE-ECDSA-CHACHA20-POLY1305-D" -+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D "DHE-RSA-CHACHA20-POLY1305-D" -+ -+/* Chacha20-Poly1305 ciphersuites from RFC */ -+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305" -+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305" -+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305" -+# define TLS1_TXT_PSK_WITH_CHACHA20_POLY1305 "PSK-CHACHA20-POLY1305" -+# define TLS1_TXT_ECDHE_PSK_WITH_CHACHA20_POLY1305 "ECDHE-PSK-CHACHA20-POLY1305" -+# define TLS1_TXT_DHE_PSK_WITH_CHACHA20_POLY1305 "DHE-PSK-CHACHA20-POLY1305" -+# define TLS1_TXT_RSA_PSK_WITH_CHACHA20_POLY1305 "RSA-PSK-CHACHA20-POLY1305" -+ - # define TLS_CT_RSA_SIGN 1 - # define TLS_CT_DSS_SIGN 2 - # define TLS_CT_RSA_FIXED_DH 3 -diff --git a/test/Makefile b/test/Makefile -index b180971..554d536 100644 ---- a/test/Makefile -+++ b/test/Makefile -@@ -71,6 +71,7 @@ - VERIFYEXTRATEST= verify_extra_test - CLIENTHELLOTEST= clienthellotest - SSLV2CONFTEST = sslv2conftest -+CHAPOLYTEST= chapolytest - - TESTS= alltests - -@@ -84,7 +85,7 @@ - $(EVPTEST)$(EXE_EXT) $(EVPEXTRATEST)$(EXE_EXT) $(IGETEST)$(EXE_EXT) $(JPAKETEST)$(EXE_EXT) $(SRPTEST)$(EXE_EXT) \ - $(ASN1TEST)$(EXE_EXT) $(V3NAMETEST)$(EXE_EXT) $(HEARTBEATTEST)$(EXE_EXT) \ - $(CONSTTIMETEST)$(EXE_EXT) $(VERIFYEXTRATEST)$(EXE_EXT) \ -- $(CLIENTHELLOTEST)$(EXE_EXT) $(SSLV2CONFTEST)$(EXE_EXT) -+ $(CLIENTHELLOTEST)$(EXE_EXT) $(SSLV2CONFTEST)$(EXE_EXT) $(CHAPOLYTEST)$(EXE_EXT) - - # $(METHTEST)$(EXE_EXT) - -@@ -98,7 +99,7 @@ - $(BFTEST).o $(SSLTEST).o $(DSATEST).o $(EXPTEST).o $(RSATEST).o \ - $(EVPTEST).o $(EVPEXTRATEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o $(V3NAMETEST).o \ - $(HEARTBEATTEST).o $(CONSTTIMETEST).o $(VERIFYEXTRATEST).o \ -- $(CLIENTHELLOTEST).o $(SSLV2CONFTEST).o -+ $(CLIENTHELLOTEST).o $(SSLV2CONFTEST).o $(CHAPOLYTEST).o - - SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ - $(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \ -@@ -109,7 +110,7 @@ - $(BFTEST).c $(SSLTEST).c $(DSATEST).c $(EXPTEST).c $(RSATEST).c \ - $(EVPTEST).c $(EVPEXTRATEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c \ - $(V3NAMETEST).c $(HEARTBEATTEST).c $(CONSTTIMETEST).c $(VERIFYEXTRATEST).c \ -- $(CLIENTHELLOTEST).c $(SSLV2CONFTEST).c -+ $(CLIENTHELLOTEST).c $(SSLV2CONFTEST).c $(CHAPOLYTEST).c - - EXHEADER= - HEADER= testutil.h $(EXHEADER) -@@ -145,7 +146,7 @@ - @(cd ..; $(MAKE) DIRS=apps all) - - alltests: \ -- test_des test_idea test_sha test_md4 test_md5 test_hmac \ -+ test_des test_idea test_sha test_md4 test_md5 test_hmac test_chapoly \ - test_md2 test_mdc2 test_wp \ - test_rmd test_rc2 test_rc4 test_rc5 test_bf test_cast test_aes \ - test_rand test_bn test_ec test_ecdsa test_ecdh \ -@@ -366,6 +367,10 @@ - @echo $(START) $@ - ../util/shlib_wrap.sh ./$(SSLV2CONFTEST) - -+test_chapoly: $(CHAPOLYTEST)$(EXE_EXT) -+ @echo "Test ChaCha20 and Poly1305" -+ ../util/shlib_wrap.sh ./$(CHAPOLYTEST) -+ - lint: - lint -DLINT $(INCLUDES) $(SRC)>fluff - -@@ -546,6 +551,9 @@ - $(SSLV2CONFTEST)$(EXE_EXT): $(SSLV2CONFTEST).o - @target=$(SSLV2CONFTEST) $(BUILD_CMD) - -+$(CHAPOLYTEST)$(EXE_EXT): $(CHAPOLYTEST).o -+ @target=$(CHAPOLYTEST); $(BUILD_CMD) -+ - #$(AESTEST).o: $(AESTEST).c - # $(CC) -c $(CFLAGS) -DINTERMEDIATE_VALUE_KAT -DTRACE_KAT_MCT $(AESTEST).c - -@@ -614,6 +622,7 @@ - constant_time_test.o: ../crypto/constant_time_locl.h ../e_os.h - constant_time_test.o: ../include/openssl/e_os2.h - constant_time_test.o: ../include/openssl/opensslconf.h constant_time_test.c -+chapolytest.o: ../include/openssl/chacha20poly1305.h chapolytest.c - destest.o: ../include/openssl/des.h ../include/openssl/des_old.h - destest.o: ../include/openssl/e_os2.h ../include/openssl/opensslconf.h - destest.o: ../include/openssl/ossl_typ.h ../include/openssl/safestack.h --- -2.5.0 - diff --git a/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch b/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch new file mode 100644 index 000000000000..cdb767379f85 --- /dev/null +++ b/openssl__chacha20_poly1305_draft_and_rfc_ossl102j.patch @@ -0,0 +1,4718 @@ +From dcf9b5698b8658c9248327b3fdb280090c5c78ec Mon Sep 17 00:00:00 2001 +From: vkrasnov <vlad@cloudflare.com> +Date: Tue, 4 Oct 2016 15:47:32 -0700 +Subject: [PATCH] ChaCha20-Poly1305 draft and RFC cipher suites for OpenSSL + 1.0.2j + +--- + Configure | 44 +- + Makefile.org | 4 +- + crypto/chacha20_poly1305/Makefile | 89 + + .../asm/chacha20_poly1305_x86_64.pl | 2299 ++++++++++++++++++++ + crypto/chacha20_poly1305/asm/chacha20_x86_64.pl | 415 ++++ + crypto/chacha20_poly1305/asm/poly1305_x86_64.pl | 280 +++ + crypto/chacha20_poly1305/chacha20.c | 142 ++ + crypto/chacha20_poly1305/chacha20poly1305.h | 64 + + crypto/chacha20_poly1305/poly1305.c | 355 +++ + crypto/evp/Makefile | 8 +- + crypto/evp/c_allc.c | 5 + + crypto/evp/e_chacha20_poly1305.c | 362 +++ + crypto/evp/evp.h | 5 + + crypto/objects/obj_dat.h | 13 +- + crypto/objects/obj_mac.h | 8 + + crypto/objects/obj_mac.num | 2 + + crypto/objects/objects.txt | 2 + + ssl/s3_lib.c | 128 +- + ssl/ssl.h | 2 + + ssl/ssl_ciph.c | 31 +- + ssl/ssl_locl.h | 2 + + ssl/tls1.h | 26 + + 22 files changed, 4260 insertions(+), 26 deletions(-) + create mode 100644 crypto/chacha20_poly1305/Makefile + create mode 100755 crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl + create mode 100644 crypto/chacha20_poly1305/asm/chacha20_x86_64.pl + create mode 100644 crypto/chacha20_poly1305/asm/poly1305_x86_64.pl + create mode 100644 crypto/chacha20_poly1305/chacha20.c + create mode 100644 crypto/chacha20_poly1305/chacha20poly1305.h + create mode 100644 crypto/chacha20_poly1305/poly1305.c + create mode 100644 crypto/evp/e_chacha20_poly1305.c + +diff --git a/Configure b/Configure +index c39f71a..f5f7c06 100755 +--- a/Configure ++++ b/Configure +@@ -150,25 +150,25 @@ my $tlib="-lnsl -lsocket"; + my $bits1="THIRTY_TWO_BIT "; + my $bits2="SIXTY_FOUR_BIT "; + +-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; ++my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o::"; + + my $x86_elf_asm="$x86_asm:elf"; + +-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; +-my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; +-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; +-my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void"; +-my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void"; +-my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; ++my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:chacha20_poly1305_x86_64.o poly1305_x86_64.o chacha20_x86_64.o:"; ++my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:::void"; ++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o:::void"; ++my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o::::::::::::::void"; ++my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o:::void"; ++my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::::"; + my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//; +-my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; +-my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void"; +-my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; +-my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; +-my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; +-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; ++my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o::"; ++my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o:::void"; ++my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o::"; ++my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::32"; ++my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::64"; ++my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o::"; + my $ppc32_asm=$ppc64_asm; +-my $no_asm="::::::::::::::::void"; ++my $no_asm=":::::::::::::::::void"; + + # As for $BSDthreads. Idea is to maintain "collective" set of flags, + # which would cover all BSD flavors. -pthread applies to them all, +@@ -179,7 +179,7 @@ my $no_asm="::::::::::::::::void"; + # seems to be sufficient? + my $BSDthreads="-pthread -D_THREAD_SAFE -D_REENTRANT"; + +-#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $ec_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib ++#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $ec_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $chapoly_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib + + my %table=( + # File 'TABLE' (created by 'make TABLE') contains the data from this list, +@@ -713,6 +713,7 @@ my $idx_rc5_obj = $idx++; + my $idx_wp_obj = $idx++; + my $idx_cmll_obj = $idx++; + my $idx_modes_obj = $idx++; ++my $idx_chapoly_obj = $idx++; + my $idx_engines_obj = $idx++; + my $idx_perlasm_scheme = $idx++; + my $idx_dso_scheme = $idx++; +@@ -1239,6 +1240,7 @@ my $rc5_obj = $fields[$idx_rc5_obj]; + my $wp_obj = $fields[$idx_wp_obj]; + my $cmll_obj = $fields[$idx_cmll_obj]; + my $modes_obj = $fields[$idx_modes_obj]; ++my $chapoly_obj= $fields[$idx_chapoly_obj]; + my $engines_obj = $fields[$idx_engines_obj]; + my $perlasm_scheme = $fields[$idx_perlasm_scheme]; + my $dso_scheme = $fields[$idx_dso_scheme]; +@@ -1407,7 +1409,8 @@ if ($no_asm) + { + $cpuid_obj=$bn_obj=$ec_obj= + $des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj= +- $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=""; ++ $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj= ++ $chapoly_obj=""; + } + + if (!$no_shared) +@@ -1622,6 +1625,10 @@ if ($ec_obj =~ /ecp_nistz256/) + { + $cflags.=" -DECP_NISTZ256_ASM"; + } ++if ($chapoly_obj =~ /chacha20_poly1305/) ++ { ++ $cflags.=" -DCHAPOLY_ASM"; ++ } + + # "Stringify" the C flags string. This permits it to be made part of a string + # and works as well on command lines. +@@ -1751,6 +1758,7 @@ while (<IN>) + s/^WP_ASM_OBJ=.*$/WP_ASM_OBJ= $wp_obj/; + s/^CMLL_ENC=.*$/CMLL_ENC= $cmll_obj/; + s/^MODES_ASM_OBJ.=*$/MODES_ASM_OBJ= $modes_obj/; ++ s/^CHAPOLY_ASM=.*$/CHAPOLY_ASM= $chapoly_obj/; + s/^ENGINES_ASM_OBJ.=*$/ENGINES_ASM_OBJ= $engines_obj/; + s/^PERLASM_SCHEME=.*$/PERLASM_SCHEME= $perlasm_scheme/; + s/^PROCESSOR=.*/PROCESSOR= $processor/; +@@ -1812,6 +1820,7 @@ print "SHA1_OBJ_ASM =$sha1_obj\n"; + print "RMD160_OBJ_ASM=$rmd160_obj\n"; + print "CMLL_ENC =$cmll_obj\n"; + print "MODES_OBJ =$modes_obj\n"; ++print "CHAPOLY_ASM =$chapoly_obj\n"; + print "ENGINES_OBJ =$engines_obj\n"; + print "PROCESSOR =$processor\n"; + print "RANLIB =$ranlib\n"; +@@ -2211,7 +2220,7 @@ sub print_table_entry + my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags, + $bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj, + $md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj, +- $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, ++ $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $chapoly_obj, $engines_obj, + $perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag, + $shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)= + split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); +@@ -2241,6 +2250,7 @@ sub print_table_entry + \$wp_obj = $wp_obj + \$cmll_obj = $cmll_obj + \$modes_obj = $modes_obj ++\$chapoly_obj = $chapoly_obj + \$engines_obj = $engines_obj + \$perlasm_scheme = $perlasm_scheme + \$dso_scheme = $dso_scheme +diff --git a/Makefile.org b/Makefile.org +index 2377f50..1f20a61 100644 +--- a/Makefile.org ++++ b/Makefile.org +@@ -103,6 +103,7 @@ WP_ASM_OBJ= + CMLL_ENC= + MODES_ASM_OBJ= + ENGINES_ASM_OBJ= ++CHAPOLY_ASM= + PERLASM_SCHEME= + + # KRB5 stuff +@@ -149,7 +150,7 @@ SDIRS= \ + bn ec rsa dsa ecdsa dh ecdh dso engine \ + buffer bio stack lhash rand err \ + evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \ +- cms pqueue ts jpake srp store cmac ++ cms pqueue ts jpake srp store cmac chacha20_poly1305 + # keep in mind that the above list is adjusted by ./Configure + # according to no-xxx arguments... + +@@ -240,6 +241,7 @@ BUILDENV= LC_ALL=C PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)'\ + FIPSLIBDIR='${FIPSLIBDIR}' \ + FIPSDIR='${FIPSDIR}' \ + FIPSCANLIB="$${FIPSCANLIB:-$(FIPSCANLIB)}" \ ++ CHAPOLY_ASM='$(CHAPOLY_ASM)' \ + THIS=$${THIS:-$@} MAKEFILE=Makefile MAKEOVERRIDES= + # MAKEOVERRIDES= effectively "equalizes" GNU-ish and SysV-ish make flavors, + # which in turn eliminates ambiguities in variable treatment with -e. +diff --git a/crypto/chacha20_poly1305/Makefile b/crypto/chacha20_poly1305/Makefile +new file mode 100644 +index 0000000..87f4ba3 +--- /dev/null ++++ b/crypto/chacha20_poly1305/Makefile +@@ -0,0 +1,89 @@ ++#
++# crypto/chacha20poly1305/Makefile
++#
++
++DIR= chacha20poly1305
++TOP= ../..
++CC= cc
++INCLUDES= -I.. -I$(TOP) -I../../include
++CFLAG=-g
++MAKEFILE= Makefile
++AR= ar r
++
++CFLAGS= $(INCLUDES) $(CFLAG)
++ASFLAGS= $(INCLUDES) $(ASFLAG)
++AFLAGS= $(ASFLAGS)
++
++GENERAL=Makefile
++TEST=
++APPS=
++
++LIB=$(TOP)/libcrypto.a
++LIBSRC= chacha20.c poly1305.c
++LIBOBJ= chacha20.o poly1305.o $(CHAPOLY_ASM)
++
++SRC= $(LIBSRC)
++
++EXHEADER= chacha20poly1305.h
++HEADER= $(EXHEADER)
++
++ALL= $(GENERAL) $(SRC) $(HEADER)
++
++top:
++ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
++
++all: lib
++
++lib: $(LIBOBJ)
++ $(AR) $(LIB) $(LIBOBJ)
++ $(RANLIB) $(LIB) || echo Never mind.
++ @touch lib
++
++chacha20_poly1305_x86_64.s: asm/chacha20_poly1305_x86_64.pl
++ $(PERL) asm/chacha20_poly1305_x86_64.pl $(PERLASM_SCHEME) > $@
++
++poly1305_x86_64.s: asm/poly1305_x86_64.pl
++ $(PERL) asm/poly1305_x86_64.pl $(PERLASM_SCHEME) > $@
++
++chacha20_x86_64.s: asm/chacha20_x86_64.pl
++ $(PERL) asm/chacha20_x86_64.pl $(PERLASM_SCHEME) > $@
++
++files:
++ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
++
++links:
++ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
++ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
++ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
++
++install:
++ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
++ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
++ do \
++ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
++ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
++ done;
++
++tags:
++ ctags $(SRC)
++
++tests:
++
++lint:
++ lint -DLINT $(INCLUDES) $(SRC)>fluff
++
++depend:
++ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
++ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
++
++dclean:
++ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
++ mv -f Makefile.new $(MAKEFILE)
++
++clean:
++ rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
++
++# DO NOT DELETE THIS LINE -- make depend depends on it.
++
++chacha20.o: ../../include/openssl/chacha20poly1305.h chacha20.c
++poly1305.o: ../../include/openssl/chacha20poly1305.h poly1305.c
+diff --git a/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl b/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl +new file mode 100755 +index 0000000..ef90831 +--- /dev/null ++++ b/crypto/chacha20_poly1305/asm/chacha20_poly1305_x86_64.pl +@@ -0,0 +1,2299 @@ ++#!/usr/bin/env perl ++ ++############################################################################## ++# # ++# Copyright 2016 CloudFlare LTD # ++# # ++# Licensed under the Apache License, Version 2.0 (the "License"); # ++# you may not use this file except in compliance with the License. # ++# You may obtain a copy of the License at # ++# # ++# http://www.apache.org/licenses/LICENSE-2.0 # ++# # ++# Unless required by applicable law or agreed to in writing, software # ++# distributed under the License is distributed on an "AS IS" BASIS, # ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # ++# See the License for the specific language governing permissions and # ++# limitations under the License. # ++# # ++############################################################################## ++# # ++# Author: Vlad Krasnov # ++# # ++############################################################################## ++ ++$flavour = shift; ++$output = shift; ++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` ++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.19) + ($1>=2.22); ++} ++ ++if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && ++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.09) + ($1>=2.10); ++} ++ ++if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && ++ `ml64 2>&1` =~ /Version ([0-9]+)\./) { ++ $avx = ($1>=10) + ($1>=11); ++} ++ ++if (`$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { ++ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 ++ $avx = ($ver>=3.0) + ($ver>=3.01); ++} ++ ++$code.=<<___; ++.text ++.extern OPENSSL_ia32cap_P ++.align 64 ++.chacha20_consts: ++.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' ++.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' ++.rol8: ++.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 ++.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 ++.rol16: ++.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 ++.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 ++.avx2_init: ++.long 0,0,0,0 ++.sse_inc: ++.long 1,0,0,0 ++.avx2_inc: ++.long 2,0,0,0,2,0,0,0 ++.clamp: ++.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC ++.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF ++.align 16 ++.and_masks: ++.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 ++.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 ++___ ++ ++my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8"); ++my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); ++my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); ++my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); ++my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); ++my $r_store="0*16(%rbp)"; ++my $s_store="1*16(%rbp)"; ++my $len_store="2*16(%rbp)"; ++my $state1_store="3*16(%rbp)"; ++my $state2_store="4*16(%rbp)"; ++my $tmp_store="5*16(%rbp)"; ++my $ctr0_store="6*16(%rbp)"; ++my $ctr1_store="7*16(%rbp)"; ++my $ctr2_store="8*16(%rbp)"; ++my $ctr3_store="9*16(%rbp)"; ++ ++sub chacha_qr { ++my ($a,$b,$c,$d,$t,$dir)=@_; ++$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); ++$code.="paddd $b, $a ++ pxor $a, $d ++ pshufb .rol16(%rip), $d ++ paddd $d, $c ++ pxor $c, $b ++ movdqa $b, $t ++ pslld \$12, $t ++ psrld \$20, $b ++ pxor $t, $b ++ paddd $b, $a ++ pxor $a, $d ++ pshufb .rol8(%rip), $d ++ paddd $d, $c ++ pxor $c, $b ++ movdqa $b, $t ++ pslld \$7, $t ++ psrld \$25, $b ++ pxor $t, $b\n"; ++$code.="palignr \$4, $b, $b ++ palignr \$8, $c, $c ++ palignr \$12, $d, $d\n" if ($dir =~ /left/); ++$code.="palignr \$12, $b, $b ++ palignr \$8, $c, $c ++ palignr \$4, $d, $d\n" if ($dir =~ /right/); ++$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); ++} ++ ++sub poly_add { ++my ($src)=@_; ++$code.="add $src, $acc0 ++ adc 8+$src, $acc1 ++ adc \$1, $acc2\n"; ++} ++ ++sub poly_stage1 { ++$code.="mov 0+$r_store, %rax ++ mov %rax, $t2 ++ mul $acc0 ++ mov %rax, $t0 ++ mov %rdx, $t1 ++ mov 0+$r_store, %rax ++ mul $acc1 ++ imul $acc2, $t2 ++ add %rax, $t1 ++ adc %rdx, $t2\n"; ++} ++ ++sub poly_stage2 { ++$code.="mov 8+$r_store, %rax ++ mov %rax, $t3 ++ mul $acc0 ++ add %rax, $t1 ++ adc \$0, %rdx ++ mov %rdx, $acc0 ++ mov 8+$r_store, %rax ++ mul $acc1 ++ add %rax, $t2 ++ adc \$0, %rdx\n"; ++} ++ ++sub poly_stage3 { ++$code.="imul $acc2, $t3 ++ add $acc0, $t2 ++ adc %rdx, $t3\n"; ++} ++ ++sub poly_reduce_stage { ++$code.="mov $t0, $acc0 ++ mov $t1, $acc1 ++ mov $t2, $acc2 ++ and \$3, $acc2 ++ mov $t2, $t0 ++ and \$-4, $t0 ++ mov $t3, $t1 ++ shrd \$2, $t3, $t2 ++ shr \$2, $t3 ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$0, $acc2 ++ add $t2, $acc0 ++ adc $t3, $acc1 ++ adc \$0, $acc2\n"; ++} ++ ++sub poly_mul { ++ &poly_stage1(); ++ &poly_stage2(); ++ &poly_stage3(); ++ &poly_reduce_stage(); ++} ++ ++sub prep_state { ++my ($n)=@_; ++$code.="movdqa .chacha20_consts(%rip), $A0 ++ movdqa $state1_store, $B0 ++ movdqa $state2_store, $C0\n"; ++$code.="movdqa $A0, $A1 ++ movdqa $B0, $B1 ++ movdqa $C0, $C1\n" if ($n ge 2); ++$code.="movdqa $A0, $A2 ++ movdqa $B0, $B2 ++ movdqa $C0, $C2\n" if ($n ge 3); ++$code.="movdqa $A0, $A3 ++ movdqa $B0, $B3 ++ movdqa $C0, $C3\n" if ($n ge 4); ++$code.="movdqa $ctr0_store, $D0 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $ctr0_store\n" if ($n eq 1); ++$code.="movdqa $ctr0_store, $D1 ++ paddd .sse_inc(%rip), $D1 ++ movdqa $D1, $D0 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $ctr0_store ++ movdqa $D1, $ctr1_store\n" if ($n eq 2); ++$code.="movdqa $ctr0_store, $D2 ++ paddd .sse_inc(%rip), $D2 ++ movdqa $D2, $D1 ++ paddd .sse_inc(%rip), $D1 ++ movdqa $D1, $D0 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $ctr0_store ++ movdqa $D1, $ctr1_store ++ movdqa $D2, $ctr2_store\n" if ($n eq 3); ++$code.="movdqa $ctr0_store, $D3 ++ paddd .sse_inc(%rip), $D3 ++ movdqa $D3, $D2 ++ paddd .sse_inc(%rip), $D2 ++ movdqa $D2, $D1 ++ paddd .sse_inc(%rip), $D1 ++ movdqa $D1, $D0 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $ctr0_store ++ movdqa $D1, $ctr1_store ++ movdqa $D2, $ctr2_store ++ movdqa $D3, $ctr3_store\n" if ($n eq 4); ++} ++ ++sub finalize_state { ++my ($n)=@_; ++$code.="paddd .chacha20_consts(%rip), $A3 ++ paddd $state1_store, $B3 ++ paddd $state2_store, $C3 ++ paddd $ctr3_store, $D3\n" if ($n eq 4); ++$code.="paddd .chacha20_consts(%rip), $A2 ++ paddd $state1_store, $B2 ++ paddd $state2_store, $C2 ++ paddd $ctr2_store, $D2\n" if ($n ge 3); ++$code.="paddd .chacha20_consts(%rip), $A1 ++ paddd $state1_store, $B1 ++ paddd $state2_store, $C1 ++ paddd $ctr1_store, $D1\n" if ($n ge 2); ++$code.="paddd .chacha20_consts(%rip), $A0 ++ paddd $state1_store, $B0 ++ paddd $state2_store, $C0 ++ paddd $ctr0_store, $D0\n"; ++} ++ ++sub xor_stream { ++my ($A, $B, $C, $D, $offset)=@_; ++$code.="movdqu 0*16 + $offset($inp), $A3 ++ movdqu 1*16 + $offset($inp), $B3 ++ movdqu 2*16 + $offset($inp), $C3 ++ movdqu 3*16 + $offset($inp), $D3 ++ pxor $A3, $A ++ pxor $B3, $B ++ pxor $C3, $C ++ pxor $D, $D3 ++ movdqu $A, 0*16 + $offset($oup) ++ movdqu $B, 1*16 + $offset($oup) ++ movdqu $C, 2*16 + $offset($oup) ++ movdqu $D3, 3*16 + $offset($oup)\n"; ++} ++ ++sub xor_stream_using_temp { ++my ($A, $B, $C, $D, $offset, $temp)=@_; ++$code.="movdqa $temp, $tmp_store ++ movdqu 0*16 + $offset($inp), $temp ++ pxor $A, $temp ++ movdqu $temp, 0*16 + $offset($oup) ++ movdqu 1*16 + $offset($inp), $temp ++ pxor $B, $temp ++ movdqu $temp, 1*16 + $offset($oup) ++ movdqu 2*16 + $offset($inp), $temp ++ pxor $C, $temp ++ movdqu $temp, 2*16 + $offset($oup) ++ movdqu 3*16 + $offset($inp), $temp ++ pxor $D, $temp ++ movdqu $temp, 3*16 + $offset($oup)\n"; ++} ++ ++sub gen_chacha_round { ++my ($rot1, $rot2, $shift)=@_; ++my $round=""; ++$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); ++$round.="movdqa $rot2, $C0 ++ paddd $B3, $A3 ++ paddd $B2, $A2 ++ paddd $B1, $A1 ++ paddd $B0, $A0 ++ pxor $A3, $D3 ++ pxor $A2, $D2 ++ pxor $A1, $D1 ++ pxor $A0, $D0 ++ pshufb $C0, $D3 ++ pshufb $C0, $D2 ++ pshufb $C0, $D1 ++ pshufb $C0, $D0 ++ movdqa $tmp_store, $C0 ++ paddd $D3, $C3 ++ paddd $D2, $C2 ++ paddd $D1, $C1 ++ paddd $D0, $C0 ++ pxor $C3, $B3 ++ pxor $C2, $B2 ++ pxor $C1, $B1 ++ pxor $C0, $B0 ++ movdqa $C0, $tmp_store ++ movdqa $B3, $C0 ++ psrld \$$rot1, $C0 ++ pslld \$32-$rot1, $B3 ++ pxor $C0, $B3 ++ movdqa $B2, $C0 ++ psrld \$$rot1, $C0 ++ pslld \$32-$rot1, $B2 ++ pxor $C0, $B2 ++ movdqa $B1, $C0 ++ psrld \$$rot1, $C0 ++ pslld \$32-$rot1, $B1 ++ pxor $C0, $B1 ++ movdqa $B0, $C0 ++ psrld \$$rot1, $C0 ++ pslld \$32-$rot1, $B0 ++ pxor $C0, $B0\n"; ++($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); ++($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); ++$round.="movdqa $tmp_store, $C0 ++ palignr \$$s1, $B3, $B3 ++ palignr \$$s2, $C3, $C3 ++ palignr \$$s3, $D3, $D3 ++ palignr \$$s1, $B2, $B2 ++ palignr \$$s2, $C2, $C2 ++ palignr \$$s3, $D2, $D2 ++ palignr \$$s1, $B1, $B1 ++ palignr \$$s2, $C1, $C1 ++ palignr \$$s3, $D1, $D1 ++ palignr \$$s1, $B0, $B0 ++ palignr \$$s2, $C0, $C0 ++ palignr \$$s3, $D0, $D0\n" ++if (($shift =~ /left/) || ($shift =~ /right/)); ++return $round; ++}; ++ ++$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") . ++ &gen_chacha_round(25, ".rol8(%rip)", "left") . ++ &gen_chacha_round(20, ".rol16(%rip)") . ++ &gen_chacha_round(25, ".rol8(%rip)", "right"); ++ ++my @loop_body = split /\n/, $chacha_body; ++ ++sub emit_body { ++my ($n)=@_; ++ for (my $i=0; $i < $n; $i++) { ++ $code=$code.shift(@loop_body)."\n"; ++ }; ++} ++ ++{ ++################################################################################ ++# void poly_hash_ad_internal(); ++$code.=" ++.type poly_hash_ad_internal,\@function,2 ++.align 64 ++poly_hash_ad_internal: ++ xor $acc0, $acc0 ++ xor $acc1, $acc1 ++ xor $acc2, $acc2 ++ cmp \$13, $itr2 ++ jne hash_ad_loop ++poly_fast_tls_ad: ++ # Special treatment for the TLS case of 13 bytes ++ mov ($adp), $acc0 ++ mov 5($adp), $acc1 ++ shr \$24, $acc1 ++ mov \$1, $acc2\n"; ++ &poly_mul(); $code.=" ++ ret ++hash_ad_loop: ++ # Hash in 16 byte chunk ++ cmp \$16, $itr2 ++ jb hash_ad_tail\n"; ++ &poly_add("0($adp)"); ++ &poly_mul(); $code.=" ++ lea (1*16)($adp), $adp ++ sub \$16, $itr2 ++ jmp hash_ad_loop ++hash_ad_tail: ++ cmp \$0, $itr2 ++ je 1f ++ # Hash last < 16 byte tail ++ xor $t0, $t0 ++ xor $t1, $t1 ++ xor $t2, $t2 ++ add $itr2, $adp ++hash_ad_tail_loop: ++ shld \$8, $t0, $t1 ++ shl \$8, $t0 ++ movzxb -1($adp), $t2 ++ xor $t2, $t0 ++ dec $adp ++ dec $itr2 ++ jne hash_ad_tail_loop ++ ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$1, $acc2\n"; ++ &poly_mul(); $code.=" ++ # Finished AD ++1: ++ ret ++.size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; ++} ++ ++{ ++################################################################################ ++# int chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); ++$code.=" ++.globl chacha20_poly1305_open ++.type chacha20_poly1305_open,\@function,2 ++.align 64 ++chacha20_poly1305_open: ++ push %rbp ++ push %rbx ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ sub \$288 + 32, %rsp ++ lea 32(%rsp), %rbp ++ and \$-32, %rbp ++ mov %rdx, 8+$len_store ++ mov %r8, 0+$len_store ++ mov %rdx, $inl\n"; $code.=" ++ mov OPENSSL_ia32cap_P+8(%rip), %eax ++ test \$`1<<5`, %eax ++ jnz chacha20_poly1305_open_avx2\n" if ($avx>1); ++$code.=" ++ cmp \$128, $inl ++ jbe open_sse_128 ++ # For long buffers, prepare the poly key first ++ movdqa .chacha20_consts(%rip), $A0 ++ movdqu 0*16($keyp), $B0 ++ movdqu 1*16($keyp), $C0 ++ movdqu 2*16($keyp), $D0 ++ movdqa $D0, $T1 ++ # Store on stack, to free keyp ++ movdqa $B0, $state1_store ++ movdqa $C0, $state2_store ++ movdqa $D0, $ctr0_store ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded ++ paddd .chacha20_consts(%rip), $A0 ++ paddd $state1_store, $B0 ++ # Clamp and store the key ++ pand .clamp(%rip), $A0 ++ movdqa $A0, $r_store ++ movdqa $B0, $s_store ++ # Hash ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++open_sse_main_loop: ++ cmp \$16*16, $inl ++ jb 2f ++ # Load state, increment counter blocks\n"; ++ &prep_state(4); $code.=" ++ # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we ++ # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 ++ mov \$4, $itr1 ++ mov $inp, $itr2 ++1: \n"; ++ &emit_body(20); ++ &poly_add("0($itr2)"); $code.=" ++ lea 2*8($itr2), $itr2\n"; ++ &emit_body(20); ++ &poly_stage1(); ++ &emit_body(20); ++ &poly_stage2(); ++ &emit_body(20); ++ &poly_stage3(); ++ &emit_body(20); ++ &poly_reduce_stage(); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ dec $itr1 ++ jge 1b\n"; ++ &poly_add("0($itr2)"); ++ &poly_mul(); $code.=" ++ lea 2*8($itr2), $itr2 ++ cmp \$-6, $itr1 ++ jg 1b\n"; ++ &finalize_state(4); ++ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); ++ &xor_stream($A2, $B2, $C2, $D2, "4*16"); ++ &xor_stream($A1, $B1, $C1, $D1, "8*16"); ++ &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" ++ lea 16*16($inp), $inp ++ lea 16*16($oup), $oup ++ sub \$16*16, $inl ++ jmp open_sse_main_loop ++2: ++ # Handle the various tail sizes efficiently ++ test $inl, $inl ++ jz open_sse_finalize ++ cmp \$4*16, $inl ++ ja 3f\n"; ++############################################################################### ++ # At most 64 bytes are left ++ &prep_state(1); $code.=" ++ xor $itr2, $itr2 ++ mov $inl, $itr1 ++ cmp \$16, $itr1 ++ jb 2f ++1: \n"; ++ &poly_add("0($inp, $itr2)"); ++ &poly_mul(); $code.=" ++ sub \$16, $itr1 ++2: ++ add \$16, $itr2\n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" ++ cmp \$16, $itr1 ++ jae 1b ++ cmp \$10*16, $itr2 ++ jne 2b\n"; ++ &finalize_state(1); $code.=" ++ jmp open_sse_tail_64_dec_loop ++3: ++ cmp \$8*16, $inl ++ ja 3f\n"; ++############################################################################### ++ # 65 - 128 bytes are left ++ &prep_state(2); $code.=" ++ mov $inl, $itr1 ++ and \$-16, $itr1 ++ xor $itr2, $itr2 ++1: \n"; ++ &poly_add("0($inp, $itr2)"); ++ &poly_mul(); $code.=" ++2: ++ add \$16, $itr2\n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" ++ cmp $itr1, $itr2 ++ jb 1b ++ cmp \$10*16, $itr2 ++ jne 2b\n"; ++ &finalize_state(2); ++ &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" ++ sub \$4*16, $inl ++ lea 4*16($inp), $inp ++ lea 4*16($oup), $oup ++ jmp open_sse_tail_64_dec_loop ++3: ++ cmp \$12*16, $inl ++ ja 3f\n"; ++############################################################################### ++ # 129 - 192 bytes are left ++ &prep_state(3); $code.=" ++ mov $inl, $itr1 ++ mov \$10*16, $itr2 ++ cmp \$10*16, $itr1 ++ cmovg $itr2, $itr1 ++ and \$-16, $itr1 ++ xor $itr2, $itr2 ++1: \n"; ++ &poly_add("0($inp, $itr2)"); ++ &poly_mul(); $code.=" ++2: ++ add \$16, $itr2\n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ cmp $itr1, $itr2 ++ jb 1b ++ cmp \$10*16, $itr2 ++ jne 2b ++ cmp \$11*16, $inl ++ jb 1f\n"; ++ &poly_add("10*16($inp)"); ++ &poly_mul(); $code.=" ++ cmp \$12*16, $inl ++ jb 1f\n"; ++ &poly_add("11*16($inp)"); ++ &poly_mul(); $code.=" ++1: \n"; ++ &finalize_state(3); ++ &xor_stream($A2, $B2, $C2, $D2, "0*16"); ++ &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" ++ sub \$8*16, $inl ++ lea 8*16($inp), $inp ++ lea 8*16($oup), $oup ++ jmp open_sse_tail_64_dec_loop ++3: ++###############################################################################\n"; ++ # 193 - 255 bytes are left ++ &prep_state(4); $code.=" ++ xor $itr2, $itr2 ++1: \n"; ++ &poly_add("0($inp, $itr2)"); ++ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); ++ &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); ++ &poly_stage1(); ++ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); ++ &poly_stage2(); ++ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); ++ &poly_stage3(); ++ &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); ++ &poly_reduce_stage(); ++ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" ++ add \$16, $itr2 ++ cmp \$10*16, $itr2 ++ jb 1b ++ mov $inl, $itr1 ++ and \$-16, $itr1 ++1: \n"; ++ &poly_add("0($inp, $itr2)"); ++ &poly_mul(); $code.=" ++ add \$16, $itr2 ++ cmp $itr1, $itr2 ++ jb 1b\n"; ++ &finalize_state(4); ++ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); ++ &xor_stream($A2, $B2, $C2, $D2, "4*16"); ++ &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" ++ movdqa $tmp_store, $D0 ++ sub \$12*16, $inl ++ lea 12*16($inp), $inp ++ lea 12*16($oup), $oup ++############################################################################### ++ # Decrypt the remaining data, 16B at a time, using existing stream ++open_sse_tail_64_dec_loop: ++ cmp \$16, $inl ++ jb 1f ++ sub \$16, $inl ++ movdqu ($inp), $T0 ++ pxor $T0, $A0 ++ movdqu $A0, ($oup) ++ lea 16($inp), $inp ++ lea 16($oup), $oup ++ movdqa $B0, $A0 ++ movdqa $C0, $B0 ++ movdqa $D0, $C0 ++ jmp open_sse_tail_64_dec_loop ++1: ++ movdqa $A0, $A1 ++ # Decrypt up to 16B ++open_sse_tail_16: ++ test $inl, $inl ++ jz open_sse_finalize ++ # We can safely load the CT from the end, because it is padded with the MAC ++ mov $inl, $itr2 ++ shl \$4, $itr2 ++ lea .and_masks(%rip), $t0 ++ movdqu ($inp), $T0 ++ add $inl, $inp ++ pand -16($t0, $itr2), $T0 ++ movq $T0, $t0 ++ pextrq \$1, $T0, $t1 ++ pxor $A1, $T0 ++ # We can only store 1 byte at a time, since plaintext can be shorter than 16 bytes ++2: ++ pextrb \$0, $T0, ($oup) ++ psrldq \$1, $T0 ++ inc $oup ++ dec $inl ++ jne 2b ++ ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$1, $acc2\n"; ++ &poly_mul(); $code.=" ++ ++open_sse_finalize:\n"; ++ &poly_add($len_store); ++ &poly_mul(); $code.=" ++ # Final reduce ++ mov $acc0, $t0 ++ mov $acc1, $t1 ++ mov $acc2, $t2 ++ sub \$-5, $acc0 ++ sbb \$-1, $acc1 ++ sbb \$3, $acc2 ++ cmovc $t0, $acc0 ++ cmovc $t1, $acc1 ++ cmovc $t2, $acc2 ++ # Add in s part of the key ++ add 0+$s_store, $acc0 ++ adc 8+$s_store, $acc1 ++ # Constant time compare ++ xor %rax, %rax ++ mov \$1, %rdx ++ xor 0*8($inp), $acc0 ++ xor 1*8($inp), $acc1 ++ or $acc1, $acc0 ++ cmovz %rdx, %rax ++ ++ add \$288 + 32, %rsp ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %rbx ++ pop %rbp ++ ret ++############################################################################### ++open_sse_128: ++ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 ++ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 ++ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 ++ movdqu 2*16($keyp), $D0 ++ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 ++ movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2 ++ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ dec $acc0 ++ jnz 1b ++ paddd .chacha20_consts(%rip), $A0 ++ paddd .chacha20_consts(%rip), $A1 ++ paddd .chacha20_consts(%rip), $A2 ++ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 ++ paddd $T2, $C1\npaddd $T2, $C2 ++ paddd $T3, $D1 ++ paddd .sse_inc(%rip), $T3 ++ paddd $T3, $D2 ++ # Clamp and store the key ++ pand .clamp(%rip), $A0 ++ movdqa $A0, $r_store ++ movdqa $B0, $s_store ++ # Hash ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++1: ++ cmp \$16, $inl ++ jb open_sse_tail_16 ++ sub \$16, $inl\n"; ++ # Load for hashing ++ &poly_add("0*8($inp)"); $code.=" ++ # Load for decryption ++ movdqu 0*16($inp), $T0 ++ pxor $T0, $A1 ++ movdqu $A1, 0*16($oup) ++ lea 1*16($inp), $inp ++ lea 1*16($oup), $oup\n"; ++ &poly_mul(); $code.=" ++ # Shift the stream left ++ movdqa $B1, $A1 ++ movdqa $C1, $B1 ++ movdqa $D1, $C1 ++ movdqa $A2, $D1 ++ movdqa $B2, $A2 ++ movdqa $C2, $B2 ++ movdqa $D2, $C2 ++ jmp 1b ++ jmp open_sse_tail_16 ++.size chacha20_poly1305_open, .-chacha20_poly1305_open ++################################################################################ ++################################################################################ ++# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); ++.globl chacha20_poly1305_seal ++.type chacha20_poly1305_seal,\@function,2 ++.align 64 ++chacha20_poly1305_seal: ++ push %rbp ++ push %rbx ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ sub \$288 + 32, %rsp ++ lea 32(%rsp), %rbp ++ and \$-32, %rbp ++ mov %rdx, 8+$len_store ++ mov %r8, 0+$len_store ++ mov %rdx, $inl\n"; $code.=" ++ mov OPENSSL_ia32cap_P+8(%rip), %eax ++ test \$`1<<5`, %eax ++ jnz chacha20_poly1305_seal_avx2\n" if ($avx>1); ++$code.=" ++ cmp \$128, $inl ++ jbe seal_sse_128 ++ # For longer buffers, prepare the poly key + some stream ++ movdqa .chacha20_consts(%rip), $A0 ++ movdqu 0*16($keyp), $B0 ++ movdqu 1*16($keyp), $C0 ++ movdqu 2*16($keyp), $D0 ++ movdqa $A0, $A1 ++ movdqa $A0, $A2 ++ movdqa $A0, $A3 ++ movdqa $B0, $B1 ++ movdqa $B0, $B2 ++ movdqa $B0, $B3 ++ movdqa $C0, $C1 ++ movdqa $C0, $C2 ++ movdqa $C0, $C3 ++ movdqa $D0, $D3 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $D2 ++ paddd .sse_inc(%rip), $D0 ++ movdqa $D0, $D1 ++ paddd .sse_inc(%rip), $D0 ++ # Store on stack ++ movdqa $B0, $state1_store ++ movdqa $C0, $state2_store ++ movdqa $D0, $ctr0_store ++ movdqa $D1, $ctr1_store ++ movdqa $D2, $ctr2_store ++ movdqa $D3, $ctr3_store ++ mov \$10, $acc0 ++1: \n"; ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ dec $acc0 ++ jnz 1b\n"; ++ &finalize_state(4); $code.=" ++ # Clamp and store the key ++ pand .clamp(%rip), $A3 ++ movdqa $A3, $r_store ++ movdqa $B3, $s_store ++ # Hash ++ mov %r8, $itr2 ++ call poly_hash_ad_internal\n"; ++ &xor_stream($A2,$B2,$C2,$D2,"0*16"); ++ &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" ++ cmp \$12*16, $inl ++ ja 1f ++ mov \$8*16, $itr1 ++ sub \$8*16, $inl ++ lea 8*16($inp), $inp ++ jmp seal_sse_128_seal_hash ++1: \n"; ++ &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" ++ mov \$12*16, $itr1 ++ sub \$12*16, $inl ++ lea 12*16($inp), $inp ++ mov \$2, $itr1 ++ mov \$8, $itr2 ++ cmp \$4*16, $inl ++ jbe seal_sse_tail_64 ++ cmp \$8*16, $inl ++ jbe seal_sse_tail_128 ++ cmp \$12*16, $inl ++ jbe seal_sse_tail_192 ++ ++1: \n"; ++ # The main loop ++ &prep_state(4); $code.=" ++2: \n"; ++ &emit_body(20); ++ &poly_add("0($oup)"); ++ &emit_body(20); ++ &poly_stage1(); ++ &emit_body(20); ++ &poly_stage2(); ++ &emit_body(20); ++ &poly_stage3(); ++ &emit_body(20); ++ &poly_reduce_stage(); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ lea 16($oup), $oup ++ dec $itr2 ++ jge 2b\n"; ++ &poly_add("0*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 16($oup), $oup ++ dec $itr1 ++ jg 2b\n"; ++ ++ &finalize_state(4);$code.=" ++ movdqa $D2, $tmp_store\n"; ++ &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" ++ movdqa $tmp_store, $D2\n"; ++ &xor_stream($A2,$B2,$C2,$D2, 4*16); ++ &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" ++ cmp \$16*16, $inl ++ ja 3f ++ ++ mov \$12*16, $itr1 ++ sub \$12*16, $inl ++ lea 12*16($inp), $inp ++ jmp seal_sse_128_seal_hash ++3: \n"; ++ &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" ++ lea 16*16($inp), $inp ++ sub \$16*16, $inl ++ mov \$6, $itr1 ++ mov \$4, $itr2 ++ cmp \$12*16, $inl ++ jg 1b ++ mov $inl, $itr1 ++ test $inl, $inl ++ je seal_sse_128_seal_hash ++ mov \$6, $itr1 ++ cmp \$4*16, $inl ++ jg 3f ++############################################################################### ++seal_sse_tail_64:\n"; ++ &prep_state(1); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 16($oup), $oup ++2: \n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 16($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state(1); $code.=" ++ jmp seal_sse_128_seal ++3: ++ cmp \$8*16, $inl ++ jg 3f ++############################################################################### ++seal_sse_tail_128:\n"; ++ &prep_state(2); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 16($oup), $oup ++2: \n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &poly_add("0($oup)"); ++ &poly_mul(); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" ++ lea 16($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state(2); ++ &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" ++ mov \$4*16, $itr1 ++ sub \$4*16, $inl ++ lea 4*16($inp), $inp ++ jmp seal_sse_128_seal_hash ++3: ++############################################################################### ++seal_sse_tail_192:\n"; ++ &prep_state(3); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 16($oup), $oup ++2: \n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); ++ &poly_add("0($oup)"); ++ &poly_mul(); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ lea 16($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state(3); ++ &xor_stream($A2,$B2,$C2,$D2,0*16); ++ &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" ++ mov \$8*16, $itr1 ++ sub \$8*16, $inl ++ lea 8*16($inp), $inp ++############################################################################### ++seal_sse_128_seal_hash: ++ cmp \$16, $itr1 ++ jb seal_sse_128_seal\n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ sub \$16, $itr1 ++ lea 16($oup), $oup ++ jmp seal_sse_128_seal_hash ++ ++seal_sse_128_seal: ++ cmp \$16, $inl ++ jb seal_sse_tail_16 ++ sub \$16, $inl ++ # Load for decryption ++ movdqu 0*16($inp), $T0 ++ pxor $T0, $A0 ++ movdqu $A0, 0*16($oup) ++ # Then hash ++ add 0*8($oup), $acc0 ++ adc 1*8($oup), $acc1 ++ adc \$1, $acc2 ++ lea 1*16($inp), $inp ++ lea 1*16($oup), $oup\n"; ++ &poly_mul(); $code.=" ++ # Shift the stream left ++ movdqa $B0, $A0 ++ movdqa $C0, $B0 ++ movdqa $D0, $C0 ++ movdqa $A1, $D0 ++ movdqa $B1, $A1 ++ movdqa $C1, $B1 ++ movdqa $D1, $C1 ++ jmp seal_sse_128_seal ++ ++seal_sse_tail_16: ++ test $inl, $inl ++ jz seal_sse_finalize ++ # We can only load the PT one byte at a time to avoid buffer overread ++ mov $inl, $itr2 ++ shl \$4, $itr2 ++ lea .and_masks(%rip), $t0 ++ mov $inl, $itr1 ++ lea -1($inp, $inl), $inp ++ pxor $T3, $T3 ++1: ++ pslldq \$1, $T3 ++ pinsrb \$0, ($inp), $T3 ++ lea -1($inp), $inp ++ dec $itr1 ++ jne 1b ++ pxor $A0, $T3 ++ movdqu $T3, ($oup) ++ pand -16($t0, $itr2), $T3 ++ movq $T3, $t0 ++ pextrq \$1, $T3, $t1 ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$1, $acc2 ++ lea ($inl, $oup), $oup\n"; ++ &poly_mul(); $code.=" ++seal_sse_finalize:\n"; ++ &poly_add($len_store); ++ &poly_mul(); $code.=" ++ # Final reduce ++ mov $acc0, $t0 ++ mov $acc1, $t1 ++ mov $acc2, $t2 ++ sub \$-5, $acc0 ++ sbb \$-1, $acc1 ++ sbb \$3, $acc2 ++ cmovc $t0, $acc0 ++ cmovc $t1, $acc1 ++ cmovc $t2, $acc2 ++ # Add in s part of the key ++ add 0+$s_store, $acc0 ++ adc 8+$s_store, $acc1 ++ mov $acc0, 0*8($oup) ++ mov $acc1, 1*8($oup) ++ add \$288 + 32, %rsp ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %rbx ++ pop %rbp ++ ret ++################################################################################ ++seal_sse_128: ++ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 ++ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 ++ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 ++ movdqu 2*16($keyp), $D2 ++ movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0 ++ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 ++ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 ++ mov \$10, $acc0 ++1:\n"; ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ dec $acc0 ++ jnz 1b ++ paddd .chacha20_consts(%rip), $A0 ++ paddd .chacha20_consts(%rip), $A1 ++ paddd .chacha20_consts(%rip), $A2 ++ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 ++ paddd $T2, $C0\npaddd $T2, $C1 ++ paddd $T3, $D0 ++ paddd .sse_inc(%rip), $T3 ++ paddd $T3, $D1 ++ # Clamp and store the key ++ pand .clamp(%rip), $A2 ++ movdqa $A2, $r_store ++ movdqa $B2, $s_store ++ # Hash ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++ jmp seal_sse_128_seal ++.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n"; ++} ++ ++if ($avx>1) { ++ ++($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); ++my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); ++($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); ++$state1_store="2*32(%rbp)"; ++$state2_store="3*32(%rbp)"; ++$tmp_store="4*32(%rbp)"; ++$ctr0_store="5*32(%rbp)"; ++$ctr1_store="6*32(%rbp)"; ++$ctr2_store="7*32(%rbp)"; ++$ctr3_store="8*32(%rbp)"; ++ ++sub chacha_qr_avx2 { ++my ($a,$b,$c,$d,$t,$dir)=@_; ++$code.=<<___ if ($dir =~ /store/); ++ vmovdqa $t, $tmp_store ++___ ++$code.=<<___; ++ vpaddd $b, $a, $a ++ vpxor $a, $d, $d ++ vpshufb .rol16(%rip), $d, $d ++ vpaddd $d, $c, $c ++ vpxor $c, $b, $b ++ vpsrld \$20, $b, $t ++ vpslld \$12, $b, $b ++ vpxor $t, $b, $b ++ vpaddd $b, $a, $a ++ vpxor $a, $d, $d ++ vpshufb .rol8(%rip), $d, $d ++ vpaddd $d, $c, $c ++ vpxor $c, $b, $b ++ vpslld \$7, $b, $t ++ vpsrld \$25, $b, $b ++ vpxor $t, $b, $b ++___ ++$code.=<<___ if ($dir =~ /left/); ++ vpalignr \$12, $d, $d, $d ++ vpalignr \$8, $c, $c, $c ++ vpalignr \$4, $b, $b, $b ++___ ++$code.=<<___ if ($dir =~ /right/); ++ vpalignr \$4, $d, $d, $d ++ vpalignr \$8, $c, $c, $c ++ vpalignr \$12, $b, $b, $b ++___ ++$code.=<<___ if ($dir =~ /load/); ++ vmovdqa $tmp_store, $t ++___ ++} ++ ++sub prep_state_avx2 { ++my ($n)=@_; ++$code.=<<___; ++ vmovdqa .chacha20_consts(%rip), $A0 ++ vmovdqa $state1_store, $B0 ++ vmovdqa $state2_store, $C0 ++___ ++$code.=<<___ if ($n ge 2); ++ vmovdqa $A0, $A1 ++ vmovdqa $B0, $B1 ++ vmovdqa $C0, $C1 ++___ ++$code.=<<___ if ($n ge 3); ++ vmovdqa $A0, $A2 ++ vmovdqa $B0, $B2 ++ vmovdqa $C0, $C2 ++___ ++$code.=<<___ if ($n ge 4); ++ vmovdqa $A0, $A3 ++ vmovdqa $B0, $B3 ++ vmovdqa $C0, $C3 ++___ ++$code.=<<___ if ($n eq 1); ++ vmovdqa .avx2_inc(%rip), $D0 ++ vpaddd $ctr0_store, $D0, $D0 ++ vmovdqa $D0, $ctr0_store ++___ ++$code.=<<___ if ($n eq 2); ++ vmovdqa .avx2_inc(%rip), $D0 ++ vpaddd $ctr0_store, $D0, $D1 ++ vpaddd $D1, $D0, $D0 ++ vmovdqa $D0, $ctr0_store ++ vmovdqa $D1, $ctr1_store ++___ ++$code.=<<___ if ($n eq 3); ++ vmovdqa .avx2_inc(%rip), $D0 ++ vpaddd $ctr0_store, $D0, $D2 ++ vpaddd $D2, $D0, $D1 ++ vpaddd $D1, $D0, $D0 ++ vmovdqa $D0, $ctr0_store ++ vmovdqa $D1, $ctr1_store ++ vmovdqa $D2, $ctr2_store ++___ ++$code.=<<___ if ($n eq 4); ++ vmovdqa .avx2_inc(%rip), $D0 ++ vpaddd $ctr0_store, $D0, $D3 ++ vpaddd $D3, $D0, $D2 ++ vpaddd $D2, $D0, $D1 ++ vpaddd $D1, $D0, $D0 ++ vmovdqa $D3, $ctr3_store ++ vmovdqa $D2, $ctr2_store ++ vmovdqa $D1, $ctr1_store ++ vmovdqa $D0, $ctr0_store ++___ ++} ++ ++sub finalize_state_avx2 { ++my ($n)=@_; ++$code.=<<___ if ($n eq 4); ++ vpaddd .chacha20_consts(%rip), $A3, $A3 ++ vpaddd $state1_store, $B3, $B3 ++ vpaddd $state2_store, $C3, $C3 ++ vpaddd $ctr3_store, $D3, $D3 ++___ ++$code.=<<___ if ($n ge 3); ++ vpaddd .chacha20_consts(%rip), $A2, $A2 ++ vpaddd $state1_store, $B2, $B2 ++ vpaddd $state2_store, $C2, $C2 ++ vpaddd $ctr2_store, $D2, $D2 ++___ ++$code.=<<___ if ($n ge 2); ++ vpaddd .chacha20_consts(%rip), $A1, $A1 ++ vpaddd $state1_store, $B1, $B1 ++ vpaddd $state2_store, $C1, $C1 ++ vpaddd $ctr1_store, $D1, $D1 ++___ ++$code.=<<___; ++ vpaddd .chacha20_consts(%rip), $A0, $A0 ++ vpaddd $state1_store, $B0, $B0 ++ vpaddd $state2_store, $C0, $C0 ++ vpaddd $ctr0_store, $D0, $D0 ++___ ++} ++ ++sub xor_stream_avx2 { ++my ($A, $B, $C, $D, $offset, $hlp)=@_; ++$code.=<<___; ++ vperm2i128 \$0x02, $A, $B, $hlp ++ vperm2i128 \$0x13, $A, $B, $B ++ vperm2i128 \$0x02, $C, $D, $A ++ vperm2i128 \$0x13, $C, $D, $C ++ vpxor 0*32+$offset($inp), $hlp, $hlp ++ vpxor 1*32+$offset($inp), $A, $A ++ vpxor 2*32+$offset($inp), $B, $B ++ vpxor 3*32+$offset($inp), $C, $C ++ vmovdqu $hlp, 0*32+$offset($oup) ++ vmovdqu $A, 1*32+$offset($oup) ++ vmovdqu $B, 2*32+$offset($oup) ++ vmovdqu $C, 3*32+$offset($oup) ++___ ++} ++ ++sub finish_stream_avx2 { ++my ($A, $B, $C, $D, $hlp)=@_; ++$code.=<<___; ++ vperm2i128 \$0x13, $A, $B, $hlp ++ vperm2i128 \$0x02, $A, $B, $A ++ vperm2i128 \$0x02, $C, $D, $B ++ vperm2i128 \$0x13, $C, $D, $D ++ vmovdqa $hlp, $C ++___ ++} ++ ++sub poly_stage1_mulx { ++$code.=<<___; ++ mov 0+$r_store, %rdx ++ mov %rdx, $t2 ++ mulx $acc0, $t0, $t1 ++ mulx $acc1, %rax, %rdx ++ imul $acc2, $t2 ++ add %rax, $t1 ++ adc %rdx, $t2 ++___ ++} ++ ++sub poly_stage2_mulx { ++$code.=<<___; ++ mov 8+$r_store, %rdx ++ mulx $acc0, $acc0, %rax ++ add $acc0, $t1 ++ mulx $acc1, $acc1, $t3 ++ adc $acc1, $t2 ++ adc \$0, $t3 ++ imul $acc2, %rdx ++___ ++} ++ ++sub poly_stage3_mulx { ++$code.=<<___; ++ add %rax, $t2 ++ adc %rdx, $t3 ++___ ++} ++ ++sub poly_mul_mulx { ++ &poly_stage1_mulx(); ++ &poly_stage2_mulx(); ++ &poly_stage3_mulx(); ++ &poly_reduce_stage(); ++} ++ ++sub gen_chacha_round_avx2 { ++my ($rot1, $rot2, $shift)=@_; ++my $round=""; ++$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); ++$round=$round ."vmovdqa $rot2, $C0 ++ vpaddd $B3, $A3, $A3 ++ vpaddd $B2, $A2, $A2 ++ vpaddd $B1, $A1, $A1 ++ vpaddd $B0, $A0, $A0 ++ vpxor $A3, $D3, $D3 ++ vpxor $A2, $D2, $D2 ++ vpxor $A1, $D1, $D1 ++ vpxor $A0, $D0, $D0 ++ vpshufb $C0, $D3, $D3 ++ vpshufb $C0, $D2, $D2 ++ vpshufb $C0, $D1, $D1 ++ vpshufb $C0, $D0, $D0 ++ vmovdqa $tmp_store, $C0 ++ vpaddd $D3, $C3, $C3 ++ vpaddd $D2, $C2, $C2 ++ vpaddd $D1, $C1, $C1 ++ vpaddd $D0, $C0, $C0 ++ vpxor $C3, $B3, $B3 ++ vpxor $C2, $B2, $B2 ++ vpxor $C1, $B1, $B1 ++ vpxor $C0, $B0, $B0 ++ vmovdqa $C0, $tmp_store ++ vpsrld \$$rot1, $B3, $C0 ++ vpslld \$32-$rot1, $B3, $B3 ++ vpxor $C0, $B3, $B3 ++ vpsrld \$$rot1, $B2, $C0 ++ vpslld \$32-$rot1, $B2, $B2 ++ vpxor $C0, $B2, $B2 ++ vpsrld \$$rot1, $B1, $C0 ++ vpslld \$32-$rot1, $B1, $B1 ++ vpxor $C0, $B1, $B1 ++ vpsrld \$$rot1, $B0, $C0 ++ vpslld \$32-$rot1, $B0, $B0 ++ vpxor $C0, $B0, $B0\n"; ++($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); ++($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); ++$round=$round ."vmovdqa $tmp_store, $C0 ++ vpalignr \$$s1, $B3, $B3, $B3 ++ vpalignr \$$s2, $C3, $C3, $C3 ++ vpalignr \$$s3, $D3, $D3, $D3 ++ vpalignr \$$s1, $B2, $B2, $B2 ++ vpalignr \$$s2, $C2, $C2, $C2 ++ vpalignr \$$s3, $D2, $D2, $D2 ++ vpalignr \$$s1, $B1, $B1, $B1 ++ vpalignr \$$s2, $C1, $C1, $C1 ++ vpalignr \$$s3, $D1, $D1, $D1 ++ vpalignr \$$s1, $B0, $B0, $B0 ++ vpalignr \$$s2, $C0, $C0, $C0 ++ vpalignr \$$s3, $D0, $D0, $D0\n" ++if (($shift =~ /left/) || ($shift =~ /right/)); ++return $round; ++}; ++ ++$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") . ++ &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") . ++ &gen_chacha_round_avx2(20, ".rol16(%rip)") . ++ &gen_chacha_round_avx2(25, ".rol8(%rip)", "right"); ++ ++@loop_body = split /\n/, $chacha_body; ++ ++$code.=" ++############################################################################### ++.type chacha20_poly1305_open_avx2,\@function,2 ++.align 64 ++chacha20_poly1305_open_avx2: ++ vzeroupper ++ vmovdqa .chacha20_consts(%rip), $A0 ++ vbroadcasti128 0*16($keyp), $B0 ++ vbroadcasti128 1*16($keyp), $C0 ++ vbroadcasti128 2*16($keyp), $D0 ++ vpaddd .avx2_init(%rip), $D0, $D0 ++ cmp \$6*32, $inl ++ jbe open_avx2_192 ++ cmp \$10*32, $inl ++ jbe open_avx2_320 ++ ++ vmovdqa $B0, $state1_store ++ vmovdqa $C0, $state2_store ++ vmovdqa $D0, $ctr0_store ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ vpaddd .chacha20_consts(%rip), $A0, $A0 ++ vpaddd $state1_store, $B0, $B0 ++ vpaddd $state2_store, $C0, $C0 ++ vpaddd $ctr0_store, $D0, $D0 ++ ++ vperm2i128 \$0x02, $A0, $B0, $T0 ++ # Clamp and store key ++ vpand .clamp(%rip), $T0, $T0 ++ vmovdqa $T0, $r_store ++ # Stream for the first 64 bytes ++ vperm2i128 \$0x13, $A0, $B0, $A0 ++ vperm2i128 \$0x13, $C0, $D0, $B0 ++ # Hash AD + first 64 bytes ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++ xor $itr1, $itr1 ++ # Hash first 64 bytes ++1: \n"; ++ &poly_add("0($inp, $itr1)"); ++ &poly_mul(); $code.=" ++ add \$16, $itr1 ++ cmp \$2*32, $itr1 ++ jne 1b ++ # Decrypt first 64 bytes ++ vpxor 0*32($inp), $A0, $A0 ++ vpxor 1*32($inp), $B0, $B0 ++ vmovdqu $A0, 0*32($oup) ++ vmovdqu $B0, 1*32($oup) ++ lea 2*32($inp), $inp ++ lea 2*32($oup), $oup ++ sub \$2*32, $inl ++1: ++ # Hash and decrypt 512 bytes each iteration ++ cmp \$16*32, $inl ++ jb 3f\n"; ++ &prep_state_avx2(4); $code.=" ++ xor $itr1, $itr1 ++2: \n"; ++ &poly_add("0*8($inp, $itr1)"); ++ &emit_body(10); ++ &poly_stage1_mulx(); ++ &emit_body(9); ++ &poly_stage2_mulx(); ++ &emit_body(12); ++ &poly_stage3_mulx(); ++ &emit_body(10); ++ &poly_reduce_stage(); ++ &emit_body(9); ++ &poly_add("2*8($inp, $itr1)"); ++ &emit_body(8); ++ &poly_stage1_mulx(); ++ &emit_body(18); ++ &poly_stage2_mulx(); ++ &emit_body(18); ++ &poly_stage3_mulx(); ++ &emit_body(9); ++ &poly_reduce_stage(); ++ &emit_body(8); ++ &poly_add("4*8($inp, $itr1)"); $code.=" ++ lea 6*8($itr1), $itr1\n"; ++ &emit_body(18); ++ &poly_stage1_mulx(); ++ &emit_body(8); ++ &poly_stage2_mulx(); ++ &emit_body(8); ++ &poly_stage3_mulx(); ++ &emit_body(18); ++ &poly_reduce_stage(); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ cmp \$10*6*8, $itr1 ++ jne 2b\n"; ++ &finalize_state_avx2(4); $code.=" ++ vmovdqa $A0, $tmp_store\n"; ++ &poly_add("10*6*8($inp)"); ++ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" ++ vmovdqa $tmp_store, $A0\n"; ++ &poly_mul(); ++ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); ++ &poly_add("10*6*8+2*8($inp)"); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); ++ &poly_mul(); ++ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" ++ lea 16*32($inp), $inp ++ lea 16*32($oup), $oup ++ sub \$16*32, $inl ++ jmp 1b ++3: ++ test $inl, $inl ++ vzeroupper ++ je open_sse_finalize ++3: ++ cmp \$4*32, $inl ++ ja 3f\n"; ++############################################################################### ++ # 1-128 bytes left ++ &prep_state_avx2(1); $code.=" ++ xor $itr2, $itr2 ++ mov $inl, $itr1 ++ and \$-16, $itr1 ++ test $itr1, $itr1 ++ je 2f ++1: \n"; ++ &poly_add("0*8($inp, $itr2)"); ++ &poly_mul(); $code.=" ++2: ++ add \$16, $itr2\n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" ++ cmp $itr1, $itr2 ++ jb 1b ++ cmp \$160, $itr2 ++ jne 2b\n"; ++ &finalize_state_avx2(1); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" ++ jmp open_avx2_tail_loop ++3: ++ cmp \$8*32, $inl ++ ja 3f\n"; ++############################################################################### ++ # 129-256 bytes left ++ &prep_state_avx2(2); $code.=" ++ mov $inl, $tmp_store ++ mov $inl, $itr1 ++ sub \$4*32, $itr1 ++ shr \$4, $itr1 ++ mov \$10, $itr2 ++ cmp \$10, $itr1 ++ cmovg $itr2, $itr1 ++ mov $inp, $inl ++ xor $itr2, $itr2 ++1: \n"; ++ &poly_add("0*8($inl)"); ++ &poly_mul_mulx(); $code.=" ++ lea 16($inl), $inl ++2: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" ++ inc $itr2\n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ cmp $itr1, $itr2 ++ jb 1b ++ cmp \$10, $itr2 ++ jne 2b ++ mov $inl, $itr2 ++ sub $inp, $inl ++ mov $inl, $itr1 ++ mov $tmp_store, $inl ++1: ++ add \$16, $itr1 ++ cmp $inl, $itr1 ++ jg 1f\n"; ++ &poly_add("0*8($itr2)"); ++ &poly_mul_mulx(); $code.=" ++ lea 16($itr2), $itr2 ++ jmp 1b ++1: \n"; ++ &finalize_state_avx2(2); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); ++ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" ++ lea 4*32($inp), $inp ++ lea 4*32($oup), $oup ++ sub \$4*32, $inl ++ jmp open_avx2_tail_loop ++3: ++ cmp \$12*32, $inl ++ ja 3f\n"; ++############################################################################### ++ # 257-383 bytes left ++ &prep_state_avx2(3); $code.=" ++ mov $inl, $tmp_store ++ mov $inl, $itr1 ++ sub \$8*32, $itr1 ++ shr \$4, $itr1 ++ add \$6, $itr1 ++ mov \$10, $itr2 ++ cmp \$10, $itr1 ++ cmovg $itr2, $itr1 ++ mov $inp, $inl ++ xor $itr2, $itr2 ++1: \n"; ++ &poly_add("0*8($inl)"); ++ &poly_mul_mulx(); $code.=" ++ lea 16($inl), $inl ++2: \n"; ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &poly_add("0*8($inl)"); ++ &poly_mul(); $code.=" ++ lea 16($inl), $inl ++ inc $itr2\n"; ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" ++ cmp $itr1, $itr2 ++ jb 1b ++ cmp \$10, $itr2 ++ jne 2b ++ mov $inl, $itr2 ++ sub $inp, $inl ++ mov $inl, $itr1 ++ mov $tmp_store, $inl ++1: ++ add \$16, $itr1 ++ cmp $inl, $itr1 ++ jg 1f\n"; ++ &poly_add("0*8($itr2)"); ++ &poly_mul_mulx(); $code.=" ++ lea 16($itr2), $itr2 ++ jmp 1b ++1: \n"; ++ &finalize_state_avx2(3); ++ &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); ++ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" ++ lea 8*32($inp), $inp ++ lea 8*32($oup), $oup ++ sub \$8*32, $inl ++ jmp open_avx2_tail_loop ++3: \n"; ++############################################################################### ++ # 384-512 bytes left ++ &prep_state_avx2(4); $code.=" ++ xor $itr1, $itr1 ++ mov $inp, $itr2 ++1: \n"; ++ &poly_add("0*8($itr2)"); ++ &poly_mul(); $code.=" ++ lea 2*8($itr2), $itr2 ++2: \n"; ++ &emit_body(37); ++ &poly_add("0*8($itr2)"); ++ &poly_mul_mulx(); ++ &emit_body(48); ++ &poly_add("2*8($itr2)"); ++ &poly_mul_mulx(); $code.=" ++ lea 4*8($itr2), $itr2\n"; ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ inc $itr1 ++ cmp \$4, $itr1 ++ jl 1b ++ cmp \$10, $itr1 ++ jne 2b ++ mov $inl, $itr1 ++ sub \$12*32, $itr1 ++ and \$-16, $itr1 ++1: ++ test $itr1, $itr1 ++ je 1f\n"; ++ &poly_add("0*8($itr2)"); ++ &poly_mul_mulx(); $code.=" ++ lea 2*8($itr2), $itr2 ++ sub \$2*8, $itr1 ++ jmp 1b ++1: \n"; ++ &finalize_state_avx2(4); $code.=" ++ vmovdqa $A0, $tmp_store\n"; ++ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" ++ vmovdqa $tmp_store, $A0\n"; ++ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); ++ &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" ++ lea 12*32($inp), $inp ++ lea 12*32($oup), $oup ++ sub \$12*32, $inl ++open_avx2_tail_loop: ++ cmp \$32, $inl ++ jb open_avx2_tail ++ sub \$32, $inl ++ vpxor ($inp), $A0, $A0 ++ vmovdqu $A0, ($oup) ++ lea 1*32($inp), $inp ++ lea 1*32($oup), $oup ++ vmovdqa $B0, $A0 ++ vmovdqa $C0, $B0 ++ vmovdqa $D0, $C0 ++ jmp open_avx2_tail_loop ++open_avx2_tail: ++ cmp \$16, $inl ++ vmovdqa $A0x, $A1x ++ jb 1f ++ sub \$16, $inl ++ #load for decryption ++ vpxor ($inp), $A0x, $A1x ++ vmovdqu $A1x, ($oup) ++ lea 1*16($inp), $inp ++ lea 1*16($oup), $oup ++ vperm2i128 \$0x11, $A0, $A0, $A0 ++ vmovdqa $A0x, $A1x ++1: ++ vzeroupper ++ jmp open_sse_tail_16 ++############################################################################### ++open_avx2_192: ++ vmovdqa $A0, $A1 ++ vmovdqa $A0, $A2 ++ vmovdqa $B0, $B1 ++ vmovdqa $B0, $B2 ++ vmovdqa $C0, $C1 ++ vmovdqa $C0, $C2 ++ vpaddd .avx2_inc(%rip), $D0, $D1 ++ vmovdqa $D0, $T2 ++ vmovdqa $D1, $T3 ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ vpaddd $A2, $A0, $A0 ++ vpaddd $A2, $A1, $A1 ++ vpaddd $B2, $B0, $B0 ++ vpaddd $B2, $B1, $B1 ++ vpaddd $C2, $C0, $C0 ++ vpaddd $C2, $C1, $C1 ++ vpaddd $T2, $D0, $D0 ++ vpaddd $T3, $D1, $D1 ++ vperm2i128 \$0x02, $A0, $B0, $T0 ++ # Clamp and store the key ++ vpand .clamp(%rip), $T0, $T0 ++ vmovdqa $T0, $r_store ++ # Stream for up to 192 bytes ++ vperm2i128 \$0x13, $A0, $B0, $A0 ++ vperm2i128 \$0x13, $C0, $D0, $B0 ++ vperm2i128 \$0x02, $A1, $B1, $C0 ++ vperm2i128 \$0x02, $C1, $D1, $D0 ++ vperm2i128 \$0x13, $A1, $B1, $A1 ++ vperm2i128 \$0x13, $C1, $D1, $B1 ++open_avx2_short: ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++open_avx2_hash_and_xor_loop: ++ cmp \$32, $inl ++ jb open_avx2_short_tail_32 ++ sub \$32, $inl\n"; ++ # Load + hash ++ &poly_add("0*8($inp)"); ++ &poly_mul(); ++ &poly_add("2*8($inp)"); ++ &poly_mul(); $code.=" ++ # Load + decrypt ++ vpxor ($inp), $A0, $A0 ++ vmovdqu $A0, ($oup) ++ lea 1*32($inp), $inp ++ lea 1*32($oup), $oup ++ # Shift stream ++ vmovdqa $B0, $A0 ++ vmovdqa $C0, $B0 ++ vmovdqa $D0, $C0 ++ vmovdqa $A1, $D0 ++ vmovdqa $B1, $A1 ++ vmovdqa $C1, $B1 ++ vmovdqa $D1, $C1 ++ vmovdqa $A2, $D1 ++ vmovdqa $B2, $A2 ++ jmp open_avx2_hash_and_xor_loop ++open_avx2_short_tail_32: ++ cmp \$16, $inl ++ vmovdqa $A0x, $A1x ++ jb 1f ++ sub \$16, $inl\n"; ++ &poly_add("0*8($inp)"); ++ &poly_mul(); $code.=" ++ vpxor ($inp), $A0x, $A3x ++ vmovdqu $A3x, ($oup) ++ lea 1*16($inp), $inp ++ lea 1*16($oup), $oup ++ vextracti128 \$1, $A0, $A1x ++1: ++ vzeroupper ++ jmp open_sse_tail_16 ++############################################################################### ++open_avx2_320: ++ vmovdqa $A0, $A1 ++ vmovdqa $A0, $A2 ++ vmovdqa $B0, $B1 ++ vmovdqa $B0, $B2 ++ vmovdqa $C0, $C1 ++ vmovdqa $C0, $C2 ++ vpaddd .avx2_inc(%rip), $D0, $D1 ++ vpaddd .avx2_inc(%rip), $D1, $D2 ++ vmovdqa $B0, $T1 ++ vmovdqa $C0, $T2 ++ vmovdqa $D0, $ctr0_store ++ vmovdqa $D1, $ctr1_store ++ vmovdqa $D2, $ctr2_store ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ vpaddd .chacha20_consts(%rip), $A0, $A0 ++ vpaddd .chacha20_consts(%rip), $A1, $A1 ++ vpaddd .chacha20_consts(%rip), $A2, $A2 ++ vpaddd $T1, $B0, $B0 ++ vpaddd $T1, $B1, $B1 ++ vpaddd $T1, $B2, $B2 ++ vpaddd $T2, $C0, $C0 ++ vpaddd $T2, $C1, $C1 ++ vpaddd $T2, $C2, $C2 ++ vpaddd $ctr0_store, $D0, $D0 ++ vpaddd $ctr1_store, $D1, $D1 ++ vpaddd $ctr2_store, $D2, $D2 ++ vperm2i128 \$0x02, $A0, $B0, $T0 ++ # Clamp and store the key ++ vpand .clamp(%rip), $T0, $T0 ++ vmovdqa $T0, $r_store ++ # Stream for up to 320 bytes ++ vperm2i128 \$0x13, $A0, $B0, $A0 ++ vperm2i128 \$0x13, $C0, $D0, $B0 ++ vperm2i128 \$0x02, $A1, $B1, $C0 ++ vperm2i128 \$0x02, $C1, $D1, $D0 ++ vperm2i128 \$0x13, $A1, $B1, $A1 ++ vperm2i128 \$0x13, $C1, $D1, $B1 ++ vperm2i128 \$0x02, $A2, $B2, $C1 ++ vperm2i128 \$0x02, $C2, $D2, $D1 ++ vperm2i128 \$0x13, $A2, $B2, $A2 ++ vperm2i128 \$0x13, $C2, $D2, $B2 ++ jmp open_avx2_short ++.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 ++############################################################################### ++############################################################################### ++.type chacha20_poly1305_seal_avx2,\@function,2 ++.align 64 ++chacha20_poly1305_seal_avx2: ++ vzeroupper ++ vmovdqa .chacha20_consts(%rip), $A0 ++ vbroadcasti128 0*16($keyp), $B0 ++ vbroadcasti128 1*16($keyp), $C0 ++ vbroadcasti128 2*16($keyp), $D0 ++ vpaddd .avx2_init(%rip), $D0, $D0 ++ cmp \$6*32, $inl ++ jbe seal_avx2_192 ++ cmp \$10*32, $inl ++ jbe seal_avx2_320 ++ vmovdqa $A0, $A1 ++ vmovdqa $A0, $A2 ++ vmovdqa $A0, $A3 ++ vmovdqa $B0, $B1 ++ vmovdqa $B0, $B2 ++ vmovdqa $B0, $B3 ++ vmovdqa $B0, $state1_store ++ vmovdqa $C0, $C1 ++ vmovdqa $C0, $C2 ++ vmovdqa $C0, $C3 ++ vmovdqa $C0, $state2_store ++ vmovdqa $D0, $D3 ++ vpaddd .avx2_inc(%rip), $D3, $D2 ++ vpaddd .avx2_inc(%rip), $D2, $D1 ++ vpaddd .avx2_inc(%rip), $D1, $D0 ++ vmovdqa $D0, $ctr0_store ++ vmovdqa $D1, $ctr1_store ++ vmovdqa $D2, $ctr2_store ++ vmovdqa $D3, $ctr3_store ++ mov \$10, $acc0 ++1: \n"; ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ dec $acc0 ++ jnz 1b\n"; ++ &finalize_state_avx2(4); $code.=" ++ vperm2i128 \$0x13, $C3, $D3, $C3 ++ vperm2i128 \$0x02, $A3, $B3, $D3 ++ vperm2i128 \$0x13, $A3, $B3, $A3 ++ vpand .clamp(%rip), $D3, $D3 ++ vmovdqa $D3, $r_store ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++ # Safely store 320 bytes (otherwise would handle with optimized call) ++ vpxor 0*32($inp), $A3, $A3 ++ vpxor 1*32($inp), $C3, $C3 ++ vmovdqu $A3, 0*32($oup) ++ vmovdqu $C3, 1*32($oup)\n"; ++ &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); ++ &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" ++ lea 10*32($inp), $inp ++ sub \$10*32, $inl ++ mov \$10*32, $itr1 ++ cmp \$4*32, $inl ++ jbe seal_avx2_hash ++ vpxor 0*32($inp), $A0, $A0 ++ vpxor 1*32($inp), $B0, $B0 ++ vpxor 2*32($inp), $C0, $C0 ++ vpxor 3*32($inp), $D0, $D0 ++ vmovdqu $A0, 10*32($oup) ++ vmovdqu $B0, 11*32($oup) ++ vmovdqu $C0, 12*32($oup) ++ vmovdqu $D0, 13*32($oup) ++ lea 4*32($inp), $inp ++ sub \$4*32, $inl ++ mov \$8, $itr1 ++ mov \$2, $itr2 ++ cmp \$4*32, $inl ++ jbe seal_avx2_tail_128 ++ cmp \$8*32, $inl ++ jbe seal_avx2_tail_256 ++ cmp \$12*32, $inl ++ jbe seal_avx2_tail_384 ++ cmp \$16*32, $inl ++ jbe seal_avx2_tail_512\n"; ++ # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop ++ &prep_state_avx2(4); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; ++ &emit_body(41); ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ sub \$16, $oup ++ mov \$9, $itr1 ++ jmp 4f ++1: \n"; ++ &prep_state_avx2(4); $code.=" ++ mov \$10, $itr1 ++2: \n"; ++ &poly_add("0*8($oup)"); ++ &emit_body(10); ++ &poly_stage1_mulx(); ++ &emit_body(9); ++ &poly_stage2_mulx(); ++ &emit_body(12); ++ &poly_stage3_mulx(); ++ &emit_body(10); ++ &poly_reduce_stage(); $code.=" ++4: \n"; ++ &emit_body(9); ++ &poly_add("2*8($oup)"); ++ &emit_body(8); ++ &poly_stage1_mulx(); ++ &emit_body(18); ++ &poly_stage2_mulx(); ++ &emit_body(18); ++ &poly_stage3_mulx(); ++ &emit_body(9); ++ &poly_reduce_stage(); ++ &emit_body(8); ++ &poly_add("4*8($oup)"); $code.=" ++ lea 6*8($oup), $oup\n"; ++ &emit_body(18); ++ &poly_stage1_mulx(); ++ &emit_body(8); ++ &poly_stage2_mulx(); ++ &emit_body(8); ++ &poly_stage3_mulx(); ++ &emit_body(18); ++ &poly_reduce_stage(); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ dec $itr1 ++ jne 2b\n"; ++ &finalize_state_avx2(4); $code.=" ++ lea 4*8($oup), $oup ++ vmovdqa $A0, $tmp_store\n"; ++ &poly_add("-4*8($oup)"); ++ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" ++ vmovdqa $tmp_store, $A0\n"; ++ &poly_mul(); ++ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); ++ &poly_add("-2*8($oup)"); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); ++ &poly_mul(); ++ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" ++ lea 16*32($inp), $inp ++ sub \$16*32, $inl ++ cmp \$16*32, $inl ++ jg 1b\n"; ++ &poly_add("0*8($oup)"); ++ &poly_mul(); ++ &poly_add("2*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 4*8($oup), $oup ++ mov \$10, $itr1 ++ xor $itr2, $itr2 ++ cmp \$4*32, $inl ++ ja 3f ++############################################################################### ++seal_avx2_tail_128:\n"; ++ &prep_state_avx2(1); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 2*8($oup), $oup ++2: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &poly_add("0*8($oup)"); ++ &poly_mul(); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &poly_add("2*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 4*8($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state_avx2(1); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" ++ jmp seal_avx2_short_loop ++3: ++ cmp \$8*32, $inl ++ ja 3f ++############################################################################### ++seal_avx2_tail_256:\n"; ++ &prep_state_avx2(2); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 2*8($oup), $oup ++2: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &poly_add("0*8($oup)"); ++ &poly_mul(); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &poly_add("2*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 4*8($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state_avx2(2); ++ &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" ++ mov \$4*32, $itr1 ++ lea 4*32($inp), $inp ++ sub \$4*32, $inl ++ jmp seal_avx2_hash ++3: ++ cmp \$12*32, $inl ++ ja seal_avx2_tail_512 ++############################################################################### ++seal_avx2_tail_384:\n"; ++ &prep_state_avx2(3); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ lea 2*8($oup), $oup ++2: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &poly_add("0*8($oup)"); ++ &poly_mul(); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &poly_add("2*8($oup)"); ++ &poly_mul(); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ lea 4*8($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state_avx2(3); ++ &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); ++ &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" ++ mov \$8*32, $itr1 ++ lea 8*32($inp), $inp ++ sub \$8*32, $inl ++ jmp seal_avx2_hash ++############################################################################### ++seal_avx2_tail_512:\n"; ++ &prep_state_avx2(4); $code.=" ++1: \n"; ++ &poly_add("0($oup)"); ++ &poly_mul_mulx(); $code.=" ++ lea 2*8($oup), $oup ++2: \n"; ++ &emit_body(20); ++ &poly_add("0*8($oup)"); ++ &emit_body(20); ++ &poly_stage1_mulx(); ++ &emit_body(20); ++ &poly_stage2_mulx(); ++ &emit_body(20); ++ &poly_stage3_mulx(); ++ &emit_body(20); ++ &poly_reduce_stage(); ++ &emit_body(20); ++ &poly_add("2*8($oup)"); ++ &emit_body(20); ++ &poly_stage1_mulx(); ++ &emit_body(20); ++ &poly_stage2_mulx(); ++ &emit_body(20); ++ &poly_stage3_mulx(); ++ &emit_body(20); ++ &poly_reduce_stage(); ++ foreach $l (@loop_body) {$code.=$l."\n";} ++ @loop_body = split /\n/, $chacha_body; $code.=" ++ lea 4*8($oup), $oup ++ dec $itr1 ++ jg 1b ++ dec $itr2 ++ jge 2b\n"; ++ &finalize_state_avx2(4); $code.=" ++ vmovdqa $A0, $tmp_store\n"; ++ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" ++ vmovdqa $tmp_store, $A0\n"; ++ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); ++ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); ++ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" ++ mov \$12*32, $itr1 ++ lea 12*32($inp), $inp ++ sub \$12*32, $inl ++ jmp seal_avx2_hash ++################################################################################ ++seal_avx2_320: ++ vmovdqa $A0, $A1 ++ vmovdqa $A0, $A2 ++ vmovdqa $B0, $B1 ++ vmovdqa $B0, $B2 ++ vmovdqa $C0, $C1 ++ vmovdqa $C0, $C2 ++ vpaddd .avx2_inc(%rip), $D0, $D1 ++ vpaddd .avx2_inc(%rip), $D1, $D2 ++ vmovdqa $B0, $T1 ++ vmovdqa $C0, $T2 ++ vmovdqa $D0, $ctr0_store ++ vmovdqa $D1, $ctr1_store ++ vmovdqa $D2, $ctr2_store ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); ++ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ vpaddd .chacha20_consts(%rip), $A0, $A0 ++ vpaddd .chacha20_consts(%rip), $A1, $A1 ++ vpaddd .chacha20_consts(%rip), $A2, $A2 ++ vpaddd $T1, $B0, $B0 ++ vpaddd $T1, $B1, $B1 ++ vpaddd $T1, $B2, $B2 ++ vpaddd $T2, $C0, $C0 ++ vpaddd $T2, $C1, $C1 ++ vpaddd $T2, $C2, $C2 ++ vpaddd $ctr0_store, $D0, $D0 ++ vpaddd $ctr1_store, $D1, $D1 ++ vpaddd $ctr2_store, $D2, $D2 ++ vperm2i128 \$0x02, $A0, $B0, $T0 ++ # Clamp and store the key ++ vpand .clamp(%rip), $T0, $T0 ++ vmovdqa $T0, $r_store ++ # Stream for up to 320 bytes ++ vperm2i128 \$0x13, $A0, $B0, $A0 ++ vperm2i128 \$0x13, $C0, $D0, $B0 ++ vperm2i128 \$0x02, $A1, $B1, $C0 ++ vperm2i128 \$0x02, $C1, $D1, $D0 ++ vperm2i128 \$0x13, $A1, $B1, $A1 ++ vperm2i128 \$0x13, $C1, $D1, $B1 ++ vperm2i128 \$0x02, $A2, $B2, $C1 ++ vperm2i128 \$0x02, $C2, $D2, $D1 ++ vperm2i128 \$0x13, $A2, $B2, $A2 ++ vperm2i128 \$0x13, $C2, $D2, $B2 ++ jmp seal_avx2_short ++################################################################################ ++seal_avx2_192: ++ vmovdqa $A0, $A1 ++ vmovdqa $A0, $A2 ++ vmovdqa $B0, $B1 ++ vmovdqa $B0, $B2 ++ vmovdqa $C0, $C1 ++ vmovdqa $C0, $C2 ++ vpaddd .avx2_inc(%rip), $D0, $D1 ++ vmovdqa $D0, $T2 ++ vmovdqa $D1, $T3 ++ mov \$10, $acc0 ++1: \n"; ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); ++ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); ++ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" ++ dec $acc0 ++ jne 1b ++ vpaddd $A2, $A0, $A0 ++ vpaddd $A2, $A1, $A1 ++ vpaddd $B2, $B0, $B0 ++ vpaddd $B2, $B1, $B1 ++ vpaddd $C2, $C0, $C0 ++ vpaddd $C2, $C1, $C1 ++ vpaddd $T2, $D0, $D0 ++ vpaddd $T3, $D1, $D1 ++ vperm2i128 \$0x02, $A0, $B0, $T0 ++ # Clamp and store the key ++ vpand .clamp(%rip), $T0, $T0 ++ vmovdqa $T0, $r_store ++ # Stream for up to 192 bytes ++ vperm2i128 \$0x13, $A0, $B0, $A0 ++ vperm2i128 \$0x13, $C0, $D0, $B0 ++ vperm2i128 \$0x02, $A1, $B1, $C0 ++ vperm2i128 \$0x02, $C1, $D1, $D0 ++ vperm2i128 \$0x13, $A1, $B1, $A1 ++ vperm2i128 \$0x13, $C1, $D1, $B1 ++seal_avx2_short: ++ mov %r8, $itr2 ++ call poly_hash_ad_internal ++ xor $itr1, $itr1 ++seal_avx2_hash: ++ cmp \$16, $itr1 ++ jb seal_avx2_short_loop\n"; ++ &poly_add("0($oup)"); ++ &poly_mul(); $code.=" ++ sub \$16, $itr1 ++ add \$16, $oup ++ jmp seal_avx2_hash ++seal_avx2_short_loop: ++ cmp \$32, $inl ++ jb seal_avx2_short_tail ++ sub \$32, $inl ++ # Encrypt ++ vpxor ($inp), $A0, $A0 ++ vmovdqu $A0, ($oup) ++ lea 1*32($inp), $inp ++ # Load + hash\n"; ++ &poly_add("0*8($oup)"); ++ &poly_mul(); ++ &poly_add("2*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 1*32($oup), $oup ++ # Shift stream ++ vmovdqa $B0, $A0 ++ vmovdqa $C0, $B0 ++ vmovdqa $D0, $C0 ++ vmovdqa $A1, $D0 ++ vmovdqa $B1, $A1 ++ vmovdqa $C1, $B1 ++ vmovdqa $D1, $C1 ++ vmovdqa $A2, $D1 ++ vmovdqa $B2, $A2 ++ jmp seal_avx2_short_loop ++seal_avx2_short_tail: ++ cmp \$16, $inl ++ jb 1f ++ sub \$16, $inl ++ vpxor ($inp), $A0x, $A3x ++ vmovdqu $A3x, ($oup) ++ lea 1*16($inp), $inp\n"; ++ &poly_add("0*8($oup)"); ++ &poly_mul(); $code.=" ++ lea 1*16($oup), $oup ++ vextracti128 \$1, $A0, $A0x ++1: ++ vzeroupper ++ jmp seal_sse_tail_16 ++"; ++} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT; +diff --git a/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl b/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl +new file mode 100644 +index 0000000..538af42 +--- /dev/null ++++ b/crypto/chacha20_poly1305/asm/chacha20_x86_64.pl +@@ -0,0 +1,415 @@ ++#!/usr/bin/env perl ++ ++############################################################################## ++# # ++# Copyright 2014 Intel Corporation # ++# # ++# Licensed under the Apache License, Version 2.0 (the "License"); # ++# you may not use this file except in compliance with the License. # ++# You may obtain a copy of the License at # ++# # ++# http://www.apache.org/licenses/LICENSE-2.0 # ++# # ++# Unless required by applicable law or agreed to in writing, software # ++# distributed under the License is distributed on an "AS IS" BASIS, # ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # ++# See the License for the specific language governing permissions and # ++# limitations under the License. # ++# # ++############################################################################## ++# # ++# Developers and authors: # ++# Shay Gueron (1, 2), and Vlad Krasnov (1) # ++# (1) Intel Corporation, Israel Development Center # ++# (2) University of Haifa # ++# # ++# Related work: # ++# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE # ++# Proceedings of 11th International Conference on Information # ++# Technology: New Generations (ITNG 2014), 612-615 (2014). # ++# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"# ++# to be published. # ++# A. Langley, chacha20poly1305 for the AEAD head # ++# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 # ++############################################################################## ++ ++ ++$flavour = shift; ++$output = shift; ++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` ++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.19) + ($1>=2.22); ++} ++ ++if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && ++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.09) + ($1>=2.10); ++} ++ ++if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && ++ `ml64 2>&1` =~ /Version ([0-9]+)\./) { ++ $avx = ($1>=10) + ($1>=11); ++} ++ ++if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { ++ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 ++ $avx = ($ver>=3.0) + ($ver>=3.01); ++} ++ ++{ ++ ++my ($rol8, $rol16, $state_cdef, $tmp, ++ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7, ++ $v8, $v9, $v10, $v11)=map("%xmm$_",(0..15)); ++ ++sub chacha_qr { ++ ++my ($a,$b,$c,$d)=@_; ++$code.=<<___; ++ paddd $b, $a # a += b ++ pxor $a, $d # d ^= a ++ pshufb $rol16, $d # d <<<= 16 ++ ++ paddd $d, $c # c += d ++ pxor $c, $b # b ^= c ++ ++ movdqa $b, $tmp ++ pslld \$12, $tmp ++ psrld \$20, $b ++ pxor $tmp, $b # b <<<= 12 ++ ++ paddd $b, $a # a += b ++ pxor $a, $d # d ^= a ++ pshufb $rol8, $d # d <<<= 8 ++ ++ paddd $d, $c # c += d ++ pxor $c, $b # b ^= c ++ ++ movdqa $b, $tmp ++ pslld \$7, $tmp ++ psrld \$25, $b ++ pxor $tmp, $b # b <<<= 7 ++___ ++ ++} ++ ++$code.=<<___; ++.text ++.align 16 ++chacha20_consts: ++.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' ++.rol8: ++.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 ++.rol16: ++.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 ++.avxInc: ++.quad 1,0 ++___ ++ ++{ ++my ($out, $in, $in_len, $key_ptr, $nr) ++ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8"); ++ ++$code.=<<___; ++.globl chacha_20_core_asm ++.type chacha_20_core_asm ,\@function,2 ++.align 64 ++chacha_20_core_asm: ++ ++ # Init state ++ movdqa .rol8(%rip), $rol8 ++ movdqa .rol16(%rip), $rol16 ++ movdqu 2*16($key_ptr), $state_cdef ++ ++2: ++ cmp \$3*64, $in_len ++ jb 2f ++ ++ movdqa chacha20_consts(%rip), $v0 ++ movdqu 0*16($key_ptr), $v1 ++ movdqu 1*16($key_ptr), $v2 ++ movdqa $state_cdef, $v3 ++ movdqa $v0, $v4 ++ movdqa $v0, $v8 ++ movdqa $v1, $v5 ++ movdqa $v1, $v9 ++ movdqa $v2, $v6 ++ movdqa $v2, $v10 ++ movdqa $v3, $v7 ++ paddd .avxInc(%rip), $v7 ++ movdqa $v7, $v11 ++ paddd .avxInc(%rip), $v11 ++ ++ mov \$10, $nr ++ ++ 1: ++___ ++ &chacha_qr( $v0, $v1, $v2, $v3); ++ &chacha_qr( $v4, $v5, $v6, $v7); ++ &chacha_qr( $v8, $v9,$v10,$v11); ++$code.=<<___; ++ palignr \$4, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$12, $v3, $v3 ++ palignr \$4, $v5, $v5 ++ palignr \$8, $v6, $v6 ++ palignr \$12, $v7, $v7 ++ palignr \$4, $v9, $v9 ++ palignr \$8, $v10, $v10 ++ palignr \$12, $v11, $v11 ++___ ++ &chacha_qr( $v0, $v1, $v2, $v3); ++ &chacha_qr( $v4, $v5, $v6, $v7); ++ &chacha_qr( $v8, $v9,$v10,$v11); ++$code.=<<___; ++ palignr \$12, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$4, $v3, $v3 ++ palignr \$12, $v5, $v5 ++ palignr \$8, $v6, $v6 ++ palignr \$4, $v7, $v7 ++ palignr \$12, $v9, $v9 ++ palignr \$8, $v10, $v10 ++ palignr \$4, $v11, $v11 ++ dec $nr ++ ++ jnz 1b ++ paddd chacha20_consts(%rip), $v0 ++ paddd chacha20_consts(%rip), $v4 ++ paddd chacha20_consts(%rip), $v8 ++ ++ movdqu 16*0($key_ptr), $tmp ++ paddd $tmp, $v1 ++ paddd $tmp, $v5 ++ paddd $tmp, $v9 ++ ++ movdqu 16*1($key_ptr), $tmp ++ paddd $tmp, $v2 ++ paddd $tmp, $v6 ++ paddd $tmp, $v10 ++ ++ paddd $state_cdef, $v3 ++ paddq .avxInc(%rip), $state_cdef ++ paddd $state_cdef, $v7 ++ paddq .avxInc(%rip), $state_cdef ++ paddd $state_cdef, $v11 ++ paddq .avxInc(%rip), $state_cdef ++ ++ movdqu 16*0($in), $tmp ++ pxor $tmp, $v0 ++ movdqu 16*1($in), $tmp ++ pxor $tmp, $v1 ++ movdqu 16*2($in), $tmp ++ pxor $tmp, $v2 ++ movdqu 16*3($in), $tmp ++ pxor $tmp, $v3 ++ ++ movdqu $v0, 16*0($out) ++ movdqu $v1, 16*1($out) ++ movdqu $v2, 16*2($out) ++ movdqu $v3, 16*3($out) ++ ++ movdqu 16*4($in), $tmp ++ pxor $tmp, $v4 ++ movdqu 16*5($in), $tmp ++ pxor $tmp, $v5 ++ movdqu 16*6($in), $tmp ++ pxor $tmp, $v6 ++ movdqu 16*7($in), $tmp ++ pxor $tmp, $v7 ++ ++ movdqu $v4, 16*4($out) ++ movdqu $v5, 16*5($out) ++ movdqu $v6, 16*6($out) ++ movdqu $v7, 16*7($out) ++ ++ movdqu 16*8($in), $tmp ++ pxor $tmp, $v8 ++ movdqu 16*9($in), $tmp ++ pxor $tmp, $v9 ++ movdqu 16*10($in), $tmp ++ pxor $tmp, $v10 ++ movdqu 16*11($in), $tmp ++ pxor $tmp, $v11 ++ ++ movdqu $v8, 16*8($out) ++ movdqu $v9, 16*9($out) ++ movdqu $v10, 16*10($out) ++ movdqu $v11, 16*11($out) ++ ++ lea 16*12($in), $in ++ lea 16*12($out), $out ++ sub \$16*12, $in_len ++ ++ jmp 2b ++ ++2: ++ cmp \$2*64, $in_len ++ jb 2f ++ ++ movdqa chacha20_consts(%rip), $v0 ++ movdqa chacha20_consts(%rip), $v4 ++ movdqu 16*0($key_ptr), $v1 ++ movdqu 16*0($key_ptr), $v5 ++ movdqu 16*1($key_ptr), $v2 ++ movdqu 16*1($key_ptr), $v6 ++ movdqa $state_cdef, $v3 ++ movdqa $v3, $v7 ++ paddd .avxInc(%rip), $v7 ++ ++ mov \$10, $nr ++ 1: ++___ ++ &chacha_qr($v0,$v1,$v2,$v3); ++ &chacha_qr($v4,$v5,$v6,$v7); ++$code.=<<___; ++ palignr \$4, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$12, $v3, $v3 ++ palignr \$4, $v5, $v5 ++ palignr \$8, $v6, $v6 ++ palignr \$12, $v7, $v7 ++___ ++ &chacha_qr($v0,$v1,$v2,$v3); ++ &chacha_qr($v4,$v5,$v6,$v7); ++$code.=<<___; ++ palignr \$12, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$4, $v3, $v3 ++ palignr \$12, $v5, $v5 ++ palignr \$8, $v6, $v6 ++ palignr \$4, $v7, $v7 ++ dec $nr ++ jnz 1b ++ ++ paddd chacha20_consts(%rip), $v0 ++ paddd chacha20_consts(%rip), $v4 ++ ++ movdqu 16*0($key_ptr), $tmp ++ paddd $tmp, $v1 ++ paddd $tmp, $v5 ++ ++ movdqu 16*1($key_ptr), $tmp ++ paddd $tmp, $v2 ++ paddd $tmp, $v6 ++ ++ paddd $state_cdef, $v3 ++ paddq .avxInc(%rip), $state_cdef ++ paddd $state_cdef, $v7 ++ paddq .avxInc(%rip), $state_cdef ++ ++ movdqu 16*0($in), $tmp ++ pxor $tmp, $v0 ++ movdqu 16*1($in), $tmp ++ pxor $tmp, $v1 ++ movdqu 16*2($in), $tmp ++ pxor $tmp, $v2 ++ movdqu 16*3($in), $tmp ++ pxor $tmp, $v3 ++ ++ movdqu $v0, 16*0($out) ++ movdqu $v1, 16*1($out) ++ movdqu $v2, 16*2($out) ++ movdqu $v3, 16*3($out) ++ ++ movdqu 16*4($in), $tmp ++ pxor $tmp, $v4 ++ movdqu 16*5($in), $tmp ++ pxor $tmp, $v5 ++ movdqu 16*6($in), $tmp ++ pxor $tmp, $v6 ++ movdqu 16*7($in), $tmp ++ pxor $tmp, $v7 ++ ++ movdqu $v4, 16*4($out) ++ movdqu $v5, 16*5($out) ++ movdqu $v6, 16*6($out) ++ movdqu $v7, 16*7($out) ++ ++ lea 16*8($in), $in ++ lea 16*8($out), $out ++ sub \$16*8, $in_len ++ ++ jmp 2b ++2: ++ cmp \$64, $in_len ++ jb 2f ++ ++ movdqa chacha20_consts(%rip), $v0 ++ movdqu 16*0($key_ptr), $v1 ++ movdqu 16*1($key_ptr), $v2 ++ movdqa $state_cdef, $v3 ++ ++ mov \$10, $nr ++ ++ 1: ++___ ++ &chacha_qr($v0,$v1,$v2,$v3); ++$code.=<<___; ++ palignr \$4, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$12, $v3, $v3 ++___ ++ &chacha_qr($v0,$v1,$v2,$v3); ++$code.=<<___; ++ palignr \$12, $v1, $v1 ++ palignr \$8, $v2, $v2 ++ palignr \$4, $v3, $v3 ++ dec $nr ++ jnz 1b ++ ++ paddd chacha20_consts(%rip), $v0 ++ ++ movdqu 16*0($key_ptr), $tmp ++ paddd $tmp, $v1 ++ ++ movdqu 16*1($key_ptr), $tmp ++ paddd $tmp, $v2 ++ ++ paddd $state_cdef, $v3 ++ paddq .avxInc(%rip), $state_cdef ++ ++ movdqu 16*0($in), $tmp ++ pxor $tmp, $v0 ++ movdqu 16*1($in), $tmp ++ pxor $tmp, $v1 ++ movdqu 16*2($in), $tmp ++ pxor $tmp, $v2 ++ movdqu 16*3($in), $tmp ++ pxor $tmp, $v3 ++ ++ movdqu $v0, 16*0($out) ++ movdqu $v1, 16*1($out) ++ movdqu $v2, 16*2($out) ++ movdqu $v3, 16*3($out) ++ ++ lea 16*4($in), $in ++ lea 16*4($out), $out ++ sub \$16*4, $in_len ++ jmp 2b ++ ++2: ++ movdqu $state_cdef, 16*2($key_ptr) ++ ret ++.size chacha_20_core_asm,.-chacha_20_core_asm ++___ ++} ++} ++ ++$code =~ s/\`([^\`]*)\`/eval($1)/gem; ++ ++print $code; ++ ++close STDOUT; +diff --git a/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl b/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl +new file mode 100644 +index 0000000..05e4bc5 +--- /dev/null ++++ b/crypto/chacha20_poly1305/asm/poly1305_x86_64.pl +@@ -0,0 +1,280 @@ ++############################################################################## ++# # ++# Copyright 2016 CloudFlare LTD # ++# # ++# Licensed under the Apache License, Version 2.0 (the "License"); # ++# you may not use this file except in compliance with the License. # ++# You may obtain a copy of the License at # ++# # ++# http://www.apache.org/licenses/LICENSE-2.0 # ++# # ++# Unless required by applicable law or agreed to in writing, software # ++# distributed under the License is distributed on an "AS IS" BASIS, # ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # ++# See the License for the specific language governing permissions and # ++# limitations under the License. # ++# # ++############################################################################## ++# # ++# Author: Vlad Krasnov # ++# # ++############################################################################## ++ ++$flavour = shift; ++$output = shift; ++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` ++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.19) + ($1>=2.22); ++} ++ ++if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && ++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { ++ $avx = ($1>=2.09) + ($1>=2.10); ++} ++ ++if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && ++ `ml64 2>&1` =~ /Version ([0-9]+)\./) { ++ $avx = ($1>=10) + ($1>=11); ++} ++ ++if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { ++ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 ++ $avx = ($ver>=3.0) + ($ver>=3.01); ++} ++ ++ ++{ ++{ ++ ++my ($state, $key) ++ =("%rdi", "%rsi"); ++ ++$code.=<<___; ++ ++.LrSet: ++.align 16 ++.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC ++############################################################################### ++# void poly1305_init_x64(void *state, uint8_t key[32]) ++ ++.globl poly1305_init_x64 ++.type poly1305_init_x64, \@function, 2 ++.align 64 ++poly1305_init_x64: ++ ++ xor %rax, %rax ++ mov %rax, 8*0($state) ++ mov %rax, 8*1($state) ++ mov %rax, 8*2($state) ++ ++ movdqu 16*0($key), %xmm0 ++ movdqu 16*1($key), %xmm1 ++ pand .LrSet(%rip), %xmm0 ++ ++ movdqu %xmm0, 8*3($state) ++ movdqu %xmm1, 8*3+16($state) ++ movq \$0, 8*7($state) ++ ++ ret ++.size poly1305_init_x64,.-poly1305_init_x64 ++___ ++} ++ ++{ ++ ++my ($state, $inp) ++ =("%rdi", "%rsi"); ++ ++my ($acc0, $acc1, $acc2, $inl, $t0, $t1, $t2, $t3, $r0) ++ =("%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"); ++ ++my ($r1) ++ =("8*4($state)"); ++ ++$code.=<<___; ++############################################################################### ++# void* poly1305_update_x64(void* state, void* in, uint64_t in_len) ++.globl poly1305_update_x64 ++.type poly1305_update_x64, \@function, 2 ++.align 64 ++poly1305_update_x64: ++ ++ push %r11 ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ ++ mov %rdx, $inl ++ ++ mov 8*0($state), $acc0 ++ mov 8*1($state), $acc1 ++ mov 8*2($state), $acc2 ++ mov 8*3($state), $r0 ++ ++ cmp \$16, $inl ++ jb 2f ++ jmp 1f ++ ++.align 64 ++1: ++############################ ++ add 8*0($inp), $acc0 ++ adc 8*1($inp), $acc1 ++ lea 16($inp), $inp ++ adc \$1, $acc2 ++ ++5: ++ mov $r0, %rax ++ mulq $acc0 ++ mov %rax, $t0 ++ mov %rdx, $t1 ++ ++ mov $r0, %rax ++ mulq $acc1 ++ add %rax, $t1 ++ adc \$0, %rdx ++ ++ mov $r0, $t2 ++ imul $acc2, $t2 ++ add %rdx, $t2 ++############################ ++ mov $r1, %rax ++ mulq $acc0 ++ add %rax, $t1 ++ adc \$0, %rdx ++ mov %rdx, $acc0 ++ ++ mov $r1, %rax ++ mulq $acc1 ++ add $acc0, $t2 ++ adc \$0, %rdx ++ add %rax, $t2 ++ adc \$0, %rdx ++ ++ mov $r1, $t3 ++ imul $acc2, $t3 ++ add %rdx, $t3 ++############################ ++ ++ mov $t0, $acc0 ++ mov $t1, $acc1 ++ mov $t2, $acc2 ++ and \$3, $acc2 ++ ++ mov $t2, $t0 ++ mov $t3, $t1 ++ ++ and \$-4, $t0 ++ shrd \$2, $t3, $t2 ++ shr \$2, $t3 ++ ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$0, $acc2 ++ ++ add $t2, $acc0 ++ adc $t3, $acc1 ++ adc \$0, $acc2 ++ ++ sub \$16, $inl ++ cmp \$16, $inl ++ jae 1b ++ ++2: ++ test $inl, $inl ++ jz 3f ++ ++ mov \$1, $t0 ++ xor $t1, $t1 ++ xor $t2, $t2 ++ add $inl, $inp ++ ++4: ++ shld \$8, $t0, $t1 ++ shl \$8, $t0 ++ movzxb -1($inp), $t2 ++ xor $t2, $t0 ++ dec $inp ++ dec $inl ++ jnz 4b ++ ++ add $t0, $acc0 ++ adc $t1, $acc1 ++ adc \$0, $acc2 ++ ++ mov \$16, $inl ++ jmp 5b ++ ++3: ++ ++ mov $acc0, 8*0($state) ++ mov $acc1, 8*1($state) ++ mov $acc2, 8*2($state) ++ ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %r11 ++ ret ++.size poly1305_update_x64, .-poly1305_update_x64 ++___ ++} ++ ++{ ++ ++my ($mac, $state)=("%rsi", "%rdi"); ++ ++my ($acc0, $acc1, $acc2, $t0, $t1, $t2) ++ =("%rcx", "%rax", "%rdx", "%r8", "%r9", "%r10"); ++ ++$code.=<<___; ++############################################################################### ++# void poly1305_finish_x64(void* state, uint64_t mac[2]); ++.type poly1305_finish_x64,\@function, 2 ++.align 64 ++.globl poly1305_finish_x64 ++poly1305_finish_x64: ++ ++ mov 8*0($state), $acc0 ++ mov 8*1($state), $acc1 ++ mov 8*2($state), $acc2 ++ ++ mov $acc0, $t0 ++ mov $acc1, $t1 ++ mov $acc2, $t2 ++ ++ sub \$-5, $acc0 ++ sbb \$-1, $acc1 ++ sbb \$3, $acc2 ++ ++ cmovc $t0, $acc0 ++ cmovc $t1, $acc1 ++ cmovc $t2, $acc2 ++ ++ add 8*5($state), $acc0 ++ adc 8*6($state), $acc1 ++ mov $acc0, ($mac) ++ mov $acc1, 8($mac) ++ ++ ret ++.size poly1305_finish_x64, .-poly1305_finish_x64 ++___ ++} ++} ++$code =~ s/\`([^\`]*)\`/eval($1)/gem; ++print $code; ++close STDOUT; +diff --git a/crypto/chacha20_poly1305/chacha20.c b/crypto/chacha20_poly1305/chacha20.c +new file mode 100644 +index 0000000..b48d857 +--- /dev/null ++++ b/crypto/chacha20_poly1305/chacha20.c +@@ -0,0 +1,142 @@ ++/* Copyright (c) 2014, Google Inc. ++ * ++ * Permission to use, copy, modify, and/or distribute this software for any ++ * purpose with or without fee is hereby granted, provided that the above ++ * copyright notice and this permission notice appear in all copies. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ++ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ++ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ++ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ++ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN ++ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ ++ ++/* Adapted from the public domain, estream code by D. Bernstein. */ ++ ++#include "chacha20poly1305.h" ++ ++/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ ++static const char sigma[16] = "expand 32-byte k"; ++ ++#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) ++#define XOR(v, w) ((v) ^ (w)) ++#define PLUS(x, y) ((x) + (y)) ++#define PLUSONE(v) (PLUS((v), 1)) ++ ++#define U32TO8_LITTLE(p, v) \ ++ { \ ++ (p)[0] = (v >> 0) & 0xff; \ ++ (p)[1] = (v >> 8) & 0xff; \ ++ (p)[2] = (v >> 16) & 0xff; \ ++ (p)[3] = (v >> 24) & 0xff; \ ++ } ++ ++#define U8TO32_LITTLE(p) \ ++ (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ ++ ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) ++ ++/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ ++#define QUARTERROUND(a,b,c,d) \ ++ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ ++ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ ++ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ ++ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); ++ ++/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in ++ * |input| and writes the 64 output bytes to |output|. */ ++static void chacha_core(uint8_t output[64], const uint32_t input[16]) { ++ uint32_t x[16]; ++ int i; ++ ++ memcpy(x, input, sizeof(uint32_t) * 16); ++ for (i = 20; i > 0; i -= 2) { ++ QUARTERROUND(0, 4, 8, 12) ++ QUARTERROUND(1, 5, 9, 13) ++ QUARTERROUND(2, 6, 10, 14) ++ QUARTERROUND(3, 7, 11, 15) ++ QUARTERROUND(0, 5, 10, 15) ++ QUARTERROUND(1, 6, 11, 12) ++ QUARTERROUND(2, 7, 8, 13) ++ QUARTERROUND(3, 4, 9, 14) ++ } ++ ++ for (i = 0; i < 16; ++i) { ++ x[i] = PLUS(x[i], input[i]); ++ } ++ for (i = 0; i < 16; ++i) { ++ U32TO8_LITTLE(output + 4 * i, x[i]); ++ } ++} ++ ++#if CHAPOLY_ASM ++void chacha_20_core_asm(uint8_t *out, const uint8_t *in, size_t in_len, ++ uint8_t nonce[48]); ++#endif ++ ++void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, ++ uint8_t nonce[48]) { ++ ++ uint8_t buf[64]; ++ uint32_t input[16]; ++ size_t todo, i; ++ ++#ifdef CHAPOLY_ASM ++ chacha_20_core_asm(out, in, in_len, nonce); ++ todo = in_len & (63); ++ ++ if(todo) { ++ out += in_len - todo; ++ in += in_len - todo; ++ memcpy(buf, in, todo); ++ ++ chacha_20_core_asm(buf, buf, sizeof(buf), nonce); ++ ++ memcpy(out, buf, todo); ++ memset(buf, 0, sizeof(buf)); ++ } ++ return; ++#endif ++ ++ input[0] = U8TO32_LITTLE(sigma + 0); ++ input[1] = U8TO32_LITTLE(sigma + 4); ++ input[2] = U8TO32_LITTLE(sigma + 8); ++ input[3] = U8TO32_LITTLE(sigma + 12); ++ ++ input[4] = U8TO32_LITTLE(nonce + 0); ++ input[5] = U8TO32_LITTLE(nonce + 4); ++ input[6] = U8TO32_LITTLE(nonce + 8); ++ input[7] = U8TO32_LITTLE(nonce + 12); ++ ++ input[8] = U8TO32_LITTLE(nonce + 16); ++ input[9] = U8TO32_LITTLE(nonce + 20); ++ input[10] = U8TO32_LITTLE(nonce + 24); ++ input[11] = U8TO32_LITTLE(nonce + 28); ++ ++ input[12] = U8TO32_LITTLE(nonce + 32); ++ input[13] = U8TO32_LITTLE(nonce + 36); ++ input[14] = U8TO32_LITTLE(nonce + 40); ++ input[15] = U8TO32_LITTLE(nonce + 44); ++ ++ while (in_len > 0) { ++ todo = 64; ++ if (in_len < todo) { ++ todo = in_len; ++ } ++ ++ chacha_core(buf, input); ++ for (i = 0; i < todo; i++) { ++ out[i] = in[i] ^ buf[i]; ++ } ++ ++ out += todo; ++ in += todo; ++ in_len -= todo; ++ ++ ((uint64_t*)input)[6]++; ++ } ++ ++ U32TO8_LITTLE(nonce + 32, input[12]); ++ U32TO8_LITTLE(nonce + 36, input[13]); ++} ++ +diff --git a/crypto/chacha20_poly1305/chacha20poly1305.h b/crypto/chacha20_poly1305/chacha20poly1305.h +new file mode 100644 +index 0000000..3968c40 +--- /dev/null ++++ b/crypto/chacha20_poly1305/chacha20poly1305.h +@@ -0,0 +1,64 @@ ++/* Copyright (c) 2014, Google Inc. ++ * ++ * Permission to use, copy, modify, and/or distribute this software for any ++ * purpose with or without fee is hereby granted, provided that the above ++ * copyright notice and this permission notice appear in all copies. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ++ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ++ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ++ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ++ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN ++ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ ++ ++#ifndef OPENSSL_HEADER_POLY1305_H ++#define OPENSSL_HEADER_POLY1305_H ++ ++#include <stdint.h> ++#include <stddef.h> ++#include <string.h> ++#include "crypto.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define POLY1305_MAC_LEN (16) ++#define POLY1305_PAD_LEN (16) ++ ++typedef unsigned char poly1305_state[92]; ++ ++ ++/* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an ++ * authentication tag with the one-time key |key|. Note that |key| is a ++ * one-time key and therefore there is no `reset' method because that would ++ * enable several messages to be authenticated with the same key. */ ++void CRYPTO_poly1305_init(poly1305_state* state, const uint8_t key[32]); ++ ++/* CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called ++ * zero or more times after poly1305_init. */ ++void CRYPTO_poly1305_update(poly1305_state* state, const uint8_t* in, ++ size_t in_len); ++ ++/* CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16 ++ * byte authentication tag to |mac|. */ ++void CRYPTO_poly1305_finish(poly1305_state* state, ++ uint8_t mac[POLY1305_MAC_LEN]); ++ ++/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and ++ * nonce and writes the result to |out|, which may be equal to |in|. The ++ * initial block counter is specified by |counter|. */ ++void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, ++ uint8_t nonce[48]); ++ ++#if CHAPOLY_ASM ++int chacha20_poly1305_open(uint8_t *pt, const uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *key); ++void chacha20_poly1305_seal(uint8_t *ct, const uint8_t *pt, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *key); ++#endif ++ ++#if defined(__cplusplus) ++} /* extern C */ ++#endif ++ ++#endif /* OPENSSL_HEADER_POLY1305_H */ +diff --git a/crypto/chacha20_poly1305/poly1305.c b/crypto/chacha20_poly1305/poly1305.c +new file mode 100644 +index 0000000..6bd553b +--- /dev/null ++++ b/crypto/chacha20_poly1305/poly1305.c +@@ -0,0 +1,355 @@ ++/* Copyright (c) 2014, Google Inc. ++ * ++ * Permission to use, copy, modify, and/or distribute this software for any ++ * purpose with or without fee is hereby granted, provided that the above ++ * copyright notice and this permission notice appear in all copies. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ++ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ++ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ++ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ++ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN ++ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ ++ ++/* This implementation of poly1305 is by Andrew Moon ++ * (https://github.com/floodyberry/poly1305-donna) and released as public ++ * domain. */ ++ ++#include "chacha20poly1305.h" ++ ++#include <string.h> ++#ifndef CHAPOLY_ASM ++ ++#if !defined(B_ENDIAN) ++/* We can assume little-endian. */ ++static uint32_t U8TO32_LE(const uint8_t *m) { ++ uint32_t r; ++ memcpy(&r, m, sizeof(r)); ++ return r; ++} ++ ++static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); } ++#else ++static uint32_t U8TO32_LE(const uint8_t *m) { ++ return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 | ++ (uint32_t)m[3] << 24; ++} ++ ++static void U32TO8_LE(uint8_t *m, uint32_t v) { ++ m[0] = v; ++ m[1] = v >> 8; ++ m[2] = v >> 16; ++ m[3] = v >> 24; ++} ++#endif ++ ++static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } ++ ++struct poly1305_state_st { ++ uint32_t r0, r1, r2, r3, r4; ++ uint32_t s1, s2, s3, s4; ++ uint32_t h0, h1, h2, h3, h4; ++ uint8_t buf[16]; ++ unsigned int buf_used; ++ uint8_t key[16]; ++}; ++ ++/* poly1305_blocks updates |state| given some amount of input data. This ++ * function may only be called with a |len| that is not a multiple of 16 at the ++ * end of the data. Otherwise the input must be buffered into 16 byte blocks. */ ++static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, ++ size_t len) { ++ uint32_t t0, t1, t2, t3; ++ uint64_t t[5]; ++ uint32_t b; ++ uint64_t c; ++ size_t j; ++ uint8_t mp[16]; ++ ++ if (len < 16) { ++ goto poly1305_donna_atmost15bytes; ++ } ++ ++poly1305_donna_16bytes: ++ t0 = U8TO32_LE(in); ++ t1 = U8TO32_LE(in + 4); ++ t2 = U8TO32_LE(in + 8); ++ t3 = U8TO32_LE(in + 12); ++ ++ in += 16; ++ len -= 16; ++ ++ state->h0 += t0 & 0x3ffffff; ++ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; ++ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; ++ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; ++ state->h4 += (t3 >> 8) | (1 << 24); ++ ++poly1305_donna_mul: ++ t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + ++ mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + ++ mul32x32_64(state->h4, state->s1); ++ t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + ++ mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + ++ mul32x32_64(state->h4, state->s2); ++ t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + ++ mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + ++ mul32x32_64(state->h4, state->s3); ++ t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + ++ mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + ++ mul32x32_64(state->h4, state->s4); ++ t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + ++ mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + ++ mul32x32_64(state->h4, state->r0); ++ ++ state->h0 = (uint32_t)t[0] & 0x3ffffff; ++ c = (t[0] >> 26); ++ t[1] += c; ++ state->h1 = (uint32_t)t[1] & 0x3ffffff; ++ b = (uint32_t)(t[1] >> 26); ++ t[2] += b; ++ state->h2 = (uint32_t)t[2] & 0x3ffffff; ++ b = (uint32_t)(t[2] >> 26); ++ t[3] += b; ++ state->h3 = (uint32_t)t[3] & 0x3ffffff; ++ b = (uint32_t)(t[3] >> 26); ++ t[4] += b; ++ state->h4 = (uint32_t)t[4] & 0x3ffffff; ++ b = (uint32_t)(t[4] >> 26); ++ state->h0 += b * 5; ++ ++ if (len >= 16) ++ goto poly1305_donna_16bytes; ++ ++/* final bytes */ ++poly1305_donna_atmost15bytes: ++ if (!len) ++ return; ++ ++ for (j = 0; j < len; j++) ++ mp[j] = in[j]; ++ mp[j++] = 1; ++ for (; j < 16; j++) ++ mp[j] = 0; ++ len = 0; ++ ++ t0 = U8TO32_LE(mp + 0); ++ t1 = U8TO32_LE(mp + 4); ++ t2 = U8TO32_LE(mp + 8); ++ t3 = U8TO32_LE(mp + 12); ++ ++ state->h0 += t0 & 0x3ffffff; ++ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; ++ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; ++ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; ++ state->h4 += (t3 >> 8); ++ ++ goto poly1305_donna_mul; ++} ++ ++void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ uint32_t t0, t1, t2, t3; ++ ++ t0 = U8TO32_LE(key + 0); ++ t1 = U8TO32_LE(key + 4); ++ t2 = U8TO32_LE(key + 8); ++ t3 = U8TO32_LE(key + 12); ++ ++ /* precompute multipliers */ ++ state->r0 = t0 & 0x3ffffff; ++ t0 >>= 26; ++ t0 |= t1 << 6; ++ state->r1 = t0 & 0x3ffff03; ++ t1 >>= 20; ++ t1 |= t2 << 12; ++ state->r2 = t1 & 0x3ffc0ff; ++ t2 >>= 14; ++ t2 |= t3 << 18; ++ state->r3 = t2 & 0x3f03fff; ++ t3 >>= 8; ++ state->r4 = t3 & 0x00fffff; ++ ++ state->s1 = state->r1 * 5; ++ state->s2 = state->r2 * 5; ++ state->s3 = state->r3 * 5; ++ state->s4 = state->r4 * 5; ++ ++ /* init state */ ++ state->h0 = 0; ++ state->h1 = 0; ++ state->h2 = 0; ++ state->h3 = 0; ++ state->h4 = 0; ++ ++ state->buf_used = 0; ++ memcpy(state->key, key + 16, sizeof(state->key)); ++} ++ ++void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, ++ size_t in_len) { ++ unsigned int i; ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ ++ if (state->buf_used) { ++ unsigned int todo = 16 - state->buf_used; ++ if (todo > in_len) ++ todo = in_len; ++ for (i = 0; i < todo; i++) ++ state->buf[state->buf_used + i] = in[i]; ++ state->buf_used += todo; ++ in_len -= todo; ++ in += todo; ++ ++ if (state->buf_used == 16) { ++ poly1305_update(state, state->buf, 16); ++ state->buf_used = 0; ++ } ++ } ++ ++ if (in_len >= 16) { ++ size_t todo = in_len & ~0xf; ++ poly1305_update(state, in, todo); ++ in += todo; ++ in_len &= 0xf; ++ } ++ ++ if (in_len) { ++ for (i = 0; i < in_len; i++) ++ state->buf[i] = in[i]; ++ state->buf_used = in_len; ++ } ++} ++ ++void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ ++ uint64_t f0, f1, f2, f3; ++ uint32_t g0, g1, g2, g3, g4; ++ uint32_t b, nb; ++ ++ if (state->buf_used) ++ poly1305_update(state, state->buf, state->buf_used); ++ ++ b = state->h0 >> 26; ++ state->h0 = state->h0 & 0x3ffffff; ++ state->h1 += b; ++ b = state->h1 >> 26; ++ state->h1 = state->h1 & 0x3ffffff; ++ state->h2 += b; ++ b = state->h2 >> 26; ++ state->h2 = state->h2 & 0x3ffffff; ++ state->h3 += b; ++ b = state->h3 >> 26; ++ state->h3 = state->h3 & 0x3ffffff; ++ state->h4 += b; ++ b = state->h4 >> 26; ++ state->h4 = state->h4 & 0x3ffffff; ++ state->h0 += b * 5; ++ ++ g0 = state->h0 + 5; ++ b = g0 >> 26; ++ g0 &= 0x3ffffff; ++ g1 = state->h1 + b; ++ b = g1 >> 26; ++ g1 &= 0x3ffffff; ++ g2 = state->h2 + b; ++ b = g2 >> 26; ++ g2 &= 0x3ffffff; ++ g3 = state->h3 + b; ++ b = g3 >> 26; ++ g3 &= 0x3ffffff; ++ g4 = state->h4 + b - (1 << 26); ++ ++ b = (g4 >> 31) - 1; ++ nb = ~b; ++ state->h0 = (state->h0 & nb) | (g0 & b); ++ state->h1 = (state->h1 & nb) | (g1 & b); ++ state->h2 = (state->h2 & nb) | (g2 & b); ++ state->h3 = (state->h3 & nb) | (g3 & b); ++ state->h4 = (state->h4 & nb) | (g4 & b); ++ ++ f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); ++ f1 = ((state->h1 >> 6) | (state->h2 << 20)) + ++ (uint64_t)U8TO32_LE(&state->key[4]); ++ f2 = ((state->h2 >> 12) | (state->h3 << 14)) + ++ (uint64_t)U8TO32_LE(&state->key[8]); ++ f3 = ((state->h3 >> 18) | (state->h4 << 8)) + ++ (uint64_t)U8TO32_LE(&state->key[12]); ++ ++ U32TO8_LE(&mac[0], f0); ++ f1 += (f0 >> 32); ++ U32TO8_LE(&mac[4], f1); ++ f2 += (f1 >> 32); ++ U32TO8_LE(&mac[8], f2); ++ f3 += (f2 >> 32); ++ U32TO8_LE(&mac[12], f3); ++} ++ ++#else ++ ++struct poly1305_state_st { ++ uint8_t opaque[8*8]; ++ uint8_t buf[16]; ++ unsigned int buf_used; ++}; ++ ++void poly1305_init_x64(struct poly1305_state_st* state, const uint8_t key[32]); ++void poly1305_update_x64(struct poly1305_state_st* state, const uint8_t *in, size_t in_len); ++void poly1305_finish_x64(struct poly1305_state_st* state, uint8_t mac[16]); ++ ++#define poly1305_update poly1305_update_x64 ++ ++void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ state->buf_used = 0; ++ return poly1305_init_x64(state, key); ++} ++ ++void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, ++ size_t in_len) { ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ int todo; ++ /* Attempt to fill as many bytes as possible before calling the update ++ function */ ++ if (in_len < 16 || state->buf_used) { ++ todo = 16 - state->buf_used; ++ todo = in_len < todo ? in_len : todo; ++ memcpy(state->buf + state->buf_used, in, todo); ++ state->buf_used += todo; ++ in += todo; ++ in_len -= todo; ++ ++ if (state->buf_used == 16) { ++ poly1305_update_x64(state, state->buf, 16); ++ state->buf_used = 0; ++ } ++ } ++ ++ if (in_len >= 16) { ++ poly1305_update_x64(state, in, in_len & (-16)); ++ in += in_len & (-16); ++ in_len &= (15); ++ } ++ ++ if (in_len) { ++ memcpy(state->buf, in, in_len); ++ state->buf_used = in_len; ++ } ++} ++ ++void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { ++ struct poly1305_state_st *state = (struct poly1305_state_st *)statep; ++ ++ if (state->buf_used) { ++ if (state->buf_used % POLY1305_PAD_LEN) { ++ memset(state->buf + state->buf_used, 0, ++ POLY1305_PAD_LEN - (state->buf_used % POLY1305_PAD_LEN)); ++ } ++ poly1305_update_x64(state, state->buf, state->buf_used); ++ } ++ ++ poly1305_finish_x64(state, mac); ++} ++#endif +diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile +index fa138d0..c87896b 100644 +--- a/crypto/evp/Makefile ++++ b/crypto/evp/Makefile +@@ -29,7 +29,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \ + c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \ + evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \ + e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \ +- e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c ++ e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c \ ++ e_chacha20_poly1305.c + + LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ + e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\ +@@ -42,7 +43,8 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ + c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \ + evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \ + e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \ +- e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o ++ e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o \ ++ e_chacha20_poly1305.o + + SRC= $(LIBSRC) + +@@ -793,3 +795,5 @@ pmeth_lib.o: ../../include/openssl/sha.h ../../include/openssl/stack.h + pmeth_lib.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h + pmeth_lib.o: ../../include/openssl/x509_vfy.h ../asn1/asn1_locl.h ../cryptlib.h + pmeth_lib.o: evp_locl.h pmeth_lib.c ++e_chacha20_poly1305.o: ../../include/openssl/chacha20poly1305.h ++e_chacha20_poly1305.o: e_chacha20_poly1305.c +diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c +index 280e584..694f168 100644 +--- a/crypto/evp/c_allc.c ++++ b/crypto/evp/c_allc.c +@@ -238,4 +238,9 @@ void OpenSSL_add_all_ciphers(void) + EVP_add_cipher_alias(SN_camellia_256_cbc, "CAMELLIA256"); + EVP_add_cipher_alias(SN_camellia_256_cbc, "camellia256"); + #endif ++ ++#ifndef OPENSSL_NO_CHACHA_POLY ++ EVP_add_cipher(EVP_chacha20_poly1305()); ++ EVP_add_cipher(EVP_chacha20_poly1305_draft()); ++#endif + } +diff --git a/crypto/evp/e_chacha20_poly1305.c b/crypto/evp/e_chacha20_poly1305.c +new file mode 100644 +index 0000000..1e072ec +--- /dev/null ++++ b/crypto/evp/e_chacha20_poly1305.c +@@ -0,0 +1,362 @@ ++/* ==================================================================== ++ * Copyright (c) 2001-2014 The OpenSSL Project. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * ++ * 3. All advertising materials mentioning features or use of this ++ * software must display the following acknowledgment: ++ * "This product includes software developed by the OpenSSL Project ++ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" ++ * ++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to ++ * endorse or promote products derived from this software without ++ * prior written permission. For written permission, please contact ++ * openssl-core@openssl.org. ++ * ++ * 5. Products derived from this software may not be called "OpenSSL" ++ * nor may "OpenSSL" appear in their names without prior written ++ * permission of the OpenSSL Project. ++ * ++ * 6. Redistributions of any form whatsoever must retain the following ++ * acknowledgment: ++ * "This product includes software developed by the OpenSSL Project ++ * for use in the OpenSSL Toolkit (http://www.openssl.org/)" ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY ++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR ++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED ++ * OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ==================================================================== ++ * ++ */ ++ ++#include <openssl/opensslconf.h> ++#ifndef OPENSSL_NO_CHACHA_POLY ++# include <openssl/evp.h> ++# include <openssl/chacha20poly1305.h> ++ ++#define FILL_BUFFER ((size_t)128) ++ ++typedef struct { ++ uint8_t iv[12]; ++ uint8_t nonce[48]; ++ size_t aad_l; ++ size_t ct_l; ++ unsigned valid:1; ++ unsigned draft:1; ++ uint8_t poly_buffer[FILL_BUFFER]; ++ uint8_t chacha_buffer[FILL_BUFFER]; ++ uint16_t poly_buffer_used; ++ uint16_t chacha_used; ++ poly1305_state poly_state; ++ #define poly_finish(c,m) CRYPTO_poly1305_finish(&c->poly_state,m) ++} EVP_CHACHA20_POLY1305_CTX; ++ ++static int EVP_chacha20_poly1305_init_draft(EVP_CIPHER_CTX *ctx, ++ const unsigned char *key, ++ const unsigned char *iv, ++ int enc) ++{ ++ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; ++ memcpy(aead_ctx->nonce, key, 32); ++ aead_ctx->valid = 0; ++ aead_ctx->draft = 1; ++ return 1; ++} ++ ++static int EVP_chacha20_poly1305_init(EVP_CIPHER_CTX *ctx, ++ const unsigned char *key, ++ const unsigned char *iv, ++ int enc) ++{ ++ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; ++ memcpy(aead_ctx->nonce, key, 32); ++ memcpy(aead_ctx->iv, iv, 12); ++ aead_ctx->valid = 0; ++ aead_ctx->draft = 0; ++ return 1; ++} ++ ++static int EVP_chacha20_poly1305_cipher(EVP_CIPHER_CTX *ctx, ++ unsigned char *out, ++ const unsigned char *in, ++ size_t inl) ++{ ++ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; ++ uint8_t poly_mac[POLY1305_MAC_LEN]; ++ uint8_t zero[POLY1305_PAD_LEN] = {0}; ++ uint64_t cmp; ++ int i, todo; ++ ++ if (!aead_ctx->valid) ++ return 0; ++ ++ if (inl < POLY1305_MAC_LEN) ++ return -1; ++ ++ /* Fix for MAC */ ++ inl -= POLY1305_MAC_LEN; ++ ++#if (CHAPOLY_ASM) ++ if (!aead_ctx->draft) { ++ aead_ctx->valid = 0; ++ if (ctx->encrypt) { ++ chacha20_poly1305_seal(out, in, inl, ++ aead_ctx->poly_buffer, ++ aead_ctx->poly_buffer_used, ++ aead_ctx->nonce); ++ } else { ++ int cmp = chacha20_poly1305_open(out, in, inl, ++ aead_ctx->poly_buffer, ++ aead_ctx->poly_buffer_used, ++ aead_ctx->nonce); ++ if (!cmp) { ++ OPENSSL_cleanse(out, inl); ++ return -1; ++ } ++ } ++ return inl; ++ } ++#endif ++ ++ if (!ctx->encrypt) { ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, in, inl); ++ } ++ ++ i = 0; ++ if (inl < 256) { ++ /* Consume the buffer we computed during poly initialization */ ++ todo = inl > (FILL_BUFFER - aead_ctx->chacha_used) ? ++ FILL_BUFFER - aead_ctx->chacha_used : ++ inl; ++ ++ for (; i < todo; i++) { ++ out[i] = in[i] ^ aead_ctx->chacha_buffer[i + 64 /*aead_ctx->chacha_used*/]; ++ } ++ ++ } else { ++ /* For long messages don't use precomputed buffer */ ++ ((uint64_t *)(aead_ctx->nonce))[4]--; ++ } ++ ++ todo = inl - i; ++ ++ if (todo) { ++ CRYPTO_chacha_20(&out[i], &in[i], todo, aead_ctx->nonce); ++ } ++ ++ if (ctx->encrypt) { ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, out, inl); ++ } ++ ++ aead_ctx->ct_l += inl; ++ ++ if (!aead_ctx->draft) { ++ /* For RFC padd ciphertext with zeroes, then mac len(aad)||len(ct) */ ++ todo = aead_ctx->ct_l % POLY1305_PAD_LEN ? ++ POLY1305_PAD_LEN - (aead_ctx->ct_l % POLY1305_PAD_LEN) : ++ 0; ++ ++ if (todo) { ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, zero, todo); ++ } ++ ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->aad_l, 8); ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->ct_l, 8); ++ ++ } else { ++ /* For the draft don't pad, mac len(ct) */ ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, (uint8_t*)&aead_ctx->ct_l, 8); ++ } ++ aead_ctx->valid = 0; ++ ++ if (ctx->encrypt) { ++ poly_finish(aead_ctx, &out[inl]); ++ return inl + POLY1305_MAC_LEN; ++ ++ } else { /* Decryption */ ++ poly_finish(aead_ctx, poly_mac); ++ /* Constant time comparison */ ++ cmp = (*(uint64_t *)(poly_mac)) ^ (*(uint64_t *)(in + inl)); ++ cmp |= (*(uint64_t *)(poly_mac + 8)) ^ (*(uint64_t *)(in + inl + 8)); ++ ++ if (cmp) { ++ OPENSSL_cleanse(out, inl); ++ return -1; ++ } ++ ++ return inl; ++ } ++} ++ ++ ++static int EVP_chacha20_poly1305_cleanup(EVP_CIPHER_CTX *ctx) ++{ ++ return 1; ++} ++ ++ ++static int EVP_chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx, ++ int type, ++ int arg, ++ void *ptr) ++{ ++ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; ++ uint8_t aad[EVP_AEAD_TLS1_AAD_LEN + 8]; ++ uint64_t thirteen = EVP_AEAD_TLS1_AAD_LEN; ++ ++ switch (type) { ++ case EVP_CTRL_AEAD_TLS1_AAD: ++ ++ /* Initialize poly keys */ ++ memset(aead_ctx->chacha_buffer, 0, FILL_BUFFER); ++ ++ if (!aead_ctx->draft) { ++ /* RFC IV = (0 || iv) ^ seq_num */ ++ memset(aead_ctx->nonce + 32, 0, 4); ++ memcpy(aead_ctx->nonce + 36, aead_ctx->iv, 12); ++ *(uint64_t *)(aead_ctx->nonce + 40) ^= *(uint64_t *)(ptr); ++ ++ } else { ++ /* draft IV = 0 || seq_num */ ++ memset(aead_ctx->nonce + 32, 0, 8); ++ memcpy(aead_ctx->nonce + 40, ptr, 8); ++ } ++ ++#if (CHAPOLY_ASM) ++ if (!aead_ctx->draft) { ++ if (arg == EVP_AEAD_TLS1_AAD_LEN) { ++ /* For RFC, use optimized seal/open */ ++ memcpy(aad, ptr, arg); ++ unsigned int len = (aad[arg-2] << 8) | aad[arg-1]; ++ if (!ctx->encrypt) { ++ len -= POLY1305_MAC_LEN; ++ aad[arg-2] = len>>8; ++ aad[arg-1] = len & 0xff; ++ } ++ memcpy(aead_ctx->poly_buffer, aad, arg); ++ } else if (arg <= FILL_BUFFER) { ++ memcpy(aead_ctx->poly_buffer, ptr, arg); ++ } else { ++ aead_ctx->valid = 0; ++ return 0; ++ } ++ aead_ctx->valid = 1; ++ aead_ctx->poly_buffer_used = arg; ++ return POLY1305_MAC_LEN; ++ } ++#endif ++ /* Poly keys = ENC(0) */ ++ CRYPTO_chacha_20(aead_ctx->chacha_buffer, ++ aead_ctx->chacha_buffer, ++ FILL_BUFFER, ++ aead_ctx->nonce); ++ ++ CRYPTO_poly1305_init(&aead_ctx->poly_state, aead_ctx->chacha_buffer); ++ ++ aead_ctx->chacha_used = 64; ++ aead_ctx->poly_buffer_used = 0; ++ aead_ctx->aad_l = arg; ++ aead_ctx->ct_l = 0; ++ ++ /* Absorb AAD */ ++ memcpy(aad, ptr, arg); ++ memset(aad + arg, 0, sizeof(aad) - arg); ++ ++ /* If decrypting fix length for tag */ ++ if (!ctx->encrypt) { ++ unsigned int len = (aad[arg-2] << 8) | aad[arg-1]; ++ len -= POLY1305_MAC_LEN; ++ aad[arg-2] = len>>8; ++ aad[arg-1] = len & 0xff; ++ } ++ ++ if (!aead_ctx->draft) { ++ /* In the RFC, AAD is padded with zeroes */ ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, aad, POLY1305_PAD_LEN); ++ ++ } else { ++ /* In the draft AAD is followed by len(AAD) */ ++ memcpy(&aad[arg], &thirteen, sizeof(thirteen)); ++ CRYPTO_poly1305_update(&aead_ctx->poly_state, aad, arg + sizeof(thirteen)); ++ } ++ ++ aead_ctx->valid = 1; ++ return POLY1305_MAC_LEN; ++ ++ break; ++ ++ default: ++ return 0; ++ break; ++ } ++ ++ return 0; ++} ++ ++ ++#define CUSTOM_FLAGS (\ ++ EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \ ++ | EVP_CIPH_ALWAYS_CALL_INIT \ ++ | EVP_CIPH_CUSTOM_COPY) ++ ++ ++static const EVP_CIPHER chacha20_poly1305_d = { ++ NID_chacha20_poly1305_draft, ++ 1, /* block size, sorta */ ++ 32, /* key len */ ++ 0, /* iv len */ ++ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */ ++ EVP_chacha20_poly1305_init_draft, ++ EVP_chacha20_poly1305_cipher, ++ EVP_chacha20_poly1305_cleanup, ++ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */ ++ NULL, ++ NULL, ++ EVP_chacha20_poly1305_ctrl, ++ NULL ++ }; ++ ++ ++static const EVP_CIPHER chacha20_poly1305 = { ++ NID_chacha20_poly1305, ++ 1, /* block size, sorta */ ++ 32, /* key len */ ++ 12, /* iv len */ ++ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */ ++ EVP_chacha20_poly1305_init, ++ EVP_chacha20_poly1305_cipher, ++ EVP_chacha20_poly1305_cleanup, ++ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */ ++ NULL, ++ NULL, ++ EVP_chacha20_poly1305_ctrl, ++ NULL ++ }; ++ ++ ++const EVP_CIPHER *EVP_chacha20_poly1305_draft(void) ++{ return &chacha20_poly1305_d; } ++ ++ ++const EVP_CIPHER *EVP_chacha20_poly1305(void) ++{ return &chacha20_poly1305; } ++#endif +diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h +index 39ab793..8feaabc 100644 +--- a/crypto/evp/evp.h ++++ b/crypto/evp/evp.h +@@ -902,6 +902,11 @@ const EVP_CIPHER *EVP_seed_cfb128(void); + const EVP_CIPHER *EVP_seed_ofb(void); + # endif + ++# ifndef OPENSSL_NO_CHACHA_POLY ++const EVP_CIPHER *EVP_chacha20_poly1305(void); ++const EVP_CIPHER *EVP_chacha20_poly1305_draft(void); ++# endif ++ + void OPENSSL_add_all_algorithms_noconf(void); + void OPENSSL_add_all_algorithms_conf(void); + +diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h +index b7e3cf2..26612e2 100644 +--- a/crypto/objects/obj_dat.h ++++ b/crypto/objects/obj_dat.h +@@ -62,9 +62,9 @@ + * [including the GNU Public Licence.] + */ + +-#define NUM_NID 958 +-#define NUM_SN 951 +-#define NUM_LN 951 ++#define NUM_NID 960 ++#define NUM_SN 953 ++#define NUM_LN 953 + #define NUM_OBJ 890 + + static const unsigned char lvalues[6255]={ +@@ -2514,6 +2514,9 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={ + NID_jurisdictionStateOrProvinceName,11,&(lvalues[6232]),0}, + {"jurisdictionC","jurisdictionCountryName", + NID_jurisdictionCountryName,11,&(lvalues[6243]),0}, ++{"CHACHA20-POLY1305","chacha20-poly1305",NID_chacha20_poly1305,0,NULL,0}, ++{"CHACHA20-POLY1305-D","chacha20-poly1305-draft", ++ NID_chacha20_poly1305_draft,0,NULL,0}, + }; + + static const unsigned int sn_objs[NUM_SN]={ +@@ -2574,6 +2577,8 @@ static const unsigned int sn_objs[NUM_SN]={ + 110, /* "CAST5-CFB" */ + 109, /* "CAST5-ECB" */ + 111, /* "CAST5-OFB" */ ++958, /* "CHACHA20-POLY1305" */ ++959, /* "CHACHA20-POLY1305-D" */ + 894, /* "CMAC" */ + 13, /* "CN" */ + 141, /* "CRLReason" */ +@@ -3728,6 +3733,8 @@ static const unsigned int ln_objs[NUM_LN]={ + 677, /* "certicom-arc" */ + 517, /* "certificate extensions" */ + 883, /* "certificateRevocationList" */ ++958, /* "chacha20-poly1305" */ ++959, /* "chacha20-poly1305-draft" */ + 54, /* "challengePassword" */ + 407, /* "characteristic-two-field" */ + 395, /* "clearance" */ +diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h +index 779c309..35a2364 100644 +--- a/crypto/objects/obj_mac.h ++++ b/crypto/objects/obj_mac.h +@@ -4047,6 +4047,14 @@ + #define LN_aes_256_cbc_hmac_sha256 "aes-256-cbc-hmac-sha256" + #define NID_aes_256_cbc_hmac_sha256 950 + ++#define SN_chacha20_poly1305 "CHACHA20-POLY1305" ++#define LN_chacha20_poly1305 "chacha20-poly1305" ++#define NID_chacha20_poly1305 958 ++ ++#define SN_chacha20_poly1305_draft "CHACHA20-POLY1305-D" ++#define LN_chacha20_poly1305_draft "chacha20-poly1305-draft" ++#define NID_chacha20_poly1305_draft 959 ++ + #define SN_dhpublicnumber "dhpublicnumber" + #define LN_dhpublicnumber "X9.42 DH" + #define NID_dhpublicnumber 920 +diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num +index 8e5ea83..a3da329 100644 +--- a/crypto/objects/obj_mac.num ++++ b/crypto/objects/obj_mac.num +@@ -955,3 +955,5 @@ ct_cert_scts 954 + jurisdictionLocalityName 955 + jurisdictionStateOrProvinceName 956 + jurisdictionCountryName 957 ++chacha20_poly1305 958 ++chacha20_poly1305_draft 959 +diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt +index b57aabb..6a34a33 100644 +--- a/crypto/objects/objects.txt ++++ b/crypto/objects/objects.txt +@@ -1294,6 +1294,8 @@ kisa 1 6 : SEED-OFB : seed-ofb + : AES-128-CBC-HMAC-SHA256 : aes-128-cbc-hmac-sha256 + : AES-192-CBC-HMAC-SHA256 : aes-192-cbc-hmac-sha256 + : AES-256-CBC-HMAC-SHA256 : aes-256-cbc-hmac-sha256 ++ : CHACHA20-POLY1305 : chacha20-poly1305 ++ : CHACHA20-POLY1305-D : chacha20-poly1305-draft + + ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH + +diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c +index 0385e03..65fdc59 100644 +--- a/ssl/s3_lib.c ++++ b/ssl/s3_lib.c +@@ -2945,6 +2945,110 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { + 256}, + #endif + ++#if !defined(OPENSSL_NO_CHACHA_POLY) ++/* Draft ciphers */ ++ { ++ 1, ++ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D, ++ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D, ++ SSL_kEECDH, ++ SSL_aRSA, ++ SSL_CHACHA20POLY1305_D, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ ++ { ++ 1, ++ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D, ++ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D, ++ SSL_kEECDH, ++ SSL_aECDSA, ++ SSL_CHACHA20POLY1305_D, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ ++ { ++ 1, ++ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D, ++ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D, ++ SSL_kEDH, ++ SSL_aRSA, ++ SSL_CHACHA20POLY1305_D, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ /* RFC ciphers */ ++ { ++ 1, ++ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305, ++ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305, ++ SSL_kECDHE, ++ SSL_aRSA, ++ SSL_CHACHA20POLY1305, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ { ++ 1, ++ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, ++ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, ++ SSL_kECDHE, ++ SSL_aECDSA, ++ SSL_CHACHA20POLY1305, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ { ++ 1, ++ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305, ++ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305, ++ SSL_kDHE, ++ SSL_aRSA, ++ SSL_CHACHA20POLY1305, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++ { ++ 1, ++ TLS1_TXT_PSK_WITH_CHACHA20_POLY1305, ++ TLS1_CK_PSK_WITH_CHACHA20_POLY1305, ++ SSL_kPSK, ++ SSL_aPSK, ++ SSL_CHACHA20POLY1305, ++ SSL_AEAD, ++ SSL_TLSV1_2, ++ SSL_HIGH, ++ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256, ++ 256, ++ 256, ++ }, ++#endif + /* end of list */ + }; + +@@ -4090,6 +4194,7 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, + int i, ii, ok; + CERT *cert; + unsigned long alg_k, alg_a, mask_k, mask_a, emask_k, emask_a; ++ int use_chacha = 0; + + /* Let's see which ciphers we can support */ + cert = s->cert; +@@ -4119,13 +4224,21 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, + fprintf(stderr, "%p:%s\n", (void *)c, c->name); + } + #endif +- ++retry: + if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) { + prio = srvr; + allow = clnt; ++ /* Use ChaCha20+Poly1305 iff it's client's most preferred cipher suite */ ++ if (sk_SSL_CIPHER_num(clnt) > 0) { ++ c = sk_SSL_CIPHER_value(clnt, 0); ++ if (c->algorithm_enc == SSL_CHACHA20POLY1305 || ++ c->algorithm_enc == SSL_CHACHA20POLY1305_D) ++ use_chacha = 1; ++ } + } else { + prio = clnt; + allow = srvr; ++ use_chacha = 1; + } + + tls1_set_cert_validity(s); +@@ -4137,6 +4250,11 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, + if ((c->algorithm_ssl & SSL_TLSV1_2) && !SSL_USE_TLS1_2_CIPHERS(s)) + continue; + ++ /* Skip ChaCha unless top client priority */ ++ if ((c->algorithm_enc == SSL_CHACHA20POLY1305 || ++ c->algorithm_enc == SSL_CHACHA20POLY1305_D) && !use_chacha) ++ continue; ++ + ssl_set_cert_masks(cert, c); + mask_k = cert->mask_k; + mask_a = cert->mask_a; +@@ -4216,6 +4334,14 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, + break; + } + } ++ ++ if (ret == NULL && !use_chacha) { ++ /* If no shared cipher was found due to some unusual preferences, try ++ * again with CHACHA enabled even if not top priority */ ++ use_chacha = 1; ++ goto retry; ++ } ++ + return (ret); + } + +diff --git a/ssl/ssl.h b/ssl/ssl.h +index 90aeb0c..f783baa 100644 +--- a/ssl/ssl.h ++++ b/ssl/ssl.h +@@ -297,6 +297,8 @@ extern "C" { + # define SSL_TXT_CAMELLIA128 "CAMELLIA128" + # define SSL_TXT_CAMELLIA256 "CAMELLIA256" + # define SSL_TXT_CAMELLIA "CAMELLIA" ++# define SSL_TXT_CHACHA20_D "CHACHA20-draft" ++# define SSL_TXT_CHACHA20 "CHACHA20" + + # define SSL_TXT_MD5 "MD5" + # define SSL_TXT_SHA1 "SHA1" +diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c +index 2ad8f43..23c1c68 100644 +--- a/ssl/ssl_ciph.c ++++ b/ssl/ssl_ciph.c +@@ -164,11 +164,13 @@ + #define SSL_ENC_SEED_IDX 11 + #define SSL_ENC_AES128GCM_IDX 12 + #define SSL_ENC_AES256GCM_IDX 13 +-#define SSL_ENC_NUM_IDX 14 ++#define SSL_ENC_CHACHA20POLY1305_DRAFT_IDX 14 ++#define SSL_ENC_CHACHA20POLY1305_IDX 15 ++#define SSL_ENC_NUM_IDX 16 + + static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +- NULL, NULL ++ NULL, NULL, NULL, NULL + }; + + #define SSL_COMP_NULL_IDX 0 +@@ -315,6 +317,8 @@ static const SSL_CIPHER cipher_aliases[] = { + {0, SSL_TXT_CAMELLIA256, 0, 0, 0, SSL_CAMELLIA256, 0, 0, 0, 0, 0, 0}, + {0, SSL_TXT_CAMELLIA, 0, 0, 0, SSL_CAMELLIA128 | SSL_CAMELLIA256, 0, 0, 0, + 0, 0, 0}, ++ {0, SSL_TXT_CHACHA20_D, 0, 0, 0, SSL_CHACHA20POLY1305_D, 0, 0, 0, 0, 0, 0}, ++ {0, SSL_TXT_CHACHA20, 0, 0, 0, SSL_CHACHA20POLY1305, 0, 0, 0, 0, 0, 0}, + + /* MAC aliases */ + {0, SSL_TXT_MD5, 0, 0, 0, 0, SSL_MD5, 0, 0, 0, 0, 0}, +@@ -431,6 +435,11 @@ void ssl_load_ciphers(void) + ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] = + EVP_get_cipherbyname(SN_aes_256_gcm); + ++ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] = ++ EVP_chacha20_poly1305_draft(); ++ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] = ++ EVP_chacha20_poly1305(); ++ + ssl_digest_methods[SSL_MD_MD5_IDX] = EVP_get_digestbyname(SN_md5); + ssl_mac_secret_size[SSL_MD_MD5_IDX] = + EVP_MD_size(ssl_digest_methods[SSL_MD_MD5_IDX]); +@@ -581,6 +590,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc, + case SSL_AES256GCM: + i = SSL_ENC_AES256GCM_IDX; + break; ++ case SSL_CHACHA20POLY1305_D: ++ i = SSL_ENC_CHACHA20POLY1305_DRAFT_IDX; ++ break; ++ case SSL_CHACHA20POLY1305: ++ i = SSL_ENC_CHACHA20POLY1305_IDX; ++ break; + default: + i = -1; + break; +@@ -805,6 +820,12 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, + (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == + NULL) ? SSL_eGOST2814789CNT : 0; + *enc |= (ssl_cipher_methods[SSL_ENC_SEED_IDX] == NULL) ? SSL_SEED : 0; ++ *enc |= ++ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] == ++ NULL) ? SSL_CHACHA20POLY1305_D : 0; ++ *enc |= ++ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] == ++ NULL) ? SSL_CHACHA20POLY1305 : 0; + + *mac |= (ssl_digest_methods[SSL_MD_MD5_IDX] == NULL) ? SSL_MD5 : 0; + *mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1 : 0; +@@ -1824,6 +1845,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) + case SSL_eGOST2814789CNT: + enc = "GOST89(256)"; + break; ++ case SSL_CHACHA20POLY1305_D: ++ enc = "ChaCha20-Poly1305-draft"; ++ break; ++ case SSL_CHACHA20POLY1305: ++ enc = "ChaCha20-Poly1305"; ++ break; + default: + enc = "unknown"; + break; +diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h +index 6df725f..dbe68f2 100644 +--- a/ssl/ssl_locl.h ++++ b/ssl/ssl_locl.h +@@ -354,6 +354,8 @@ + # define SSL_SEED 0x00000800L + # define SSL_AES128GCM 0x00001000L + # define SSL_AES256GCM 0x00002000L ++# define SSL_CHACHA20POLY1305_D 0x00040000L ++# define SSL_CHACHA20POLY1305 0x00080000L /* Value from openssl */ + + # define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM) + # define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256) +diff --git a/ssl/tls1.h b/ssl/tls1.h +index 7e237d0..ff2e259 100644 +--- a/ssl/tls1.h ++++ b/ssl/tls1.h +@@ -563,6 +563,19 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) + # define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031 + # define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032 + ++/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ ++# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC13 ++# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D 0x0300CC14 ++# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC15 ++/* ChaCha20-Poly1305 ciphersuites from RFC */ ++# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCA8 ++# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 0x0300CCA9 ++# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCAA ++# define TLS1_CK_PSK_WITH_CHACHA20_POLY1305 0x0300CCAB ++# define TLS1_CK_ECDHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAC ++# define TLS1_CK_DHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAD ++# define TLS1_CK_RSA_PSK_WITH_CHACHA20_POLY1305 0x0300CCAE ++ + /* + * XXX * Backward compatibility alert: + * Older versions of OpenSSL gave + * some DHE ciphers names with "EDH" + * instead of "DHE". Going forward, we +@@ -713,6 +726,19 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) + # define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256" + # define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384" + ++/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ ++# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D "ECDHE-RSA-CHACHA20-POLY1305-D" ++# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D "ECDHE-ECDSA-CHACHA20-POLY1305-D" ++# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D "DHE-RSA-CHACHA20-POLY1305-D" ++/* Chacha20-Poly1305 ciphersuites from RFC */ ++# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305" ++# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305" ++# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305" ++# define TLS1_TXT_PSK_WITH_CHACHA20_POLY1305 "PSK-CHACHA20-POLY1305" ++# define TLS1_TXT_ECDHE_PSK_WITH_CHACHA20_POLY1305 "ECDHE-PSK-CHACHA20-POLY1305" ++# define TLS1_TXT_DHE_PSK_WITH_CHACHA20_POLY1305 "DHE-PSK-CHACHA20-POLY1305" ++# define TLS1_TXT_RSA_PSK_WITH_CHACHA20_POLY1305 "RSA-PSK-CHACHA20-POLY1305" ++ + # define TLS_CT_RSA_SIGN 1 + # define TLS_CT_DSS_SIGN 2 + # define TLS_CT_RSA_FIXED_DH 3 +-- +2.10.1 + |