diff --git a/crypto/fipsmodule/aes/asm/aesni-xts-avx512.pl b/crypto/fipsmodule/aes/asm/aesni-xts-avx512.pl index 3800dfa1a9..b47a25fbb1 100644 --- a/crypto/fipsmodule/aes/asm/aesni-xts-avx512.pl +++ b/crypto/fipsmodule/aes/asm/aesni-xts-avx512.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2025 Intel Corporation # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -48,48 +48,33 @@ if ($avx512vaes) { - my $GP_STORAGE = $win64 ? (16 * 33) : (16 * 23); # store rbx - my $XMM_STORAGE = $win64 ? (16 * 23) : 0; # store xmm6:xmm15 - my $VARIABLE_OFFSET = $win64 ? (16 *8 + 16* 15 + 16 * 10 + 8*3) : - (16*8 + 16 * 15 + 8 * 1); + my $GP_STORAGE = $win64 ? (16 * 18) : (16 * 8); # store rbx + my $XMM_STORAGE = $win64 ? (16 * 8) : 0; # store xmm6:xmm15 + my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) : + (16*8 + 8*1); + # All usages of rsp should be invoked via $TW, not shadowed by any + # other name or used directly. my $TW = "%rsp"; - my $TWTEMPH = "%rbx"; - my $TWTEMPL = "%rax"; + my $TEMPHIGH = "%rbx"; + my $TEMPLOW = "%rax"; my $ZPOLY = "%zmm25"; # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; # ;;; Function arguments abstraction # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; my ($key2, $key1, $tweak, $length, $input, $output); - - if ($win64) { - $input = "%rcx"; - $output = "%rdx"; - $length = "%r8"; - $key1 = "%r9"; - $key2 = "%r10"; - $tweak = "%r11"; - } else { - $input = "%rdi"; - $output = "%rsi"; - $length = "%rdx"; - $key1 = "%rcx"; - $key2 = "%r8"; - $tweak = "%r9"; - } + $input = "%rdi"; + $output = "%rsi"; + $length = "%rdx"; + $key1 = "%rcx"; + $key2 = "%r8"; + $tweak = "%r9"; # arguments for temp parameters - my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp); - if ($win64) { - $tmp1 = "%r10"; - $gf_poly_8b = "%rdi"; - $gf_poly_8b_temp = "%rsi"; - } else { - $tmp1 = "%r8"; - $gf_poly_8b = "%r10"; - $gf_poly_8b_temp = "%r11"; - } + my $tmp1 = "%r8"; + my $gf_poly_8b = "%r10"; + my $decLength = "%r11"; # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; # ;;; Helper functions @@ -107,787 +92,107 @@ # ; Seed the RNG so the labels are generated deterministically srand(12345); - sub encrypt_tweak_for_encryption { - my $key2 = $_[0]; - my $state_tweak = $_[1]; - my $key1 = $_[2]; - my $raw_key = $_[3]; - my $tmp = $_[4]; - my $ptr_key2 = $_[5]; - my $ptr_key1 = $_[6]; - my $ptr_expanded_keys = $_[7]; + sub encrypt_tweak { + my $state_tweak = $_[0]; $code.=<<___; - vmovdqu ($ptr_key2), $key2 - vpxor $key2, $state_tweak, $state_tweak # AddRoundKey(ARK) for tweak encryption - - vmovdqu ($ptr_key1), $key1 - vmovdqa $key1, 0x80($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x10($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 1 for tweak encryption - - vmovdqu 0x10($ptr_key1), $key1 - vmovdqa $key1, 0x90($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x20($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 2 for tweak encryption - - vmovdqu 0x20($ptr_key1), $key1 - vmovdqa $key1, 0xa0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x30($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 3 for tweak encryption - - vmovdqu 0x30($ptr_key1), $key1 - vmovdqa $key1, 0xb0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x40($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 4 for tweak encryption - - vmovdqu 0x40($ptr_key1), $key1 - vmovdqa $key1, 0xc0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x50($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 5 for tweak encryption - - vmovdqu 0x50($ptr_key1), $key1 - vmovdqa $key1, 0xd0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x60($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 6 for tweak encryption - - vmovdqu 0x60($ptr_key1), $key1 - vmovdqa $key1, 0xe0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x70($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 7 for tweak encryption - - vmovdqu 0x70($ptr_key1), $key1 - vmovdqa $key1, 0xf0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x80($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 8 for tweak encryption - - vmovdqu 0x80($ptr_key1), $key1 - vmovdqa $key1, 0x100($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x90($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 9 for tweak encryption - - vmovdqu 0x90($ptr_key1), $key1 - vmovdqa $key1, 0x110($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xa0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 10 for tweak encryption - - vmovdqu 0xa0($ptr_key1), $key1 - vmovdqa $key1, 0x120($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xb0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 11 for tweak encryption - - vmovdqu 0xb0($ptr_key1), $key1 - vmovdqa $key1, 0x130($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xc0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 12 for tweak encryption - - vmovdqu 0xc0($ptr_key1), $key1 - vmovdqa $key1, 0x140($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xd0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 13 for tweak encryption - - vmovdqu 0xd0($ptr_key1), $key1 - vmovdqa $key1, 0x150($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xe0($ptr_key2), $key2 - vaesenclast $key2, $state_tweak, $state_tweak # round 14 for tweak encryption - - vmovdqu 0xe0($ptr_key1), $key1 - vmovdqa $key1, 0x160($ptr_expanded_keys) # store round keys in stack - - vmovdqa $state_tweak, ($ptr_expanded_keys) # Store the encrypted Tweak value -___ - } - - sub initialize { - my @st; - $st[0] = $_[0]; - $st[1] = $_[1]; - $st[2] = $_[2]; - $st[3] = $_[3]; - $st[4] = $_[4]; - $st[5] = $_[5]; - $st[6] = $_[6]; - $st[7] = $_[7]; - - my @tw; - $tw[0] = $_[8]; - $tw[1] = $_[9]; - $tw[2] = $_[10]; - $tw[3] = $_[11]; - $tw[4] = $_[12]; - $tw[5] = $_[13]; - $tw[6] = $_[14]; - my $num_initial_blocks = $_[15]; - - $code .= <<___; - vmovdqa 0x0($TW), $tw[0] - mov 0x0($TW), $TWTEMPL - mov 0x08($TW), $TWTEMPH - vmovdqu 0x0($input), $st[0] -___ - - if ($num_initial_blocks >= 2) { - for (my $i = 1; $i < $num_initial_blocks; $i++) { - $code .= "xor $gf_poly_8b_temp, $gf_poly_8b_temp\n"; - $code .= "shl \$1, $TWTEMPL\n"; - $code .= "adc $TWTEMPH, $TWTEMPH\n"; - $code .= "cmovc $gf_poly_8b, $gf_poly_8b_temp\n"; - $code .= "xor $gf_poly_8b_temp, $TWTEMPL\n"; - - my $offset = $i * 16; - $code .= "mov $TWTEMPL, $offset($TW)\n"; - $code .= "mov $TWTEMPH, `$offset + 8`($TW)\n"; - $code .= "vmovdqa $offset($TW), $tw[$i]\n"; - $code .= "vmovdqu $offset($input), $st[$i]\n"; - } - } - } - - # encrypt initial blocks of AES - # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted - # next 8 Tweak values are generated - sub encrypt_initial { - my @st; - $st[0] = $_[0]; - $st[1] = $_[1]; - $st[2] = $_[2]; - $st[3] = $_[3]; - $st[4] = $_[4]; - $st[5] = $_[5]; - $st[6] = $_[6]; - $st[7] = $_[7]; - - my @tw; - $tw[0] = $_[8]; - $tw[1] = $_[9]; - $tw[2] = $_[10]; - $tw[3] = $_[11]; - $tw[4] = $_[12]; - $tw[5] = $_[13]; - $tw[6] = $_[14]; - my $t0 = $_[15]; - my $num_blocks = $_[16]; - my $lt128 = $_[17]; - - # num_blocks blocks encrypted - # num_blocks can be 1, 2, 3, 4, 5, 6, 7 + vpxor ($key2), $state_tweak, $state_tweak + vaesenc 0x10($key2), $state_tweak, $state_tweak + vaesenc 0x20($key2), $state_tweak, $state_tweak + vaesenc 0x30($key2), $state_tweak, $state_tweak + vaesenc 0x40($key2), $state_tweak, $state_tweak + vaesenc 0x50($key2), $state_tweak, $state_tweak + vaesenc 0x60($key2), $state_tweak, $state_tweak + vaesenc 0x70($key2), $state_tweak, $state_tweak + vaesenc 0x80($key2), $state_tweak, $state_tweak + vaesenc 0x90($key2), $state_tweak, $state_tweak + vaesenc 0xa0($key2), $state_tweak, $state_tweak + vaesenc 0xb0($key2), $state_tweak, $state_tweak + vaesenc 0xc0($key2), $state_tweak, $state_tweak + vaesenc 0xd0($key2), $state_tweak, $state_tweak + vaesenclast 0xe0($key2), $state_tweak, $state_tweak + vmovdqa $state_tweak, ($TW) +___ + } + + sub encrypt_final { + my $st = $_[0]; + my $tw = $_[1]; # xor Tweak value - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; - } - $code .= "vmovdqa 0x80($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH -___ - } - # round 1 - $code .= "vmovdqa 0x90($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x0($TW) # next Tweak1 generated - mov $TWTEMPL, 0x08($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp -___ - } - - # round 2 - $code .= "vmovdqa 0xa0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x10($TW) # next Tweak2 generated -___ - } - - # round 3 - $code .= "vmovdqa 0xb0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - mov $TWTEMPH, 0x18($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp -___ - } - - # round 4 - $code .= "vmovdqa 0xc0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x20($TW) # next Tweak3 generated - mov $TWTEMPH, 0x28($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL -___ - } - - # round 5 - $code .= "vmovdqa 0xd0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x30($TW) # next Tweak4 generated - mov $TWTEMPH, 0x38($TW) -___ - } - - # round 6 - $code .= "vmovdqa 0xe0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x40($TW) # next Tweak5 generated - mov $TWTEMPH, 0x48($TW) -___ - } - - # round 7 - $code .= "vmovdqa 0xf0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x50($TW) # next Tweak6 generated - mov $TWTEMPH, 0x58($TW) -___ - } - - # round 8 - $code .= "vmovdqa 0x100($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } + $code .= "vpxor $tw, $st, $st\n"; + $code .= "vpxor ($key1), $st, $st\n"; - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x60($TW) # next Tweak7 generated - mov $TWTEMPH, 0x68($TW) -___ - } - - # round 9 - $code .= "vmovdqa 0x110($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; + for (my $i = 1; $i < 14; $i++) { + $code .= "vaesenc 16*$i($key1), $st, $st\n"; } - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x70($TW) # next Tweak8 generated - mov $TWTEMPH, 0x78($TW) + $code .=<<___; + vaesenclast 16*14($key1), $st, $st + vpxor $tw, $st, $st ___ - } - - # round 10 - $code .= "vmovdqa 0x120($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - # round 11 - $code .= "vmovdqa 0x130($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - # round 12 - $code .= "vmovdqa 0x140($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - # round 13 - $code .= "vmovdqa 0x150($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; - } - - # round 14 - $code .= "vmovdqa 0x160($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; - } - - # xor Tweak values - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - # load next Tweak values - $code .= <<___; - vmovdqa 0x0($TW), $tw[0] - vmovdqa 0x10($TW), $tw[1] - vmovdqa 0x20($TW), $tw[2] - vmovdqa 0x30($TW), $tw[3] - vmovdqa 0x40($TW), $tw[4] - vmovdqa 0x50($TW), $tw[5] - vmovdqa 0x60($TW), $tw[6] -___ - } } - sub encrypt_tweak_for_decryption { - my $key2 = $_[0]; - my $state_tweak = $_[1]; - my $key1 = $_[2]; - my $raw_key = $_[3]; - my $tmp = $_[4]; - my $ptr_key2 = $_[5]; - my $ptr_key1 = $_[6]; - my $ptr_expanded_keys = $_[7]; - - $code.=<<___; - vmovdqu ($ptr_key2), $key2 - vpxor $key2, $state_tweak, $state_tweak # ARK for tweak encryption - - vmovdqu 0xe0($ptr_key1), $key1 - vmovdqa $key1, 0x160($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x10($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 1 for tweak encryption - - vmovdqu 0xd0($ptr_key1), $key1 - vmovdqa $key1, 0x150($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x20($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 2 for tweak encryption - - vmovdqu 0xc0($ptr_key1), $key1 - vmovdqa $key1, 0x140($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x30($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 3 for tweak encryption - - vmovdqu 0xb0($ptr_key1), $key1 - vmovdqa $key1, 0x130($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x40($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 4 for tweak encryption - - vmovdqu 0xa0($ptr_key1), $key1 - vmovdqa $key1, 0x120($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x50($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 5 for tweak encryption - - vmovdqu 0x90($ptr_key1), $key1 - vmovdqa $key1, 0x110($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x60($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 6 for tweak encryption - - vmovdqu 0x80($ptr_key1), $key1 - vmovdqa $key1, 0x100($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x70($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 7 for tweak encryption - - vmovdqu 0x70($ptr_key1), $key1 - vmovdqa $key1, 0xf0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x80($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 8 for tweak encryption - - vmovdqu 0x60($ptr_key1), $key1 - vmovdqa $key1, 0xe0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0x90($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 9 for tweak encryption - - vmovdqu 0x50($ptr_key1), $key1 - vmovdqa $key1, 0xd0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xa0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 10 for tweak encryption - - vmovdqu 0x40($ptr_key1), $key1 - vmovdqa $key1, 0xc0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xb0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 11 for tweak encryption - - vmovdqu 0x30($ptr_key1), $key1 - vmovdqa $key1, 0xb0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xc0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 12 for tweak encryption - - vmovdqu 0x20($ptr_key1), $key1 - vmovdqa $key1, 0xa0($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xd0($ptr_key2), $key2 - vaesenc $key2, $state_tweak, $state_tweak # round 13 for tweak encryption - - vmovdqu 0x10($ptr_key1), $key1 - vmovdqa $key1, 0x90($ptr_expanded_keys) # store round keys in stack - - vmovdqu 0xe0($ptr_key2), $key2 - vaesenclast $key2, $state_tweak, $state_tweak # round 14 for tweak encryption - - vmovdqu ($ptr_key1), $key1 - vmovdqa $key1, 0x80($ptr_expanded_keys) # store round keys in stack - - vmovdqa $state_tweak, ($ptr_expanded_keys) # Store the encrypted Tweak value -___ - } - - # decrypt initial blocks of AES - # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted - # next 8 Tweak values are generated - sub decrypt_initial { - my @st; - $st[0] = $_[0]; - $st[1] = $_[1]; - $st[2] = $_[2]; - $st[3] = $_[3]; - $st[4] = $_[4]; - $st[5] = $_[5]; - $st[6] = $_[6]; - $st[7] = $_[7]; - - my @tw; - $tw[0] = $_[8]; - $tw[1] = $_[9]; - $tw[2] = $_[10]; - $tw[3] = $_[11]; - $tw[4] = $_[12]; - $tw[5] = $_[13]; - $tw[6] = $_[14]; - my $t0 = $_[15]; - my $num_blocks = $_[16]; - my $lt128 = $_[17]; - - # num_blocks blocks encrypted - # num_blocks can be 1, 2, 3, 4, 5, 6, 7 - - # xor Tweak value - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; - } - - $code .= "vmovdqa 0x80($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH -___ - } - # round 1 - $code .= "vmovdqa 0x90($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, ($TW) # next Tweak1 generated - mov $TWTEMPL, 0x08($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp -___ - } - - # round 2 - $code .= "vmovdqa 0xa0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x10($TW) # next Tweak2 generated -___ - } - - # round 3 - $code .= "vmovdqa 0xb0($TW), $t0\n"; + sub decrypt_final { + my $st = $_[0]; + my $tw = $_[1]; - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - mov $TWTEMPH, 0x18($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp -___ - } - - # round 4 - $code .= "vmovdqa 0xc0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x20($TW) # next Tweak3 generated - mov $TWTEMPH, 0x28($TW) - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL -___ - } - - # round 5 - $code .= "vmovdqa 0xd0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x30($TW) # next Tweak4 generated - mov $TWTEMPH, 0x38($TW) -___ - } - - # round 6 - $code .= "vmovdqa 0xe0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x40($TW) # next Tweak5 generated - mov $TWTEMPH, 0x48($TW) -___ - } - - # round 7 - $code .= "vmovdqa 0xf0($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x50($TW) # next Tweak6 generated - mov $TWTEMPH, 0x58($TW) -___ - } - - # round 8 - $code .= "vmovdqa 0x100($TW), $t0\n"; + # xor Tweak value + $code .= "vpxor $tw, $st, $st\n"; + $code .= "vpxor ($key1), $st, $st\n"; - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; + for (my $i = 1; $i < 14; $i++) { + $code .= "vaesdec 16*$i($key1), $st, $st\n"; } - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x60($TW) # next Tweak7 generated - mov $TWTEMPH, 0x68($TW) + $code .=<<___; + vaesdeclast 16*14($key1), $st, $st + vpxor $tw, $st, $st ___ - } + } - # round 9 - $code .= "vmovdqa 0x110($TW), $t0\n"; + # Encrypt 4 blocks in parallel + sub encrypt_by_four { + my $st1 = $_[0]; # state 1 + my $tw1 = $_[1]; # tweak 1 + my $tmp = $_[2]; - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } + $code .= "vbroadcasti32x4 ($key1), $tmp\n"; + $code .= "vpternlogq \$0x96, $tmp, $tw1, $st1\n"; - if (0 == $lt128) { - $code .= <<___; - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL, 0x70($TW) # next Tweak8 generated - mov $TWTEMPH, 0x78($TW) -___ + for (my $i = 1; $i < 14; $i++) { + $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n"; + $code .= "vaesenc $tmp, $st1, $st1\n"; } - # round 10 - $code .= "vmovdqa 0x120($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } + $code .= "vbroadcasti32x4 16*14($key1), $tmp\n"; + $code .= "vaesenclast $tmp, $st1, $st1\n"; - # round 11 - $code .= "vmovdqa 0x130($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - # round 12 - $code .= "vmovdqa 0x140($TW), $t0\n"; - - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } - - # round 13 - $code .= "vmovdqa 0x150($TW), $t0\n"; + $code .= "vpxorq $tw1, $st1, $st1\n"; + } - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; - } + sub decrypt_by_four { + my $st1 = $_[0]; # state 1 + my $tw1 = $_[1]; # tweak 1 + my $tmp = $_[2]; - # round 14 - $code .= "vmovdqa 0x160($TW), $t0\n"; + $code .= "vbroadcasti32x4 ($key1), $tmp\n"; + $code .= "vpternlogq \$0x96, $tmp, $tw1, $st1\n"; - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; + for (my $i = 1; $i < 14; $i++) { + $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n"; + $code .= "vaesdec $tmp, $st1, $st1\n"; } - # xor Tweak values - for (my $i = 0; $i < $num_blocks; $i++) { - $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; - } + $code .= "vbroadcasti32x4 16*14($key1), $tmp\n"; + $code .= "vaesdeclast $tmp, $st1, $st1\n"; - if (0 == $lt128) { - # load next Tweak values - $code .= <<___; - vmovdqa ($TW), $tw1 - vmovdqa 0x10($TW), $tw2 - vmovdqa 0x20($TW), $tw3 - vmovdqa 0x30($TW), $tw4 - vmovdqa 0x40($TW), $tw5 - vmovdqa 0x50($TW), $tw6 - vmovdqa 0x60($TW), $tw7 -___ - } + $code .= "vpxorq $tw1, $st1, $st1\n"; } # Encrypt 8 blocks in parallel # generate next 8 tweak values - sub encrypt_by_eight_zmm { + sub encrypt_by_eight { my $st1 = $_[0]; my $st2 = $_[1]; my $tw1 = $_[2]; @@ -896,14 +201,9 @@ my $last_eight = $_[5]; $code .= <<___; - # xor Tweak values - vpxorq $tw1, $st1, $st1 - vpxorq $tw2, $st2, $st2 - - # ARK - vbroadcasti32x4 0x80($TW), $t0 - vpxorq $t0, $st1, $st1 - vpxorq $t0, $st2, $st2 + vbroadcasti32x4 ($key1), $t0 + vpternlogq \$0x96, $t0, $tw1, $st1 + vpternlogq \$0x96, $t0, $tw2, $st2 ___ if (0 == $last_eight) { @@ -916,17 +216,17 @@ } # round 1 $code .= <<___; - vbroadcasti32x4 0x90($TW), $t0 + vbroadcasti32x4 0x10($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 2 - vbroadcasti32x4 0xa0($TW), $t0 + vbroadcasti32x4 0x20($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 3 - vbroadcasti32x4 0xb0($TW), $t0 + vbroadcasti32x4 0x30($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 ___ @@ -942,73 +242,74 @@ $code .= <<___; # round 4 - vbroadcasti32x4 0xc0($TW), $t0 + vbroadcasti32x4 0x40($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 5 - vbroadcasti32x4 0xd0($TW), $t0 + vbroadcasti32x4 0x50($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 6 - vbroadcasti32x4 0xe0($TW), $t0 + vbroadcasti32x4 0x60($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 7 - vbroadcasti32x4 0xf0($TW), $t0 + vbroadcasti32x4 0x70($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 8 - vbroadcasti32x4 0x100($TW), $t0 + vbroadcasti32x4 0x80($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 9 - vbroadcasti32x4 0x110($TW), $t0 + vbroadcasti32x4 0x90($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 10 - vbroadcasti32x4 0x120($TW), $t0 + vbroadcasti32x4 0xa0($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 11 - vbroadcasti32x4 0x130($TW), $t0 + vbroadcasti32x4 0xb0($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 12 - vbroadcasti32x4 0x140($TW), $t0 + vbroadcasti32x4 0xc0($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 13 - vbroadcasti32x4 0x150($TW), $t0 + vbroadcasti32x4 0xd0($key1), $t0 vaesenc $t0, $st1, $st1 vaesenc $t0, $st2, $st2 # round 14 - vbroadcasti32x4 0x160($TW), $t0 - vaesenclast $t0, $st1, $st1 - vaesenclast $t0, $st2, $st2 - + vbroadcasti32x4 0xe0($key1), $t0 + vaesenclast $t0, $st1, $st1 + vaesenclast $t0, $st2, $st2 + vpxorq $tw1, $st1, $st1 + vpxorq $tw2, $st2, $st2 +___ # xor Tweak values - vpxorq $tw1, $st1, $st1 - vpxorq $tw2, $st2, $st2 - - # load next Tweak values - vmovdqa32 %zmm15, $tw1 - vmovdqa32 %zmm16, $tw2 + if (0 == $last_eight) { + # load next Tweak values + $code .= <<___; + vmovdqa32 %zmm15, $tw1 + vmovdqa32 %zmm16, $tw2 ___ + } } - # Decrypt 8 blocks in parallel - # generate next 8 tweak values - sub decrypt_by_eight_zmm { + # Decrypt 8 blocks in paralle and generate next 8 tweak values. + sub decrypt_by_eight { my $st1 = $_[0]; my $st2 = $_[1]; my $tw1 = $_[2]; @@ -1017,37 +318,32 @@ my $last_eight = $_[5]; $code .= <<___; - # xor Tweak values - vpxorq $tw1, $st1, $st1 - vpxorq $tw2, $st2, $st2 - - # ARK - vbroadcasti32x4 0x80($TW), $t0 - vpxorq $t0, $st1, $st1 - vpxorq $t0, $st2, $st2 + vbroadcasti32x4 ($key1), $t0 + vpternlogq \$0x96, $t0, $tw1, $st1 + vpternlogq \$0x96, $t0, $tw2, $st2 ___ if (0 == $last_eight) { $code .= <<___; vpsrldq \$0xf, $tw1, %zmm13 - vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 + vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 vpslldq \$0x1, $tw1, %zmm15 vpxord %zmm14, %zmm15, %zmm15 ___ } # round 1 $code .= <<___; - vbroadcasti32x4 0x90($TW), $t0 + vbroadcasti32x4 0x10($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 2 - vbroadcasti32x4 0xa0($TW), $t0 + vbroadcasti32x4 0x20($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 3 - vbroadcasti32x4 0xb0($TW), $t0 + vbroadcasti32x4 0x30($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 ___ @@ -1055,7 +351,7 @@ if (0 == $last_eight) { $code .= <<___; vpsrldq \$0xf, $tw2, %zmm13 - vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 + vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 vpslldq \$0x1, $tw2, %zmm16 vpxord %zmm14, %zmm16, %zmm16 ___ @@ -1063,73 +359,74 @@ $code .= <<___; # round 4 - vbroadcasti32x4 0xc0($TW), $t0 + vbroadcasti32x4 0x40($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 5 - vbroadcasti32x4 0xd0($TW), $t0 + vbroadcasti32x4 0x50($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 6 - vbroadcasti32x4 0xe0($TW), $t0 + vbroadcasti32x4 0x60($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 7 - vbroadcasti32x4 0xf0($TW), $t0 + vbroadcasti32x4 0x70($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 8 - vbroadcasti32x4 0x100($TW), $t0 + vbroadcasti32x4 0x80($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 9 - vbroadcasti32x4 0x110($TW), $t0 + vbroadcasti32x4 0x90($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 10 - vbroadcasti32x4 0x120($TW), $t0 + vbroadcasti32x4 0xa0($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 11 - vbroadcasti32x4 0x130($TW), $t0 + vbroadcasti32x4 0xb0($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 12 - vbroadcasti32x4 0x140($TW), $t0 + vbroadcasti32x4 0xc0($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 13 - vbroadcasti32x4 0x150($TW), $t0 + vbroadcasti32x4 0xd0($key1), $t0 vaesdec $t0, $st1, $st1 vaesdec $t0, $st2, $st2 # round 14 - vbroadcasti32x4 0x160($TW), $t0 - vaesdeclast $t0, $st1, $st1 - vaesdeclast $t0, $st2, $st2 - + vbroadcasti32x4 0xe0($key1), $t0 + vaesdeclast $t0, $st1, $st1 + vaesdeclast $t0, $st2, $st2 + vpxorq $tw1, $st1, $st1 + vpxorq $tw2, $st2, $st2 +___ # xor Tweak values - vpxorq $tw1, $st1, $st1 - vpxorq $tw2, $st2, $st2 - - # load next Tweak values - vmovdqa32 %zmm15, $tw1 - vmovdqa32 %zmm16, $tw2 + if (0 == $last_eight) { + # load next Tweak values + $code .= <<___; + vmovdqa32 %zmm15, $tw1 + vmovdqa32 %zmm16, $tw2 ___ + } } - # Encrypt 16 blocks in parallel - # generate next 16 tweak values - sub encrypt_by_16_zmm { + # Encrypt 16 blocks in parallel and generate next 16 tweak values. + sub encrypt_by_16 { my @st; $st[0] = $_[0]; $st[1] = $_[1]; @@ -1151,7 +448,7 @@ } # ARK - $code .= "vbroadcasti32x4 0x80($TW), $t0\n"; + $code .= "vbroadcasti32x4 ($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; } @@ -1166,19 +463,19 @@ } # round 1 - $code .= "vbroadcasti32x4 0x90($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 2 - $code .= "vbroadcasti32x4 0xa0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 3 - $code .= "vbroadcasti32x4 0xb0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } @@ -1192,19 +489,19 @@ ___ } # round 4 - $code .= "vbroadcasti32x4 0xc0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 5 - $code .= "vbroadcasti32x4 0xd0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 6 - $code .= "vbroadcasti32x4 0xe0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } @@ -1218,19 +515,19 @@ ___ } # round 7 - $code .= "vbroadcasti32x4 0xf0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 8 - $code .= "vbroadcasti32x4 0x100($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } # round 9 - $code .= "vbroadcasti32x4 0x110($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } @@ -1244,36 +541,31 @@ ___ } # round 10 - $code .= "vbroadcasti32x4 0x120($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } - # round 11 - $code .= "vbroadcasti32x4 0x130($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } - # round 12 - $code .= "vbroadcasti32x4 0x140($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } - # round 13 - $code .= "vbroadcasti32x4 0x150($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; } - # round 14 - $code .= "vbroadcasti32x4 0x160($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; } - # xor Tweak values for (my $i = 0; $i < 4; $i++) { $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; @@ -1288,9 +580,8 @@ ___ } - # Decrypt 16 blocks in parallel - # generate next 8 tweak values - sub decrypt_by_16_zmm { + # Decrypt 16 blocks in parallel and generate next 16 tweak values. + sub decrypt_by_16 { my @st; $st[0] = $_[0]; $st[1] = $_[1]; @@ -1312,7 +603,7 @@ } # ARK - $code .= "vbroadcasti32x4 0x80($TW), $t0\n"; + $code .= "vbroadcasti32x4 ($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; } @@ -1327,19 +618,19 @@ } # round 1 - $code .= "vbroadcasti32x4 0x90($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 2 - $code .= "vbroadcasti32x4 0xa0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 3 - $code .= "vbroadcasti32x4 0xb0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } @@ -1353,19 +644,19 @@ ___ } # round 4 - $code .= "vbroadcasti32x4 0xc0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 5 - $code .= "vbroadcasti32x4 0xd0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 6 - $code .= "vbroadcasti32x4 0xe0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } @@ -1379,19 +670,19 @@ ___ } # round 7 - $code .= "vbroadcasti32x4 0xf0($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 8 - $code .= "vbroadcasti32x4 0x100($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } # round 9 - $code .= "vbroadcasti32x4 0x110($TW), $t0\n"; + $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } @@ -1405,31 +696,27 @@ ___ } # round 10 - $code .= "vbroadcasti32x4 0x120($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } - # round 11 - $code .= "vbroadcasti32x4 0x130($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } - # round 12 - $code .= "vbroadcasti32x4 0x140($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } - # round 13 - $code .= "vbroadcasti32x4 0x150($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; } - # round 14 - $code .= "vbroadcasti32x4 0x160($TW), $t0\n"; + $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; for (my $i = 0; $i < 4; $i++) { $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; } @@ -1442,1649 +729,1442 @@ $code .= <<___; # load next Tweak values vmovdqa32 %zmm15, $tw[0] - vmovdqa32 %zmm16, $tw[1] - vmovdqa32 %zmm17, $tw[2] - vmovdqa32 %zmm18, $tw[3] -___ - } - - # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - # ;void aes_hw_xts_encrypt_avx512( - # ; const uint8_t *in, // input data - # ; uint8_t *out, // output data - # ; size_t length, // sector size, in bytes - # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes - # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes - # ; const uint8_t iv[16]) // initial tweak value, 16 bytes - # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - my $rndsuffix = &random_string(); - - $code .= <<___; -#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX -.text -___ - - { - $code.=<<___; - .globl aes_hw_xts_encrypt_avx512 - .hidden aes_hw_xts_encrypt_avx512 - .type aes_hw_xts_encrypt_avx512,\@abi-omnipotent - .align 32 - aes_hw_xts_encrypt_avx512: - .cfi_startproc - endbranch -___ - } - $code .= "push %rbp\n"; - $code .= "mov %rsp,%rbp\n"; - $code .= "sub \$$VARIABLE_OFFSET,%rsp\n"; - $code .= "and \$0xffffffffffffffc0,%rsp\n"; - $code .= "mov %rbx,$GP_STORAGE($TW)\n"; - - if ($win64) { - $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; - $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; - $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; - $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; - $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; - $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; - $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; - $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; - $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; - $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; - $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; - $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; - } - - $code .= "mov \$0x87, $gf_poly_8b\n"; - $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values - $code .= "vpxor %xmm4,%xmm4,%xmm4\n"; # for key expansion - - encrypt_tweak_for_encryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", - $key2, $key1, $TW); - - if ($win64) { - $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer - $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer - } - - { - $code.=<<___; - - cmp \$0x80,$length - jl .L_less_than_128_bytes_${rndsuffix} - vpbroadcastq $gf_poly_8b,$ZPOLY - cmp \$0x100,$length - jge .L_start_by16_${rndsuffix} - cmp \$0x80,$length - jge .L_start_by8_${rndsuffix} - - .L_do_n_blocks_${rndsuffix}: - cmp \$0x0,$length - je .L_ret_${rndsuffix} - cmp \$0x70,$length - jge .L_remaining_num_blocks_is_7_${rndsuffix} - cmp \$0x60,$length - jge .L_remaining_num_blocks_is_6_${rndsuffix} - cmp \$0x50,$length - jge .L_remaining_num_blocks_is_5_${rndsuffix} - cmp \$0x40,$length - jge .L_remaining_num_blocks_is_4_${rndsuffix} - cmp \$0x30,$length - jge .L_remaining_num_blocks_is_3_${rndsuffix} - cmp \$0x20,$length - jge .L_remaining_num_blocks_is_2_${rndsuffix} - cmp \$0x10,$length - jge .L_remaining_num_blocks_is_1_${rndsuffix} - vmovdqa %xmm0,%xmm8 - vmovdqa %xmm9,%xmm0 - jmp .L_steal_cipher_${rndsuffix} - - .L_remaining_num_blocks_is_7_${rndsuffix}: - mov \$0xffffffffffffffff,$tmp1 - shr \$0x10,$tmp1 - kmovq $tmp1,%k1 - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2{%k1} - add \$0x70,$input -___ - } - - encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %zmm2,0x40($output){%k1} - add \$0x70,$output - vextracti32x4 \$0x2,%zmm2,%xmm8 - vextracti32x4 \$0x3,%zmm10,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} - - .L_remaining_num_blocks_is_6_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%ymm2 - add \$0x60,$input -___ - } - - encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %ymm2,0x40($output) - add \$0x60,$output - vextracti32x4 \$0x1,%zmm2,%xmm8 - vextracti32x4 \$0x2,%zmm10,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} - - .L_remaining_num_blocks_is_5_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu 0x40($input),%xmm2 - add \$0x50,$input -___ - } - - encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu %xmm2,0x40($output) - add \$0x50,$output - vmovdqa %xmm2,%xmm8 - vextracti32x4 \$0x1,%zmm10,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} - - .L_remaining_num_blocks_is_4_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - add \$0x40,$input -___ - } - - encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - add \$0x40,$output - vextracti32x4 \$0x3,%zmm1,%xmm8 - vextracti32x4 \$0x0,%zmm10,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} -___ - } - - { - $code .= <<___; - .L_remaining_num_blocks_is_3_${rndsuffix}: - vextracti32x4 \$0x1,%zmm9,%xmm10 - vextracti32x4 \$0x2,%zmm9,%xmm11 - vmovdqu ($input),%xmm1 - vmovdqu 0x10($input),%xmm2 - vmovdqu 0x20($input),%xmm3 - add \$0x30,$input -___ - } - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); - - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - add \$0x30,$output - vmovdqa %xmm3,%xmm8 - vextracti32x4 \$0x3,%zmm9,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} -___ - } - - { - $code .= <<___; - .L_remaining_num_blocks_is_2_${rndsuffix}: - vextracti32x4 \$0x1,%zmm9,%xmm10 - vmovdqu ($input),%xmm1 - vmovdqu 0x10($input),%xmm2 - add \$0x20,$input -___ - } - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); - - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - add \$0x20,$output - vmovdqa %xmm2,%xmm8 - vextracti32x4 \$0x2,%zmm9,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} -___ - } - - { - $code .= <<___; - .L_remaining_num_blocks_is_1_${rndsuffix}: - vmovdqu ($input),%xmm1 - add \$0x10,$input -___ - } - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); - { - $code .= <<___; - vmovdqu %xmm1,($output) - add \$0x10,$output - vmovdqa %xmm1,%xmm8 - vextracti32x4 \$0x1,%zmm9,%xmm0 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_${rndsuffix} - - .L_start_by16_${rndsuffix}: - vbroadcasti32x4 (%rsp),%zmm0 - vbroadcasti32x4 shufb_15_7(%rip),%zmm8 - mov \$0xaa,$tmp1 - kmovq $tmp1,%k2 - vpshufb %zmm8,%zmm0,%zmm1 - vpsllvq const_dq3210(%rip),%zmm0,%zmm4 - vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 - vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 - vpxorq %zmm2,%zmm4,%zmm4{%k2} - vpxord %zmm4,%zmm3,%zmm9 - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 - vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 - vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 - vpxorq %zmm6,%zmm5,%zmm5{%k2} - vpxord %zmm5,%zmm7,%zmm10 - vpsrldq \$0xf,%zmm9,%zmm13 - vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 - vpslldq \$0x1,%zmm9,%zmm11 - vpxord %zmm14,%zmm11,%zmm11 - vpsrldq \$0xf,%zmm10,%zmm15 - vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 - vpslldq \$0x1,%zmm10,%zmm12 - vpxord %zmm16,%zmm12,%zmm12 - - .L_main_loop_run_16_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2 - vmovdqu8 0x80($input),%zmm3 - vmovdqu8 0xc0($input),%zmm4 - add \$0x100,$input -___ - } - - encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", - "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %zmm2,0x40($output) - vmovdqu8 %zmm3,0x80($output) - vmovdqu8 %zmm4,0xc0($output) - add \$0x100,$output - sub \$0x100,$length - cmp \$0x100,$length - jge .L_main_loop_run_16_${rndsuffix} - cmp \$0x80,$length - jge .L_main_loop_run_8_${rndsuffix} - vextracti32x4 \$0x3,%zmm4,%xmm0 - jmp .L_do_n_blocks_${rndsuffix} - - .L_start_by8_${rndsuffix}: - vbroadcasti32x4 (%rsp),%zmm0 - vbroadcasti32x4 shufb_15_7(%rip),%zmm8 - mov \$0xaa,$tmp1 - kmovq $tmp1,%k2 - vpshufb %zmm8,%zmm0,%zmm1 - vpsllvq const_dq3210(%rip),%zmm0,%zmm4 - vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 - vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 - vpxorq %zmm2,%zmm4,%zmm4{%k2} - vpxord %zmm4,%zmm3,%zmm9 - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 - vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 - vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 - vpxorq %zmm6,%zmm5,%zmm5{%k2} - vpxord %zmm5,%zmm7,%zmm10 - - .L_main_loop_run_8_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2 - add \$0x80,$input -___ - } - - encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0); - - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %zmm2,0x40($output) - add \$0x80,$output - sub \$0x80,$length - cmp \$0x80,$length - jge .L_main_loop_run_8_${rndsuffix} - vextracti32x4 \$0x3,%zmm2,%xmm0 - jmp .L_do_n_blocks_${rndsuffix} - - .L_steal_cipher_next_${rndsuffix}: - xor $gf_poly_8b_temp,$gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH,$TWTEMPH - cmovc $gf_poly_8b,$gf_poly_8b_temp - xor $gf_poly_8b_temp,$TWTEMPL - mov $TWTEMPL,($TW) - mov $TWTEMPH,0x8($TW) - vmovdqa ($TW),%xmm0 - - .L_steal_cipher_${rndsuffix}: - vmovdqa %xmm8,%xmm2 - lea vpshufb_shf_table(%rip),$TWTEMPL - vmovdqu ($TWTEMPL,$length,1),%xmm10 - vpshufb %xmm10,%xmm8,%xmm8 - vmovdqu -0x10($input,$length,1),%xmm3 - vmovdqu %xmm8,-0x10($output,$length,1) - lea vpshufb_shf_table(%rip),$TWTEMPL - add \$16, $TWTEMPL - sub $length,$TWTEMPL - vmovdqu ($TWTEMPL),%xmm10 - vpxor mask1(%rip),%xmm10,%xmm10 - vpshufb %xmm10,%xmm3,%xmm3 - vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 - vpxor %xmm0,%xmm3,%xmm8 - vpxor 0x80(%rsp),%xmm8,%xmm8 - vaesenc 0x90(%rsp),%xmm8,%xmm8 - vaesenc 0xa0(%rsp),%xmm8,%xmm8 - vaesenc 0xb0(%rsp),%xmm8,%xmm8 - vaesenc 0xc0(%rsp),%xmm8,%xmm8 - vaesenc 0xd0(%rsp),%xmm8,%xmm8 - vaesenc 0xe0(%rsp),%xmm8,%xmm8 - vaesenc 0xf0(%rsp),%xmm8,%xmm8 - vaesenc 0x100(%rsp),%xmm8,%xmm8 - vaesenc 0x110(%rsp),%xmm8,%xmm8 - vaesenc 0x120(%rsp),%xmm8,%xmm8 - vaesenc 0x130(%rsp),%xmm8,%xmm8 - vaesenc 0x140(%rsp),%xmm8,%xmm8 - vaesenc 0x150(%rsp),%xmm8,%xmm8 - vaesenclast 0x160(%rsp),%xmm8,%xmm8 - vpxor %xmm0,%xmm8,%xmm8 - vmovdqu %xmm8,-0x10($output) -___ - } - - { - $code .= <<___; - .L_ret_${rndsuffix}: - mov $GP_STORAGE($TW),%rbx - xor $tmp1,$tmp1 - mov $tmp1,$GP_STORAGE($TW) - # Zero-out the whole of `%zmm0`. - vpxorq %zmm0,%zmm0,%zmm0 -___ - } - - if ($win64) { - $code .= <<___; - mov $GP_STORAGE + 8*1($TW),%rdi - mov $tmp1,$GP_STORAGE + 8*1($TW) - mov $GP_STORAGE + 8*2($TW),%rsi - mov $tmp1,$GP_STORAGE + 8*2($TW) - - vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 - vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 - vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 - vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 - - # Zero the 64 bytes we just restored to the xmm registers. - vmovdqa64 %zmm0,$XMM_STORAGE($TW) - - vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 - vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 - vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 - vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 - - # And again. - vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) - - vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 - vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 - - # Last round is only 32 bytes (256-bits), so we use `%ymm` as the - # source operand. - vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) -___ - } - - { - $code .= <<___; - # Zero-out the stack frames used for `key1`, 64 bytes at a time. - vmovdqa64 %zmm0,0x80(%rsp) - vmovdqa64 %zmm0,0xc0(%rsp) - vmovdqa64 %zmm0,0x100(%rsp) - - # Stack usage is not divisible by 64, so we use a kmask register to - # only mov 48 of the bytes (6 quad-words). - mov \$0x3f,$tmp1 - kmovq $tmp1,%k2 - vmovdqa64 %zmm0,0x140(%rsp){%k2} - - mov %rbp,%rsp - pop %rbp - vzeroupper - ret - - .L_less_than_128_bytes_${rndsuffix}: - cmp \$0x10,$length - jb .L_ret_${rndsuffix} - mov $length,$tmp1 - and \$0x70,$tmp1 - cmp \$0x60,$tmp1 - je .L_num_blocks_is_6_${rndsuffix} - cmp \$0x50,$tmp1 - je .L_num_blocks_is_5_${rndsuffix} - cmp \$0x40,$tmp1 - je .L_num_blocks_is_4_${rndsuffix} - cmp \$0x30,$tmp1 - je .L_num_blocks_is_3_${rndsuffix} - cmp \$0x20,$tmp1 - je .L_num_blocks_is_2_${rndsuffix} - cmp \$0x10,$tmp1 - je .L_num_blocks_is_1_${rndsuffix} -___ - } - - $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 7); - - $code .= "add \$0x70,$input\n"; - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1); - - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - vmovdqu %xmm6,0x50($output) - vmovdqu %xmm7,0x60($output) - add \$0x70,$output - vmovdqa %xmm7,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} -___ - } - - $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 6); - - $code .= "add \$0x60,$input\n"; - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1); - - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - vmovdqu %xmm6,0x50($output) - add \$0x60,$output - vmovdqa %xmm6,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} -___ - } - - $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 5); - - $code .= "add \$0x50,$input\n"; - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1); - - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - add \$0x50,$output - vmovdqa %xmm5,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} + vmovdqa32 %zmm16, $tw[1] + vmovdqa32 %zmm17, $tw[2] + vmovdqa32 %zmm18, $tw[3] ___ } - $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n"; - - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 4); + $code .= "#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX\n"; + $code .= ".text\n"; - $code .= "add \$0x40, $input\n"; - - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1); + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + # ;void aes_hw_xts_encrypt_avx512( + # ; const uint8_t *in, // input data + # ; uint8_t *out, // output data + # ; size_t length, // sector size, in bytes + # ; const AES_KEY *key1, // key used for "ECB" encryption + # ; const AES_KEY *key2, // key used for tweaking + # ; const uint8_t iv[16]) // initial tweak value, 16 bytes + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + sub enc { + my $rndsuffix = &random_string(); - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - add \$0x40,$output - vmovdqa %xmm4,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} + $code.=<<___; + .globl aes_hw_xts_encrypt_avx512 + .hidden aes_hw_xts_encrypt_avx512 + .type aes_hw_xts_encrypt_avx512,\@function,6 + .align 32 + aes_hw_xts_encrypt_avx512: + .cfi_startproc + endbranch ___ - } + $code .= "push %rbp\n"; + $code .= "mov $TW,%rbp\n"; + $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; + $code .= "and \$0xffffffffffffffc0,$TW\n"; + $code .= "mov %rbx,$GP_STORAGE($TW)\n"; - $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n"; + if ($win64) { + $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; + $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; + $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; + $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; + $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; + $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; + $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; + $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; + $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; + $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; + $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; + $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; + } - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 3); + $code .= "mov \$0x87, $gf_poly_8b\n"; + $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values - $code .= "add \$0x30,$input\n"; + encrypt_tweak("%xmm1"); - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); + if ($win64) { + $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer + $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer + } - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - add \$0x30,$output - vmovdqa %xmm3,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} -___ - } + { + $code.=<<___; - $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n"; + cmp \$0x80,$length + jl .L_less_than_128_bytes_${rndsuffix} + vpbroadcastq $gf_poly_8b,$ZPOLY + cmp \$0x100,$length + jge .L_start_by16_${rndsuffix} + cmp \$0x80,$length + jge .L_start_by8_${rndsuffix} + + .L_do_n_blocks_${rndsuffix}: + cmp \$0x0,$length + je .L_ret_${rndsuffix} + cmp \$0x70,$length + jge .L_remaining_num_blocks_is_7_${rndsuffix} + cmp \$0x60,$length + jge .L_remaining_num_blocks_is_6_${rndsuffix} + cmp \$0x50,$length + jge .L_remaining_num_blocks_is_5_${rndsuffix} + cmp \$0x40,$length + jge .L_remaining_num_blocks_is_4_${rndsuffix} + cmp \$0x30,$length + jge .L_remaining_num_blocks_is_3_${rndsuffix} + cmp \$0x20,$length + jge .L_remaining_num_blocks_is_2_${rndsuffix} + cmp \$0x10,$length + jge .L_remaining_num_blocks_is_1_${rndsuffix} + vmovdqa %xmm0,%xmm8 + vmovdqa %xmm9,%xmm0 + jmp .L_steal_cipher_${rndsuffix} + + .L_remaining_num_blocks_is_7_${rndsuffix}: + mov \$0x0000ffffffffffff,$tmp1 + kmovq $tmp1,%k1 + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2{%k1} + add \$0x70,$input +___ + } + + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output){%k1} + add \$0x70,$output + vextracti32x4 \$0x2,%zmm2,%xmm8 + vextracti32x4 \$0x3,%zmm10,%xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 2); + .L_remaining_num_blocks_is_6_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%ymm2 + add \$0x60,$input +___ + } - $code .= "add \$0x20,$input\n"; + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %ymm2,0x40($output) + add \$0x60,$output + vextracti32x4 \$0x1,%zmm2,%xmm8 + vextracti32x4 \$0x2,%zmm10,%xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - add \$0x20,$output - vmovdqa %xmm2,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} + .L_remaining_num_blocks_is_5_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu 0x40($input),%xmm2 + add \$0x50,$input ___ - } + } - $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n"; + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 1); + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu %xmm2,0x40($output) + add \$0x50,$output + vmovdqa %xmm2,%xmm8 + vextracti32x4 \$0x1,%zmm10,%xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} - $code .= "add \$0x10,$input\n"; + .L_remaining_num_blocks_is_4_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + add \$0x40,$input +___ + } - encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); + encrypt_by_four("%zmm1", "%zmm9", "%zmm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - add \$0x10,$output - vmovdqa %xmm1,%xmm8 - and \$0xf,$length - je .L_ret_${rndsuffix} - jmp .L_steal_cipher_next_${rndsuffix} - .cfi_endproc + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + add \$0x40,$output + vextracti32x4 \$0x3,%zmm1,%xmm8 + vmovdqa64 %xmm10, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - # ;void aes_hw_xts_decrypt_avx512( - # ; const uint8_t *in, // input data - # ; uint8_t *out, // output data - # ; size_t length, // sector size, in bytes - # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes - # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes - # ; const uint8_t iv[16]) // initial tweak value, 16 bytes - # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + { + $code .= <<___; + .L_remaining_num_blocks_is_3_${rndsuffix}: + mov \$-1, $tmp1 + shr \$0x10, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 ($input), %zmm1{%k1} + add \$0x30, $input +___ + } - my $rndsuffix = &random_string(); + encrypt_by_four("%zmm1", "%zmm9", "%zmm0"); - { - $code.=<<___; - .globl aes_hw_xts_decrypt_avx512 - .hidden aes_hw_xts_decrypt_avx512 - .type aes_hw_xts_decrypt_avx512,\@abi-omnipotent - .align 32 - aes_hw_xts_decrypt_avx512: - .cfi_startproc - endbranch + { + $code .= <<___; + vmovdqu8 %zmm1, ($output){%k1} + add \$0x30, $output + vextracti32x4 \$0x2, %zmm1, %xmm8 + vextracti32x4 \$0x3, %zmm9, %xmm0 + and \$0xf, $length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - $code .= "push %rbp\n"; - $code .= "mov %rsp,%rbp\n"; - $code .= "sub \$$VARIABLE_OFFSET,%rsp\n"; - $code .= "and \$0xffffffffffffffc0,%rsp\n"; - $code .= "mov %rbx,$GP_STORAGE($TW)\n"; - - if ($win64) { - $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; - $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; - $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; - $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; - $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; - $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; - $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; - $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; - $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; - $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; - $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; - $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; - } - - $code .= "mov \$0x87, $gf_poly_8b\n"; - $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values - $code .= "vpxor %xmm4,%xmm4,%xmm4\n"; # for key expansion + } - encrypt_tweak_for_decryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", - $key2, $key1, $TW); + { + $code .= <<___; + .L_remaining_num_blocks_is_2_${rndsuffix}: + vmovdqu8 ($input), %ymm1 + add \$0x20, $input +___ + } - if ($win64) { - $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer - $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer - } + encrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - { - $code.=<<___; - - cmp \$0x80,$length - jb .L_less_than_128_bytes_${rndsuffix} - vpbroadcastq $gf_poly_8b,$ZPOLY - cmp \$0x100,$length - jge .L_start_by16_${rndsuffix} - jmp .L_start_by8_${rndsuffix} - - .L_do_n_blocks_${rndsuffix}: - cmp \$0x0,$length - je .L_ret_${rndsuffix} - cmp \$0x70,$length - jge .L_remaining_num_blocks_is_7_${rndsuffix} - cmp \$0x60,$length - jge .L_remaining_num_blocks_is_6_${rndsuffix} - cmp \$0x50,$length - jge .L_remaining_num_blocks_is_5_${rndsuffix} - cmp \$0x40,$length - jge .L_remaining_num_blocks_is_4_${rndsuffix} - cmp \$0x30,$length - jge .L_remaining_num_blocks_is_3_${rndsuffix} - cmp \$0x20,$length - jge .L_remaining_num_blocks_is_2_${rndsuffix} - cmp \$0x10,$length - jge .L_remaining_num_blocks_is_1_${rndsuffix} - - # _remaining_num_blocks_is_0: - vmovdqu %xmm5, %xmm1 - # xmm5 contains last full block to decrypt with next teawk + { + $code .= <<___; + vmovdqu %ymm1,($output) + add \$0x20,$output + vextracti32x4 \$0x1, %zmm1, %xmm8 + vextracti32x4 \$0x2,%zmm9,%xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); + } - { - $code .= <<___; - vmovdqu %xmm1, -0x10($output) - vmovdqa %xmm1, %xmm8 - - # Calc previous tweak - mov \$0x1,$tmp1 - kmovq $tmp1, %k1 - vpsllq \$0x3f,%xmm9,%xmm13 - vpsraq \$0x3f,%xmm13,%xmm14 - vpandq %xmm25,%xmm14,%xmm5 - vpxorq %xmm5,%xmm9,%xmm9{%k1} - vpsrldq \$0x8,%xmm9,%xmm10 - .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0 - vpslldq \$0x8,%xmm13,%xmm13 - vpxorq %xmm13,%xmm0,%xmm0 - jmp .L_steal_cipher_${rndsuffix} - - .L_remaining_num_blocks_is_7_${rndsuffix}: - mov \$0xffffffffffffffff,$tmp1 - shr \$0x10,$tmp1 - kmovq $tmp1,%k1 - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2{%k1} - add \$0x70,$input - and \$0xf,$length - je .L_done_7_remain_${rndsuffix} - vextracti32x4 \$0x2,%zmm10,%xmm12 - vextracti32x4 \$0x3,%zmm10,%xmm13 - vinserti32x4 \$0x2,%xmm13,%zmm10,%zmm10 + { + $code .= <<___; + .L_remaining_num_blocks_is_1_${rndsuffix}: + vmovdqu ($input),%xmm1 + add \$0x10,$input ___ - } + } - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + encrypt_final("%xmm1", "%xmm9"); - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu8 %zmm2, 0x40($output){%k1} - add \$0x70, $output - vextracti32x4 \$0x2,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_${rndsuffix} -___ - } + { + $code .= <<___; + vmovdqu %xmm1,($output) + add \$0x10,$output + vmovdqa %xmm1,%xmm8 + vextracti32x4 \$0x1,%zmm9,%xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} + + + # Set up for and then generation of 16 tweaks. + .L_start_by16_${rndsuffix}: + vbroadcasti32x4 ($TW),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + mov \$0xaa,$tmp1 + kmovq $tmp1,%k2 + vpshufb %zmm8,%zmm0,%zmm1 + + # Tweaks 0-3 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 + vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + + # Tweaks 4-7 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 + vpclmulqdq \$0x0,$ZPOLY,%zmm6,%zmm7 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + + # Tweaks 8-11 + vpsrldq \$0xf,%zmm9,%zmm13 + vpclmulqdq \$0x0,$ZPOLY,%zmm13,%zmm14 + vpslldq \$0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + + # Tweaks 12-15 + vpsrldq \$0xf,%zmm10,%zmm15 + vpclmulqdq \$0x0,$ZPOLY,%zmm15,%zmm16 + vpslldq \$0x1,%zmm10,%zmm12 + vpxord %zmm16,%zmm12,%zmm12 + + .L_main_loop_run_16_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2 + vmovdqu8 0x80($input),%zmm3 + vmovdqu8 0xc0($input),%zmm4 + add \$0x100,$input +___ + } + + encrypt_by_16("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", + "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0); + + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output) + vmovdqu8 %zmm3,0x80($output) + vmovdqu8 %zmm4,0xc0($output) + add \$0x100,$output + sub \$0x100,$length + cmp \$0x100,$length + jae .L_main_loop_run_16_${rndsuffix} + cmp \$0x80,$length + jae .L_main_loop_run_8_${rndsuffix} + vextracti32x4 \$0x3,%zmm4,%xmm0 + jmp .L_do_n_blocks_${rndsuffix} + + .L_start_by8_${rndsuffix}: + vbroadcasti32x4 ($TW),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + mov \$0xaa,$tmp1 + kmovq $tmp1,%k2 + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 + vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 + vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + + .L_main_loop_run_8_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2 + add \$0x80,$input +___ + } + + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0); + + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output) + add \$0x80,$output + sub \$0x80,$length + cmp \$0x80,$length + jae .L_main_loop_run_8_${rndsuffix} + vextracti32x4 \$0x3,%zmm2,%xmm0 + jmp .L_do_n_blocks_${rndsuffix} + + .L_steal_cipher_${rndsuffix}: + vmovdqa %xmm8,%xmm2 + lea vpshufb_shf_table(%rip),$TEMPLOW + vmovdqu ($TEMPLOW,$length,1),%xmm10 + vpshufb %xmm10,%xmm8,%xmm8 + vmovdqu -0x10($input,$length,1),%xmm3 + vmovdqu %xmm8,-0x10($output,$length,1) + lea vpshufb_shf_table(%rip),$TEMPLOW + add \$16, $TEMPLOW + sub $length,$TEMPLOW + vmovdqu ($TEMPLOW),%xmm10 + vpxor mask1(%rip),%xmm10,%xmm10 + vpshufb %xmm10,%xmm3,%xmm3 + vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 + vpxor %xmm0,%xmm3,%xmm8 + vpxor ($key1),%xmm8,%xmm8 + vaesenc 0x10($key1),%xmm8,%xmm8 + vaesenc 0x20($key1),%xmm8,%xmm8 + vaesenc 0x30($key1),%xmm8,%xmm8 + vaesenc 0x40($key1),%xmm8,%xmm8 + vaesenc 0x50($key1),%xmm8,%xmm8 + vaesenc 0x60($key1),%xmm8,%xmm8 + vaesenc 0x70($key1),%xmm8,%xmm8 + vaesenc 0x80($key1),%xmm8,%xmm8 + vaesenc 0x90($key1),%xmm8,%xmm8 + vaesenc 0xa0($key1),%xmm8,%xmm8 + vaesenc 0xb0($key1),%xmm8,%xmm8 + vaesenc 0xc0($key1),%xmm8,%xmm8 + vaesenc 0xd0($key1),%xmm8,%xmm8 + vaesenclast 0xe0($key1),%xmm8,%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vmovdqu %xmm8,-0x10($output) + + .L_ret_${rndsuffix}: + mov $GP_STORAGE($TW),%rbx + xor $tmp1,$tmp1 + mov $tmp1,$GP_STORAGE($TW) + vpxorq %zmm0,%zmm0,%zmm0 +___ + } + + if ($win64) { + $code .= <<___; + mov $GP_STORAGE + 8*1($TW),%rdi + mov $tmp1,$GP_STORAGE + 8*1($TW) + mov $GP_STORAGE + 8*2($TW),%rsi + mov $tmp1,$GP_STORAGE + 8*2($TW) - $code .= "\n.L_done_7_remain_${rndsuffix}:\n"; - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 + vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 + vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 + vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu8 %zmm2, 0x40($output){%k1} - jmp .L_ret_${rndsuffix} - - .L_remaining_num_blocks_is_6_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%ymm2 - add \$0x60,$input - and \$0xf, $length - je .L_done_6_remain_${rndsuffix} - vextracti32x4 \$0x1,%zmm10,%xmm12 - vextracti32x4 \$0x2,%zmm10,%xmm13 - vinserti32x4 \$0x1,%xmm13,%zmm10,%zmm10 -___ - } + # Zero the 64 bytes we just restored to the xmm registers. + vmovdqa64 %zmm0,$XMM_STORAGE($TW) - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 + vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 + vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 + vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu8 %ymm2, 0x40($output) - add \$0x60,$output - vextracti32x4 \$0x1,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_${rndsuffix} -___ - } + # And again. + vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) - $code .= "\n.L_done_6_remain_${rndsuffix}:\n"; - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 + vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu8 %ymm2,0x40($output) - jmp .L_ret_${rndsuffix} - - .L_remaining_num_blocks_is_5_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu 0x40($input),%xmm2 - add \$0x50,$input - and \$0xf,$length - je .L_done_5_remain_${rndsuffix} - vmovdqa %xmm10,%xmm12 - vextracti32x4 \$0x1,%zmm10,%xmm10 + # Last round is only 32 bytes (256-bits), so we use `%ymm` as the + # source operand. + vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) ___ - } + } - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + { + $code .= <<___; + mov %rbp,$TW + pop %rbp + vzeroupper + ret - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu %xmm2, 0x40($output) - add \$0x50, $output - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_${rndsuffix} + .L_less_than_128_bytes_${rndsuffix}: + vpbroadcastq $gf_poly_8b, $ZPOLY + cmp \$0x10,$length + jb .L_ret_${rndsuffix} + vbroadcasti32x4 ($TW), %zmm0 + vbroadcasti32x4 shufb_15_7(%rip), %zmm8 + movl \$0xaa, %r8d + kmovq %r8, %k2 + mov $length,$tmp1 + and \$0x70,$tmp1 + cmp \$0x60,$tmp1 + je .L_num_blocks_is_6_${rndsuffix} + cmp \$0x50,$tmp1 + je .L_num_blocks_is_5_${rndsuffix} + cmp \$0x40,$tmp1 + je .L_num_blocks_is_4_${rndsuffix} + cmp \$0x30,$tmp1 + je .L_num_blocks_is_3_${rndsuffix} + cmp \$0x20,$tmp1 + je .L_num_blocks_is_2_${rndsuffix} + cmp \$0x10,$tmp1 + je .L_num_blocks_is_1_${rndsuffix} + + .L_num_blocks_is_7_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + mov \$0x0000ffffffffffff, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %zmm2{%k1} + + add \$0x70,$input +___ + } + + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %zmm2, 16*4($output){%k1} + add \$0x70,$output + vextracti32x4 \$0x2, %zmm2, %xmm8 + vextracti32x4 \$0x3, %zmm10, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - $code .= "\n.L_done_5_remain_${rndsuffix}:\n"; - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + { + $code .= <<___; + .L_num_blocks_is_6_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %ymm2 + add \$96, $input +___ + } + + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %ymm2, 16*4($output) + add \$96, $output - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - vmovdqu8 %xmm2, 0x40($output) - jmp .L_ret_${rndsuffix} - - .L_remaining_num_blocks_is_4_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - add \$0x40,$input - and \$0xf, $length - je .L_done_4_remain_${rndsuffix} - vextracti32x4 \$0x3,%zmm9,%xmm12 - vinserti32x4 \$0x3,%xmm10,%zmm9,%zmm9 + vextracti32x4 \$0x1, %ymm2, %xmm8 + vextracti32x4 \$0x2, %zmm10, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + { + $code .= <<___; + .L_num_blocks_is_5_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %xmm2 + add \$80, $input +___ + } + + encrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %xmm2, 16*4($output) + add \$80, $output - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - add \$0x40,$output - vextracti32x4 \$0x3,%zmm1,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_${rndsuffix} + vmovdqa %xmm2, %xmm8 + vextracti32x4 \$0x1, %zmm10, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - - $code .= "\n.L_done_4_remain_${rndsuffix}:\n"; - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + } - { - $code .= <<___; - vmovdqu8 %zmm1, ($output) - jmp .L_ret_${rndsuffix} - - .L_remaining_num_blocks_is_3_${rndsuffix}: - vmovdqu ($input),%xmm1 - vmovdqu 0x10($input),%xmm2 - vmovdqu 0x20($input),%xmm3 - add \$0x30,$input - and \$0xf,$length - je .L_done_3_remain_${rndsuffix} - vextracti32x4 \$0x2,%zmm9,%xmm13 - vextracti32x4 \$0x1,%zmm9,%xmm10 - vextracti32x4 \$0x3,%zmm9,%xmm11 + { + $code .= <<___; + .L_num_blocks_is_4_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + add \$64, $input +___ + } + + encrypt_by_four("%zmm1", "%zmm9", "%zmm0"); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + add \$64, $output + vextracti32x4 \$0x3, %zmm1, %xmm8 + vmovdqa %xmm10, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); + } - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - add \$0x30,$output - vmovdqa %xmm3,%xmm8 - vmovdqa %xmm13,%xmm0 - jmp .L_steal_cipher_${rndsuffix} + { + $code .= <<___; + .L_num_blocks_is_3_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + mov \$0x0000ffffffffffff, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 16*0($input), %zmm1{%k1} + add \$48, $input ___ - } - $code .= "\n.L_done_3_remain_${rndsuffix}:\n"; - $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; - $code .= "vextracti32x4 \$0x2,%zmm9,%xmm11\n"; + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); + encrypt_by_four("%zmm1", "%zmm9", "%zmm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - jmp .L_ret_${rndsuffix} - - .L_remaining_num_blocks_is_2_${rndsuffix}: - vmovdqu ($input),%xmm1 - vmovdqu 0x10($input),%xmm2 - add \$0x20,$input - and \$0xf,$length - je .L_done_2_remain_${rndsuffix} - vextracti32x4 \$0x2,%zmm9,%xmm10 - vextracti32x4 \$0x1,%zmm9,%xmm12 + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output){%k1} + add \$48, $output + vextracti32x4 \$2, %zmm1, %xmm8 + vextracti32x4 \$3, %zmm9, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); + { + $code .= <<___; + .L_num_blocks_is_2_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - add \$0x20,$output - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_${rndsuffix} + vmovdqu8 16*0($input), %ymm1 + add \$32, $input ___ - } - $code .= "\n.L_done_2_remain_${rndsuffix}:\n"; - $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); + encrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - jmp .L_ret_${rndsuffix} + { + $code .= <<___; + vmovdqu8 %ymm1, 16*0($output) + add \$32, $output - .L_remaining_num_blocks_is_1_${rndsuffix}: - vmovdqu ($input),%xmm1 - add \$0x10,$input - and \$0xf,$length - je .L_done_1_remain_${rndsuffix} - vextracti32x4 \$0x1,%zmm9,%xmm11 + vextracti32x4 \$1, %ymm1, %xmm8 + vextracti32x4 \$2, %zmm9, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); - { - $code .= <<___; - vmovdqu %xmm1,($output) - add \$0x10,$output - vmovdqa %xmm1,%xmm8 - vmovdqa %xmm9,%xmm0 - jmp .L_steal_cipher_${rndsuffix} + { + $code .= <<___; + .L_num_blocks_is_1_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + + vmovdqu8 16*0($input), %xmm1 + add \$16, $input ___ - } + } - $code .= "\n.L_done_1_remain_${rndsuffix}:\n"; + encrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); + { + $code .= <<___; + vmovdqu8 %xmm1, 16*0($output) + add \$16, $output - { - $code .= <<___; - vmovdqu %xmm1, ($output) - jmp .L_ret_${rndsuffix} - - .L_start_by16_${rndsuffix}: - vbroadcasti32x4 ($TW),%zmm0 - vbroadcasti32x4 shufb_15_7(%rip),%zmm8 - mov \$0xaa,$tmp1 - kmovq $tmp1,%k2 - - # Mult tweak by 2^{3, 2, 1, 0} - vpshufb %zmm8,%zmm0,%zmm1 - vpsllvq const_dq3210(%rip),%zmm0,%zmm4 - vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 - vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 - vpxorq %zmm2,%zmm4,%zmm4{%k2} - vpxord %zmm4,%zmm3,%zmm9 - - # Mult tweak by 2^{7, 6, 5, 4} - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 - vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 - vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 - vpxorq %zmm6,%zmm5,%zmm5{%k2} - vpxord %zmm5,%zmm7,%zmm10 - - # Make next 8 tweek values by all x 2^8 - vpsrldq \$0xf,%zmm9,%zmm13 - vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 - vpslldq \$0x1,%zmm9,%zmm11 - vpxord %zmm14,%zmm11,%zmm11 - - vpsrldq \$0xf,%zmm10,%zmm15 - vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 - vpslldq \$0x1,%zmm10,%zmm12 - vpxord %zmm16,%zmm12,%zmm12 - - .L_main_loop_run_16_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2 - vmovdqu8 0x80($input),%zmm3 - vmovdqu8 0xc0($input),%zmm4 - vmovdqu8 0xf0($input),%xmm5 - add \$0x100,$input + vmovdqa %xmm1, %xmm8 + vextracti32x4 \$1, %zmm9, %xmm0 + and \$0xf,$length + je .L_ret_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} + .cfi_endproc ___ + } } - decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", - "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0); + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + # ;void aes_hw_xts_decrypt_avx512( + # ; const uint8_t *in, // input data + # ; uint8_t *out, // output data + # ; size_t length, // sector size, in bytes + # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes + # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes + # ; const uint8_t iv[16]) // initial tweak value, 16 bytes + # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + sub dec { + my $rndsuffix = &random_string(); - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %zmm2,0x40($output) - vmovdqu8 %zmm3,0x80($output) - vmovdqu8 %zmm4,0xc0($output) - add \$0x100,$output - sub \$0x100,$length - cmp \$0x100,$length - jge .L_main_loop_run_16_${rndsuffix} - - cmp \$0x80,$length - jge .L_main_loop_run_8_${rndsuffix} - jmp .L_do_n_blocks_${rndsuffix} - - .L_start_by8_${rndsuffix}: - # Make first 7 tweek values - vbroadcasti32x4 ($TW),%zmm0 - vbroadcasti32x4 shufb_15_7(%rip),%zmm8 - mov \$0xaa,$tmp1 - kmovq $tmp1,%k2 - - # Mult tweak by 2^{3, 2, 1, 0} - vpshufb %zmm8,%zmm0,%zmm1 - vpsllvq const_dq3210(%rip),%zmm0,%zmm4 - vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 - vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 - vpxorq %zmm2,%zmm4,%zmm4{%k2} - vpxord %zmm4,%zmm3,%zmm9 - - # Mult tweak by 2^{7, 6, 5, 4} - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 - vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 - vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 - vpxorq %zmm6,%zmm5,%zmm5{%k2} - vpxord %zmm5,%zmm7,%zmm10 - - .L_main_loop_run_8_${rndsuffix}: - vmovdqu8 ($input),%zmm1 - vmovdqu8 0x40($input),%zmm2 - vmovdqu8 0x70($input),%xmm5 - add \$0x80,$input + $code.=<<___; + .globl aes_hw_xts_decrypt_avx512 + .hidden aes_hw_xts_decrypt_avx512 + .type aes_hw_xts_decrypt_avx512,\@function,6 + .align 32 + aes_hw_xts_decrypt_avx512: + .cfi_startproc + endbranch ___ - } + $code .= "push %rbp\n"; + $code .= "mov $TW,%rbp\n"; + $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; + $code .= "and \$0xffffffffffffffc0,$TW\n"; + $code .= "mov %rbx,$GP_STORAGE($TW)\n"; + if ($win64) { + $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; + $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; + $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; + $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; + $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; + $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; + $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; + $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; + $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; + $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; + $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; + $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; + } - decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0); + $code .= "mov \$0x87, $gf_poly_8b\n"; + $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values - { - $code .= <<___; - vmovdqu8 %zmm1,($output) - vmovdqu8 %zmm2,0x40($output) - add \$0x80,$output - sub \$0x80,$length - cmp \$0x80,$length - jge .L_main_loop_run_8_${rndsuffix} - jmp .L_do_n_blocks_${rndsuffix} - - .L_steal_cipher_${rndsuffix}: - # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak - vmovdqa %xmm8,%xmm2 - - # shift xmm8 to the left by 16-N_val bytes - lea vpshufb_shf_table(%rip),$TWTEMPL - vmovdqu ($TWTEMPL,$length,1),%xmm10 - vpshufb %xmm10,%xmm8,%xmm8 - - - vmovdqu -0x10($input,$length,1),%xmm3 - vmovdqu %xmm8,-0x10($output,$length,1) - - # shift xmm3 to the right by 16-N_val bytes - lea vpshufb_shf_table(%rip), $TWTEMPL - add \$16, $TWTEMPL - sub $length,$TWTEMPL - vmovdqu ($TWTEMPL),%xmm10 - vpxor mask1(%rip),%xmm10,%xmm10 - vpshufb %xmm10,%xmm3,%xmm3 - - vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 - - # xor Tweak value - vpxor %xmm0,%xmm3,%xmm8 - - # decrypt last block with cipher stealing - vpxor 0x80(%rsp),%xmm8,%xmm8 - vaesdec 0x90(%rsp),%xmm8,%xmm8 - vaesdec 0xa0(%rsp),%xmm8,%xmm8 - vaesdec 0xb0(%rsp),%xmm8,%xmm8 - vaesdec 0xc0(%rsp),%xmm8,%xmm8 - vaesdec 0xd0(%rsp),%xmm8,%xmm8 - vaesdec 0xe0(%rsp),%xmm8,%xmm8 - vaesdec 0xf0(%rsp),%xmm8,%xmm8 - vaesdec 0x100(%rsp),%xmm8,%xmm8 - vaesdec 0x110(%rsp),%xmm8,%xmm8 - vaesdec 0x120(%rsp),%xmm8,%xmm8 - vaesdec 0x130(%rsp),%xmm8,%xmm8 - vaesdec 0x140(%rsp),%xmm8,%xmm8 - vaesdec 0x150(%rsp),%xmm8,%xmm8 - vaesdeclast 0x160(%rsp),%xmm8,%xmm8 - - # xor Tweak value - vpxor %xmm0,%xmm8,%xmm8 - - .L_done_${rndsuffix}: - # store last ciphertext value - vmovdqu %xmm8,-0x10($output) -___ - } + encrypt_tweak("%xmm1"); - { - $code .= <<___; - .L_ret_${rndsuffix}: - mov $GP_STORAGE($TW),%rbx - xor $tmp1,$tmp1 - mov $tmp1,$GP_STORAGE($TW) - # Zero-out the whole of `%zmm0`. - vpxorq %zmm0,%zmm0,%zmm0 -___ - } + if ($win64) { + $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer + $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer + } - if ($win64) { + { + $code.=<<___; + # XTS decryption involves special tweak handling for the final block, so if + # there is /only/ one block, we just jump straight to that handling. + cmp \$0x20,$length + jl .L_final_block_is_only_block_${rndsuffix} + + # Otherwise, we reduce length by `to the (nearest multple of 16) - 16`, + # leaving the final block + any bytes that need cipher stealing and leave + # those for the special tweak handling. + mov $length, $decLength + and \$0xfffffffffffffff0,$decLength + sub \$16,$decLength + cmp \$0x80,$decLength + jl .L_less_than_128_bytes_${rndsuffix} + vpbroadcastq $gf_poly_8b,$ZPOLY + cmp \$0x100,$decLength + jge .L_start_by16_${rndsuffix} + cmp \$0x80,$decLength + jge .L_start_by8_${rndsuffix} + + .L_do_n_blocks_${rndsuffix}: + cmp \$0x70,$decLength + je .L_remaining_num_blocks_is_7_${rndsuffix} + cmp \$0x60,$decLength + je .L_remaining_num_blocks_is_6_${rndsuffix} + cmp \$0x50,$decLength + je .L_remaining_num_blocks_is_5_${rndsuffix} + cmp \$0x40,$decLength + je .L_remaining_num_blocks_is_4_${rndsuffix} + cmp \$0x30,$decLength + je .L_remaining_num_blocks_is_3_${rndsuffix} + cmp \$0x20,$decLength + je .L_remaining_num_blocks_is_2_${rndsuffix} + cmp \$0x10,$decLength + je .L_remaining_num_blocks_is_1_${rndsuffix} + and \$0xf,$length + je .L_final_block_${rndsuffix} + vextracti32x4 \$0x0,%zmm9,%xmm0 + vextracti32x4 \$0x1,%zmm9,%xmm15 + jmp .L_steal_cipher_${rndsuffix} + + .L_remaining_num_blocks_is_7_${rndsuffix}: + mov \$0x0000ffffffffffff,$tmp1 + kmovq $tmp1,%k1 + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2{%k1} + add \$0x70,$input +___ + } + + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { $code .= <<___; - mov $GP_STORAGE + 8*1($TW),%rdi - mov $tmp1,$GP_STORAGE + 8*1($TW) - mov $GP_STORAGE + 8*2($TW),%rsi - mov $tmp1,$GP_STORAGE + 8*2($TW) + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output){%k1} + add \$0x70,$output + vextracti32x4 \$0x3,%zmm10,%xmm0 + and \$0xf,$length + je .L_final_block_${rndsuffix} + vpsrldq \$0xf,%zmm9,%zmm13 + vpclmulqdq \$0x0,$ZPOLY,%zmm13,%zmm14 + vpslldq \$0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 \$0x0,%zmm11,%xmm15 + jmp .L_steal_cipher_${rndsuffix} - vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 - vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 - vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 - vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 + .L_remaining_num_blocks_is_6_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%ymm2 + add \$0x60,$input +___ + } - # Zero the 64 bytes we just restored to the xmm registers. - vmovdqa64 %zmm0,$XMM_STORAGE($TW) + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 - vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 - vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 - vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %ymm2,0x40($output) + add \$0x60,$output + vextracti32x4 \$0x2,%zmm10,%xmm0 + vextracti32x4 \$0x3,%zmm10,%xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} - # And again. - vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) + .L_remaining_num_blocks_is_5_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu 0x40($input),%xmm2 + add \$0x50,$input +___ + } - vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 - vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); - # Last round is only 32 bytes (256-bits), so we use `%ymm` as the - # source operand. - vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) -___ - } + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu %xmm2,0x40($output) + add \$0x50,$output + vextracti32x4 \$0x1,%zmm10,%xmm0 + vextracti32x4 \$0x2,%zmm10,%xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} - { - $code .= <<___; - # Zero-out the stack frames used for `key1`, 64 bytes at a time. - vmovdqa64 %zmm0,0x80(%rsp) - vmovdqa64 %zmm0,0xc0(%rsp) - vmovdqa64 %zmm0,0x100(%rsp) - - # Stack usage is not divisible by 64, so we use a kmask register to - # only mov 48 of the bytes (6 quad-words). - mov \$0x3f,$tmp1 - kmovq $tmp1,%k2 - vmovdqa64 %zmm0,0x140(%rsp){%k2} - - mov %rbp,%rsp - pop %rbp - vzeroupper - ret - - .L_less_than_128_bytes_${rndsuffix}: - cmp \$0x10,$length - jb .L_ret_${rndsuffix} - - mov $length,$tmp1 - and \$0x70,$tmp1 - cmp \$0x60,$tmp1 - je .L_num_blocks_is_6_${rndsuffix} - cmp \$0x50,$tmp1 - je .L_num_blocks_is_5_${rndsuffix} - cmp \$0x40,$tmp1 - je .L_num_blocks_is_4_${rndsuffix} - cmp \$0x30,$tmp1 - je .L_num_blocks_is_3_${rndsuffix} - cmp \$0x20,$tmp1 - je .L_num_blocks_is_2_${rndsuffix} - cmp \$0x10,$tmp1 - je .L_num_blocks_is_1_${rndsuffix} + .L_remaining_num_blocks_is_4_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + add \$0x40,$input ___ - } + } - $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 7); + decrypt_by_four("%zmm1", "%zmm9", "%zmm0"); - { - $code .= <<___; - add \$0x70,$input - and \$0xf,$length - je .L_done_7_${rndsuffix} - - .L_steal_cipher_7_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm15,%xmm16 - vmovdqa 0x10(%rsp),%xmm15 + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + add \$0x40,$output + vextracti32x4 \$0x0,%zmm10,%xmm0 + vextracti32x4 \$0x1,%zmm10,%xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1); + } - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - vmovdqu %xmm6,0x50($output) - add \$0x70,$output - vmovdqa64 %xmm16,%xmm0 - vmovdqa %xmm7,%xmm8 - jmp .L_steal_cipher_${rndsuffix} + { + $code .= <<___; + .L_remaining_num_blocks_is_3_${rndsuffix}: + mov \$-1, $tmp1 + shr \$0x10, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 ($input), %zmm1{%k1} + add \$0x30, $input ___ - } + } - $code .= "\n.L_done_7_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1); + decrypt_by_four("%zmm1", "%zmm9", "%zmm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - vmovdqu %xmm6,0x50($output) - add \$0x70,$output - vmovdqa %xmm7,%xmm8 - jmp .L_done_${rndsuffix} + { + $code .= <<___; + vmovdqu8 %zmm1, ($output){%k1} + add \$0x30, $output + vextracti32x4 \$0x3, %zmm9, %xmm0 + vextracti32x4 \$0x0, %zmm10, %xmm15 + and \$0xf, $length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - - $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 6); + } - { - $code .= <<___; - add \$0x60,$input - and \$0xf,$length - je .L_done_6_${rndsuffix} - - .L_steal_cipher_6_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm14,%xmm15 - vmovdqa 0x10(%rsp),%xmm14 + { + $code .= <<___; + .L_remaining_num_blocks_is_2_${rndsuffix}: + vmovdqu8 ($input), %ymm1 + add \$0x20, $input ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1); + decrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - add \$0x60,$output - vmovdqa %xmm15,%xmm0 - vmovdqa %xmm6,%xmm8 - jmp .L_steal_cipher_${rndsuffix} + { + $code .= <<___; + vmovdqu %ymm1,($output) + add \$0x20,$output + vextracti32x4 \$0x2,%zmm9,%xmm0 + vextracti32x4 \$0x3,%zmm9,%xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - $code .= "\n.L_done_6_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1); + } - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - vmovdqu %xmm5,0x40($output) - add \$0x60,$output - vmovdqa %xmm6,%xmm8 - jmp .L_done_${rndsuffix} + { + $code .= <<___; + .L_remaining_num_blocks_is_1_${rndsuffix}: + vmovdqu ($input),%xmm1 + add \$0x10,$input ___ - } - - $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n"; - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 5); + } - { - $code .= <<___; - add \$0x50,$input - and \$0xf,$length - je .L_done_5_${rndsuffix} - - .L_steal_cipher_5_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm13,%xmm14 - vmovdqa 0x10($TW),%xmm13 -___ - } + decrypt_final("%xmm1", "%xmm9"); - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1); + { + $code .= <<___; + vmovdqu %xmm1,($output) + add \$0x10,$output + vextracti32x4 \$0x1,%zmm9,%xmm0 + vextracti32x4 \$0x2,%zmm9,%xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} + + # Setup for and then generation of 16 tweaks. + .L_start_by16_${rndsuffix}: + vbroadcasti32x4 ($TW),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + mov \$0xaa,$tmp1 + kmovq $tmp1,%k2 + vpshufb %zmm8,%zmm0,%zmm1 + + # Tweaks 0-3 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 + vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + + # Tweaks 4-7 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 + vpclmulqdq \$0x0,$ZPOLY,%zmm6,%zmm7 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + + # Tweaks 8-11 + vpsrldq \$0xf,%zmm9,%zmm13 + vpclmulqdq \$0x0,$ZPOLY,%zmm13,%zmm14 + vpslldq \$0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + + # Tweaks 12-15 + vpsrldq \$0xf,%zmm10,%zmm15 + vpclmulqdq \$0x0,$ZPOLY,%zmm15,%zmm16 + vpslldq \$0x1,%zmm10,%zmm12 + vpxord %zmm16,%zmm12,%zmm12 + + .L_main_loop_run_16_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2 + vmovdqu8 0x80($input),%zmm3 + vmovdqu8 0xc0($input),%zmm4 + add \$0x100,$input +___ + } + + decrypt_by_16("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", + "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0); + + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output) + vmovdqu8 %zmm3,0x80($output) + vmovdqu8 %zmm4,0xc0($output) + add \$0x100,$output + sub \$0x100,$decLength + cmp \$0x100,$decLength + jae .L_main_loop_run_16_${rndsuffix} + cmp \$0x80,$decLength + jae .L_main_loop_run_8_${rndsuffix} + jmp .L_do_n_blocks_${rndsuffix} + + .L_start_by8_${rndsuffix}: + vbroadcasti32x4 ($TW),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + mov \$0xaa,$tmp1 + kmovq $tmp1,%k2 + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 + vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 + vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + + .L_main_loop_run_8_${rndsuffix}: + vmovdqu8 ($input),%zmm1 + vmovdqu8 0x40($input),%zmm2 + add \$0x80,$input +___ + } + + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0); + + { + $code .= <<___; + vmovdqu8 %zmm1,($output) + vmovdqu8 %zmm2,0x40($output) + add \$0x80,$output + sub \$0x80,$decLength + cmp \$0x80,$decLength + jae .L_main_loop_run_8_${rndsuffix} + vextracti32x4 \$0x0,%zmm9,%xmm0 + vextracti32x4 \$0x1,%zmm9,%xmm15 + jmp .L_do_n_blocks_${rndsuffix} + + .L_steal_cipher_with_tweak_${rndsuffix}: + # %xmm0 holds tweak, %xmm15 holds tweak' + vmovdqa shufb_15_7(%rip),%xmm11 + vpshufb %xmm11,%xmm0,%xmm12 + vpsllq \$0x1,%xmm0,%xmm13 + vpsrlq \$0x7,%xmm12,%xmm14 + vpclmulqdq \$0x0,%xmm25,%xmm14,%xmm15 # just the first lane of ZPOLY + vpxord %xmm13,%xmm15,%xmm15 + + .L_steal_cipher_${rndsuffix}: + # 1. Decrypt the final complete block with tweak', result is held in xmm8. + vmovdqu ($input), %xmm8 + vpxor %xmm15,%xmm8,%xmm8 + vpxor ($key1),%xmm8,%xmm8 + vaesdec 0x10($key1),%xmm8,%xmm8 + vaesdec 0x20($key1),%xmm8,%xmm8 + vaesdec 0x30($key1),%xmm8,%xmm8 + vaesdec 0x40($key1),%xmm8,%xmm8 + vaesdec 0x50($key1),%xmm8,%xmm8 + vaesdec 0x60($key1),%xmm8,%xmm8 + vaesdec 0x70($key1),%xmm8,%xmm8 + vaesdec 0x80($key1),%xmm8,%xmm8 + vaesdec 0x90($key1),%xmm8,%xmm8 + vaesdec 0xa0($key1),%xmm8,%xmm8 + vaesdec 0xb0($key1),%xmm8,%xmm8 + vaesdec 0xc0($key1),%xmm8,%xmm8 + vaesdec 0xd0($key1),%xmm8,%xmm8 + vaesdeclast 0xe0($key1),%xmm8,%xmm8 + vpxor %xmm15,%xmm8,%xmm8 + + # 2. Take the n (s.t. n < 16) leftover bytes from the cipher text and + # replace the front n bytes of the decrypted block from step 1, held in + # xmm9. + mov \$1,%r11 + mov $key1,$tmp1 + mov $length,$key1 # shl's shift op has to be in %cl. + shlq %cl,%r11 + sub \$1,%r11 + kmovq %r11,%k1 + vmovdqu8 0x10($input),%xmm9{%k1}{z} + vmovdqu8 %xmm8,%xmm10{%k1}{z} # save front n bytes to append later + vpblendmb %xmm9,%xmm8,%xmm9{%k1} + + # 3. Run decrypt on that block again, with tweak. + mov $tmp1,$key1 # put the pointer to the keys back into %rcx + vpxor %xmm0,%xmm9,%xmm9 + vpxor ($key1),%xmm9,%xmm9 + vaesdec 0x10($key1),%xmm9,%xmm9 + vaesdec 0x20($key1),%xmm9,%xmm9 + vaesdec 0x30($key1),%xmm9,%xmm9 + vaesdec 0x40($key1),%xmm9,%xmm9 + vaesdec 0x50($key1),%xmm9,%xmm9 + vaesdec 0x60($key1),%xmm9,%xmm9 + vaesdec 0x70($key1),%xmm9,%xmm9 + vaesdec 0x80($key1),%xmm9,%xmm9 + vaesdec 0x90($key1),%xmm9,%xmm9 + vaesdec 0xa0($key1),%xmm9,%xmm9 + vaesdec 0xb0($key1),%xmm9,%xmm9 + vaesdec 0xc0($key1),%xmm9,%xmm9 + vaesdec 0xd0($key1),%xmm9,%xmm9 + vaesdeclast 0xe0($key1),%xmm9,%xmm9 + vpxor %xmm0,%xmm9,%xmm9 + + # 4. Final output is that block, + the original front n bytes from the last + # complete block. + vmovdqu %xmm9,($output) + vmovdqu8 %xmm10,0x10($output){%k1} + jmp .L_ret_${rndsuffix} + + .L_final_block_is_only_block_${rndsuffix}: + vmovdqa ($TW),%xmm0 + and \$0xf,$length + jne .L_steal_cipher_with_tweak_${rndsuffix} + + .L_final_block_${rndsuffix}: + vmovdqa ($input), %xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vpxor ($key1),%xmm8,%xmm8 + vaesdec 0x10($key1),%xmm8,%xmm8 + vaesdec 0x20($key1),%xmm8,%xmm8 + vaesdec 0x30($key1),%xmm8,%xmm8 + vaesdec 0x40($key1),%xmm8,%xmm8 + vaesdec 0x50($key1),%xmm8,%xmm8 + vaesdec 0x60($key1),%xmm8,%xmm8 + vaesdec 0x70($key1),%xmm8,%xmm8 + vaesdec 0x80($key1),%xmm8,%xmm8 + vaesdec 0x90($key1),%xmm8,%xmm8 + vaesdec 0xa0($key1),%xmm8,%xmm8 + vaesdec 0xb0($key1),%xmm8,%xmm8 + vaesdec 0xc0($key1),%xmm8,%xmm8 + vaesdec 0xd0($key1),%xmm8,%xmm8 + vaesdeclast 0xe0($key1),%xmm8,%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vmovdqa %xmm8,($output) + + .L_ret_${rndsuffix}: + mov $GP_STORAGE($TW),%rbx + xor $tmp1,$tmp1 + mov $tmp1,$GP_STORAGE($TW) + vpxorq %zmm0,%zmm0,%zmm0 +___ + } + + if ($win64) { + $code .= <<___; + mov $GP_STORAGE + 8*1($TW),%rdi + mov $tmp1,$GP_STORAGE + 8*1($TW) + mov $GP_STORAGE + 8*2($TW),%rsi + mov $tmp1,$GP_STORAGE + 8*2($TW) - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - add \$0x50,$output - vmovdqa %xmm14,%xmm0 - vmovdqa %xmm5,%xmm8 - jmp .L_steal_cipher_${rndsuffix} -___ - } + vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 + vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 + vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 + vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 - $code .= "\n.L_done_5_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1); + # Zero the 64 bytes we just restored to the xmm registers. + vmovdqa64 %zmm0,$XMM_STORAGE($TW) - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - vmovdqu %xmm4,0x30($output) - add \$0x50,$output - vmovdqa %xmm5,%xmm8 - jmp .L_done_${rndsuffix} -___ - } + vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 + vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 + vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 + vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 - $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n"; + # And again. + vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 4); + vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 + vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 - { - $code .= <<___; - add \$0x40,$input - and \$0xf,$length - je .L_done_4_${rndsuffix} - - .L_steal_cipher_4_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm12,%xmm13 - vmovdqa 0x10($TW),%xmm12 + # Last round is only 32 bytes (256-bits), so we use `%ymm` as the + # source operand. + vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1); + { + $code .= <<___; + mov %rbp,$TW + pop %rbp + vzeroupper + ret - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - add \$0x40,$output - vmovdqa %xmm13,%xmm0 - vmovdqa %xmm4,%xmm8 - jmp .L_steal_cipher_${rndsuffix} -___ - } + .L_less_than_128_bytes_${rndsuffix}: + vpbroadcastq $gf_poly_8b, $ZPOLY + cmp \$0x10,$decLength + jb .L_ret_${rndsuffix} + vbroadcasti32x4 ($TW), %zmm0 + vbroadcasti32x4 shufb_15_7(%rip), %zmm8 + movl \$0xaa, %r8d + kmovq %r8, %k2 + mov $decLength,$tmp1 + and \$0x70,$tmp1 + cmp \$0x60,$tmp1 + je .L_num_blocks_is_6_${rndsuffix} + cmp \$0x50,$tmp1 + je .L_num_blocks_is_5_${rndsuffix} + cmp \$0x40,$tmp1 + je .L_num_blocks_is_4_${rndsuffix} + cmp \$0x30,$tmp1 + je .L_num_blocks_is_3_${rndsuffix} + cmp \$0x20,$tmp1 + je .L_num_blocks_is_2_${rndsuffix} + cmp \$0x10,$tmp1 + je .L_num_blocks_is_1_${rndsuffix} + + .L_num_blocks_is_7_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + mov \$0x0000ffffffffffff, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %zmm2{%k1} + + add \$0x70,$input +___ + } + + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %zmm2, 16*4($output){%k1} + add \$0x70,$output - $code .= "\n.L_done_4_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1); + vextracti32x4 \$0x3, %zmm10, %xmm0 - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - vmovdqu %xmm3,0x20($output) - add \$0x40,$output - vmovdqa %xmm4,%xmm8 - jmp .L_done_${rndsuffix} -___ - } + and \$0xf,$length + je .L_final_block_${rndsuffix} - $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n"; + vpsrldq \$0xf,%zmm9,%zmm13 + vpclmulqdq \$0x0,$ZPOLY,%zmm13,%zmm14 + vpslldq \$0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 \$0x0, %zmm11, %xmm15 + jmp .L_steal_cipher_${rndsuffix} +___ + } - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 3); + { + $code .= <<___; + .L_num_blocks_is_6_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %ymm2 + add \$96, $input +___ + } + + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %ymm2, 16*4($output) + add \$96, $output - { - $code .= <<___; - add \$0x30,$input - and \$0xf,$length - je .L_done_3_${rndsuffix} - - .L_steal_cipher_3_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm11,%xmm12 - vmovdqa 0x10($TW),%xmm11 + vextracti32x4 \$0x2, %zmm10, %xmm0 + vextracti32x4 \$0x3, %zmm10, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); + { + $code .= <<___; + .L_num_blocks_is_5_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + vmovdqu8 16*4($input), %xmm2 + add \$80, $input +___ + } + + decrypt_by_eight("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + vmovdqu8 %xmm2, 16*4($output) + add \$80, $output - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - add \$0x30,$output - vmovdqa %xmm12,%xmm0 - vmovdqa %xmm3,%xmm8 - jmp .L_steal_cipher_${rndsuffix} + vmovdqa %xmm2, %xmm8 + vextracti32x4 \$0x1, %zmm10, %xmm0 + vextracti32x4 \$0x2, %zmm10, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } - $code .= "\n.L_done_3_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1); + } - { - $code .= <<___; - vmovdqu %xmm1,($output) - vmovdqu %xmm2,0x10($output) - add \$0x30,$output - vmovdqa %xmm3,%xmm8 - jmp .L_done_${rndsuffix} + { + $code .= <<___; + .L_num_blocks_is_4_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + vmovdqu8 16*0($input), %zmm1 + add \$64, $input +___ + } + + decrypt_by_four("%zmm1", "%zmm9", "%zmm0"); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output) + add \$64, $output + vmovdqa %xmm10, %xmm0 + vextracti32x4 \$0x1, %zmm10, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n"; + { + $code .= <<___; + .L_num_blocks_is_3_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 + vpsllvq const_dq7654(%rip), %zmm0, %zmm5 + vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 + vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 + vpxorq %zmm6, %zmm5, %zmm5{%k2} + vpxord %zmm5, %zmm7, %zmm10 + mov \$0x0000ffffffffffff, $tmp1 + kmovq $tmp1, %k1 + vmovdqu8 16*0($input), %zmm1{%k1} + add \$48, $input +___ + } + + decrypt_by_four("%zmm1", "%zmm9", "%zmm0"); + + { + $code .= <<___; + vmovdqu8 %zmm1, 16*0($output){%k1} + add \$48, $output + vextracti32x4 \$3, %zmm9, %xmm0 + vextracti32x4 \$0, %zmm10, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} +___ + } - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 2); + { + $code .= <<___; + .L_num_blocks_is_2_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 - { - $code .= <<___; - add \$0x20,$input - and \$0xf,$length - je .L_done_2_${rndsuffix} - - .L_steal_cipher_2_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm10,%xmm11 - vmovdqa 0x10($TW),%xmm10 + vmovdqu8 16*0($input), %ymm1 + add \$32, $input ___ - } + } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); + decrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - { - $code .= <<___; - vmovdqu %xmm1,($output) - add \$0x20,$output - vmovdqa %xmm11,%xmm0 - vmovdqa %xmm2,%xmm8 - jmp .L_steal_cipher_${rndsuffix} + { + $code .= <<___; + vmovdqu8 %ymm1, 16*0($output) + add \$32, $output + + vextracti32x4 \$2, %zmm9, %xmm0 + vextracti32x4 \$3, %zmm9, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} ___ - } + } - $code .= "\n.L_done_2_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1); + { + $code .= <<___; + .L_num_blocks_is_1_${rndsuffix}: + vpshufb %zmm8, %zmm0, %zmm1 + vpsllvq const_dq3210(%rip), %zmm0, %zmm4 + vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 + vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 + vpxorq %zmm2, %zmm4, %zmm4{%k2} + vpxord %zmm4, %zmm3, %zmm9 - { - $code .= <<___; - vmovdqu %xmm1,($output) - add \$0x20,$output - vmovdqa %xmm2,%xmm8 - jmp .L_done_${rndsuffix} + vmovdqu8 16*0($input), %xmm1 + add \$16, $input ___ - } - - $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n"; + } - initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", 1); + decrypt_by_four("%ymm1", "%ymm9", "%ymm0"); - { - $code .= <<___; - add \$0x10,$input - and \$0xf,$length - je .L_done_1_${rndsuffix} - - .L_steal_cipher_1_${rndsuffix}: - xor $gf_poly_8b_temp, $gf_poly_8b_temp - shl \$1, $TWTEMPL - adc $TWTEMPH, $TWTEMPH - cmovc $gf_poly_8b, $gf_poly_8b_temp - xor $gf_poly_8b_temp, $TWTEMPL - mov $TWTEMPL,0x10($TW) - mov $TWTEMPH,0x18($TW) - vmovdqa64 %xmm9,%xmm10 - vmovdqa 0x10($TW),%xmm9 -___ - } - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); + { + $code .= <<___; + vmovdqu8 %xmm1, 16*0($output) + add \$16, $output - { - $code .= <<___; - add \$0x10,$output - vmovdqa %xmm10,%xmm0 - vmovdqa %xmm1,%xmm8 - jmp .L_steal_cipher_${rndsuffix} + vmovdqa %xmm1, %xmm8 + vextracti32x4 \$1, %zmm9, %xmm0 + vextracti32x4 \$2, %zmm9, %xmm15 + and \$0xf,$length + je .L_final_block_${rndsuffix} + jmp .L_steal_cipher_${rndsuffix} + .cfi_endproc ___ + } } - $code .= "\n.L_done_1_${rndsuffix}:\n"; - decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", - "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", - "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1); - { - $code .= <<___; - add \$0x10,$output - vmovdqa %xmm1,%xmm8 - jmp .L_done_${rndsuffix} - .cfi_endproc -___ - } + enc(); + dec(); $code .= <<___; .section .rodata @@ -3113,6 +2193,7 @@ .text #endif ___ + } else { $code .= <<___; .text diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h index 97d260f9bf..8c808c32bf 100644 --- a/crypto/fipsmodule/aes/internal.h +++ b/crypto/fipsmodule/aes/internal.h @@ -161,7 +161,7 @@ OPENSSL_EXPORT int aes_hw_xts_cipher(const uint8_t *in, uint8_t *out, size_t len const AES_KEY *key1, const AES_KEY *key2, const uint8_t iv[16], int enc); -#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) && !defined(OPENSSL_WINDOWS) +#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) #define AES_XTS_X86_64_AVX512 void aes_hw_xts_encrypt_avx512(const uint8_t *in, uint8_t *out, size_t length, const AES_KEY *key1, const AES_KEY *key2, diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/aesni-xts-avx512.S b/generated-src/linux-x86_64/crypto/fipsmodule/aesni-xts-avx512.S index e827997a41..84cc7b90c7 100644 --- a/generated-src/linux-x86_64/crypto/fipsmodule/aesni-xts-avx512.S +++ b/generated-src/linux-x86_64/crypto/fipsmodule/aesni-xts-avx512.S @@ -16,102 +16,26 @@ aes_hw_xts_encrypt_avx512: .byte 243,15,30,250 pushq %rbp movq %rsp,%rbp - subq $376,%rsp + subq $136,%rsp andq $0xffffffffffffffc0,%rsp - movq %rbx,368(%rsp) + movq %rbx,128(%rsp) movq $0x87,%r10 vmovdqu (%r9),%xmm1 - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu (%r8),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - - vmovdqu (%rcx),%xmm2 - vmovdqa %xmm2,128(%rsp) - - vmovdqu 16(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 16(%rcx),%xmm2 - vmovdqa %xmm2,144(%rsp) - - vmovdqu 32(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 32(%rcx),%xmm2 - vmovdqa %xmm2,160(%rsp) - - vmovdqu 48(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 48(%rcx),%xmm2 - vmovdqa %xmm2,176(%rsp) - - vmovdqu 64(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 64(%rcx),%xmm2 - vmovdqa %xmm2,192(%rsp) - - vmovdqu 80(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 80(%rcx),%xmm2 - vmovdqa %xmm2,208(%rsp) - - vmovdqu 96(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 96(%rcx),%xmm2 - vmovdqa %xmm2,224(%rsp) - - vmovdqu 112(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 112(%rcx),%xmm2 - vmovdqa %xmm2,240(%rsp) - - vmovdqu 128(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 128(%rcx),%xmm2 - vmovdqa %xmm2,256(%rsp) - - vmovdqu 144(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 144(%rcx),%xmm2 - vmovdqa %xmm2,272(%rsp) - - vmovdqu 160(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 160(%rcx),%xmm2 - vmovdqa %xmm2,288(%rsp) - - vmovdqu 176(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 176(%rcx),%xmm2 - vmovdqa %xmm2,304(%rsp) - - vmovdqu 192(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 192(%rcx),%xmm2 - vmovdqa %xmm2,320(%rsp) - - vmovdqu 208(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 208(%rcx),%xmm2 - vmovdqa %xmm2,336(%rsp) - - vmovdqu 224(%r8),%xmm0 -.byte 98,242,117,8,221,200 - - vmovdqu 224(%rcx),%xmm2 - vmovdqa %xmm2,352(%rsp) - + vpxor (%r8),%xmm1,%xmm1 + vaesenc 16(%r8),%xmm1,%xmm1 + vaesenc 32(%r8),%xmm1,%xmm1 + vaesenc 48(%r8),%xmm1,%xmm1 + vaesenc 64(%r8),%xmm1,%xmm1 + vaesenc 80(%r8),%xmm1,%xmm1 + vaesenc 96(%r8),%xmm1,%xmm1 + vaesenc 112(%r8),%xmm1,%xmm1 + vaesenc 128(%r8),%xmm1,%xmm1 + vaesenc 144(%r8),%xmm1,%xmm1 + vaesenc 160(%r8),%xmm1,%xmm1 + vaesenc 176(%r8),%xmm1,%xmm1 + vaesenc 192(%r8),%xmm1,%xmm1 + vaesenc 208(%r8),%xmm1,%xmm1 + vaesenclast 224(%r8),%xmm1,%xmm1 vmovdqa %xmm1,(%rsp) cmpq $0x80,%rdx @@ -144,95 +68,83 @@ aes_hw_xts_encrypt_avx512: jmp .L_steal_cipher_hEgxyDlCngwrfFe .L_remaining_num_blocks_is_7_hEgxyDlCngwrfFe: - movq $0xffffffffffffffff,%r8 - shrq $0x10,%r8 + movq $0x0000ffffffffffff,%r8 kmovq %r8,%k1 vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2{%k1} addq $0x70,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu8 %zmm2,64(%rsi){%k1} addq $0x70,%rsi @@ -246,89 +158,78 @@ aes_hw_xts_encrypt_avx512: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%ymm2 addq $0x60,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu8 %ymm2,64(%rsi) addq $0x60,%rsi @@ -342,89 +243,78 @@ aes_hw_xts_encrypt_avx512: vmovdqu8 (%rdi),%zmm1 vmovdqu 64(%rdi),%xmm2 addq $0x50,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu %xmm2,64(%rsi) addq $0x50,%rsi @@ -437,236 +327,125 @@ aes_hw_xts_encrypt_avx512: .L_remaining_num_blocks_is_4_hEgxyDlCngwrfFe: vmovdqu8 (%rdi),%zmm1 addq $0x40,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 -.byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) addq $0x40,%rsi vextracti32x4 $0x3,%zmm1,%xmm8 - vextracti32x4 $0x0,%zmm10,%xmm0 + vmovdqa64 %xmm10,%xmm0 andq $0xf,%rdx je .L_ret_hEgxyDlCngwrfFe jmp .L_steal_cipher_hEgxyDlCngwrfFe .L_remaining_num_blocks_is_3_hEgxyDlCngwrfFe: - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x2,%zmm9,%xmm11 - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 - vmovdqu 32(%rdi),%xmm3 + movq $-1,%r8 + shrq $0x10,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1{%k1} addq $0x30,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,(%rsi){%k1} addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 + vextracti32x4 $0x2,%zmm1,%xmm8 vextracti32x4 $0x3,%zmm9,%xmm0 andq $0xf,%rdx je .L_ret_hEgxyDlCngwrfFe jmp .L_steal_cipher_hEgxyDlCngwrfFe .L_remaining_num_blocks_is_2_hEgxyDlCngwrfFe: - vextracti32x4 $0x1,%zmm9,%xmm10 - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 + vmovdqu8 (%rdi),%ymm1 addq $0x20,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu %ymm1,(%rsi) addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm1,%xmm8 vextracti32x4 $0x2,%zmm9,%xmm0 andq $0xf,%rdx je .L_ret_hEgxyDlCngwrfFe @@ -675,36 +454,21 @@ aes_hw_xts_encrypt_avx512: vmovdqu (%rdi),%xmm1 addq $0x10,%rdi vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 vpxor %xmm9,%xmm1,%xmm1 vmovdqu %xmm1,(%rsi) addq $0x10,%rsi @@ -714,26 +478,36 @@ aes_hw_xts_encrypt_avx512: je .L_ret_hEgxyDlCngwrfFe jmp .L_steal_cipher_hEgxyDlCngwrfFe + + .L_start_by16_hEgxyDlCngwrfFe: vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 vpshufb %zmm8,%zmm0,%zmm1 + + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 vpxorq %zmm2,%zmm4,%zmm4{%k2} vpxord %zmm4,%zmm3,%zmm9 + + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 .byte 98,147,77,72,68,249,0 vpxorq %zmm6,%zmm5,%zmm5{%k2} vpxord %zmm5,%zmm7,%zmm10 + + vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm11 vpxord %zmm14,%zmm11,%zmm11 + + vpsrldq $0xf,%zmm10,%zmm15 .byte 98,131,5,72,68,193,0 vpslldq $0x1,%zmm10,%zmm12 @@ -749,7 +523,7 @@ aes_hw_xts_encrypt_avx512: vpxorq %zmm10,%zmm2,%zmm2 vpxorq %zmm11,%zmm3,%zmm3 vpxorq %zmm12,%zmm4,%zmm4 - vbroadcasti32x4 128(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 vpxorq %zmm0,%zmm1,%zmm1 vpxorq %zmm0,%zmm2,%zmm2 vpxorq %zmm0,%zmm3,%zmm3 @@ -758,17 +532,17 @@ aes_hw_xts_encrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm11,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -777,17 +551,17 @@ aes_hw_xts_encrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm12,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -796,17 +570,17 @@ aes_hw_xts_encrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm15,%zmm17 vpxord %zmm14,%zmm17,%zmm17 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -815,27 +589,27 @@ aes_hw_xts_encrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm16,%zmm18 vpxord %zmm14,%zmm18,%zmm18 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 .byte 98,242,101,72,221,216 @@ -856,9 +630,9 @@ aes_hw_xts_encrypt_avx512: addq $0x100,%rsi subq $0x100,%rdx cmpq $0x100,%rdx - jge .L_main_loop_run_16_hEgxyDlCngwrfFe + jae .L_main_loop_run_16_hEgxyDlCngwrfFe cmpq $0x80,%rdx - jge .L_main_loop_run_8_hEgxyDlCngwrfFe + jae .L_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 $0x3,%zmm4,%xmm0 jmp .L_do_n_blocks_hEgxyDlCngwrfFe @@ -883,29 +657,24 @@ aes_hw_xts_encrypt_avx512: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2 addq $0x80,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 vpsrldq $0xf,%zmm10,%zmm13 @@ -913,65 +682,61 @@ aes_hw_xts_encrypt_avx512: vpslldq $0x1,%zmm10,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - vmovdqa32 %zmm15,%zmm9 vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) @@ -979,20 +744,10 @@ aes_hw_xts_encrypt_avx512: addq $0x80,%rsi subq $0x80,%rdx cmpq $0x80,%rdx - jge .L_main_loop_run_8_hEgxyDlCngwrfFe + jae .L_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 $0x3,%zmm2,%xmm0 jmp .L_do_n_blocks_hEgxyDlCngwrfFe -.L_steal_cipher_next_hEgxyDlCngwrfFe: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,(%rsp) - movq %rbx,8(%rsp) - vmovdqa (%rsp),%xmm0 - .L_steal_cipher_hEgxyDlCngwrfFe: vmovdqa %xmm8,%xmm2 leaq vpshufb_shf_table(%rip),%rax @@ -1008,48 +763,42 @@ aes_hw_xts_encrypt_avx512: vpshufb %xmm10,%xmm3,%xmm3 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 vpxor %xmm0,%xmm3,%xmm8 - vpxor 128(%rsp),%xmm8,%xmm8 -.byte 98,114,61,8,220,132,36,144,0,0,0 -.byte 98,114,61,8,220,132,36,160,0,0,0 -.byte 98,114,61,8,220,132,36,176,0,0,0 -.byte 98,114,61,8,220,132,36,192,0,0,0 -.byte 98,114,61,8,220,132,36,208,0,0,0 -.byte 98,114,61,8,220,132,36,224,0,0,0 -.byte 98,114,61,8,220,132,36,240,0,0,0 -.byte 98,114,61,8,220,132,36,0,1,0,0 -.byte 98,114,61,8,220,132,36,16,1,0,0 -.byte 98,114,61,8,220,132,36,32,1,0,0 -.byte 98,114,61,8,220,132,36,48,1,0,0 -.byte 98,114,61,8,220,132,36,64,1,0,0 -.byte 98,114,61,8,220,132,36,80,1,0,0 -.byte 98,114,61,8,221,132,36,96,1,0,0 + vpxor (%rcx),%xmm8,%xmm8 + vaesenc 16(%rcx),%xmm8,%xmm8 + vaesenc 32(%rcx),%xmm8,%xmm8 + vaesenc 48(%rcx),%xmm8,%xmm8 + vaesenc 64(%rcx),%xmm8,%xmm8 + vaesenc 80(%rcx),%xmm8,%xmm8 + vaesenc 96(%rcx),%xmm8,%xmm8 + vaesenc 112(%rcx),%xmm8,%xmm8 + vaesenc 128(%rcx),%xmm8,%xmm8 + vaesenc 144(%rcx),%xmm8,%xmm8 + vaesenc 160(%rcx),%xmm8,%xmm8 + vaesenc 176(%rcx),%xmm8,%xmm8 + vaesenc 192(%rcx),%xmm8,%xmm8 + vaesenc 208(%rcx),%xmm8,%xmm8 + vaesenclast 224(%rcx),%xmm8,%xmm8 vpxor %xmm0,%xmm8,%xmm8 vmovdqu %xmm8,-16(%rsi) + .L_ret_hEgxyDlCngwrfFe: - movq 368(%rsp),%rbx + movq 128(%rsp),%rbx xorq %r8,%r8 - movq %r8,368(%rsp) - + movq %r8,128(%rsp) vpxorq %zmm0,%zmm0,%zmm0 - - vmovdqa64 %zmm0,128(%rsp) - vmovdqa64 %zmm0,192(%rsp) - vmovdqa64 %zmm0,256(%rsp) - - - - movq $0x3f,%r8 - kmovq %r8,%k2 - vmovdqa64 %zmm0,320(%rsp){%k2} - movq %rbp,%rsp popq %rbp vzeroupper .byte 0xf3,0xc3 .L_less_than_128_bytes_hEgxyDlCngwrfFe: + vpbroadcastq %r10,%zmm25 cmpq $0x10,%rdx jb .L_ret_hEgxyDlCngwrfFe + vbroadcasti32x4 (%rsp),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + movl $0xaa,%r8d + kmovq %r8,%k2 movq %rdx,%r8 andq $0x70,%r8 cmpq $0x60,%r8 @@ -1066,2182 +815,990 @@ aes_hw_xts_encrypt_avx512: je .L_num_blocks_is_1_hEgxyDlCngwrfFe .L_num_blocks_is_7_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,96(%rsp) - movq %rbx,104(%rsp) - vmovdqa 96(%rsp),%xmm15 - vmovdqu 96(%rdi),%xmm7 + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + addq $0x70,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 -.byte 98,242,77,8,221,240 -.byte 98,242,69,8,221,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - addq $0x70,%rsi - vmovdqa %xmm7,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_num_blocks_is_6_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - addq $0x60,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 -.byte 98,242,77,8,221,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x60,%rsi - vmovdqa %xmm6,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe -.L_num_blocks_is_5_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - addq $0x50,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x50,%rsi - vmovdqa %xmm5,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_num_blocks_is_4_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - addq $0x40,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x40,%rsi - vmovdqa %xmm4,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe -.L_num_blocks_is_3_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_num_blocks_is_2_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - addq $0x20,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_num_blocks_is_1_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - addq $0x10,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,(%rsi) - addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - andq $0xf,%rdx - je .L_ret_hEgxyDlCngwrfFe - jmp .L_steal_cipher_next_hEgxyDlCngwrfFe -.cfi_endproc -.globl aes_hw_xts_decrypt_avx512 -.hidden aes_hw_xts_decrypt_avx512 -.hidden aes_hw_xts_decrypt_avx512 -.type aes_hw_xts_decrypt_avx512,@function -.align 32 -aes_hw_xts_decrypt_avx512: -.cfi_startproc -.byte 243,15,30,250 - pushq %rbp - movq %rsp,%rbp - subq $376,%rsp - andq $0xffffffffffffffc0,%rsp - movq %rbx,368(%rsp) - movq $0x87,%r10 - vmovdqu (%r9),%xmm1 - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu (%r8),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - - vmovdqu 224(%rcx),%xmm2 - vmovdqa %xmm2,352(%rsp) - - vmovdqu 16(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 208(%rcx),%xmm2 - vmovdqa %xmm2,336(%rsp) - - vmovdqu 32(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 192(%rcx),%xmm2 - vmovdqa %xmm2,320(%rsp) - - vmovdqu 48(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 176(%rcx),%xmm2 - vmovdqa %xmm2,304(%rsp) - - vmovdqu 64(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 160(%rcx),%xmm2 - vmovdqa %xmm2,288(%rsp) - - vmovdqu 80(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 144(%rcx),%xmm2 - vmovdqa %xmm2,272(%rsp) - - vmovdqu 96(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 128(%rcx),%xmm2 - vmovdqa %xmm2,256(%rsp) - - vmovdqu 112(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 112(%rcx),%xmm2 - vmovdqa %xmm2,240(%rsp) - - vmovdqu 128(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 96(%rcx),%xmm2 - vmovdqa %xmm2,224(%rsp) - - vmovdqu 144(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 80(%rcx),%xmm2 - vmovdqa %xmm2,208(%rsp) - - vmovdqu 160(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 64(%rcx),%xmm2 - vmovdqa %xmm2,192(%rsp) + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 176(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 48(%rcx),%xmm2 - vmovdqa %xmm2,176(%rsp) + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 192(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 32(%rcx),%xmm2 - vmovdqa %xmm2,160(%rsp) + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 208(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 16(%rcx),%xmm2 - vmovdqa %xmm2,144(%rsp) + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 224(%r8),%xmm0 -.byte 98,242,117,8,221,200 - vmovdqu (%rcx),%xmm2 - vmovdqa %xmm2,128(%rsp) + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqa %xmm1,(%rsp) - cmpq $0x80,%rdx - jb .L_less_than_128_bytes_amivrujEyduiFoi - vpbroadcastq %r10,%zmm25 - cmpq $0x100,%rdx - jge .L_start_by16_amivrujEyduiFoi - jmp .L_start_by8_amivrujEyduiFoi + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_do_n_blocks_amivrujEyduiFoi: - cmpq $0x0,%rdx - je .L_ret_amivrujEyduiFoi - cmpq $0x70,%rdx - jge .L_remaining_num_blocks_is_7_amivrujEyduiFoi - cmpq $0x60,%rdx - jge .L_remaining_num_blocks_is_6_amivrujEyduiFoi - cmpq $0x50,%rdx - jge .L_remaining_num_blocks_is_5_amivrujEyduiFoi - cmpq $0x40,%rdx - jge .L_remaining_num_blocks_is_4_amivrujEyduiFoi - cmpq $0x30,%rdx - jge .L_remaining_num_blocks_is_3_amivrujEyduiFoi - cmpq $0x20,%rdx - jge .L_remaining_num_blocks_is_2_amivrujEyduiFoi - cmpq $0x10,%rdx - jge .L_remaining_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu %xmm5,%xmm1 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,-16(%rsi) - vmovdqa %xmm1,%xmm8 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - movq $0x1,%r8 - kmovq %r8,%k1 - vpsllq $0x3f,%xmm9,%xmm13 - vpsraq $0x3f,%xmm13,%xmm14 - vpandq %xmm25,%xmm14,%xmm5 - vpxorq %xmm5,%xmm9,%xmm9{%k1} - vpsrldq $0x8,%xmm9,%xmm10 -.byte 98, 211, 181, 8, 115, 194, 1 - vpslldq $0x8,%xmm13,%xmm13 - vpxorq %xmm13,%xmm0,%xmm0 - jmp .L_steal_cipher_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_remaining_num_blocks_is_7_amivrujEyduiFoi: - movq $0xffffffffffffffff,%r8 - shrq $0x10,%r8 - kmovq %r8,%k1 - vmovdqu8 (%rdi),%zmm1 - vmovdqu8 64(%rdi),%zmm2{%k1} - addq $0x70,%rdi - andq $0xf,%rdx - je .L_done_7_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm10,%xmm12 - vextracti32x4 $0x3,%zmm10,%xmm13 - vinserti32x4 $0x2,%xmm13,%zmm10,%zmm10 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + vextracti32x4 $0x2,%zmm2,%xmm8 + vextracti32x4 $0x3,%zmm10,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_6_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $96,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi){%k1} - addq $0x70,%rsi - vextracti32x4 $0x2,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -.L_done_7_remain_amivrujEyduiFoi: + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %ymm2,64(%rsi) + addq $96,%rsi - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vextracti32x4 $0x1,%ymm2,%xmm8 + vextracti32x4 $0x2,%zmm10,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_5_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%xmm2 + addq $80,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %xmm2,64(%rsi) + addq $80,%rsi - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi){%k1} - jmp .L_ret_amivrujEyduiFoi - -.L_remaining_num_blocks_is_6_amivrujEyduiFoi: - vmovdqu8 (%rdi),%zmm1 - vmovdqu8 64(%rdi),%ymm2 - addq $0x60,%rdi + vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm10,%xmm0 andq $0xf,%rdx - je .L_done_6_remain_amivrujEyduiFoi - vextracti32x4 $0x1,%zmm10,%xmm12 - vextracti32x4 $0x2,%zmm10,%xmm13 - vinserti32x4 $0x1,%xmm13,%zmm10,%zmm10 - + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_4_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + addq $64,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - + vmovdqu8 %zmm1,0(%rsi) + addq $64,%rsi + vextracti32x4 $0x3,%zmm1,%xmm8 + vmovdqa %xmm10,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_3_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1{%k1} + addq $48,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi){%k1} + addq $48,%rsi + vextracti32x4 $2,%zmm1,%xmm8 + vextracti32x4 $3,%zmm9,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_2_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vmovdqu8 0(%rdi),%ymm1 + addq $32,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %ymm1,0(%rsi) + addq $32,%rsi + + vextracti32x4 $1,%ymm1,%xmm8 + vextracti32x4 $2,%zmm9,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.L_num_blocks_is_1_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vmovdqu8 0(%rdi),%xmm1 + addq $16,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %xmm1,0(%rsi) + addq $16,%rsi - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 + vmovdqa %xmm1,%xmm8 + vextracti32x4 $1,%zmm9,%xmm0 + andq $0xf,%rdx + je .L_ret_hEgxyDlCngwrfFe + jmp .L_steal_cipher_hEgxyDlCngwrfFe +.cfi_endproc +.globl aes_hw_xts_decrypt_avx512 +.hidden aes_hw_xts_decrypt_avx512 +.hidden aes_hw_xts_decrypt_avx512 +.type aes_hw_xts_decrypt_avx512,@function +.align 32 +aes_hw_xts_decrypt_avx512: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbp + movq %rsp,%rbp + subq $136,%rsp + andq $0xffffffffffffffc0,%rsp + movq %rbx,128(%rsp) + movq $0x87,%r10 + vmovdqu (%r9),%xmm1 + vpxor (%r8),%xmm1,%xmm1 + vaesenc 16(%r8),%xmm1,%xmm1 + vaesenc 32(%r8),%xmm1,%xmm1 + vaesenc 48(%r8),%xmm1,%xmm1 + vaesenc 64(%r8),%xmm1,%xmm1 + vaesenc 80(%r8),%xmm1,%xmm1 + vaesenc 96(%r8),%xmm1,%xmm1 + vaesenc 112(%r8),%xmm1,%xmm1 + vaesenc 128(%r8),%xmm1,%xmm1 + vaesenc 144(%r8),%xmm1,%xmm1 + vaesenc 160(%r8),%xmm1,%xmm1 + vaesenc 176(%r8),%xmm1,%xmm1 + vaesenc 192(%r8),%xmm1,%xmm1 + vaesenc 208(%r8),%xmm1,%xmm1 + vaesenclast 224(%r8),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsp) - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + cmpq $0x20,%rdx + jl .L_final_block_is_only_block_amivrujEyduiFoi - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %ymm2,64(%rsi) - addq $0x60,%rsi - vextracti32x4 $0x1,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp .L_steal_cipher_amivrujEyduiFoi -.L_done_6_remain_amivrujEyduiFoi: - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + movq %rdx,%r11 + andq $0xfffffffffffffff0,%r11 + subq $16,%r11 + cmpq $0x80,%r11 + jl .L_less_than_128_bytes_amivrujEyduiFoi + vpbroadcastq %r10,%zmm25 + cmpq $0x100,%r11 + jge .L_start_by16_amivrujEyduiFoi + cmpq $0x80,%r11 + jge .L_start_by8_amivrujEyduiFoi +.L_do_n_blocks_amivrujEyduiFoi: + cmpq $0x70,%r11 + je .L_remaining_num_blocks_is_7_amivrujEyduiFoi + cmpq $0x60,%r11 + je .L_remaining_num_blocks_is_6_amivrujEyduiFoi + cmpq $0x50,%r11 + je .L_remaining_num_blocks_is_5_amivrujEyduiFoi + cmpq $0x40,%r11 + je .L_remaining_num_blocks_is_4_amivrujEyduiFoi + cmpq $0x30,%r11 + je .L_remaining_num_blocks_is_3_amivrujEyduiFoi + cmpq $0x20,%r11 + je .L_remaining_num_blocks_is_2_amivrujEyduiFoi + cmpq $0x10,%r11 + je .L_remaining_num_blocks_is_1_amivrujEyduiFoi + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi + vextracti32x4 $0x0,%zmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm15 + jmp .L_steal_cipher_amivrujEyduiFoi - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +.L_remaining_num_blocks_is_7_amivrujEyduiFoi: + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + addq $0x70,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %ymm2,64(%rsi) - jmp .L_ret_amivrujEyduiFoi - -.L_remaining_num_blocks_is_5_amivrujEyduiFoi: - vmovdqu8 (%rdi),%zmm1 - vmovdqu 64(%rdi),%xmm2 - addq $0x50,%rdi + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + vextracti32x4 $0x3,%zmm10,%xmm0 andq $0xf,%rdx - je .L_done_5_remain_amivrujEyduiFoi - vmovdqa %xmm10,%xmm12 - vextracti32x4 $0x1,%zmm10,%xmm10 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - + je .L_final_block_amivrujEyduiFoi + vpsrldq $0xf,%zmm9,%zmm13 +.byte 98,19,21,72,68,241,0 + vpslldq $0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 $0x0,%zmm11,%xmm15 + jmp .L_steal_cipher_amivrujEyduiFoi - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +.L_remaining_num_blocks_is_6_amivrujEyduiFoi: + vmovdqu8 (%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $0x60,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu %xmm2,64(%rsi) - addq $0x50,%rsi - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 + vmovdqu8 %ymm2,64(%rsi) + addq $0x60,%rsi + vextracti32x4 $0x2,%zmm10,%xmm0 + vextracti32x4 $0x3,%zmm10,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi -.L_done_5_remain_amivrujEyduiFoi: - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +.L_remaining_num_blocks_is_5_amivrujEyduiFoi: + vmovdqu8 (%rdi),%zmm1 + vmovdqu 64(%rdi),%xmm2 + addq $0x50,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %xmm2,64(%rsi) - jmp .L_ret_amivrujEyduiFoi + vmovdqu %xmm2,64(%rsi) + addq $0x50,%rsi + vextracti32x4 $0x1,%zmm10,%xmm0 + vextracti32x4 $0x2,%zmm10,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi + jmp .L_steal_cipher_amivrujEyduiFoi .L_remaining_num_blocks_is_4_amivrujEyduiFoi: vmovdqu8 (%rdi),%zmm1 addq $0x40,%rdi - andq $0xf,%rdx - je .L_done_4_remain_amivrujEyduiFoi - vextracti32x4 $0x3,%zmm9,%xmm12 - vinserti32x4 $0x3,%xmm10,%zmm9,%zmm9 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) addq $0x40,%rsi - vextracti32x4 $0x3,%zmm1,%xmm8 - vmovdqa %xmm12,%xmm0 + vextracti32x4 $0x0,%zmm10,%xmm0 + vextracti32x4 $0x1,%zmm10,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_4_remain_amivrujEyduiFoi: - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 +.L_remaining_num_blocks_is_3_amivrujEyduiFoi: + movq $-1,%r8 + shrq $0x10,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1{%k1} + addq $0x30,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 - - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - jmp .L_ret_amivrujEyduiFoi - -.L_remaining_num_blocks_is_3_amivrujEyduiFoi: - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi - andq $0xf,%rdx - je .L_done_3_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm9,%xmm13 - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x3,%zmm9,%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,(%rsi){%k1} addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - vmovdqa %xmm13,%xmm0 + vextracti32x4 $0x3,%zmm9,%xmm0 + vextracti32x4 $0x0,%zmm10,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_3_remain_amivrujEyduiFoi: - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x2,%zmm9,%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - jmp .L_ret_amivrujEyduiFoi - .L_remaining_num_blocks_is_2_amivrujEyduiFoi: - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 + vmovdqu8 (%rdi),%ymm1 addq $0x20,%rdi - andq $0xf,%rdx - je .L_done_2_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm9,%xmm10 - vextracti32x4 $0x1,%zmm9,%xmm12 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu %ymm1,(%rsi) addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 + vextracti32x4 $0x2,%zmm9,%xmm0 + vextracti32x4 $0x3,%zmm9,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_2_remain_amivrujEyduiFoi: - vextracti32x4 $0x1,%zmm9,%xmm10 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - jmp .L_ret_amivrujEyduiFoi - .L_remaining_num_blocks_is_1_amivrujEyduiFoi: vmovdqu (%rdi),%xmm1 addq $0x10,%rdi - andq $0xf,%rdx - je .L_done_1_remain_amivrujEyduiFoi - vextracti32x4 $0x1,%zmm9,%xmm11 - vpxor %xmm11,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + vpxor (%rcx),%xmm1,%xmm1 + vaesdec 16(%rcx),%xmm1,%xmm1 + vaesdec 32(%rcx),%xmm1,%xmm1 + vaesdec 48(%rcx),%xmm1,%xmm1 + vaesdec 64(%rcx),%xmm1,%xmm1 + vaesdec 80(%rcx),%xmm1,%xmm1 + vaesdec 96(%rcx),%xmm1,%xmm1 + vaesdec 112(%rcx),%xmm1,%xmm1 + vaesdec 128(%rcx),%xmm1,%xmm1 + vaesdec 144(%rcx),%xmm1,%xmm1 + vaesdec 160(%rcx),%xmm1,%xmm1 + vaesdec 176(%rcx),%xmm1,%xmm1 + vaesdec 192(%rcx),%xmm1,%xmm1 + vaesdec 208(%rcx),%xmm1,%xmm1 + vaesdeclast 224(%rcx),%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 vmovdqu %xmm1,(%rsi) addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - vmovdqa %xmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm0 + vextracti32x4 $0x2,%zmm9,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi -.L_done_1_remain_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,(%rsi) - jmp .L_ret_amivrujEyduiFoi .L_start_by16_amivrujEyduiFoi: vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 + vpshufb %zmm8,%zmm0,%zmm1 - vpshufb %zmm8,%zmm0,%zmm1 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 @@ -3261,6 +1818,7 @@ aes_hw_xts_decrypt_avx512: vpslldq $0x1,%zmm9,%zmm11 vpxord %zmm14,%zmm11,%zmm11 + vpsrldq $0xf,%zmm10,%zmm15 .byte 98,131,5,72,68,193,0 vpslldq $0x1,%zmm10,%zmm12 @@ -3271,13 +1829,12 @@ aes_hw_xts_decrypt_avx512: vmovdqu8 64(%rdi),%zmm2 vmovdqu8 128(%rdi),%zmm3 vmovdqu8 192(%rdi),%zmm4 - vmovdqu8 240(%rdi),%xmm5 addq $0x100,%rdi vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 vpxorq %zmm11,%zmm3,%zmm3 vpxorq %zmm12,%zmm4,%zmm4 - vbroadcasti32x4 128(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 vpxorq %zmm0,%zmm1,%zmm1 vpxorq %zmm0,%zmm2,%zmm2 vpxorq %zmm0,%zmm3,%zmm3 @@ -3286,17 +1843,17 @@ aes_hw_xts_decrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm11,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3305,17 +1862,17 @@ aes_hw_xts_decrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm12,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3324,17 +1881,17 @@ aes_hw_xts_decrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm15,%zmm17 vpxord %zmm14,%zmm17,%zmm17 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3343,27 +1900,27 @@ aes_hw_xts_decrypt_avx512: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm16,%zmm18 vpxord %zmm14,%zmm18,%zmm18 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 .byte 98,242,101,72,223,216 @@ -3382,30 +1939,24 @@ aes_hw_xts_decrypt_avx512: vmovdqu8 %zmm3,128(%rsi) vmovdqu8 %zmm4,192(%rsi) addq $0x100,%rsi - subq $0x100,%rdx - cmpq $0x100,%rdx - jge .L_main_loop_run_16_amivrujEyduiFoi - - cmpq $0x80,%rdx - jge .L_main_loop_run_8_amivrujEyduiFoi + subq $0x100,%r11 + cmpq $0x100,%r11 + jae .L_main_loop_run_16_amivrujEyduiFoi + cmpq $0x80,%r11 + jae .L_main_loop_run_8_amivrujEyduiFoi jmp .L_do_n_blocks_amivrujEyduiFoi .L_start_by8_amivrujEyduiFoi: - vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 - - vpshufb %zmm8,%zmm0,%zmm1 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 vpxorq %zmm2,%zmm4,%zmm4{%k2} vpxord %zmm4,%zmm3,%zmm9 - - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 .byte 98,147,77,72,68,249,0 @@ -3415,31 +1966,25 @@ aes_hw_xts_decrypt_avx512: .L_main_loop_run_8_amivrujEyduiFoi: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2 - vmovdqu8 112(%rdi),%xmm5 addq $0x80,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 vpsrldq $0xf,%zmm10,%zmm13 @@ -3447,1760 +1992,704 @@ aes_hw_xts_decrypt_avx512: vpslldq $0x1,%zmm10,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqa32 %zmm15,%zmm9 + vmovdqa32 %zmm16,%zmm10 + vmovdqu8 %zmm1,(%rsi) + vmovdqu8 %zmm2,64(%rsi) + addq $0x80,%rsi + subq $0x80,%r11 + cmpq $0x80,%r11 + jae .L_main_loop_run_8_amivrujEyduiFoi + vextracti32x4 $0x0,%zmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm15 + jmp .L_do_n_blocks_amivrujEyduiFoi + +.L_steal_cipher_with_tweak_amivrujEyduiFoi: + + vmovdqa shufb_15_7(%rip),%xmm11 + vpshufb %xmm11,%xmm0,%xmm12 + vpsllq $0x1,%xmm0,%xmm13 + vpsrlq $0x7,%xmm12,%xmm14 +.byte 98,19,13,8,68,249,0 + vpxord %xmm13,%xmm15,%xmm15 + +.L_steal_cipher_amivrujEyduiFoi: + + vmovdqu (%rdi),%xmm8 + vpxor %xmm15,%xmm8,%xmm8 + vpxor (%rcx),%xmm8,%xmm8 + vaesdec 16(%rcx),%xmm8,%xmm8 + vaesdec 32(%rcx),%xmm8,%xmm8 + vaesdec 48(%rcx),%xmm8,%xmm8 + vaesdec 64(%rcx),%xmm8,%xmm8 + vaesdec 80(%rcx),%xmm8,%xmm8 + vaesdec 96(%rcx),%xmm8,%xmm8 + vaesdec 112(%rcx),%xmm8,%xmm8 + vaesdec 128(%rcx),%xmm8,%xmm8 + vaesdec 144(%rcx),%xmm8,%xmm8 + vaesdec 160(%rcx),%xmm8,%xmm8 + vaesdec 176(%rcx),%xmm8,%xmm8 + vaesdec 192(%rcx),%xmm8,%xmm8 + vaesdec 208(%rcx),%xmm8,%xmm8 + vaesdeclast 224(%rcx),%xmm8,%xmm8 + vpxor %xmm15,%xmm8,%xmm8 + + + + + movq $1,%r11 + movq %rcx,%r8 + movq %rdx,%rcx + shlq %cl,%r11 + subq $1,%r11 + kmovq %r11,%k1 + vmovdqu8 16(%rdi),%xmm9{%k1}{z} + vmovdqu8 %xmm8,%xmm10{%k1}{z} + vpblendmb %xmm9,%xmm8,%xmm9{%k1} + + + movq %r8,%rcx + vpxor %xmm0,%xmm9,%xmm9 + vpxor (%rcx),%xmm9,%xmm9 + vaesdec 16(%rcx),%xmm9,%xmm9 + vaesdec 32(%rcx),%xmm9,%xmm9 + vaesdec 48(%rcx),%xmm9,%xmm9 + vaesdec 64(%rcx),%xmm9,%xmm9 + vaesdec 80(%rcx),%xmm9,%xmm9 + vaesdec 96(%rcx),%xmm9,%xmm9 + vaesdec 112(%rcx),%xmm9,%xmm9 + vaesdec 128(%rcx),%xmm9,%xmm9 + vaesdec 144(%rcx),%xmm9,%xmm9 + vaesdec 160(%rcx),%xmm9,%xmm9 + vaesdec 176(%rcx),%xmm9,%xmm9 + vaesdec 192(%rcx),%xmm9,%xmm9 + vaesdec 208(%rcx),%xmm9,%xmm9 + vaesdeclast 224(%rcx),%xmm9,%xmm9 + vpxor %xmm0,%xmm9,%xmm9 + + + + vmovdqu %xmm9,(%rsi) + vmovdqu8 %xmm10,16(%rsi){%k1} + jmp .L_ret_amivrujEyduiFoi + +.L_final_block_is_only_block_amivrujEyduiFoi: + vmovdqa (%rsp),%xmm0 + andq $0xf,%rdx + jne .L_steal_cipher_with_tweak_amivrujEyduiFoi + +.L_final_block_amivrujEyduiFoi: + vmovdqa (%rdi),%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vpxor (%rcx),%xmm8,%xmm8 + vaesdec 16(%rcx),%xmm8,%xmm8 + vaesdec 32(%rcx),%xmm8,%xmm8 + vaesdec 48(%rcx),%xmm8,%xmm8 + vaesdec 64(%rcx),%xmm8,%xmm8 + vaesdec 80(%rcx),%xmm8,%xmm8 + vaesdec 96(%rcx),%xmm8,%xmm8 + vaesdec 112(%rcx),%xmm8,%xmm8 + vaesdec 128(%rcx),%xmm8,%xmm8 + vaesdec 144(%rcx),%xmm8,%xmm8 + vaesdec 160(%rcx),%xmm8,%xmm8 + vaesdec 176(%rcx),%xmm8,%xmm8 + vaesdec 192(%rcx),%xmm8,%xmm8 + vaesdec 208(%rcx),%xmm8,%xmm8 + vaesdeclast 224(%rcx),%xmm8,%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vmovdqa %xmm8,(%rsi) + +.L_ret_amivrujEyduiFoi: + movq 128(%rsp),%rbx + xorq %r8,%r8 + movq %r8,128(%rsp) + vpxorq %zmm0,%zmm0,%zmm0 + movq %rbp,%rsp + popq %rbp + vzeroupper + .byte 0xf3,0xc3 + +.L_less_than_128_bytes_amivrujEyduiFoi: + vpbroadcastq %r10,%zmm25 + cmpq $0x10,%r11 + jb .L_ret_amivrujEyduiFoi + vbroadcasti32x4 (%rsp),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + movl $0xaa,%r8d + kmovq %r8,%k2 + movq %r11,%r8 + andq $0x70,%r8 + cmpq $0x60,%r8 + je .L_num_blocks_is_6_amivrujEyduiFoi + cmpq $0x50,%r8 + je .L_num_blocks_is_5_amivrujEyduiFoi + cmpq $0x40,%r8 + je .L_num_blocks_is_4_amivrujEyduiFoi + cmpq $0x30,%r8 + je .L_num_blocks_is_3_amivrujEyduiFoi + cmpq $0x20,%r8 + je .L_num_blocks_is_2_amivrujEyduiFoi + cmpq $0x10,%r8 + je .L_num_blocks_is_1_amivrujEyduiFoi + +.L_num_blocks_is_7_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + + addq $0x70,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + + vextracti32x4 $0x3,%zmm10,%xmm0 + + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi + + vpsrldq $0xf,%zmm9,%zmm13 +.byte 98,19,21,72,68,241,0 + vpslldq $0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 $0x0,%zmm11,%xmm15 + jmp .L_steal_cipher_amivrujEyduiFoi +.L_num_blocks_is_6_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $96,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %ymm2,64(%rsi) + addq $96,%rsi - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi) - addq $0x80,%rsi - subq $0x80,%rdx - cmpq $0x80,%rdx - jge .L_main_loop_run_8_amivrujEyduiFoi - jmp .L_do_n_blocks_amivrujEyduiFoi - -.L_steal_cipher_amivrujEyduiFoi: - - vmovdqa %xmm8,%xmm2 - - - leaq vpshufb_shf_table(%rip),%rax - vmovdqu (%rax,%rdx,1),%xmm10 - vpshufb %xmm10,%xmm8,%xmm8 + vextracti32x4 $0x2,%zmm10,%xmm0 + vextracti32x4 $0x3,%zmm10,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi + jmp .L_steal_cipher_amivrujEyduiFoi +.L_num_blocks_is_5_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%xmm2 + addq $80,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vmovdqu -16(%rdi,%rdx,1),%xmm3 - vmovdqu %xmm8,-16(%rsi,%rdx,1) + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - leaq vpshufb_shf_table(%rip),%rax - addq $16,%rax - subq %rdx,%rax - vmovdqu (%rax),%xmm10 - vpxor mask1(%rip),%xmm10,%xmm10 - vpshufb %xmm10,%xmm3,%xmm3 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor %xmm0,%xmm3,%xmm8 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor 128(%rsp),%xmm8,%xmm8 -.byte 98,114,61,8,222,132,36,144,0,0,0 -.byte 98,114,61,8,222,132,36,160,0,0,0 -.byte 98,114,61,8,222,132,36,176,0,0,0 -.byte 98,114,61,8,222,132,36,192,0,0,0 -.byte 98,114,61,8,222,132,36,208,0,0,0 -.byte 98,114,61,8,222,132,36,224,0,0,0 -.byte 98,114,61,8,222,132,36,240,0,0,0 -.byte 98,114,61,8,222,132,36,0,1,0,0 -.byte 98,114,61,8,222,132,36,16,1,0,0 -.byte 98,114,61,8,222,132,36,32,1,0,0 -.byte 98,114,61,8,222,132,36,48,1,0,0 -.byte 98,114,61,8,222,132,36,64,1,0,0 -.byte 98,114,61,8,222,132,36,80,1,0,0 -.byte 98,114,61,8,223,132,36,96,1,0,0 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor %xmm0,%xmm8,%xmm8 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -.L_done_amivrujEyduiFoi: - vmovdqu %xmm8,-16(%rsi) -.L_ret_amivrujEyduiFoi: - movq 368(%rsp),%rbx - xorq %r8,%r8 - movq %r8,368(%rsp) + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxorq %zmm0,%zmm0,%zmm0 - vmovdqa64 %zmm0,128(%rsp) - vmovdqa64 %zmm0,192(%rsp) - vmovdqa64 %zmm0,256(%rsp) + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - movq $0x3f,%r8 - kmovq %r8,%k2 - vmovdqa64 %zmm0,320(%rsp){%k2} - movq %rbp,%rsp - popq %rbp - vzeroupper - .byte 0xf3,0xc3 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -.L_less_than_128_bytes_amivrujEyduiFoi: - cmpq $0x10,%rdx - jb .L_ret_amivrujEyduiFoi - movq %rdx,%r8 - andq $0x70,%r8 - cmpq $0x60,%r8 - je .L_num_blocks_is_6_amivrujEyduiFoi - cmpq $0x50,%r8 - je .L_num_blocks_is_5_amivrujEyduiFoi - cmpq $0x40,%r8 - je .L_num_blocks_is_4_amivrujEyduiFoi - cmpq $0x30,%r8 - je .L_num_blocks_is_3_amivrujEyduiFoi - cmpq $0x20,%r8 - je .L_num_blocks_is_2_amivrujEyduiFoi - cmpq $0x10,%r8 - je .L_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -.L_num_blocks_is_7_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,96(%rsp) - movq %rbx,104(%rsp) - vmovdqa 96(%rsp),%xmm15 - vmovdqu 96(%rdi),%xmm7 - addq $0x70,%rdi - andq $0xf,%rdx - je .L_done_7_amivrujEyduiFoi - -.L_steal_cipher_7_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm15,%xmm16 - vmovdqa 16(%rsp),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 -.byte 98,242,69,8,223,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x70,%rsi - vmovdqa64 %xmm16,%xmm0 - vmovdqa %xmm7,%xmm8 - jmp .L_steal_cipher_amivrujEyduiFoi -.L_done_7_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 -.byte 98,242,69,8,223,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x70,%rsi - vmovdqa %xmm7,%xmm8 - jmp .L_done_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -.L_num_blocks_is_6_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - addq $0x60,%rdi - andq $0xf,%rdx - je .L_done_6_amivrujEyduiFoi - -.L_steal_cipher_6_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm14,%xmm15 - vmovdqa 16(%rsp),%xmm14 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x60,%rsi - vmovdqa %xmm15,%xmm0 - vmovdqa %xmm6,%xmm8 - jmp .L_steal_cipher_amivrujEyduiFoi -.L_done_6_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x60,%rsi - vmovdqa %xmm6,%xmm8 - jmp .L_done_amivrujEyduiFoi + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %xmm2,64(%rsi) + addq $80,%rsi -.L_num_blocks_is_5_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - addq $0x50,%rdi + vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm10,%xmm0 + vextracti32x4 $0x2,%zmm10,%xmm15 andq $0xf,%rdx - je .L_done_5_amivrujEyduiFoi - -.L_steal_cipher_5_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm13,%xmm14 - vmovdqa 16(%rsp),%xmm13 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x50,%rsi - vmovdqa %xmm14,%xmm0 - vmovdqa %xmm5,%xmm8 + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_5_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x50,%rsi - vmovdqa %xmm5,%xmm8 - jmp .L_done_amivrujEyduiFoi - .L_num_blocks_is_4_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - addq $0x40,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + addq $64,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi) + addq $64,%rsi + vmovdqa %xmm10,%xmm0 + vextracti32x4 $0x1,%zmm10,%xmm15 andq $0xf,%rdx - je .L_done_4_amivrujEyduiFoi - -.L_steal_cipher_4_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm12,%xmm13 - vmovdqa 16(%rsp),%xmm12 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x40,%rsi - vmovdqa %xmm13,%xmm0 - vmovdqa %xmm4,%xmm8 + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_4_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x40,%rsi - vmovdqa %xmm4,%xmm8 - jmp .L_done_amivrujEyduiFoi - .L_num_blocks_is_3_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1{%k1} + addq $48,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi){%k1} + addq $48,%rsi + vextracti32x4 $3,%zmm9,%xmm0 + vextracti32x4 $0,%zmm10,%xmm15 andq $0xf,%rdx - je .L_done_3_amivrujEyduiFoi - -.L_steal_cipher_3_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm11,%xmm12 - vmovdqa 16(%rsp),%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x30,%rsi - vmovdqa %xmm12,%xmm0 - vmovdqa %xmm3,%xmm8 + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_3_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - jmp .L_done_amivrujEyduiFoi - .L_num_blocks_is_2_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - addq $0x20,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + + vmovdqu8 0(%rdi),%ymm1 + addq $32,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %ymm1,0(%rsi) + addq $32,%rsi + + vextracti32x4 $2,%zmm9,%xmm0 + vextracti32x4 $3,%zmm9,%xmm15 andq $0xf,%rdx - je .L_done_2_amivrujEyduiFoi - -.L_steal_cipher_2_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm10,%xmm11 - vmovdqa 16(%rsp),%xmm10 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - addq $0x20,%rsi - vmovdqa %xmm11,%xmm0 - vmovdqa %xmm2,%xmm8 + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi +.L_num_blocks_is_1_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 -.L_done_2_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - jmp .L_done_amivrujEyduiFoi + vmovdqu8 0(%rdi),%xmm1 + addq $16,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %xmm1,0(%rsi) + addq $16,%rsi -.L_num_blocks_is_1_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - addq $0x10,%rdi - andq $0xf,%rdx - je .L_done_1_amivrujEyduiFoi - -.L_steal_cipher_1_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm9,%xmm10 - vmovdqa 16(%rsp),%xmm9 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - addq $0x10,%rsi - vmovdqa %xmm10,%xmm0 vmovdqa %xmm1,%xmm8 + vextracti32x4 $1,%zmm9,%xmm0 + vextracti32x4 $2,%zmm9,%xmm15 + andq $0xf,%rdx + je .L_final_block_amivrujEyduiFoi jmp .L_steal_cipher_amivrujEyduiFoi - -.L_done_1_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - jmp .L_done_amivrujEyduiFoi .cfi_endproc .section .rodata .align 16 diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/aesni-xts-avx512.S b/generated-src/mac-x86_64/crypto/fipsmodule/aesni-xts-avx512.S index a3ce4e45f5..0139635d81 100644 --- a/generated-src/mac-x86_64/crypto/fipsmodule/aesni-xts-avx512.S +++ b/generated-src/mac-x86_64/crypto/fipsmodule/aesni-xts-avx512.S @@ -16,102 +16,26 @@ _aes_hw_xts_encrypt_avx512: .byte 243,15,30,250 pushq %rbp movq %rsp,%rbp - subq $376,%rsp + subq $136,%rsp andq $0xffffffffffffffc0,%rsp - movq %rbx,368(%rsp) + movq %rbx,128(%rsp) movq $0x87,%r10 vmovdqu (%r9),%xmm1 - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu (%r8),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - - vmovdqu (%rcx),%xmm2 - vmovdqa %xmm2,128(%rsp) - - vmovdqu 16(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 16(%rcx),%xmm2 - vmovdqa %xmm2,144(%rsp) - - vmovdqu 32(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 32(%rcx),%xmm2 - vmovdqa %xmm2,160(%rsp) - - vmovdqu 48(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 48(%rcx),%xmm2 - vmovdqa %xmm2,176(%rsp) - - vmovdqu 64(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 64(%rcx),%xmm2 - vmovdqa %xmm2,192(%rsp) - - vmovdqu 80(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 80(%rcx),%xmm2 - vmovdqa %xmm2,208(%rsp) - - vmovdqu 96(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 96(%rcx),%xmm2 - vmovdqa %xmm2,224(%rsp) - - vmovdqu 112(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 112(%rcx),%xmm2 - vmovdqa %xmm2,240(%rsp) - - vmovdqu 128(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 128(%rcx),%xmm2 - vmovdqa %xmm2,256(%rsp) - - vmovdqu 144(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 144(%rcx),%xmm2 - vmovdqa %xmm2,272(%rsp) - - vmovdqu 160(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 160(%rcx),%xmm2 - vmovdqa %xmm2,288(%rsp) - - vmovdqu 176(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 176(%rcx),%xmm2 - vmovdqa %xmm2,304(%rsp) - - vmovdqu 192(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 192(%rcx),%xmm2 - vmovdqa %xmm2,320(%rsp) - - vmovdqu 208(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 208(%rcx),%xmm2 - vmovdqa %xmm2,336(%rsp) - - vmovdqu 224(%r8),%xmm0 -.byte 98,242,117,8,221,200 - - vmovdqu 224(%rcx),%xmm2 - vmovdqa %xmm2,352(%rsp) - + vpxor (%r8),%xmm1,%xmm1 + vaesenc 16(%r8),%xmm1,%xmm1 + vaesenc 32(%r8),%xmm1,%xmm1 + vaesenc 48(%r8),%xmm1,%xmm1 + vaesenc 64(%r8),%xmm1,%xmm1 + vaesenc 80(%r8),%xmm1,%xmm1 + vaesenc 96(%r8),%xmm1,%xmm1 + vaesenc 112(%r8),%xmm1,%xmm1 + vaesenc 128(%r8),%xmm1,%xmm1 + vaesenc 144(%r8),%xmm1,%xmm1 + vaesenc 160(%r8),%xmm1,%xmm1 + vaesenc 176(%r8),%xmm1,%xmm1 + vaesenc 192(%r8),%xmm1,%xmm1 + vaesenc 208(%r8),%xmm1,%xmm1 + vaesenclast 224(%r8),%xmm1,%xmm1 vmovdqa %xmm1,(%rsp) cmpq $0x80,%rdx @@ -144,95 +68,83 @@ L$_do_n_blocks_hEgxyDlCngwrfFe: jmp L$_steal_cipher_hEgxyDlCngwrfFe L$_remaining_num_blocks_is_7_hEgxyDlCngwrfFe: - movq $0xffffffffffffffff,%r8 - shrq $0x10,%r8 + movq $0x0000ffffffffffff,%r8 kmovq %r8,%k1 vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2{%k1} addq $0x70,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu8 %zmm2,64(%rsi){%k1} addq $0x70,%rsi @@ -246,89 +158,78 @@ L$_remaining_num_blocks_is_6_hEgxyDlCngwrfFe: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%ymm2 addq $0x60,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu8 %ymm2,64(%rsi) addq $0x60,%rsi @@ -342,89 +243,78 @@ L$_remaining_num_blocks_is_5_hEgxyDlCngwrfFe: vmovdqu8 (%rdi),%zmm1 vmovdqu 64(%rdi),%xmm2 addq $0x50,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) vmovdqu %xmm2,64(%rsi) addq $0x50,%rsi @@ -437,236 +327,125 @@ L$_remaining_num_blocks_is_5_hEgxyDlCngwrfFe: L$_remaining_num_blocks_is_4_hEgxyDlCngwrfFe: vmovdqu8 (%rdi),%zmm1 addq $0x40,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 -.byte 98,242,109,72,220,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 -.byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) addq $0x40,%rsi vextracti32x4 $0x3,%zmm1,%xmm8 - vextracti32x4 $0x0,%zmm10,%xmm0 + vmovdqa64 %xmm10,%xmm0 andq $0xf,%rdx je L$_ret_hEgxyDlCngwrfFe jmp L$_steal_cipher_hEgxyDlCngwrfFe L$_remaining_num_blocks_is_3_hEgxyDlCngwrfFe: - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x2,%zmm9,%xmm11 - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 - vmovdqu 32(%rdi),%xmm3 + movq $-1,%r8 + shrq $0x10,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1{%k1} addq $0x30,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,(%rsi){%k1} addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 + vextracti32x4 $0x2,%zmm1,%xmm8 vextracti32x4 $0x3,%zmm9,%xmm0 andq $0xf,%rdx je L$_ret_hEgxyDlCngwrfFe jmp L$_steal_cipher_hEgxyDlCngwrfFe L$_remaining_num_blocks_is_2_hEgxyDlCngwrfFe: - vextracti32x4 $0x1,%zmm9,%xmm10 - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 + vmovdqu8 (%rdi),%ymm1 addq $0x20,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu %ymm1,(%rsi) addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm1,%xmm8 vextracti32x4 $0x2,%zmm9,%xmm0 andq $0xf,%rdx je L$_ret_hEgxyDlCngwrfFe @@ -675,36 +454,21 @@ L$_remaining_num_blocks_is_1_hEgxyDlCngwrfFe: vmovdqu (%rdi),%xmm1 addq $0x10,%rdi vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 vpxor %xmm9,%xmm1,%xmm1 vmovdqu %xmm1,(%rsi) addq $0x10,%rsi @@ -714,26 +478,36 @@ L$_remaining_num_blocks_is_1_hEgxyDlCngwrfFe: je L$_ret_hEgxyDlCngwrfFe jmp L$_steal_cipher_hEgxyDlCngwrfFe + + L$_start_by16_hEgxyDlCngwrfFe: vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 vpshufb %zmm8,%zmm0,%zmm1 + + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 vpxorq %zmm2,%zmm4,%zmm4{%k2} vpxord %zmm4,%zmm3,%zmm9 + + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 .byte 98,147,77,72,68,249,0 vpxorq %zmm6,%zmm5,%zmm5{%k2} vpxord %zmm5,%zmm7,%zmm10 + + vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm11 vpxord %zmm14,%zmm11,%zmm11 + + vpsrldq $0xf,%zmm10,%zmm15 .byte 98,131,5,72,68,193,0 vpslldq $0x1,%zmm10,%zmm12 @@ -749,7 +523,7 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: vpxorq %zmm10,%zmm2,%zmm2 vpxorq %zmm11,%zmm3,%zmm3 vpxorq %zmm12,%zmm4,%zmm4 - vbroadcasti32x4 128(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 vpxorq %zmm0,%zmm1,%zmm1 vpxorq %zmm0,%zmm2,%zmm2 vpxorq %zmm0,%zmm3,%zmm3 @@ -758,17 +532,17 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm11,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -777,17 +551,17 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm12,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -796,17 +570,17 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm15,%zmm17 vpxord %zmm14,%zmm17,%zmm17 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 @@ -815,27 +589,27 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm16,%zmm18 vpxord %zmm14,%zmm18,%zmm18 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 .byte 98,242,101,72,220,216 .byte 98,242,93,72,220,224 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 .byte 98,242,101,72,221,216 @@ -856,9 +630,9 @@ L$_main_loop_run_16_hEgxyDlCngwrfFe: addq $0x100,%rsi subq $0x100,%rdx cmpq $0x100,%rdx - jge L$_main_loop_run_16_hEgxyDlCngwrfFe + jae L$_main_loop_run_16_hEgxyDlCngwrfFe cmpq $0x80,%rdx - jge L$_main_loop_run_8_hEgxyDlCngwrfFe + jae L$_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 $0x3,%zmm4,%xmm0 jmp L$_do_n_blocks_hEgxyDlCngwrfFe @@ -883,29 +657,24 @@ L$_main_loop_run_8_hEgxyDlCngwrfFe: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2 addq $0x80,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 vpsrldq $0xf,%zmm10,%zmm13 @@ -913,65 +682,61 @@ L$_main_loop_run_8_hEgxyDlCngwrfFe: vpslldq $0x1,%zmm10,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,220,200 .byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,221,200 .byte 98,242,109,72,221,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - vmovdqa32 %zmm15,%zmm9 vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) @@ -979,20 +744,10 @@ L$_main_loop_run_8_hEgxyDlCngwrfFe: addq $0x80,%rsi subq $0x80,%rdx cmpq $0x80,%rdx - jge L$_main_loop_run_8_hEgxyDlCngwrfFe + jae L$_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 $0x3,%zmm2,%xmm0 jmp L$_do_n_blocks_hEgxyDlCngwrfFe -L$_steal_cipher_next_hEgxyDlCngwrfFe: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,(%rsp) - movq %rbx,8(%rsp) - vmovdqa (%rsp),%xmm0 - L$_steal_cipher_hEgxyDlCngwrfFe: vmovdqa %xmm8,%xmm2 leaq vpshufb_shf_table(%rip),%rax @@ -1008,48 +763,42 @@ L$_steal_cipher_hEgxyDlCngwrfFe: vpshufb %xmm10,%xmm3,%xmm3 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 vpxor %xmm0,%xmm3,%xmm8 - vpxor 128(%rsp),%xmm8,%xmm8 -.byte 98,114,61,8,220,132,36,144,0,0,0 -.byte 98,114,61,8,220,132,36,160,0,0,0 -.byte 98,114,61,8,220,132,36,176,0,0,0 -.byte 98,114,61,8,220,132,36,192,0,0,0 -.byte 98,114,61,8,220,132,36,208,0,0,0 -.byte 98,114,61,8,220,132,36,224,0,0,0 -.byte 98,114,61,8,220,132,36,240,0,0,0 -.byte 98,114,61,8,220,132,36,0,1,0,0 -.byte 98,114,61,8,220,132,36,16,1,0,0 -.byte 98,114,61,8,220,132,36,32,1,0,0 -.byte 98,114,61,8,220,132,36,48,1,0,0 -.byte 98,114,61,8,220,132,36,64,1,0,0 -.byte 98,114,61,8,220,132,36,80,1,0,0 -.byte 98,114,61,8,221,132,36,96,1,0,0 + vpxor (%rcx),%xmm8,%xmm8 + vaesenc 16(%rcx),%xmm8,%xmm8 + vaesenc 32(%rcx),%xmm8,%xmm8 + vaesenc 48(%rcx),%xmm8,%xmm8 + vaesenc 64(%rcx),%xmm8,%xmm8 + vaesenc 80(%rcx),%xmm8,%xmm8 + vaesenc 96(%rcx),%xmm8,%xmm8 + vaesenc 112(%rcx),%xmm8,%xmm8 + vaesenc 128(%rcx),%xmm8,%xmm8 + vaesenc 144(%rcx),%xmm8,%xmm8 + vaesenc 160(%rcx),%xmm8,%xmm8 + vaesenc 176(%rcx),%xmm8,%xmm8 + vaesenc 192(%rcx),%xmm8,%xmm8 + vaesenc 208(%rcx),%xmm8,%xmm8 + vaesenclast 224(%rcx),%xmm8,%xmm8 vpxor %xmm0,%xmm8,%xmm8 vmovdqu %xmm8,-16(%rsi) + L$_ret_hEgxyDlCngwrfFe: - movq 368(%rsp),%rbx + movq 128(%rsp),%rbx xorq %r8,%r8 - movq %r8,368(%rsp) - + movq %r8,128(%rsp) vpxorq %zmm0,%zmm0,%zmm0 - - vmovdqa64 %zmm0,128(%rsp) - vmovdqa64 %zmm0,192(%rsp) - vmovdqa64 %zmm0,256(%rsp) - - - - movq $0x3f,%r8 - kmovq %r8,%k2 - vmovdqa64 %zmm0,320(%rsp){%k2} - movq %rbp,%rsp popq %rbp vzeroupper .byte 0xf3,0xc3 L$_less_than_128_bytes_hEgxyDlCngwrfFe: + vpbroadcastq %r10,%zmm25 cmpq $0x10,%rdx jb L$_ret_hEgxyDlCngwrfFe + vbroadcasti32x4 (%rsp),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + movl $0xaa,%r8d + kmovq %r8,%k2 movq %rdx,%r8 andq $0x70,%r8 cmpq $0x60,%r8 @@ -1066,2182 +815,990 @@ L$_less_than_128_bytes_hEgxyDlCngwrfFe: je L$_num_blocks_is_1_hEgxyDlCngwrfFe L$_num_blocks_is_7_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,96(%rsp) - movq %rbx,104(%rsp) - vmovdqa 96(%rsp),%xmm15 - vmovdqu 96(%rdi),%xmm7 + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + addq $0x70,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 -.byte 98,242,69,8,220,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 -.byte 98,242,77,8,221,240 -.byte 98,242,69,8,221,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - addq $0x70,%rsi - vmovdqa %xmm7,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_num_blocks_is_6_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - addq $0x60,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 -.byte 98,242,77,8,220,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 -.byte 98,242,77,8,221,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x60,%rsi - vmovdqa %xmm6,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe -L$_num_blocks_is_5_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - addq $0x50,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 -.byte 98,242,85,8,220,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 -.byte 98,242,85,8,221,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x50,%rsi - vmovdqa %xmm5,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_num_blocks_is_4_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - addq $0x40,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 -.byte 98,242,93,8,220,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 -.byte 98,242,93,8,221,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x40,%rsi - vmovdqa %xmm4,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe -L$_num_blocks_is_3_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 -.byte 98,242,101,8,220,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 -.byte 98,242,101,8,221,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_num_blocks_is_2_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - addq $0x20,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 -.byte 98,242,109,8,220,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 -.byte 98,242,109,8,221,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_num_blocks_is_1_hEgxyDlCngwrfFe: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - addq $0x10,%rdi - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,221,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,(%rsi) - addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - andq $0xf,%rdx - je L$_ret_hEgxyDlCngwrfFe - jmp L$_steal_cipher_next_hEgxyDlCngwrfFe - -.globl _aes_hw_xts_decrypt_avx512 -.private_extern _aes_hw_xts_decrypt_avx512 -.private_extern _aes_hw_xts_decrypt_avx512 - -.p2align 5 -_aes_hw_xts_decrypt_avx512: - -.byte 243,15,30,250 - pushq %rbp - movq %rsp,%rbp - subq $376,%rsp - andq $0xffffffffffffffc0,%rsp - movq %rbx,368(%rsp) - movq $0x87,%r10 - vmovdqu (%r9),%xmm1 - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu (%r8),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - - vmovdqu 224(%rcx),%xmm2 - vmovdqa %xmm2,352(%rsp) - - vmovdqu 16(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 208(%rcx),%xmm2 - vmovdqa %xmm2,336(%rsp) - - vmovdqu 32(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 192(%rcx),%xmm2 - vmovdqa %xmm2,320(%rsp) - - vmovdqu 48(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 176(%rcx),%xmm2 - vmovdqa %xmm2,304(%rsp) - - vmovdqu 64(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 160(%rcx),%xmm2 - vmovdqa %xmm2,288(%rsp) - - vmovdqu 80(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 144(%rcx),%xmm2 - vmovdqa %xmm2,272(%rsp) - - vmovdqu 96(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 128(%rcx),%xmm2 - vmovdqa %xmm2,256(%rsp) - - vmovdqu 112(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 112(%rcx),%xmm2 - vmovdqa %xmm2,240(%rsp) - - vmovdqu 128(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 96(%rcx),%xmm2 - vmovdqa %xmm2,224(%rsp) - - vmovdqu 144(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 80(%rcx),%xmm2 - vmovdqa %xmm2,208(%rsp) - - vmovdqu 160(%r8),%xmm0 -.byte 98,242,117,8,220,200 - - vmovdqu 64(%rcx),%xmm2 - vmovdqa %xmm2,192(%rsp) + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 176(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 48(%rcx),%xmm2 - vmovdqa %xmm2,176(%rsp) + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 192(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 32(%rcx),%xmm2 - vmovdqa %xmm2,160(%rsp) + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 208(%r8),%xmm0 -.byte 98,242,117,8,220,200 - vmovdqu 16(%rcx),%xmm2 - vmovdqa %xmm2,144(%rsp) + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu 224(%r8),%xmm0 -.byte 98,242,117,8,221,200 - vmovdqu (%rcx),%xmm2 - vmovdqa %xmm2,128(%rsp) + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqa %xmm1,(%rsp) - cmpq $0x80,%rdx - jb L$_less_than_128_bytes_amivrujEyduiFoi - vpbroadcastq %r10,%zmm25 - cmpq $0x100,%rdx - jge L$_start_by16_amivrujEyduiFoi - jmp L$_start_by8_amivrujEyduiFoi + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_do_n_blocks_amivrujEyduiFoi: - cmpq $0x0,%rdx - je L$_ret_amivrujEyduiFoi - cmpq $0x70,%rdx - jge L$_remaining_num_blocks_is_7_amivrujEyduiFoi - cmpq $0x60,%rdx - jge L$_remaining_num_blocks_is_6_amivrujEyduiFoi - cmpq $0x50,%rdx - jge L$_remaining_num_blocks_is_5_amivrujEyduiFoi - cmpq $0x40,%rdx - jge L$_remaining_num_blocks_is_4_amivrujEyduiFoi - cmpq $0x30,%rdx - jge L$_remaining_num_blocks_is_3_amivrujEyduiFoi - cmpq $0x20,%rdx - jge L$_remaining_num_blocks_is_2_amivrujEyduiFoi - cmpq $0x10,%rdx - jge L$_remaining_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqu %xmm5,%xmm1 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,-16(%rsi) - vmovdqa %xmm1,%xmm8 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - movq $0x1,%r8 - kmovq %r8,%k1 - vpsllq $0x3f,%xmm9,%xmm13 - vpsraq $0x3f,%xmm13,%xmm14 - vpandq %xmm25,%xmm14,%xmm5 - vpxorq %xmm5,%xmm9,%xmm9{%k1} - vpsrldq $0x8,%xmm9,%xmm10 -.byte 98, 211, 181, 8, 115, 194, 1 - vpslldq $0x8,%xmm13,%xmm13 - vpxorq %xmm13,%xmm0,%xmm0 - jmp L$_steal_cipher_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_remaining_num_blocks_is_7_amivrujEyduiFoi: - movq $0xffffffffffffffff,%r8 - shrq $0x10,%r8 - kmovq %r8,%k1 - vmovdqu8 (%rdi),%zmm1 - vmovdqu8 64(%rdi),%zmm2{%k1} - addq $0x70,%rdi - andq $0xf,%rdx - je L$_done_7_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm10,%xmm12 - vextracti32x4 $0x3,%zmm10,%xmm13 - vinserti32x4 $0x2,%xmm13,%zmm10,%zmm10 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + vextracti32x4 $0x2,%zmm2,%xmm8 + vextracti32x4 $0x3,%zmm10,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_6_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $96,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi){%k1} - addq $0x70,%rsi - vextracti32x4 $0x2,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp L$_steal_cipher_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 -L$_done_7_remain_amivrujEyduiFoi: + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %ymm2,64(%rsi) + addq $96,%rsi - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vextracti32x4 $0x1,%ymm2,%xmm8 + vextracti32x4 $0x2,%zmm10,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_5_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%xmm2 + addq $80,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 +.byte 98,242,109,72,220,208 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 +.byte 98,242,109,72,221,208 vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %xmm2,64(%rsi) + addq $80,%rsi - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi){%k1} - jmp L$_ret_amivrujEyduiFoi - -L$_remaining_num_blocks_is_6_amivrujEyduiFoi: - vmovdqu8 (%rdi),%zmm1 - vmovdqu8 64(%rdi),%ymm2 - addq $0x60,%rdi + vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm10,%xmm0 andq $0xf,%rdx - je L$_done_6_remain_amivrujEyduiFoi - vextracti32x4 $0x1,%zmm10,%xmm12 - vextracti32x4 $0x2,%zmm10,%xmm13 - vinserti32x4 $0x1,%xmm13,%zmm10,%zmm10 - + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_4_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + addq $64,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vmovdqu8 %zmm1,0(%rsi) + addq $64,%rsi + vextracti32x4 $0x3,%zmm1,%xmm8 + vmovdqa %xmm10,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_3_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1{%k1} + addq $48,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,220,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,221,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi){%k1} + addq $48,%rsi + vextracti32x4 $2,%zmm1,%xmm8 + vextracti32x4 $3,%zmm9,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_2_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vmovdqu8 0(%rdi),%ymm1 + addq $32,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %ymm1,0(%rsi) + addq $32,%rsi + + vextracti32x4 $1,%ymm1,%xmm8 + vextracti32x4 $2,%zmm9,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe +L$_num_blocks_is_1_hEgxyDlCngwrfFe: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 - vbroadcasti32x4 320(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 + vmovdqu8 0(%rdi),%xmm1 + addq $16,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,220,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,221,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %xmm1,0(%rsi) + addq $16,%rsi + vmovdqa %xmm1,%xmm8 + vextracti32x4 $1,%zmm9,%xmm0 + andq $0xf,%rdx + je L$_ret_hEgxyDlCngwrfFe + jmp L$_steal_cipher_hEgxyDlCngwrfFe - vbroadcasti32x4 336(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 +.globl _aes_hw_xts_decrypt_avx512 +.private_extern _aes_hw_xts_decrypt_avx512 +.private_extern _aes_hw_xts_decrypt_avx512 +.p2align 5 +_aes_hw_xts_decrypt_avx512: - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 +.byte 243,15,30,250 + pushq %rbp + movq %rsp,%rbp + subq $136,%rsp + andq $0xffffffffffffffc0,%rsp + movq %rbx,128(%rsp) + movq $0x87,%r10 + vmovdqu (%r9),%xmm1 + vpxor (%r8),%xmm1,%xmm1 + vaesenc 16(%r8),%xmm1,%xmm1 + vaesenc 32(%r8),%xmm1,%xmm1 + vaesenc 48(%r8),%xmm1,%xmm1 + vaesenc 64(%r8),%xmm1,%xmm1 + vaesenc 80(%r8),%xmm1,%xmm1 + vaesenc 96(%r8),%xmm1,%xmm1 + vaesenc 112(%r8),%xmm1,%xmm1 + vaesenc 128(%r8),%xmm1,%xmm1 + vaesenc 144(%r8),%xmm1,%xmm1 + vaesenc 160(%r8),%xmm1,%xmm1 + vaesenc 176(%r8),%xmm1,%xmm1 + vaesenc 192(%r8),%xmm1,%xmm1 + vaesenc 208(%r8),%xmm1,%xmm1 + vaesenclast 224(%r8),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsp) - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + cmpq $0x20,%rdx + jl L$_final_block_is_only_block_amivrujEyduiFoi - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %ymm2,64(%rsi) - addq $0x60,%rsi - vextracti32x4 $0x1,%zmm2,%xmm8 - vmovdqa %xmm12,%xmm0 - jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_6_remain_amivrujEyduiFoi: - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 + movq %rdx,%r11 + andq $0xfffffffffffffff0,%r11 + subq $16,%r11 + cmpq $0x80,%r11 + jl L$_less_than_128_bytes_amivrujEyduiFoi + vpbroadcastq %r10,%zmm25 + cmpq $0x100,%r11 + jge L$_start_by16_amivrujEyduiFoi + cmpq $0x80,%r11 + jge L$_start_by8_amivrujEyduiFoi +L$_do_n_blocks_amivrujEyduiFoi: + cmpq $0x70,%r11 + je L$_remaining_num_blocks_is_7_amivrujEyduiFoi + cmpq $0x60,%r11 + je L$_remaining_num_blocks_is_6_amivrujEyduiFoi + cmpq $0x50,%r11 + je L$_remaining_num_blocks_is_5_amivrujEyduiFoi + cmpq $0x40,%r11 + je L$_remaining_num_blocks_is_4_amivrujEyduiFoi + cmpq $0x30,%r11 + je L$_remaining_num_blocks_is_3_amivrujEyduiFoi + cmpq $0x20,%r11 + je L$_remaining_num_blocks_is_2_amivrujEyduiFoi + cmpq $0x10,%r11 + je L$_remaining_num_blocks_is_1_amivrujEyduiFoi + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi + vextracti32x4 $0x0,%zmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm15 + jmp L$_steal_cipher_amivrujEyduiFoi - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +L$_remaining_num_blocks_is_7_amivrujEyduiFoi: + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + addq $0x70,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %ymm2,64(%rsi) - jmp L$_ret_amivrujEyduiFoi - -L$_remaining_num_blocks_is_5_amivrujEyduiFoi: - vmovdqu8 (%rdi),%zmm1 - vmovdqu 64(%rdi),%xmm2 - addq $0x50,%rdi + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + vextracti32x4 $0x3,%zmm10,%xmm0 andq $0xf,%rdx - je L$_done_5_remain_amivrujEyduiFoi - vmovdqa %xmm10,%xmm12 - vextracti32x4 $0x1,%zmm10,%xmm10 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - + je L$_final_block_amivrujEyduiFoi + vpsrldq $0xf,%zmm9,%zmm13 +.byte 98,19,21,72,68,241,0 + vpslldq $0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 $0x0,%zmm11,%xmm15 + jmp L$_steal_cipher_amivrujEyduiFoi - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +L$_remaining_num_blocks_is_6_amivrujEyduiFoi: + vmovdqu8 (%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $0x60,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu %xmm2,64(%rsi) - addq $0x50,%rsi - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 + vmovdqu8 %ymm2,64(%rsi) + addq $0x60,%rsi + vextracti32x4 $0x2,%zmm10,%xmm0 + vextracti32x4 $0x3,%zmm10,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_5_remain_amivrujEyduiFoi: - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 +L$_remaining_num_blocks_is_5_amivrujEyduiFoi: + vmovdqu8 (%rdi),%zmm1 + vmovdqu 64(%rdi),%xmm2 + addq $0x50,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %xmm2,64(%rsi) - jmp L$_ret_amivrujEyduiFoi + vmovdqu %xmm2,64(%rsi) + addq $0x50,%rsi + vextracti32x4 $0x1,%zmm10,%xmm0 + vextracti32x4 $0x2,%zmm10,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi + jmp L$_steal_cipher_amivrujEyduiFoi L$_remaining_num_blocks_is_4_amivrujEyduiFoi: vmovdqu8 (%rdi),%zmm1 addq $0x40,%rdi - andq $0xf,%rdx - je L$_done_4_remain_amivrujEyduiFoi - vextracti32x4 $0x3,%zmm9,%xmm12 - vinserti32x4 $0x3,%xmm10,%zmm9,%zmm9 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 vmovdqu8 %zmm1,(%rsi) addq $0x40,%rsi - vextracti32x4 $0x3,%zmm1,%xmm8 - vmovdqa %xmm12,%xmm0 + vextracti32x4 $0x0,%zmm10,%xmm0 + vextracti32x4 $0x1,%zmm10,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_4_remain_amivrujEyduiFoi: - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 - vbroadcasti32x4 144(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 160(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 176(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - vbroadcasti32x4 192(%rsp),%zmm0 -.byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 208(%rsp),%zmm0 +L$_remaining_num_blocks_is_3_amivrujEyduiFoi: + movq $-1,%r8 + shrq $0x10,%r8 + kmovq %r8,%k1 + vmovdqu8 (%rdi),%zmm1{%k1} + addq $0x30,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 -.byte 98,242,109,72,222,208 - - - vbroadcasti32x4 352(%rsp),%zmm0 -.byte 98,242,117,72,223,200 -.byte 98,242,109,72,223,208 - - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - jmp L$_ret_amivrujEyduiFoi - -L$_remaining_num_blocks_is_3_amivrujEyduiFoi: - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi - andq $0xf,%rdx - je L$_done_3_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm9,%xmm13 - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x3,%zmm9,%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,(%rsi){%k1} addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - vmovdqa %xmm13,%xmm0 + vextracti32x4 $0x3,%zmm9,%xmm0 + vextracti32x4 $0x0,%zmm10,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_3_remain_amivrujEyduiFoi: - vextracti32x4 $0x1,%zmm9,%xmm10 - vextracti32x4 $0x2,%zmm9,%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - jmp L$_ret_amivrujEyduiFoi - L$_remaining_num_blocks_is_2_amivrujEyduiFoi: - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm2 + vmovdqu8 (%rdi),%ymm1 addq $0x20,%rdi - andq $0xf,%rdx - je L$_done_2_remain_amivrujEyduiFoi - vextracti32x4 $0x2,%zmm9,%xmm10 - vextracti32x4 $0x1,%zmm9,%xmm12 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu %ymm1,(%rsi) addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - vmovdqa %xmm12,%xmm0 + vextracti32x4 $0x2,%zmm9,%xmm0 + vextracti32x4 $0x3,%zmm9,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_2_remain_amivrujEyduiFoi: - vextracti32x4 $0x1,%zmm9,%xmm10 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - jmp L$_ret_amivrujEyduiFoi - L$_remaining_num_blocks_is_1_amivrujEyduiFoi: vmovdqu (%rdi),%xmm1 addq $0x10,%rdi - andq $0xf,%rdx - je L$_done_1_remain_amivrujEyduiFoi - vextracti32x4 $0x1,%zmm9,%xmm11 - vpxor %xmm11,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + vpxor (%rcx),%xmm1,%xmm1 + vaesdec 16(%rcx),%xmm1,%xmm1 + vaesdec 32(%rcx),%xmm1,%xmm1 + vaesdec 48(%rcx),%xmm1,%xmm1 + vaesdec 64(%rcx),%xmm1,%xmm1 + vaesdec 80(%rcx),%xmm1,%xmm1 + vaesdec 96(%rcx),%xmm1,%xmm1 + vaesdec 112(%rcx),%xmm1,%xmm1 + vaesdec 128(%rcx),%xmm1,%xmm1 + vaesdec 144(%rcx),%xmm1,%xmm1 + vaesdec 160(%rcx),%xmm1,%xmm1 + vaesdec 176(%rcx),%xmm1,%xmm1 + vaesdec 192(%rcx),%xmm1,%xmm1 + vaesdec 208(%rcx),%xmm1,%xmm1 + vaesdeclast 224(%rcx),%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 vmovdqu %xmm1,(%rsi) addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - vmovdqa %xmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm0 + vextracti32x4 $0x2,%zmm9,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_1_remain_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqu %xmm1,(%rsi) - jmp L$_ret_amivrujEyduiFoi L$_start_by16_amivrujEyduiFoi: vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 + vpshufb %zmm8,%zmm0,%zmm1 - vpshufb %zmm8,%zmm0,%zmm1 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 @@ -3261,6 +1818,7 @@ L$_start_by16_amivrujEyduiFoi: vpslldq $0x1,%zmm9,%zmm11 vpxord %zmm14,%zmm11,%zmm11 + vpsrldq $0xf,%zmm10,%zmm15 .byte 98,131,5,72,68,193,0 vpslldq $0x1,%zmm10,%zmm12 @@ -3271,13 +1829,12 @@ L$_main_loop_run_16_amivrujEyduiFoi: vmovdqu8 64(%rdi),%zmm2 vmovdqu8 128(%rdi),%zmm3 vmovdqu8 192(%rdi),%zmm4 - vmovdqu8 240(%rdi),%xmm5 addq $0x100,%rdi vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 vpxorq %zmm11,%zmm3,%zmm3 vpxorq %zmm12,%zmm4,%zmm4 - vbroadcasti32x4 128(%rsp),%zmm0 + vbroadcasti32x4 (%rcx),%zmm0 vpxorq %zmm0,%zmm1,%zmm1 vpxorq %zmm0,%zmm2,%zmm2 vpxorq %zmm0,%zmm3,%zmm3 @@ -3286,17 +1843,17 @@ L$_main_loop_run_16_amivrujEyduiFoi: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm11,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3305,17 +1862,17 @@ L$_main_loop_run_16_amivrujEyduiFoi: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm12,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3324,17 +1881,17 @@ L$_main_loop_run_16_amivrujEyduiFoi: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm15,%zmm17 vpxord %zmm14,%zmm17,%zmm17 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 @@ -3343,27 +1900,27 @@ L$_main_loop_run_16_amivrujEyduiFoi: .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm16,%zmm18 vpxord %zmm14,%zmm18,%zmm18 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 .byte 98,242,101,72,222,216 .byte 98,242,93,72,222,224 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 .byte 98,242,101,72,223,216 @@ -3382,30 +1939,24 @@ L$_main_loop_run_16_amivrujEyduiFoi: vmovdqu8 %zmm3,128(%rsi) vmovdqu8 %zmm4,192(%rsi) addq $0x100,%rsi - subq $0x100,%rdx - cmpq $0x100,%rdx - jge L$_main_loop_run_16_amivrujEyduiFoi - - cmpq $0x80,%rdx - jge L$_main_loop_run_8_amivrujEyduiFoi + subq $0x100,%r11 + cmpq $0x100,%r11 + jae L$_main_loop_run_16_amivrujEyduiFoi + cmpq $0x80,%r11 + jae L$_main_loop_run_8_amivrujEyduiFoi jmp L$_do_n_blocks_amivrujEyduiFoi L$_start_by8_amivrujEyduiFoi: - vbroadcasti32x4 (%rsp),%zmm0 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 movq $0xaa,%r8 kmovq %r8,%k2 - - vpshufb %zmm8,%zmm0,%zmm1 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 .byte 98,147,109,72,68,217,0 vpxorq %zmm2,%zmm4,%zmm4{%k2} vpxord %zmm4,%zmm3,%zmm9 - - vpsllvq const_dq7654(%rip),%zmm0,%zmm5 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 .byte 98,147,77,72,68,249,0 @@ -3415,31 +1966,25 @@ L$_start_by8_amivrujEyduiFoi: L$_main_loop_run_8_amivrujEyduiFoi: vmovdqu8 (%rdi),%zmm1 vmovdqu8 64(%rdi),%zmm2 - vmovdqu8 112(%rdi),%xmm5 addq $0x80,%rdi - - vpxorq %zmm9,%zmm1,%zmm1 - vpxorq %zmm10,%zmm2,%zmm2 - - - vbroadcasti32x4 128(%rsp),%zmm0 - vpxorq %zmm0,%zmm1,%zmm1 - vpxorq %zmm0,%zmm2,%zmm2 + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 vpsrldq $0xf,%zmm9,%zmm13 .byte 98,19,21,72,68,241,0 vpslldq $0x1,%zmm9,%zmm15 vpxord %zmm14,%zmm15,%zmm15 - vbroadcasti32x4 144(%rsp),%zmm0 + vbroadcasti32x4 16(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 160(%rsp),%zmm0 + vbroadcasti32x4 32(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 176(%rsp),%zmm0 + vbroadcasti32x4 48(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 vpsrldq $0xf,%zmm10,%zmm13 @@ -3447,1761 +1992,705 @@ L$_main_loop_run_8_amivrujEyduiFoi: vpslldq $0x1,%zmm10,%zmm16 vpxord %zmm14,%zmm16,%zmm16 - vbroadcasti32x4 192(%rsp),%zmm0 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqa32 %zmm15,%zmm9 + vmovdqa32 %zmm16,%zmm10 + vmovdqu8 %zmm1,(%rsi) + vmovdqu8 %zmm2,64(%rsi) + addq $0x80,%rsi + subq $0x80,%r11 + cmpq $0x80,%r11 + jae L$_main_loop_run_8_amivrujEyduiFoi + vextracti32x4 $0x0,%zmm9,%xmm0 + vextracti32x4 $0x1,%zmm9,%xmm15 + jmp L$_do_n_blocks_amivrujEyduiFoi + +L$_steal_cipher_with_tweak_amivrujEyduiFoi: + + vmovdqa shufb_15_7(%rip),%xmm11 + vpshufb %xmm11,%xmm0,%xmm12 + vpsllq $0x1,%xmm0,%xmm13 + vpsrlq $0x7,%xmm12,%xmm14 +.byte 98,19,13,8,68,249,0 + vpxord %xmm13,%xmm15,%xmm15 + +L$_steal_cipher_amivrujEyduiFoi: + + vmovdqu (%rdi),%xmm8 + vpxor %xmm15,%xmm8,%xmm8 + vpxor (%rcx),%xmm8,%xmm8 + vaesdec 16(%rcx),%xmm8,%xmm8 + vaesdec 32(%rcx),%xmm8,%xmm8 + vaesdec 48(%rcx),%xmm8,%xmm8 + vaesdec 64(%rcx),%xmm8,%xmm8 + vaesdec 80(%rcx),%xmm8,%xmm8 + vaesdec 96(%rcx),%xmm8,%xmm8 + vaesdec 112(%rcx),%xmm8,%xmm8 + vaesdec 128(%rcx),%xmm8,%xmm8 + vaesdec 144(%rcx),%xmm8,%xmm8 + vaesdec 160(%rcx),%xmm8,%xmm8 + vaesdec 176(%rcx),%xmm8,%xmm8 + vaesdec 192(%rcx),%xmm8,%xmm8 + vaesdec 208(%rcx),%xmm8,%xmm8 + vaesdeclast 224(%rcx),%xmm8,%xmm8 + vpxor %xmm15,%xmm8,%xmm8 + + + + + movq $1,%r11 + movq %rcx,%r8 + movq %rdx,%rcx + shlq %cl,%r11 + subq $1,%r11 + kmovq %r11,%k1 + vmovdqu8 16(%rdi),%xmm9{%k1}{z} + vmovdqu8 %xmm8,%xmm10{%k1}{z} + vpblendmb %xmm9,%xmm8,%xmm9{%k1} + + + movq %r8,%rcx + vpxor %xmm0,%xmm9,%xmm9 + vpxor (%rcx),%xmm9,%xmm9 + vaesdec 16(%rcx),%xmm9,%xmm9 + vaesdec 32(%rcx),%xmm9,%xmm9 + vaesdec 48(%rcx),%xmm9,%xmm9 + vaesdec 64(%rcx),%xmm9,%xmm9 + vaesdec 80(%rcx),%xmm9,%xmm9 + vaesdec 96(%rcx),%xmm9,%xmm9 + vaesdec 112(%rcx),%xmm9,%xmm9 + vaesdec 128(%rcx),%xmm9,%xmm9 + vaesdec 144(%rcx),%xmm9,%xmm9 + vaesdec 160(%rcx),%xmm9,%xmm9 + vaesdec 176(%rcx),%xmm9,%xmm9 + vaesdec 192(%rcx),%xmm9,%xmm9 + vaesdec 208(%rcx),%xmm9,%xmm9 + vaesdeclast 224(%rcx),%xmm9,%xmm9 + vpxor %xmm0,%xmm9,%xmm9 + + + + vmovdqu %xmm9,(%rsi) + vmovdqu8 %xmm10,16(%rsi){%k1} + jmp L$_ret_amivrujEyduiFoi + +L$_final_block_is_only_block_amivrujEyduiFoi: + vmovdqa (%rsp),%xmm0 + andq $0xf,%rdx + jne L$_steal_cipher_with_tweak_amivrujEyduiFoi + +L$_final_block_amivrujEyduiFoi: + vmovdqa (%rdi),%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vpxor (%rcx),%xmm8,%xmm8 + vaesdec 16(%rcx),%xmm8,%xmm8 + vaesdec 32(%rcx),%xmm8,%xmm8 + vaesdec 48(%rcx),%xmm8,%xmm8 + vaesdec 64(%rcx),%xmm8,%xmm8 + vaesdec 80(%rcx),%xmm8,%xmm8 + vaesdec 96(%rcx),%xmm8,%xmm8 + vaesdec 112(%rcx),%xmm8,%xmm8 + vaesdec 128(%rcx),%xmm8,%xmm8 + vaesdec 144(%rcx),%xmm8,%xmm8 + vaesdec 160(%rcx),%xmm8,%xmm8 + vaesdec 176(%rcx),%xmm8,%xmm8 + vaesdec 192(%rcx),%xmm8,%xmm8 + vaesdec 208(%rcx),%xmm8,%xmm8 + vaesdeclast 224(%rcx),%xmm8,%xmm8 + vpxor %xmm0,%xmm8,%xmm8 + vmovdqa %xmm8,(%rsi) + +L$_ret_amivrujEyduiFoi: + movq 128(%rsp),%rbx + xorq %r8,%r8 + movq %r8,128(%rsp) + vpxorq %zmm0,%zmm0,%zmm0 + movq %rbp,%rsp + popq %rbp + vzeroupper + .byte 0xf3,0xc3 + +L$_less_than_128_bytes_amivrujEyduiFoi: + vpbroadcastq %r10,%zmm25 + cmpq $0x10,%r11 + jb L$_ret_amivrujEyduiFoi + vbroadcasti32x4 (%rsp),%zmm0 + vbroadcasti32x4 shufb_15_7(%rip),%zmm8 + movl $0xaa,%r8d + kmovq %r8,%k2 + movq %r11,%r8 + andq $0x70,%r8 + cmpq $0x60,%r8 + je L$_num_blocks_is_6_amivrujEyduiFoi + cmpq $0x50,%r8 + je L$_num_blocks_is_5_amivrujEyduiFoi + cmpq $0x40,%r8 + je L$_num_blocks_is_4_amivrujEyduiFoi + cmpq $0x30,%r8 + je L$_num_blocks_is_3_amivrujEyduiFoi + cmpq $0x20,%r8 + je L$_num_blocks_is_2_amivrujEyduiFoi + cmpq $0x10,%r8 + je L$_num_blocks_is_1_amivrujEyduiFoi + +L$_num_blocks_is_7_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%zmm2{%k1} + + addq $0x70,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %zmm2,64(%rsi){%k1} + addq $0x70,%rsi + + vextracti32x4 $0x3,%zmm10,%xmm0 + + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi + + vpsrldq $0xf,%zmm9,%zmm13 +.byte 98,19,21,72,68,241,0 + vpslldq $0x1,%zmm9,%zmm11 + vpxord %zmm14,%zmm11,%zmm11 + vextracti32x4 $0x0,%zmm11,%xmm15 + jmp L$_steal_cipher_amivrujEyduiFoi +L$_num_blocks_is_6_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%ymm2 + addq $96,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + + vbroadcasti32x4 64(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 208(%rsp),%zmm0 + vbroadcasti32x4 80(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 224(%rsp),%zmm0 + vbroadcasti32x4 96(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 240(%rsp),%zmm0 + vbroadcasti32x4 112(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 256(%rsp),%zmm0 + vbroadcasti32x4 128(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 272(%rsp),%zmm0 + vbroadcasti32x4 144(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 288(%rsp),%zmm0 + vbroadcasti32x4 160(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 304(%rsp),%zmm0 + vbroadcasti32x4 176(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 320(%rsp),%zmm0 + vbroadcasti32x4 192(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 336(%rsp),%zmm0 + vbroadcasti32x4 208(%rcx),%zmm0 .byte 98,242,117,72,222,200 .byte 98,242,109,72,222,208 - vbroadcasti32x4 352(%rsp),%zmm0 + vbroadcasti32x4 224(%rcx),%zmm0 .byte 98,242,117,72,223,200 .byte 98,242,109,72,223,208 - - vpxorq %zmm9,%zmm1,%zmm1 vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %ymm2,64(%rsi) + addq $96,%rsi - - vmovdqa32 %zmm15,%zmm9 - vmovdqa32 %zmm16,%zmm10 - vmovdqu8 %zmm1,(%rsi) - vmovdqu8 %zmm2,64(%rsi) - addq $0x80,%rsi - subq $0x80,%rdx - cmpq $0x80,%rdx - jge L$_main_loop_run_8_amivrujEyduiFoi - jmp L$_do_n_blocks_amivrujEyduiFoi - -L$_steal_cipher_amivrujEyduiFoi: - - vmovdqa %xmm8,%xmm2 - - - leaq vpshufb_shf_table(%rip),%rax - vmovdqu (%rax,%rdx,1),%xmm10 - vpshufb %xmm10,%xmm8,%xmm8 + vextracti32x4 $0x2,%zmm10,%xmm0 + vextracti32x4 $0x3,%zmm10,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi + jmp L$_steal_cipher_amivrujEyduiFoi +L$_num_blocks_is_5_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + vmovdqu8 64(%rdi),%xmm2 + addq $80,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vpternlogq $0x96,%zmm0,%zmm10,%zmm2 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vmovdqu -16(%rdi,%rdx,1),%xmm3 - vmovdqu %xmm8,-16(%rsi,%rdx,1) + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - leaq vpshufb_shf_table(%rip),%rax - addq $16,%rax - subq %rdx,%rax - vmovdqu (%rax),%xmm10 - vpxor mask1(%rip),%xmm10,%xmm10 - vpshufb %xmm10,%xmm3,%xmm3 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor %xmm0,%xmm3,%xmm8 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor 128(%rsp),%xmm8,%xmm8 -.byte 98,114,61,8,222,132,36,144,0,0,0 -.byte 98,114,61,8,222,132,36,160,0,0,0 -.byte 98,114,61,8,222,132,36,176,0,0,0 -.byte 98,114,61,8,222,132,36,192,0,0,0 -.byte 98,114,61,8,222,132,36,208,0,0,0 -.byte 98,114,61,8,222,132,36,224,0,0,0 -.byte 98,114,61,8,222,132,36,240,0,0,0 -.byte 98,114,61,8,222,132,36,0,1,0,0 -.byte 98,114,61,8,222,132,36,16,1,0,0 -.byte 98,114,61,8,222,132,36,32,1,0,0 -.byte 98,114,61,8,222,132,36,48,1,0,0 -.byte 98,114,61,8,222,132,36,64,1,0,0 -.byte 98,114,61,8,222,132,36,80,1,0,0 -.byte 98,114,61,8,223,132,36,96,1,0,0 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxor %xmm0,%xmm8,%xmm8 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -L$_done_amivrujEyduiFoi: - vmovdqu %xmm8,-16(%rsi) -L$_ret_amivrujEyduiFoi: - movq 368(%rsp),%rbx - xorq %r8,%r8 - movq %r8,368(%rsp) + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - vpxorq %zmm0,%zmm0,%zmm0 - vmovdqa64 %zmm0,128(%rsp) - vmovdqa64 %zmm0,192(%rsp) - vmovdqa64 %zmm0,256(%rsp) + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 - movq $0x3f,%r8 - kmovq %r8,%k2 - vmovdqa64 %zmm0,320(%rsp){%k2} - movq %rbp,%rsp - popq %rbp - vzeroupper - .byte 0xf3,0xc3 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -L$_less_than_128_bytes_amivrujEyduiFoi: - cmpq $0x10,%rdx - jb L$_ret_amivrujEyduiFoi - movq %rdx,%r8 - andq $0x70,%r8 - cmpq $0x60,%r8 - je L$_num_blocks_is_6_amivrujEyduiFoi - cmpq $0x50,%r8 - je L$_num_blocks_is_5_amivrujEyduiFoi - cmpq $0x40,%r8 - je L$_num_blocks_is_4_amivrujEyduiFoi - cmpq $0x30,%r8 - je L$_num_blocks_is_3_amivrujEyduiFoi - cmpq $0x20,%r8 - je L$_num_blocks_is_2_amivrujEyduiFoi - cmpq $0x10,%r8 - je L$_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -L$_num_blocks_is_7_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,96(%rsp) - movq %rbx,104(%rsp) - vmovdqa 96(%rsp),%xmm15 - vmovdqu 96(%rdi),%xmm7 - addq $0x70,%rdi - andq $0xf,%rdx - je L$_done_7_amivrujEyduiFoi - -L$_steal_cipher_7_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm15,%xmm16 - vmovdqa 16(%rsp),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 -.byte 98,242,69,8,223,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x70,%rsi - vmovdqa64 %xmm16,%xmm0 - vmovdqa %xmm7,%xmm8 - jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_7_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 -.byte 98,242,69,8,222,248 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 -.byte 98,242,69,8,223,248 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - addq $0x70,%rsi - vmovdqa %xmm7,%xmm8 - jmp L$_done_amivrujEyduiFoi + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 +.byte 98,242,109,72,222,208 -L$_num_blocks_is_6_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,80(%rsp) - movq %rbx,88(%rsp) - vmovdqa 80(%rsp),%xmm14 - vmovdqu 80(%rdi),%xmm6 - addq $0x60,%rdi - andq $0xf,%rdx - je L$_done_6_amivrujEyduiFoi - -L$_steal_cipher_6_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm14,%xmm15 - vmovdqa 16(%rsp),%xmm14 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x60,%rsi - vmovdqa %xmm15,%xmm0 - vmovdqa %xmm6,%xmm8 - jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_6_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 -.byte 98,242,77,8,222,240 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 -.byte 98,242,77,8,223,240 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vpxor %xmm14,%xmm6,%xmm6 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - addq $0x60,%rsi - vmovdqa %xmm6,%xmm8 - jmp L$_done_amivrujEyduiFoi + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 +.byte 98,242,109,72,223,208 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm2,%zmm2 + vmovdqu8 %zmm1,0(%rsi) + vmovdqu8 %xmm2,64(%rsi) + addq $80,%rsi -L$_num_blocks_is_5_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,64(%rsp) - movq %rbx,72(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqu 64(%rdi),%xmm5 - addq $0x50,%rdi + vmovdqa %xmm2,%xmm8 + vextracti32x4 $0x1,%zmm10,%xmm0 + vextracti32x4 $0x2,%zmm10,%xmm15 andq $0xf,%rdx - je L$_done_5_amivrujEyduiFoi - -L$_steal_cipher_5_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm13,%xmm14 - vmovdqa 16(%rsp),%xmm13 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x50,%rsi - vmovdqa %xmm14,%xmm0 - vmovdqa %xmm5,%xmm8 + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_5_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vpxor %xmm0,%xmm5,%xmm5 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 -.byte 98,242,85,8,222,232 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 -.byte 98,242,85,8,223,232 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vpxor %xmm13,%xmm5,%xmm5 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - addq $0x50,%rsi - vmovdqa %xmm5,%xmm8 - jmp L$_done_amivrujEyduiFoi - L$_num_blocks_is_4_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - vmovdqa 48(%rsp),%xmm12 - vmovdqu 48(%rdi),%xmm4 - addq $0x40,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + vmovdqu8 0(%rdi),%zmm1 + addq $64,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi) + addq $64,%rsi + vmovdqa %xmm10,%xmm0 + vextracti32x4 $0x1,%zmm10,%xmm15 andq $0xf,%rdx - je L$_done_4_amivrujEyduiFoi - -L$_steal_cipher_4_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm12,%xmm13 - vmovdqa 16(%rsp),%xmm12 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x40,%rsi - vmovdqa %xmm13,%xmm0 - vmovdqa %xmm4,%xmm8 + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_4_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm0,%xmm4,%xmm4 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 -.byte 98,242,93,8,222,224 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 -.byte 98,242,93,8,223,224 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vpxor %xmm12,%xmm4,%xmm4 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - addq $0x40,%rsi - vmovdqa %xmm4,%xmm8 - jmp L$_done_amivrujEyduiFoi - L$_num_blocks_is_3_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,32(%rsp) - movq %rbx,40(%rsp) - vmovdqa 32(%rsp),%xmm11 - vmovdqu 32(%rdi),%xmm3 - addq $0x30,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + vpsllvq const_dq7654(%rip),%zmm0,%zmm5 + vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 +.byte 98,147,77,72,68,249,0 + vpxorq %zmm6,%zmm5,%zmm5{%k2} + vpxord %zmm5,%zmm7,%zmm10 + movq $0x0000ffffffffffff,%r8 + kmovq %r8,%k1 + vmovdqu8 0(%rdi),%zmm1{%k1} + addq $48,%rdi + vbroadcasti32x4 (%rcx),%zmm0 + vpternlogq $0x96,%zmm0,%zmm9,%zmm1 + vbroadcasti32x4 16(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 32(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 48(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 64(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 80(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 96(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 112(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 128(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 144(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 160(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 176(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 192(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 208(%rcx),%zmm0 +.byte 98,242,117,72,222,200 + vbroadcasti32x4 224(%rcx),%zmm0 +.byte 98,242,117,72,223,200 + vpxorq %zmm9,%zmm1,%zmm1 + vmovdqu8 %zmm1,0(%rsi){%k1} + addq $48,%rsi + vextracti32x4 $3,%zmm9,%xmm0 + vextracti32x4 $0,%zmm10,%xmm15 andq $0xf,%rdx - je L$_done_3_amivrujEyduiFoi - -L$_steal_cipher_3_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm11,%xmm12 - vmovdqa 16(%rsp),%xmm11 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x30,%rsi - vmovdqa %xmm12,%xmm0 - vmovdqa %xmm3,%xmm8 + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi - -L$_done_3_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 -.byte 98,242,101,8,222,216 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 -.byte 98,242,101,8,223,216 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vpxor %xmm11,%xmm3,%xmm3 - vmovdqu %xmm1,(%rsi) - vmovdqu %xmm2,16(%rsi) - addq $0x30,%rsi - vmovdqa %xmm3,%xmm8 - jmp L$_done_amivrujEyduiFoi - L$_num_blocks_is_2_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa 16(%rsp),%xmm10 - vmovdqu 16(%rdi),%xmm2 - addq $0x20,%rdi + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 + + vmovdqu8 0(%rdi),%ymm1 + addq $32,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %ymm1,0(%rsi) + addq $32,%rsi + + vextracti32x4 $2,%zmm9,%xmm0 + vextracti32x4 $3,%zmm9,%xmm15 andq $0xf,%rdx - je L$_done_2_amivrujEyduiFoi - -L$_steal_cipher_2_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm10,%xmm11 - vmovdqa 16(%rsp),%xmm10 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - addq $0x20,%rsi - vmovdqa %xmm11,%xmm0 - vmovdqa %xmm2,%xmm8 + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi +L$_num_blocks_is_1_amivrujEyduiFoi: + vpshufb %zmm8,%zmm0,%zmm1 + vpsllvq const_dq3210(%rip),%zmm0,%zmm4 + vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 +.byte 98,147,109,72,68,217,0 + vpxorq %zmm2,%zmm4,%zmm4{%k2} + vpxord %zmm4,%zmm3,%zmm9 -L$_done_2_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm0,%xmm2,%xmm2 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 -.byte 98,242,109,8,222,208 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 -.byte 98,242,109,8,223,208 - vpxor %xmm9,%xmm1,%xmm1 - vpxor %xmm10,%xmm2,%xmm2 - vmovdqu %xmm1,(%rsi) - addq $0x20,%rsi - vmovdqa %xmm2,%xmm8 - jmp L$_done_amivrujEyduiFoi + vmovdqu8 0(%rdi),%xmm1 + addq $16,%rdi + vbroadcasti32x4 (%rcx),%ymm0 + vpternlogq $0x96,%ymm0,%ymm9,%ymm1 + vbroadcasti32x4 16(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 32(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 48(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 64(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 80(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 96(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 112(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 128(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 144(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 160(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 176(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 192(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 208(%rcx),%ymm0 +.byte 98,242,117,40,222,200 + vbroadcasti32x4 224(%rcx),%ymm0 +.byte 98,242,117,40,223,200 + vpxorq %ymm9,%ymm1,%ymm1 + vmovdqu8 %xmm1,0(%rsi) + addq $16,%rsi -L$_num_blocks_is_1_amivrujEyduiFoi: - vmovdqa 0(%rsp),%xmm9 - movq 0(%rsp),%rax - movq 8(%rsp),%rbx - vmovdqu 0(%rdi),%xmm1 - addq $0x10,%rdi - andq $0xf,%rdx - je L$_done_1_amivrujEyduiFoi - -L$_steal_cipher_1_amivrujEyduiFoi: - xorq %r11,%r11 - shlq $1,%rax - adcq %rbx,%rbx - cmovcq %r10,%r11 - xorq %r11,%rax - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - vmovdqa64 %xmm9,%xmm10 - vmovdqa 16(%rsp),%xmm9 - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - addq $0x10,%rsi - vmovdqa %xmm10,%xmm0 vmovdqa %xmm1,%xmm8 + vextracti32x4 $1,%zmm9,%xmm0 + vextracti32x4 $2,%zmm9,%xmm15 + andq $0xf,%rdx + je L$_final_block_amivrujEyduiFoi jmp L$_steal_cipher_amivrujEyduiFoi -L$_done_1_amivrujEyduiFoi: - vpxor %xmm9,%xmm1,%xmm1 - vmovdqa 128(%rsp),%xmm0 - vpxor %xmm0,%xmm1,%xmm1 - vmovdqa 144(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 160(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 176(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 192(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 208(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 224(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 240(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 256(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 272(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 288(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 304(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 320(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 336(%rsp),%xmm0 -.byte 98,242,117,8,222,200 - vmovdqa 352(%rsp),%xmm0 -.byte 98,242,117,8,223,200 - vpxor %xmm9,%xmm1,%xmm1 - addq $0x10,%rsi - vmovdqa %xmm1,%xmm8 - jmp L$_done_amivrujEyduiFoi - .section __DATA,__const .p2align 4 diff --git a/generated-src/win-x86_64/crypto/fipsmodule/aesni-xts-avx512.asm b/generated-src/win-x86_64/crypto/fipsmodule/aesni-xts-avx512.asm index 3c3c6cbd65..0b9ccad9be 100644 --- a/generated-src/win-x86_64/crypto/fipsmodule/aesni-xts-avx512.asm +++ b/generated-src/win-x86_64/crypto/fipsmodule/aesni-xts-avx512.asm @@ -17,758 +17,544 @@ global aes_hw_xts_encrypt_avx512 ALIGN 32 aes_hw_xts_encrypt_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_xts_encrypt_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + DB 243,15,30,250 push rbp mov rbp,rsp - sub rsp,552 + sub rsp,312 and rsp,0xffffffffffffffc0 - mov QWORD[528+rsp],rbx - mov QWORD[((528 + 8))+rsp],rdi - mov QWORD[((528 + 16))+rsp],rsi - vmovdqa XMMWORD[(368 + 0)+rsp],xmm6 - vmovdqa XMMWORD[(368 + 16)+rsp],xmm7 - vmovdqa XMMWORD[(368 + 32)+rsp],xmm8 - vmovdqa XMMWORD[(368 + 48)+rsp],xmm9 - vmovdqa XMMWORD[(368 + 64)+rsp],xmm10 - vmovdqa XMMWORD[(368 + 80)+rsp],xmm11 - vmovdqa XMMWORD[(368 + 96)+rsp],xmm12 - vmovdqa XMMWORD[(368 + 112)+rsp],xmm13 - vmovdqa XMMWORD[(368 + 128)+rsp],xmm14 - vmovdqa XMMWORD[(368 + 144)+rsp],xmm15 - mov rdi,0x87 - vmovdqu xmm1,XMMWORD[r11] - vpxor xmm4,xmm4,xmm4 - vmovdqu xmm0,XMMWORD[r10] - vpxor xmm1,xmm1,xmm0 - - vmovdqu xmm2,XMMWORD[r9] - vmovdqa XMMWORD[128+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[16+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[16+r9] - vmovdqa XMMWORD[144+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[32+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[32+r9] - vmovdqa XMMWORD[160+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[48+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[48+r9] - vmovdqa XMMWORD[176+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[64+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[64+r9] - vmovdqa XMMWORD[192+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[80+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[80+r9] - vmovdqa XMMWORD[208+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[96+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[96+r9] - vmovdqa XMMWORD[224+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[112+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[112+r9] - vmovdqa XMMWORD[240+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[128+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[128+r9] - vmovdqa XMMWORD[256+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[144+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[144+r9] - vmovdqa XMMWORD[272+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[160+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[160+r9] - vmovdqa XMMWORD[288+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[176+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[176+r9] - vmovdqa XMMWORD[304+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[192+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[192+r9] - vmovdqa XMMWORD[320+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[208+r10] - DB 98,242,117,8,220,200 - - vmovdqu xmm2,XMMWORD[208+r9] - vmovdqa XMMWORD[336+rsp],xmm2 - - vmovdqu xmm0,XMMWORD[224+r10] - DB 98,242,117,8,221,200 - - vmovdqu xmm2,XMMWORD[224+r9] - vmovdqa XMMWORD[352+rsp],xmm2 - + mov QWORD[288+rsp],rbx + mov QWORD[((288 + 8))+rsp],rdi + mov QWORD[((288 + 16))+rsp],rsi + vmovdqa XMMWORD[(128 + 0)+rsp],xmm6 + vmovdqa XMMWORD[(128 + 16)+rsp],xmm7 + vmovdqa XMMWORD[(128 + 32)+rsp],xmm8 + vmovdqa XMMWORD[(128 + 48)+rsp],xmm9 + vmovdqa XMMWORD[(128 + 64)+rsp],xmm10 + vmovdqa XMMWORD[(128 + 80)+rsp],xmm11 + vmovdqa XMMWORD[(128 + 96)+rsp],xmm12 + vmovdqa XMMWORD[(128 + 112)+rsp],xmm13 + vmovdqa XMMWORD[(128 + 128)+rsp],xmm14 + vmovdqa XMMWORD[(128 + 144)+rsp],xmm15 + mov r10,0x87 + vmovdqu xmm1,XMMWORD[r9] + vpxor xmm1,xmm1,XMMWORD[r8] + vaesenc xmm1,xmm1,XMMWORD[16+r8] + vaesenc xmm1,xmm1,XMMWORD[32+r8] + vaesenc xmm1,xmm1,XMMWORD[48+r8] + vaesenc xmm1,xmm1,XMMWORD[64+r8] + vaesenc xmm1,xmm1,XMMWORD[80+r8] + vaesenc xmm1,xmm1,XMMWORD[96+r8] + vaesenc xmm1,xmm1,XMMWORD[112+r8] + vaesenc xmm1,xmm1,XMMWORD[128+r8] + vaesenc xmm1,xmm1,XMMWORD[144+r8] + vaesenc xmm1,xmm1,XMMWORD[160+r8] + vaesenc xmm1,xmm1,XMMWORD[176+r8] + vaesenc xmm1,xmm1,XMMWORD[192+r8] + vaesenc xmm1,xmm1,XMMWORD[208+r8] + vaesenclast xmm1,xmm1,XMMWORD[224+r8] vmovdqa XMMWORD[rsp],xmm1 - mov QWORD[((8 + 40))+rbp],rcx - mov QWORD[((8 + 48))+rbp],rdx + mov QWORD[((8 + 40))+rbp],rdi + mov QWORD[((8 + 48))+rbp],rsi - cmp r8,0x80 + cmp rdx,0x80 jl NEAR $L$_less_than_128_bytes_hEgxyDlCngwrfFe - vpbroadcastq zmm25,rdi - cmp r8,0x100 + vpbroadcastq zmm25,r10 + cmp rdx,0x100 jge NEAR $L$_start_by16_hEgxyDlCngwrfFe - cmp r8,0x80 + cmp rdx,0x80 jge NEAR $L$_start_by8_hEgxyDlCngwrfFe $L$_do_n_blocks_hEgxyDlCngwrfFe: - cmp r8,0x0 + cmp rdx,0x0 je NEAR $L$_ret_hEgxyDlCngwrfFe - cmp r8,0x70 + cmp rdx,0x70 jge NEAR $L$_remaining_num_blocks_is_7_hEgxyDlCngwrfFe - cmp r8,0x60 + cmp rdx,0x60 jge NEAR $L$_remaining_num_blocks_is_6_hEgxyDlCngwrfFe - cmp r8,0x50 + cmp rdx,0x50 jge NEAR $L$_remaining_num_blocks_is_5_hEgxyDlCngwrfFe - cmp r8,0x40 + cmp rdx,0x40 jge NEAR $L$_remaining_num_blocks_is_4_hEgxyDlCngwrfFe - cmp r8,0x30 + cmp rdx,0x30 jge NEAR $L$_remaining_num_blocks_is_3_hEgxyDlCngwrfFe - cmp r8,0x20 + cmp rdx,0x20 jge NEAR $L$_remaining_num_blocks_is_2_hEgxyDlCngwrfFe - cmp r8,0x10 + cmp rdx,0x10 jge NEAR $L$_remaining_num_blocks_is_1_hEgxyDlCngwrfFe vmovdqa xmm8,xmm0 vmovdqa xmm0,xmm9 jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_7_hEgxyDlCngwrfFe: - mov r10,0xffffffffffffffff - shr r10,0x10 - kmovq k1,r10 - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2{k1},[64+rcx] - add rcx,0x70 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2{k1},[64+rdi] + add rdi,0x70 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 DB 98,242,109,72,221,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx]{k1},zmm2 - add rdx,0x70 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi]{k1},zmm2 + add rsi,0x70 vextracti32x4 xmm8,zmm2,0x2 vextracti32x4 xmm0,zmm10,0x3 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_6_hEgxyDlCngwrfFe: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 ymm2,YMMWORD[64+rcx] - add rcx,0x60 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 ymm2,YMMWORD[64+rdi] + add rdi,0x60 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 DB 98,242,109,72,221,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 YMMWORD[64+rdx],ymm2 - add rdx,0x60 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 YMMWORD[64+rsi],ymm2 + add rsi,0x60 vextracti32x4 xmm8,zmm2,0x1 vextracti32x4 xmm0,zmm10,0x2 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_5_hEgxyDlCngwrfFe: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu xmm2,XMMWORD[64+rcx] - add rcx,0x50 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu xmm2,XMMWORD[64+rdi] + add rdi,0x50 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 DB 98,242,109,72,221,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu XMMWORD[64+rdx],xmm2 - add rdx,0x50 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu XMMWORD[64+rsi],xmm2 + add rsi,0x50 vmovdqa xmm8,xmm2 vextracti32x4 xmm0,zmm10,0x1 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_4_hEgxyDlCngwrfFe: - vmovdqu8 zmm1,ZMMWORD[rcx] - add rcx,0x40 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vmovdqu8 zmm1,ZMMWORD[rdi] + add rdi,0x40 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 - DB 98,242,109,72,220,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 - DB 98,242,109,72,221,208 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - add rdx,0x40 + vmovdqu8 ZMMWORD[rsi],zmm1 + add rsi,0x40 vextracti32x4 xmm8,zmm1,0x3 - vextracti32x4 xmm0,zmm10,0x0 - and r8,0xf + vmovdqa64 xmm0,xmm10 + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_3_hEgxyDlCngwrfFe: - vextracti32x4 xmm10,zmm9,0x1 - vextracti32x4 xmm11,zmm9,0x2 - vmovdqu xmm1,XMMWORD[rcx] - vmovdqu xmm2,XMMWORD[16+rcx] - vmovdqu xmm3,XMMWORD[32+rcx] - add rcx,0x30 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - add rdx,0x30 - vmovdqa xmm8,xmm3 + mov r8,-1 + shr r8,0x10 + kmovq k1,r8 + vmovdqu8 zmm1{k1},[rdi] + add rdi,0x30 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi]{k1},zmm1 + add rsi,0x30 + vextracti32x4 xmm8,zmm1,0x2 vextracti32x4 xmm0,zmm9,0x3 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_2_hEgxyDlCngwrfFe: - vextracti32x4 xmm10,zmm9,0x1 - vmovdqu xmm1,XMMWORD[rcx] - vmovdqu xmm2,XMMWORD[16+rcx] - add rcx,0x20 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - add rdx,0x20 - vmovdqa xmm8,xmm2 + vmovdqu8 ymm1,YMMWORD[rdi] + add rdi,0x20 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,221,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu YMMWORD[rsi],ymm1 + add rsi,0x20 + vextracti32x4 xmm8,zmm1,0x1 vextracti32x4 xmm0,zmm9,0x2 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe $L$_remaining_num_blocks_is_1_hEgxyDlCngwrfFe: - vmovdqu xmm1,XMMWORD[rcx] - add rcx,0x10 + vmovdqu xmm1,XMMWORD[rdi] + add rdi,0x10 vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,XMMWORD[160+rcx] + vaesenc xmm1,xmm1,XMMWORD[176+rcx] + vaesenc xmm1,xmm1,XMMWORD[192+rcx] + vaesenc xmm1,xmm1,XMMWORD[208+rcx] + vaesenclast xmm1,xmm1,XMMWORD[224+rcx] vpxor xmm1,xmm1,xmm9 - vmovdqu XMMWORD[rdx],xmm1 - add rdx,0x10 + vmovdqu XMMWORD[rsi],xmm1 + add rsi,0x10 vmovdqa xmm8,xmm1 vextracti32x4 xmm0,zmm9,0x1 - and r8,0xf + and rdx,0xf je NEAR $L$_ret_hEgxyDlCngwrfFe jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe + + $L$_start_by16_hEgxyDlCngwrfFe: vbroadcasti32x4 zmm0,ZMMWORD[rsp] vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] - mov r10,0xaa - kmovq k2,r10 + mov r8,0xaa + kmovq k2,r8 vpshufb zmm1,zmm0,zmm8 + + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] DB 98,147,109,72,68,217,0 vpxorq zmm4{k2},zmm4,zmm2 vpxord zmm9,zmm3,zmm4 + + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] DB 98,147,77,72,68,249,0 vpxorq zmm5{k2},zmm5,zmm6 vpxord zmm10,zmm7,zmm5 + + vpsrldq zmm13,zmm9,0xf DB 98,19,21,72,68,241,0 vpslldq zmm11,zmm9,0x1 vpxord zmm11,zmm11,zmm14 + + vpsrldq zmm15,zmm10,0xf DB 98,131,5,72,68,193,0 vpslldq zmm12,zmm10,0x1 vpxord zmm12,zmm12,zmm16 $L$_main_loop_run_16_hEgxyDlCngwrfFe: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2,ZMMWORD[64+rcx] - vmovdqu8 zmm3,ZMMWORD[128+rcx] - vmovdqu8 zmm4,ZMMWORD[192+rcx] - add rcx,0x100 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2,ZMMWORD[64+rdi] + vmovdqu8 zmm3,ZMMWORD[128+rdi] + vmovdqu8 zmm4,ZMMWORD[192+rdi] + add rdi,0x100 vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 vpxorq zmm3,zmm3,zmm11 vpxorq zmm4,zmm4,zmm12 - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[rcx] vpxorq zmm1,zmm1,zmm0 vpxorq zmm2,zmm2,zmm0 vpxorq zmm3,zmm3,zmm0 @@ -777,17 +563,17 @@ $L$_main_loop_run_16_hEgxyDlCngwrfFe: DB 98,19,21,72,68,241,0 vpslldq zmm15,zmm11,0x1 vpxord zmm15,zmm15,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 @@ -796,17 +582,17 @@ $L$_main_loop_run_16_hEgxyDlCngwrfFe: DB 98,19,21,72,68,241,0 vpslldq zmm16,zmm12,0x1 vpxord zmm16,zmm16,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 @@ -815,17 +601,17 @@ $L$_main_loop_run_16_hEgxyDlCngwrfFe: DB 98,19,21,72,68,241,0 vpslldq zmm17,zmm15,0x1 vpxord zmm17,zmm17,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 @@ -834,27 +620,27 @@ $L$_main_loop_run_16_hEgxyDlCngwrfFe: DB 98,19,21,72,68,241,0 vpslldq zmm18,zmm16,0x1 vpxord zmm18,zmm18,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 DB 98,242,101,72,220,216 DB 98,242,93,72,220,224 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 DB 98,242,109,72,221,208 DB 98,242,101,72,221,216 @@ -868,24 +654,24 @@ $L$_main_loop_run_16_hEgxyDlCngwrfFe: vmovdqa32 zmm10,zmm16 vmovdqa32 zmm11,zmm17 vmovdqa32 zmm12,zmm18 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx],zmm2 - vmovdqu8 ZMMWORD[128+rdx],zmm3 - vmovdqu8 ZMMWORD[192+rdx],zmm4 - add rdx,0x100 - sub r8,0x100 - cmp r8,0x100 - jge NEAR $L$_main_loop_run_16_hEgxyDlCngwrfFe - cmp r8,0x80 - jge NEAR $L$_main_loop_run_8_hEgxyDlCngwrfFe + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi],zmm2 + vmovdqu8 ZMMWORD[128+rsi],zmm3 + vmovdqu8 ZMMWORD[192+rsi],zmm4 + add rsi,0x100 + sub rdx,0x100 + cmp rdx,0x100 + jae NEAR $L$_main_loop_run_16_hEgxyDlCngwrfFe + cmp rdx,0x80 + jae NEAR $L$_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 xmm0,zmm4,0x3 jmp NEAR $L$_do_n_blocks_hEgxyDlCngwrfFe $L$_start_by8_hEgxyDlCngwrfFe: vbroadcasti32x4 zmm0,ZMMWORD[rsp] vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] - mov r10,0xaa - kmovq k2,r10 + mov r8,0xaa + kmovq k2,r8 vpshufb zmm1,zmm0,zmm8 vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] @@ -899,32 +685,27 @@ $L$_start_by8_hEgxyDlCngwrfFe: vpxord zmm10,zmm7,zmm5 $L$_main_loop_run_8_hEgxyDlCngwrfFe: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2,ZMMWORD[64+rcx] - add rcx,0x80 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2,ZMMWORD[64+rdi] + add rdi,0x80 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 vpsrldq zmm13,zmm9,0xf DB 98,19,21,72,68,241,0 vpslldq zmm15,zmm9,0x1 vpxord zmm15,zmm15,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 vpsrldq zmm13,zmm10,0xf @@ -932,4362 +713,2098 @@ $L$_main_loop_run_8_hEgxyDlCngwrfFe: vpslldq zmm16,zmm10,0x1 vpxord zmm16,zmm16,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,220,200 DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,221,200 DB 98,242,109,72,221,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - vmovdqa32 zmm9,zmm15 vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx],zmm2 - add rdx,0x80 - sub r8,0x80 - cmp r8,0x80 - jge NEAR $L$_main_loop_run_8_hEgxyDlCngwrfFe + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi],zmm2 + add rsi,0x80 + sub rdx,0x80 + cmp rdx,0x80 + jae NEAR $L$_main_loop_run_8_hEgxyDlCngwrfFe vextracti32x4 xmm0,zmm2,0x3 jmp NEAR $L$_do_n_blocks_hEgxyDlCngwrfFe -$L$_steal_cipher_next_hEgxyDlCngwrfFe: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[rsp],rax - mov QWORD[8+rsp],rbx - vmovdqa xmm0,XMMWORD[rsp] - $L$_steal_cipher_hEgxyDlCngwrfFe: vmovdqa xmm2,xmm8 lea rax,[vpshufb_shf_table] - vmovdqu xmm10,XMMWORD[r8*1+rax] + vmovdqu xmm10,XMMWORD[rdx*1+rax] vpshufb xmm8,xmm8,xmm10 - vmovdqu xmm3,XMMWORD[((-16))+r8*1+rcx] - vmovdqu XMMWORD[(-16)+r8*1+rdx],xmm8 + vmovdqu xmm3,XMMWORD[((-16))+rdx*1+rdi] + vmovdqu XMMWORD[(-16)+rdx*1+rsi],xmm8 lea rax,[vpshufb_shf_table] add rax,16 - sub rax,r8 + sub rax,rdx vmovdqu xmm10,XMMWORD[rax] vpxor xmm10,xmm10,XMMWORD[mask1] vpshufb xmm3,xmm3,xmm10 vpblendvb xmm3,xmm3,xmm2,xmm10 vpxor xmm8,xmm3,xmm0 - vpxor xmm8,xmm8,XMMWORD[128+rsp] - DB 98,114,61,8,220,132,36,144,0,0,0 - DB 98,114,61,8,220,132,36,160,0,0,0 - DB 98,114,61,8,220,132,36,176,0,0,0 - DB 98,114,61,8,220,132,36,192,0,0,0 - DB 98,114,61,8,220,132,36,208,0,0,0 - DB 98,114,61,8,220,132,36,224,0,0,0 - DB 98,114,61,8,220,132,36,240,0,0,0 - DB 98,114,61,8,220,132,36,0,1,0,0 - DB 98,114,61,8,220,132,36,16,1,0,0 - DB 98,114,61,8,220,132,36,32,1,0,0 - DB 98,114,61,8,220,132,36,48,1,0,0 - DB 98,114,61,8,220,132,36,64,1,0,0 - DB 98,114,61,8,220,132,36,80,1,0,0 - DB 98,114,61,8,221,132,36,96,1,0,0 + vpxor xmm8,xmm8,XMMWORD[rcx] + vaesenc xmm8,xmm8,XMMWORD[16+rcx] + vaesenc xmm8,xmm8,XMMWORD[32+rcx] + vaesenc xmm8,xmm8,XMMWORD[48+rcx] + vaesenc xmm8,xmm8,XMMWORD[64+rcx] + vaesenc xmm8,xmm8,XMMWORD[80+rcx] + vaesenc xmm8,xmm8,XMMWORD[96+rcx] + vaesenc xmm8,xmm8,XMMWORD[112+rcx] + vaesenc xmm8,xmm8,XMMWORD[128+rcx] + vaesenc xmm8,xmm8,XMMWORD[144+rcx] + vaesenc xmm8,xmm8,XMMWORD[160+rcx] + vaesenc xmm8,xmm8,XMMWORD[176+rcx] + vaesenc xmm8,xmm8,XMMWORD[192+rcx] + vaesenc xmm8,xmm8,XMMWORD[208+rcx] + vaesenclast xmm8,xmm8,XMMWORD[224+rcx] vpxor xmm8,xmm8,xmm0 - vmovdqu XMMWORD[(-16)+rdx],xmm8 -$L$_ret_hEgxyDlCngwrfFe: - mov rbx,QWORD[528+rsp] - xor r10,r10 - mov QWORD[528+rsp],r10 + vmovdqu XMMWORD[(-16)+rsi],xmm8 +$L$_ret_hEgxyDlCngwrfFe: + mov rbx,QWORD[288+rsp] + xor r8,r8 + mov QWORD[288+rsp],r8 vpxorq zmm0,zmm0,zmm0 - mov rdi,QWORD[((528 + 8))+rsp] - mov QWORD[((528 + 8))+rsp],r10 - mov rsi,QWORD[((528 + 16))+rsp] - mov QWORD[((528 + 16))+rsp],r10 + mov rdi,QWORD[((288 + 8))+rsp] + mov QWORD[((288 + 8))+rsp],r8 + mov rsi,QWORD[((288 + 16))+rsp] + mov QWORD[((288 + 16))+rsp],r8 - vmovdqa xmm6,XMMWORD[((368 + 0))+rsp] - vmovdqa xmm7,XMMWORD[((368 + 16))+rsp] - vmovdqa xmm8,XMMWORD[((368 + 32))+rsp] - vmovdqa xmm9,XMMWORD[((368 + 48))+rsp] + vmovdqa xmm6,XMMWORD[((128 + 0))+rsp] + vmovdqa xmm7,XMMWORD[((128 + 16))+rsp] + vmovdqa xmm8,XMMWORD[((128 + 32))+rsp] + vmovdqa xmm9,XMMWORD[((128 + 48))+rsp] - vmovdqa64 ZMMWORD[368+rsp],zmm0 - - vmovdqa xmm10,XMMWORD[((368 + 64))+rsp] - vmovdqa xmm11,XMMWORD[((368 + 80))+rsp] - vmovdqa xmm12,XMMWORD[((368 + 96))+rsp] - vmovdqa xmm13,XMMWORD[((368 + 112))+rsp] - - - vmovdqa64 ZMMWORD[(368 + 64)+rsp],zmm0 - - vmovdqa xmm14,XMMWORD[((368 + 128))+rsp] - vmovdqa xmm15,XMMWORD[((368 + 144))+rsp] - + vmovdqa64 ZMMWORD[128+rsp],zmm0 + vmovdqa xmm10,XMMWORD[((128 + 64))+rsp] + vmovdqa xmm11,XMMWORD[((128 + 80))+rsp] + vmovdqa xmm12,XMMWORD[((128 + 96))+rsp] + vmovdqa xmm13,XMMWORD[((128 + 112))+rsp] - vmovdqa YMMWORD[(368 + 128)+rsp],ymm0 - vmovdqa64 ZMMWORD[128+rsp],zmm0 - vmovdqa64 ZMMWORD[192+rsp],zmm0 - vmovdqa64 ZMMWORD[256+rsp],zmm0 + vmovdqa64 ZMMWORD[(128 + 64)+rsp],zmm0 + vmovdqa xmm14,XMMWORD[((128 + 128))+rsp] + vmovdqa xmm15,XMMWORD[((128 + 144))+rsp] - mov r10,0x3f - kmovq k2,r10 - vmovdqa64 ZMMWORD[320+rsp]{k2},zmm0 + vmovdqa YMMWORD[(128 + 128)+rsp],ymm0 mov rsp,rbp pop rbp vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$_less_than_128_bytes_hEgxyDlCngwrfFe: - cmp r8,0x10 + vpbroadcastq zmm25,r10 + cmp rdx,0x10 jb NEAR $L$_ret_hEgxyDlCngwrfFe - mov r10,r8 - and r10,0x70 - cmp r10,0x60 + vbroadcasti32x4 zmm0,ZMMWORD[rsp] + vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] + mov r8d,0xaa + kmovq k2,r8 + mov r8,rdx + and r8,0x70 + cmp r8,0x60 je NEAR $L$_num_blocks_is_6_hEgxyDlCngwrfFe - cmp r10,0x50 + cmp r8,0x50 je NEAR $L$_num_blocks_is_5_hEgxyDlCngwrfFe - cmp r10,0x40 + cmp r8,0x40 je NEAR $L$_num_blocks_is_4_hEgxyDlCngwrfFe - cmp r10,0x30 + cmp r8,0x30 je NEAR $L$_num_blocks_is_3_hEgxyDlCngwrfFe - cmp r10,0x20 + cmp r8,0x20 je NEAR $L$_num_blocks_is_2_hEgxyDlCngwrfFe - cmp r10,0x10 + cmp r8,0x10 je NEAR $L$_num_blocks_is_1_hEgxyDlCngwrfFe $L$_num_blocks_is_7_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[80+rsp],rax - mov QWORD[88+rsp],rbx - vmovdqa xmm14,XMMWORD[80+rsp] - vmovdqu xmm6,XMMWORD[80+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[96+rsp],rax - mov QWORD[104+rsp],rbx - vmovdqa xmm15,XMMWORD[96+rsp] - vmovdqu xmm7,XMMWORD[96+rcx] - add rcx,0x70 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vpxor xmm7,xmm7,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - DB 98,242,69,8,220,248 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - DB 98,242,93,8,221,224 - DB 98,242,85,8,221,232 - DB 98,242,77,8,221,240 - DB 98,242,69,8,221,248 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - vmovdqu XMMWORD[80+rdx],xmm6 - vmovdqu XMMWORD[96+rdx],xmm7 - add rdx,0x70 - vmovdqa xmm8,xmm7 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe - -$L$_num_blocks_is_6_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[80+rsp],rax - mov QWORD[88+rsp],rbx - vmovdqa xmm14,XMMWORD[80+rsp] - vmovdqu xmm6,XMMWORD[80+rcx] - add rcx,0x60 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - DB 98,242,77,8,220,240 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - DB 98,242,93,8,221,224 - DB 98,242,85,8,221,232 - DB 98,242,77,8,221,240 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - vmovdqu XMMWORD[80+rdx],xmm6 - add rdx,0x60 - vmovdqa xmm8,xmm6 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2{k1},[64+rdi] -$L$_num_blocks_is_5_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - add rcx,0x50 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - DB 98,242,85,8,220,232 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - DB 98,242,93,8,221,224 - DB 98,242,85,8,221,232 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - add rdx,0x50 - vmovdqa xmm8,xmm5 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe + add rdi,0x70 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 -$L$_num_blocks_is_4_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - add rcx,0x40 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - DB 98,242,93,8,220,224 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - DB 98,242,93,8,221,224 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - add rdx,0x40 - vmovdqa xmm8,xmm4 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe -$L$_num_blocks_is_3_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - add rcx,0x30 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - DB 98,242,101,8,220,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - DB 98,242,101,8,221,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - add rdx,0x30 - vmovdqa xmm8,xmm3 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 -$L$_num_blocks_is_2_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - add rcx,0x20 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - DB 98,242,109,8,220,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - DB 98,242,109,8,221,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - add rdx,0x20 - vmovdqa xmm8,xmm2 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe -$L$_num_blocks_is_1_hEgxyDlCngwrfFe: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - add rcx,0x10 - vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,220,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,221,200 - vpxor xmm1,xmm1,xmm9 - vmovdqu XMMWORD[rdx],xmm1 - add rdx,0x10 - vmovdqa xmm8,xmm1 - and r8,0xf - je NEAR $L$_ret_hEgxyDlCngwrfFe - jmp NEAR $L$_steal_cipher_next_hEgxyDlCngwrfFe + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 -global aes_hw_xts_decrypt_avx512 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 -ALIGN 32 -aes_hw_xts_decrypt_avx512: + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 -DB 243,15,30,250 - push rbp - mov rbp,rsp - sub rsp,552 - and rsp,0xffffffffffffffc0 - mov QWORD[528+rsp],rbx - mov QWORD[((528 + 8))+rsp],rdi - mov QWORD[((528 + 16))+rsp],rsi - vmovdqa XMMWORD[(368 + 0)+rsp],xmm6 - vmovdqa XMMWORD[(368 + 16)+rsp],xmm7 - vmovdqa XMMWORD[(368 + 32)+rsp],xmm8 - vmovdqa XMMWORD[(368 + 48)+rsp],xmm9 - vmovdqa XMMWORD[(368 + 64)+rsp],xmm10 - vmovdqa XMMWORD[(368 + 80)+rsp],xmm11 - vmovdqa XMMWORD[(368 + 96)+rsp],xmm12 - vmovdqa XMMWORD[(368 + 112)+rsp],xmm13 - vmovdqa XMMWORD[(368 + 128)+rsp],xmm14 - vmovdqa XMMWORD[(368 + 144)+rsp],xmm15 - mov rdi,0x87 - vmovdqu xmm1,XMMWORD[r11] - vpxor xmm4,xmm4,xmm4 - vmovdqu xmm0,XMMWORD[r10] - vpxor xmm1,xmm1,xmm0 - vmovdqu xmm2,XMMWORD[224+r9] - vmovdqa XMMWORD[352+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[16+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[208+r9] - vmovdqa XMMWORD[336+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[32+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[192+r9] - vmovdqa XMMWORD[320+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[48+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[176+r9] - vmovdqa XMMWORD[304+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[64+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[160+r9] - vmovdqa XMMWORD[288+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[80+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[144+r9] - vmovdqa XMMWORD[272+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[96+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[128+r9] - vmovdqa XMMWORD[256+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[112+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[112+r9] - vmovdqa XMMWORD[240+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[128+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[96+r9] - vmovdqa XMMWORD[224+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + DB 98,242,109,72,221,208 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi]{k1},zmm2 + add rsi,0x70 + vextracti32x4 xmm8,zmm2,0x2 + vextracti32x4 xmm0,zmm10,0x3 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_6_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 ymm2,YMMWORD[64+rdi] + add rdi,96 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[144+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[80+r9] - vmovdqa XMMWORD[208+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[160+r10] - DB 98,242,117,8,220,200 - vmovdqu xmm2,XMMWORD[64+r9] - vmovdqa XMMWORD[192+rsp],xmm2 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm0,XMMWORD[176+r10] - DB 98,242,117,8,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm2,XMMWORD[48+r9] - vmovdqa XMMWORD[176+rsp],xmm2 - vmovdqu xmm0,XMMWORD[192+r10] - DB 98,242,117,8,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm2,XMMWORD[32+r9] - vmovdqa XMMWORD[160+rsp],xmm2 - vmovdqu xmm0,XMMWORD[208+r10] - DB 98,242,117,8,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm2,XMMWORD[16+r9] - vmovdqa XMMWORD[144+rsp],xmm2 - vmovdqu xmm0,XMMWORD[224+r10] - DB 98,242,117,8,221,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm2,XMMWORD[r9] - vmovdqa XMMWORD[128+rsp],xmm2 - vmovdqa XMMWORD[rsp],xmm1 - mov QWORD[((8 + 40))+rbp],rcx - mov QWORD[((8 + 48))+rbp],rdx + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - cmp r8,0x80 - jb NEAR $L$_less_than_128_bytes_amivrujEyduiFoi - vpbroadcastq zmm25,rdi - cmp r8,0x100 - jge NEAR $L$_start_by16_amivrujEyduiFoi - jmp NEAR $L$_start_by8_amivrujEyduiFoi -$L$_do_n_blocks_amivrujEyduiFoi: - cmp r8,0x0 - je NEAR $L$_ret_amivrujEyduiFoi - cmp r8,0x70 - jge NEAR $L$_remaining_num_blocks_is_7_amivrujEyduiFoi - cmp r8,0x60 - jge NEAR $L$_remaining_num_blocks_is_6_amivrujEyduiFoi - cmp r8,0x50 - jge NEAR $L$_remaining_num_blocks_is_5_amivrujEyduiFoi - cmp r8,0x40 - jge NEAR $L$_remaining_num_blocks_is_4_amivrujEyduiFoi - cmp r8,0x30 - jge NEAR $L$_remaining_num_blocks_is_3_amivrujEyduiFoi - cmp r8,0x20 - jge NEAR $L$_remaining_num_blocks_is_2_amivrujEyduiFoi - cmp r8,0x10 - jge NEAR $L$_remaining_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vmovdqu xmm1,xmm5 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - vpxor xmm1,xmm1,xmm9 - vmovdqu XMMWORD[(-16)+rdx],xmm1 - vmovdqa xmm8,xmm1 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - mov r10,0x1 - kmovq k1,r10 - vpsllq xmm13,xmm9,0x3f - vpsraq xmm14,xmm13,0x3f - vpandq xmm5,xmm14,xmm25 - vpxorq xmm9{k1},xmm9,xmm5 - vpsrldq xmm10,xmm9,0x8 - DB 98,211,181,8,115,194,1 - vpslldq xmm13,xmm13,0x8 - vpxorq xmm0,xmm0,xmm13 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi -$L$_remaining_num_blocks_is_7_amivrujEyduiFoi: - mov r10,0xffffffffffffffff - shr r10,0x10 - kmovq k1,r10 - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2{k1},[64+rcx] - add rcx,0x70 - and r8,0xf - je NEAR $L$_done_7_remain_amivrujEyduiFoi - vextracti32x4 xmm12,zmm10,0x2 - vextracti32x4 xmm13,zmm10,0x3 - vinserti32x4 zmm10,zmm10,xmm13,0x2 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + DB 98,242,109,72,221,208 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 YMMWORD[64+rsi],ymm2 + add rsi,96 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vextracti32x4 xmm8,ymm2,0x1 + vextracti32x4 xmm0,zmm10,0x2 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_5_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 xmm2,XMMWORD[64+rdi] + add rdi,80 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] - DB 98,242,117,72,223,200 - DB 98,242,109,72,223,208 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + DB 98,242,109,72,220,208 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + DB 98,242,109,72,221,208 vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 XMMWORD[64+rsi],xmm2 + add rsi,80 + vmovdqa xmm8,xmm2 + vextracti32x4 xmm0,zmm10,0x1 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_4_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + vmovdqu8 zmm1,ZMMWORD[rdi] + add rdi,64 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi],zmm1 + add rsi,64 + vextracti32x4 xmm8,zmm1,0x3 + vmovdqa xmm0,xmm10 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_3_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1{k1},[rdi] + add rdi,48 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,220,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,221,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi]{k1},zmm1 + add rsi,48 + vextracti32x4 xmm8,zmm1,2 + vextracti32x4 xmm0,zmm9,3 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_2_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx]{k1},zmm2 - add rdx,0x70 - vextracti32x4 xmm8,zmm2,0x2 - vmovdqa xmm0,xmm12 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi + vmovdqu8 ymm1,YMMWORD[rdi] + add rdi,32 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,221,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu8 YMMWORD[rsi],ymm1 + add rsi,32 + + vextracti32x4 xmm8,ymm1,1 + vextracti32x4 xmm0,zmm9,2 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +$L$_num_blocks_is_1_hEgxyDlCngwrfFe: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 -$L$_done_7_remain_amivrujEyduiFoi: + vmovdqu8 xmm1,XMMWORD[rdi] + add rdi,16 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,220,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,221,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu8 XMMWORD[rsi],xmm1 + add rsi,16 - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 + vmovdqa xmm8,xmm1 + vextracti32x4 xmm0,zmm9,1 + and rdx,0xf + je NEAR $L$_ret_hEgxyDlCngwrfFe + jmp NEAR $L$_steal_cipher_hEgxyDlCngwrfFe +global aes_hw_xts_decrypt_avx512 - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + +ALIGN 32 +aes_hw_xts_decrypt_avx512: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_xts_decrypt_avx512: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +DB 243,15,30,250 + push rbp + mov rbp,rsp + sub rsp,312 + and rsp,0xffffffffffffffc0 + mov QWORD[288+rsp],rbx + mov QWORD[((288 + 8))+rsp],rdi + mov QWORD[((288 + 16))+rsp],rsi + vmovdqa XMMWORD[(128 + 0)+rsp],xmm6 + vmovdqa XMMWORD[(128 + 16)+rsp],xmm7 + vmovdqa XMMWORD[(128 + 32)+rsp],xmm8 + vmovdqa XMMWORD[(128 + 48)+rsp],xmm9 + vmovdqa XMMWORD[(128 + 64)+rsp],xmm10 + vmovdqa XMMWORD[(128 + 80)+rsp],xmm11 + vmovdqa XMMWORD[(128 + 96)+rsp],xmm12 + vmovdqa XMMWORD[(128 + 112)+rsp],xmm13 + vmovdqa XMMWORD[(128 + 128)+rsp],xmm14 + vmovdqa XMMWORD[(128 + 144)+rsp],xmm15 + mov r10,0x87 + vmovdqu xmm1,XMMWORD[r9] + vpxor xmm1,xmm1,XMMWORD[r8] + vaesenc xmm1,xmm1,XMMWORD[16+r8] + vaesenc xmm1,xmm1,XMMWORD[32+r8] + vaesenc xmm1,xmm1,XMMWORD[48+r8] + vaesenc xmm1,xmm1,XMMWORD[64+r8] + vaesenc xmm1,xmm1,XMMWORD[80+r8] + vaesenc xmm1,xmm1,XMMWORD[96+r8] + vaesenc xmm1,xmm1,XMMWORD[112+r8] + vaesenc xmm1,xmm1,XMMWORD[128+r8] + vaesenc xmm1,xmm1,XMMWORD[144+r8] + vaesenc xmm1,xmm1,XMMWORD[160+r8] + vaesenc xmm1,xmm1,XMMWORD[176+r8] + vaesenc xmm1,xmm1,XMMWORD[192+r8] + vaesenc xmm1,xmm1,XMMWORD[208+r8] + vaesenclast xmm1,xmm1,XMMWORD[224+r8] + vmovdqa XMMWORD[rsp],xmm1 + mov QWORD[((8 + 40))+rbp],rdi + mov QWORD[((8 + 48))+rbp],rsi + + + cmp rdx,0x20 + jl NEAR $L$_final_block_is_only_block_amivrujEyduiFoi + + + + + mov r11,rdx + and r11,0xfffffffffffffff0 + sub r11,16 + cmp r11,0x80 + jl NEAR $L$_less_than_128_bytes_amivrujEyduiFoi + vpbroadcastq zmm25,r10 + cmp r11,0x100 + jge NEAR $L$_start_by16_amivrujEyduiFoi + cmp r11,0x80 + jge NEAR $L$_start_by8_amivrujEyduiFoi + +$L$_do_n_blocks_amivrujEyduiFoi: + cmp r11,0x70 + je NEAR $L$_remaining_num_blocks_is_7_amivrujEyduiFoi + cmp r11,0x60 + je NEAR $L$_remaining_num_blocks_is_6_amivrujEyduiFoi + cmp r11,0x50 + je NEAR $L$_remaining_num_blocks_is_5_amivrujEyduiFoi + cmp r11,0x40 + je NEAR $L$_remaining_num_blocks_is_4_amivrujEyduiFoi + cmp r11,0x30 + je NEAR $L$_remaining_num_blocks_is_3_amivrujEyduiFoi + cmp r11,0x20 + je NEAR $L$_remaining_num_blocks_is_2_amivrujEyduiFoi + cmp r11,0x10 + je NEAR $L$_remaining_num_blocks_is_1_amivrujEyduiFoi + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + vextracti32x4 xmm0,zmm9,0x0 + vextracti32x4 xmm15,zmm9,0x1 + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi + +$L$_remaining_num_blocks_is_7_amivrujEyduiFoi: + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2{k1},[64+rdi] + add rdi,0x70 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx]{k1},zmm2 - jmp NEAR $L$_ret_amivrujEyduiFoi + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi]{k1},zmm2 + add rsi,0x70 + vextracti32x4 xmm0,zmm10,0x3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + vpsrldq zmm13,zmm9,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm11,zmm9,0x1 + vpxord zmm11,zmm11,zmm14 + vextracti32x4 xmm15,zmm11,0x0 + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi $L$_remaining_num_blocks_is_6_amivrujEyduiFoi: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 ymm2,YMMWORD[64+rcx] - add rcx,0x60 - and r8,0xf - je NEAR $L$_done_6_remain_amivrujEyduiFoi - vextracti32x4 xmm12,zmm10,0x1 - vextracti32x4 xmm13,zmm10,0x2 - vinserti32x4 zmm10,zmm10,xmm13,0x1 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 ymm2,YMMWORD[64+rdi] + add rdi,0x60 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 YMMWORD[64+rdx],ymm2 - add rdx,0x60 - vextracti32x4 xmm8,zmm2,0x1 - vmovdqa xmm0,xmm12 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 YMMWORD[64+rsi],ymm2 + add rsi,0x60 + vextracti32x4 xmm0,zmm10,0x2 + vextracti32x4 xmm15,zmm10,0x3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi -$L$_done_6_remain_amivrujEyduiFoi: - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] +$L$_remaining_num_blocks_is_5_amivrujEyduiFoi: + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu xmm2,XMMWORD[64+rdi] + add rdi,0x50 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 YMMWORD[64+rdx],ymm2 - jmp NEAR $L$_ret_amivrujEyduiFoi - -$L$_remaining_num_blocks_is_5_amivrujEyduiFoi: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu xmm2,XMMWORD[64+rcx] - add rcx,0x50 - and r8,0xf - je NEAR $L$_done_5_remain_amivrujEyduiFoi - vmovdqa xmm12,xmm10 - vextracti32x4 xmm10,zmm10,0x1 - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu XMMWORD[64+rsi],xmm2 + add rsi,0x50 + vextracti32x4 xmm0,zmm10,0x1 + vextracti32x4 xmm15,zmm10,0x2 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] +$L$_remaining_num_blocks_is_4_amivrujEyduiFoi: + vmovdqu8 zmm1,ZMMWORD[rdi] + add rdi,0x40 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 - DB 98,242,109,72,223,208 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu XMMWORD[64+rdx],xmm2 - add rdx,0x50 - vmovdqa xmm8,xmm2 - vmovdqa xmm0,xmm12 + vmovdqu8 ZMMWORD[rsi],zmm1 + add rsi,0x40 + vextracti32x4 xmm0,zmm10,0x0 + vextracti32x4 xmm15,zmm10,0x1 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - -$L$_done_5_remain_amivrujEyduiFoi: - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] +$L$_remaining_num_blocks_is_3_amivrujEyduiFoi: + mov r8,-1 + shr r8,0x10 + kmovq k1,r8 + vmovdqu8 zmm1{k1},[rdi] + add rdi,0x30 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,223,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi]{k1},zmm1 + add rsi,0x30 + vextracti32x4 xmm0,zmm9,0x3 + vextracti32x4 xmm15,zmm10,0x0 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_remaining_num_blocks_is_2_amivrujEyduiFoi: + vmovdqu8 ymm1,YMMWORD[rdi] + add rdi,0x20 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,223,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu YMMWORD[rsi],ymm1 + add rsi,0x20 + vextracti32x4 xmm0,zmm9,0x2 + vextracti32x4 xmm15,zmm9,0x3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_remaining_num_blocks_is_1_amivrujEyduiFoi: + vmovdqu xmm1,XMMWORD[rdi] + add rdi,0x10 + vpxor xmm1,xmm1,xmm9 + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesdec xmm1,xmm1,XMMWORD[16+rcx] + vaesdec xmm1,xmm1,XMMWORD[32+rcx] + vaesdec xmm1,xmm1,XMMWORD[48+rcx] + vaesdec xmm1,xmm1,XMMWORD[64+rcx] + vaesdec xmm1,xmm1,XMMWORD[80+rcx] + vaesdec xmm1,xmm1,XMMWORD[96+rcx] + vaesdec xmm1,xmm1,XMMWORD[112+rcx] + vaesdec xmm1,xmm1,XMMWORD[128+rcx] + vaesdec xmm1,xmm1,XMMWORD[144+rcx] + vaesdec xmm1,xmm1,XMMWORD[160+rcx] + vaesdec xmm1,xmm1,XMMWORD[176+rcx] + vaesdec xmm1,xmm1,XMMWORD[192+rcx] + vaesdec xmm1,xmm1,XMMWORD[208+rcx] + vaesdeclast xmm1,xmm1,XMMWORD[224+rcx] + vpxor xmm1,xmm1,xmm9 + vmovdqu XMMWORD[rsi],xmm1 + add rsi,0x10 + vextracti32x4 xmm0,zmm9,0x1 + vextracti32x4 xmm15,zmm9,0x2 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] - DB 98,242,117,72,222,200 - DB 98,242,109,72,222,208 +$L$_start_by16_amivrujEyduiFoi: + vbroadcasti32x4 zmm0,ZMMWORD[rsp] + vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] + mov r8,0xaa + kmovq k2,r8 + vpshufb zmm1,zmm0,zmm8 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] - DB 98,242,117,72,223,200 - DB 98,242,109,72,223,208 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 XMMWORD[64+rdx],xmm2 - jmp NEAR $L$_ret_amivrujEyduiFoi + vpsrldq zmm13,zmm9,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm11,zmm9,0x1 + vpxord zmm11,zmm11,zmm14 -$L$_remaining_num_blocks_is_4_amivrujEyduiFoi: - vmovdqu8 zmm1,ZMMWORD[rcx] - add rcx,0x40 - and r8,0xf - je NEAR $L$_done_4_remain_amivrujEyduiFoi - vextracti32x4 xmm12,zmm9,0x3 - vinserti32x4 zmm9,zmm9,xmm10,0x3 + vpsrldq zmm15,zmm10,0xf + DB 98,131,5,72,68,193,0 + vpslldq zmm12,zmm10,0x1 + vpxord zmm12,zmm12,zmm16 + +$L$_main_loop_run_16_amivrujEyduiFoi: + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2,ZMMWORD[64+rdi] + vmovdqu8 zmm3,ZMMWORD[128+rdi] + vmovdqu8 zmm4,ZMMWORD[192+rdi] + add rdi,0x100 vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] + vpxorq zmm3,zmm3,zmm11 + vpxorq zmm4,zmm4,zmm12 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] vpxorq zmm1,zmm1,zmm0 vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vpxorq zmm3,zmm3,zmm0 + vpxorq zmm4,zmm4,zmm0 + vpsrldq zmm13,zmm11,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm15,zmm11,0x1 + vpxord zmm15,zmm15,zmm14 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vpsrldq zmm13,zmm12,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm16,zmm12,0x1 + vpxord zmm16,zmm16,zmm14 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vpsrldq zmm13,zmm15,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm17,zmm15,0x1 + vpxord zmm17,zmm17,zmm14 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vpsrldq zmm13,zmm16,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm18,zmm16,0x1 + vpxord zmm18,zmm18,zmm14 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - - - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + DB 98,242,101,72,222,216 + DB 98,242,93,72,222,224 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - + DB 98,242,101,72,223,216 + DB 98,242,93,72,223,224 vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - + vpxorq zmm3,zmm3,zmm11 + vpxorq zmm4,zmm4,zmm12 vmovdqa32 zmm9,zmm15 vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - add rdx,0x40 - vextracti32x4 xmm8,zmm1,0x3 - vmovdqa xmm0,xmm12 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - -$L$_done_4_remain_amivrujEyduiFoi: - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 + vmovdqa32 zmm11,zmm17 + vmovdqa32 zmm12,zmm18 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi],zmm2 + vmovdqu8 ZMMWORD[128+rsi],zmm3 + vmovdqu8 ZMMWORD[192+rsi],zmm4 + add rsi,0x100 + sub r11,0x100 + cmp r11,0x100 + jae NEAR $L$_main_loop_run_16_amivrujEyduiFoi + cmp r11,0x80 + jae NEAR $L$_main_loop_run_8_amivrujEyduiFoi + jmp NEAR $L$_do_n_blocks_amivrujEyduiFoi +$L$_start_by8_amivrujEyduiFoi: + vbroadcasti32x4 zmm0,ZMMWORD[rsp] + vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] + mov r8,0xaa + kmovq k2,r8 + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] +$L$_main_loop_run_8_amivrujEyduiFoi: + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2,ZMMWORD[64+rdi] + add rdi,0x80 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vpsrldq zmm13,zmm9,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm15,zmm9,0x1 + vpxord zmm15,zmm15,zmm14 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 + vpsrldq zmm13,zmm10,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm16,zmm10,0x1 + vpxord zmm16,zmm16,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - - vmovdqa32 zmm9,zmm15 vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - jmp NEAR $L$_ret_amivrujEyduiFoi + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi],zmm2 + add rsi,0x80 + sub r11,0x80 + cmp r11,0x80 + jae NEAR $L$_main_loop_run_8_amivrujEyduiFoi + vextracti32x4 xmm0,zmm9,0x0 + vextracti32x4 xmm15,zmm9,0x1 + jmp NEAR $L$_do_n_blocks_amivrujEyduiFoi -$L$_remaining_num_blocks_is_3_amivrujEyduiFoi: - vmovdqu xmm1,XMMWORD[rcx] - vmovdqu xmm2,XMMWORD[16+rcx] - vmovdqu xmm3,XMMWORD[32+rcx] - add rcx,0x30 - and r8,0xf - je NEAR $L$_done_3_remain_amivrujEyduiFoi - vextracti32x4 xmm13,zmm9,0x2 - vextracti32x4 xmm10,zmm9,0x1 - vextracti32x4 xmm11,zmm9,0x3 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - add rdx,0x30 - vmovdqa xmm8,xmm3 - vmovdqa xmm0,xmm13 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_steal_cipher_with_tweak_amivrujEyduiFoi: -$L$_done_3_remain_amivrujEyduiFoi: - vextracti32x4 xmm10,zmm9,0x1 - vextracti32x4 xmm11,zmm9,0x2 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - jmp NEAR $L$_ret_amivrujEyduiFoi + vmovdqa xmm11,XMMWORD[shufb_15_7] + vpshufb xmm12,xmm0,xmm11 + vpsllq xmm13,xmm0,0x1 + vpsrlq xmm14,xmm12,0x7 + DB 98,19,13,8,68,249,0 + vpxord xmm15,xmm15,xmm13 -$L$_remaining_num_blocks_is_2_amivrujEyduiFoi: - vmovdqu xmm1,XMMWORD[rcx] - vmovdqu xmm2,XMMWORD[16+rcx] - add rcx,0x20 - and r8,0xf - je NEAR $L$_done_2_remain_amivrujEyduiFoi - vextracti32x4 xmm10,zmm9,0x2 - vextracti32x4 xmm12,zmm9,0x1 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - add rdx,0x20 - vmovdqa xmm8,xmm2 - vmovdqa xmm0,xmm12 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_steal_cipher_amivrujEyduiFoi: -$L$_done_2_remain_amivrujEyduiFoi: - vextracti32x4 xmm10,zmm9,0x1 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 + vmovdqu xmm8,XMMWORD[rdi] + vpxor xmm8,xmm8,xmm15 + vpxor xmm8,xmm8,XMMWORD[rcx] + vaesdec xmm8,xmm8,XMMWORD[16+rcx] + vaesdec xmm8,xmm8,XMMWORD[32+rcx] + vaesdec xmm8,xmm8,XMMWORD[48+rcx] + vaesdec xmm8,xmm8,XMMWORD[64+rcx] + vaesdec xmm8,xmm8,XMMWORD[80+rcx] + vaesdec xmm8,xmm8,XMMWORD[96+rcx] + vaesdec xmm8,xmm8,XMMWORD[112+rcx] + vaesdec xmm8,xmm8,XMMWORD[128+rcx] + vaesdec xmm8,xmm8,XMMWORD[144+rcx] + vaesdec xmm8,xmm8,XMMWORD[160+rcx] + vaesdec xmm8,xmm8,XMMWORD[176+rcx] + vaesdec xmm8,xmm8,XMMWORD[192+rcx] + vaesdec xmm8,xmm8,XMMWORD[208+rcx] + vaesdeclast xmm8,xmm8,XMMWORD[224+rcx] + vpxor xmm8,xmm8,xmm15 + + + + + mov r11,1 + mov r8,rcx + mov rcx,rdx + shl r11,cl + sub r11,1 + kmovq k1,r11 + vmovdqu8 xmm9{k1}{z},[16+rdi] + vmovdqu8 xmm10{k1}{z},xmm8 + vpblendmb xmm9{k1},xmm8,xmm9 + + + mov rcx,r8 + vpxor xmm9,xmm9,xmm0 + vpxor xmm9,xmm9,XMMWORD[rcx] + vaesdec xmm9,xmm9,XMMWORD[16+rcx] + vaesdec xmm9,xmm9,XMMWORD[32+rcx] + vaesdec xmm9,xmm9,XMMWORD[48+rcx] + vaesdec xmm9,xmm9,XMMWORD[64+rcx] + vaesdec xmm9,xmm9,XMMWORD[80+rcx] + vaesdec xmm9,xmm9,XMMWORD[96+rcx] + vaesdec xmm9,xmm9,XMMWORD[112+rcx] + vaesdec xmm9,xmm9,XMMWORD[128+rcx] + vaesdec xmm9,xmm9,XMMWORD[144+rcx] + vaesdec xmm9,xmm9,XMMWORD[160+rcx] + vaesdec xmm9,xmm9,XMMWORD[176+rcx] + vaesdec xmm9,xmm9,XMMWORD[192+rcx] + vaesdec xmm9,xmm9,XMMWORD[208+rcx] + vaesdeclast xmm9,xmm9,XMMWORD[224+rcx] + vpxor xmm9,xmm9,xmm0 + + + + vmovdqu XMMWORD[rsi],xmm9 + vmovdqu8 XMMWORD[16+rsi]{k1},xmm10 jmp NEAR $L$_ret_amivrujEyduiFoi -$L$_remaining_num_blocks_is_1_amivrujEyduiFoi: - vmovdqu xmm1,XMMWORD[rcx] - add rcx,0x10 - and r8,0xf - je NEAR $L$_done_1_remain_amivrujEyduiFoi - vextracti32x4 xmm11,zmm9,0x1 - vpxor xmm1,xmm1,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - vpxor xmm1,xmm1,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - add rdx,0x10 - vmovdqa xmm8,xmm1 - vmovdqa xmm0,xmm9 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_final_block_is_only_block_amivrujEyduiFoi: + vmovdqa xmm0,XMMWORD[rsp] + and rdx,0xf + jne NEAR $L$_steal_cipher_with_tweak_amivrujEyduiFoi -$L$_done_1_remain_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - vpxor xmm1,xmm1,xmm9 - vmovdqu XMMWORD[rdx],xmm1 - jmp NEAR $L$_ret_amivrujEyduiFoi +$L$_final_block_amivrujEyduiFoi: + vmovdqa xmm8,XMMWORD[rdi] + vpxor xmm8,xmm8,xmm0 + vpxor xmm8,xmm8,XMMWORD[rcx] + vaesdec xmm8,xmm8,XMMWORD[16+rcx] + vaesdec xmm8,xmm8,XMMWORD[32+rcx] + vaesdec xmm8,xmm8,XMMWORD[48+rcx] + vaesdec xmm8,xmm8,XMMWORD[64+rcx] + vaesdec xmm8,xmm8,XMMWORD[80+rcx] + vaesdec xmm8,xmm8,XMMWORD[96+rcx] + vaesdec xmm8,xmm8,XMMWORD[112+rcx] + vaesdec xmm8,xmm8,XMMWORD[128+rcx] + vaesdec xmm8,xmm8,XMMWORD[144+rcx] + vaesdec xmm8,xmm8,XMMWORD[160+rcx] + vaesdec xmm8,xmm8,XMMWORD[176+rcx] + vaesdec xmm8,xmm8,XMMWORD[192+rcx] + vaesdec xmm8,xmm8,XMMWORD[208+rcx] + vaesdeclast xmm8,xmm8,XMMWORD[224+rcx] + vpxor xmm8,xmm8,xmm0 + vmovdqa XMMWORD[rsi],xmm8 -$L$_start_by16_amivrujEyduiFoi: +$L$_ret_amivrujEyduiFoi: + mov rbx,QWORD[288+rsp] + xor r8,r8 + mov QWORD[288+rsp],r8 + vpxorq zmm0,zmm0,zmm0 + mov rdi,QWORD[((288 + 8))+rsp] + mov QWORD[((288 + 8))+rsp],r8 + mov rsi,QWORD[((288 + 16))+rsp] + mov QWORD[((288 + 16))+rsp],r8 + + vmovdqa xmm6,XMMWORD[((128 + 0))+rsp] + vmovdqa xmm7,XMMWORD[((128 + 16))+rsp] + vmovdqa xmm8,XMMWORD[((128 + 32))+rsp] + vmovdqa xmm9,XMMWORD[((128 + 48))+rsp] + + + vmovdqa64 ZMMWORD[128+rsp],zmm0 + + vmovdqa xmm10,XMMWORD[((128 + 64))+rsp] + vmovdqa xmm11,XMMWORD[((128 + 80))+rsp] + vmovdqa xmm12,XMMWORD[((128 + 96))+rsp] + vmovdqa xmm13,XMMWORD[((128 + 112))+rsp] + + + vmovdqa64 ZMMWORD[(128 + 64)+rsp],zmm0 + + vmovdqa xmm14,XMMWORD[((128 + 128))+rsp] + vmovdqa xmm15,XMMWORD[((128 + 144))+rsp] + + + + vmovdqa YMMWORD[(128 + 128)+rsp],ymm0 + mov rsp,rbp + pop rbp + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$_less_than_128_bytes_amivrujEyduiFoi: + vpbroadcastq zmm25,r10 + cmp r11,0x10 + jb NEAR $L$_ret_amivrujEyduiFoi vbroadcasti32x4 zmm0,ZMMWORD[rsp] vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] - mov r10,0xaa - kmovq k2,r10 - + mov r8d,0xaa + kmovq k2,r8 + mov r8,r11 + and r8,0x70 + cmp r8,0x60 + je NEAR $L$_num_blocks_is_6_amivrujEyduiFoi + cmp r8,0x50 + je NEAR $L$_num_blocks_is_5_amivrujEyduiFoi + cmp r8,0x40 + je NEAR $L$_num_blocks_is_4_amivrujEyduiFoi + cmp r8,0x30 + je NEAR $L$_num_blocks_is_3_amivrujEyduiFoi + cmp r8,0x20 + je NEAR $L$_num_blocks_is_2_amivrujEyduiFoi + cmp r8,0x10 + je NEAR $L$_num_blocks_is_1_amivrujEyduiFoi +$L$_num_blocks_is_7_amivrujEyduiFoi: vpshufb zmm1,zmm0,zmm8 vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] DB 98,147,109,72,68,217,0 vpxorq zmm4{k2},zmm4,zmm2 vpxord zmm9,zmm3,zmm4 - - vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] DB 98,147,77,72,68,249,0 vpxorq zmm5{k2},zmm5,zmm6 vpxord zmm10,zmm7,zmm5 + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 zmm2{k1},[64+rdi] - - vpsrldq zmm13,zmm9,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm11,zmm9,0x1 - vpxord zmm11,zmm11,zmm14 - - vpsrldq zmm15,zmm10,0xf - DB 98,131,5,72,68,193,0 - vpslldq zmm12,zmm10,0x1 - vpxord zmm12,zmm12,zmm16 - -$L$_main_loop_run_16_amivrujEyduiFoi: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2,ZMMWORD[64+rcx] - vmovdqu8 zmm3,ZMMWORD[128+rcx] - vmovdqu8 zmm4,ZMMWORD[192+rcx] - vmovdqu8 xmm5,XMMWORD[240+rcx] - add rcx,0x100 - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - vpxorq zmm3,zmm3,zmm11 - vpxorq zmm4,zmm4,zmm12 - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vpxorq zmm3,zmm3,zmm0 - vpxorq zmm4,zmm4,zmm0 - vpsrldq zmm13,zmm11,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm15,zmm11,0x1 - vpxord zmm15,zmm15,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + add rdi,0x70 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vpsrldq zmm13,zmm12,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm16,zmm12,0x1 - vpxord zmm16,zmm16,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vpsrldq zmm13,zmm15,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm17,zmm15,0x1 - vpxord zmm17,zmm17,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vpsrldq zmm13,zmm16,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm18,zmm16,0x1 - vpxord zmm18,zmm18,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - DB 98,242,101,72,222,216 - DB 98,242,93,72,222,224 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + + + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - DB 98,242,101,72,223,216 - DB 98,242,93,72,223,224 vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 - vpxorq zmm3,zmm3,zmm11 - vpxorq zmm4,zmm4,zmm12 - - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqa32 zmm11,zmm17 - vmovdqa32 zmm12,zmm18 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx],zmm2 - vmovdqu8 ZMMWORD[128+rdx],zmm3 - vmovdqu8 ZMMWORD[192+rdx],zmm4 - add rdx,0x100 - sub r8,0x100 - cmp r8,0x100 - jge NEAR $L$_main_loop_run_16_amivrujEyduiFoi - - cmp r8,0x80 - jge NEAR $L$_main_loop_run_8_amivrujEyduiFoi - jmp NEAR $L$_do_n_blocks_amivrujEyduiFoi + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 ZMMWORD[64+rsi]{k1},zmm2 + add rsi,0x70 -$L$_start_by8_amivrujEyduiFoi: - - vbroadcasti32x4 zmm0,ZMMWORD[rsp] - vbroadcasti32x4 zmm8,ZMMWORD[shufb_15_7] - mov r10,0xaa - kmovq k2,r10 + vextracti32x4 xmm0,zmm10,0x3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + vpsrldq zmm13,zmm9,0xf + DB 98,19,21,72,68,241,0 + vpslldq zmm11,zmm9,0x1 + vpxord zmm11,zmm11,zmm14 + vextracti32x4 xmm15,zmm11,0x0 + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_num_blocks_is_6_amivrujEyduiFoi: vpshufb zmm1,zmm0,zmm8 vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] DB 98,147,109,72,68,217,0 vpxorq zmm4{k2},zmm4,zmm2 vpxord zmm9,zmm3,zmm4 - - vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] DB 98,147,77,72,68,249,0 vpxorq zmm5{k2},zmm5,zmm6 vpxord zmm10,zmm7,zmm5 - -$L$_main_loop_run_8_amivrujEyduiFoi: - vmovdqu8 zmm1,ZMMWORD[rcx] - vmovdqu8 zmm2,ZMMWORD[64+rcx] - vmovdqu8 xmm5,XMMWORD[112+rcx] - add rcx,0x80 - - vpxorq zmm1,zmm1,zmm9 - vpxorq zmm2,zmm2,zmm10 - - - vbroadcasti32x4 zmm0,ZMMWORD[128+rsp] - vpxorq zmm1,zmm1,zmm0 - vpxorq zmm2,zmm2,zmm0 - vpsrldq zmm13,zmm9,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm15,zmm9,0x1 - vpxord zmm15,zmm15,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[144+rsp] + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 ymm2,YMMWORD[64+rdi] + add rdi,96 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[160+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[176+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vpsrldq zmm13,zmm10,0xf - DB 98,19,21,72,68,241,0 - vpslldq zmm16,zmm10,0x1 - vpxord zmm16,zmm16,zmm14 - vbroadcasti32x4 zmm0,ZMMWORD[192+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[208+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[224+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[240+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[256+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[272+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[288+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[304+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[320+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[336+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] DB 98,242,117,72,222,200 DB 98,242,109,72,222,208 - vbroadcasti32x4 zmm0,ZMMWORD[352+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] DB 98,242,117,72,223,200 DB 98,242,109,72,223,208 - - vpxorq zmm1,zmm1,zmm9 vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 YMMWORD[64+rsi],ymm2 + add rsi,96 + vextracti32x4 xmm0,zmm10,0x2 + vextracti32x4 xmm15,zmm10,0x3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi + jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_num_blocks_is_5_amivrujEyduiFoi: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + vmovdqu8 zmm1,ZMMWORD[rdi] + vmovdqu8 xmm2,XMMWORD[64+rdi] + add rdi,80 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vpternlogq zmm2,zmm10,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa32 zmm9,zmm15 - vmovdqa32 zmm10,zmm16 - vmovdqu8 ZMMWORD[rdx],zmm1 - vmovdqu8 ZMMWORD[64+rdx],zmm2 - add rdx,0x80 - sub r8,0x80 - cmp r8,0x80 - jge NEAR $L$_main_loop_run_8_amivrujEyduiFoi - jmp NEAR $L$_do_n_blocks_amivrujEyduiFoi - -$L$_steal_cipher_amivrujEyduiFoi: - - vmovdqa xmm2,xmm8 - - - lea rax,[vpshufb_shf_table] - vmovdqu xmm10,XMMWORD[r8*1+rax] - vpshufb xmm8,xmm8,xmm10 - - - vmovdqu xmm3,XMMWORD[((-16))+r8*1+rcx] - vmovdqu XMMWORD[(-16)+r8*1+rdx],xmm8 - - - lea rax,[vpshufb_shf_table] - add rax,16 - sub rax,r8 - vmovdqu xmm10,XMMWORD[rax] - vpxor xmm10,xmm10,XMMWORD[mask1] - vpshufb xmm3,xmm3,xmm10 - - vpblendvb xmm3,xmm3,xmm2,xmm10 - - - vpxor xmm8,xmm3,xmm0 - - - vpxor xmm8,xmm8,XMMWORD[128+rsp] - DB 98,114,61,8,222,132,36,144,0,0,0 - DB 98,114,61,8,222,132,36,160,0,0,0 - DB 98,114,61,8,222,132,36,176,0,0,0 - DB 98,114,61,8,222,132,36,192,0,0,0 - DB 98,114,61,8,222,132,36,208,0,0,0 - DB 98,114,61,8,222,132,36,224,0,0,0 - DB 98,114,61,8,222,132,36,240,0,0,0 - DB 98,114,61,8,222,132,36,0,1,0,0 - DB 98,114,61,8,222,132,36,16,1,0,0 - DB 98,114,61,8,222,132,36,32,1,0,0 - DB 98,114,61,8,222,132,36,48,1,0,0 - DB 98,114,61,8,222,132,36,64,1,0,0 - DB 98,114,61,8,222,132,36,80,1,0,0 - DB 98,114,61,8,223,132,36,96,1,0,0 - - - vpxor xmm8,xmm8,xmm0 -$L$_done_amivrujEyduiFoi: + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqu XMMWORD[(-16)+rdx],xmm8 -$L$_ret_amivrujEyduiFoi: - mov rbx,QWORD[528+rsp] - xor r10,r10 - mov QWORD[528+rsp],r10 - vpxorq zmm0,zmm0,zmm0 - mov rdi,QWORD[((528 + 8))+rsp] - mov QWORD[((528 + 8))+rsp],r10 - mov rsi,QWORD[((528 + 16))+rsp] - mov QWORD[((528 + 16))+rsp],r10 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa xmm6,XMMWORD[((368 + 0))+rsp] - vmovdqa xmm7,XMMWORD[((368 + 16))+rsp] - vmovdqa xmm8,XMMWORD[((368 + 32))+rsp] - vmovdqa xmm9,XMMWORD[((368 + 48))+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa64 ZMMWORD[368+rsp],zmm0 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa xmm10,XMMWORD[((368 + 64))+rsp] - vmovdqa xmm11,XMMWORD[((368 + 80))+rsp] - vmovdqa xmm12,XMMWORD[((368 + 96))+rsp] - vmovdqa xmm13,XMMWORD[((368 + 112))+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa64 ZMMWORD[(368 + 64)+rsp],zmm0 - vmovdqa xmm14,XMMWORD[((368 + 128))+rsp] - vmovdqa xmm15,XMMWORD[((368 + 144))+rsp] + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - vmovdqa YMMWORD[(368 + 128)+rsp],ymm0 - vmovdqa64 ZMMWORD[128+rsp],zmm0 - vmovdqa64 ZMMWORD[192+rsp],zmm0 - vmovdqa64 ZMMWORD[256+rsp],zmm0 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 - mov r10,0x3f - kmovq k2,r10 - vmovdqa64 ZMMWORD[320+rsp]{k2},zmm0 - mov rsp,rbp - pop rbp - vzeroupper - DB 0F3h,0C3h ;repret + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 -$L$_less_than_128_bytes_amivrujEyduiFoi: - cmp r8,0x10 - jb NEAR $L$_ret_amivrujEyduiFoi - mov r10,r8 - and r10,0x70 - cmp r10,0x60 - je NEAR $L$_num_blocks_is_6_amivrujEyduiFoi - cmp r10,0x50 - je NEAR $L$_num_blocks_is_5_amivrujEyduiFoi - cmp r10,0x40 - je NEAR $L$_num_blocks_is_4_amivrujEyduiFoi - cmp r10,0x30 - je NEAR $L$_num_blocks_is_3_amivrujEyduiFoi - cmp r10,0x20 - je NEAR $L$_num_blocks_is_2_amivrujEyduiFoi - cmp r10,0x10 - je NEAR $L$_num_blocks_is_1_amivrujEyduiFoi + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 -$L$_num_blocks_is_7_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[80+rsp],rax - mov QWORD[88+rsp],rbx - vmovdqa xmm14,XMMWORD[80+rsp] - vmovdqu xmm6,XMMWORD[80+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[96+rsp],rax - mov QWORD[104+rsp],rbx - vmovdqa xmm15,XMMWORD[96+rsp] - vmovdqu xmm7,XMMWORD[96+rcx] - add rcx,0x70 - and r8,0xf - je NEAR $L$_done_7_amivrujEyduiFoi - -$L$_steal_cipher_7_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm16,xmm15 - vmovdqa xmm15,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vpxor xmm7,xmm7,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - DB 98,242,77,8,223,240 - DB 98,242,69,8,223,248 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - vmovdqu XMMWORD[80+rdx],xmm6 - add rdx,0x70 - vmovdqa64 xmm0,xmm16 - vmovdqa xmm8,xmm7 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi -$L$_done_7_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vpxor xmm7,xmm7,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - DB 98,242,69,8,222,248 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - DB 98,242,77,8,223,240 - DB 98,242,69,8,223,248 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vpxor xmm7,xmm7,xmm15 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - vmovdqu XMMWORD[80+rdx],xmm6 - add rdx,0x70 - vmovdqa xmm8,xmm7 - jmp NEAR $L$_done_amivrujEyduiFoi + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,222,200 + DB 98,242,109,72,222,208 -$L$_num_blocks_is_6_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[80+rsp],rax - mov QWORD[88+rsp],rbx - vmovdqa xmm14,XMMWORD[80+rsp] - vmovdqu xmm6,XMMWORD[80+rcx] - add rcx,0x60 - and r8,0xf - je NEAR $L$_done_6_amivrujEyduiFoi - -$L$_steal_cipher_6_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm15,xmm14 - vmovdqa xmm14,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - DB 98,242,77,8,223,240 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - add rdx,0x60 - vmovdqa xmm0,xmm15 - vmovdqa xmm8,xmm6 - jmp NEAR $L$_steal_cipher_amivrujEyduiFoi -$L$_done_6_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vpxor xmm6,xmm6,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - DB 98,242,77,8,222,240 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - DB 98,242,77,8,223,240 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vpxor xmm6,xmm6,xmm14 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - vmovdqu XMMWORD[64+rdx],xmm5 - add rdx,0x60 - vmovdqa xmm8,xmm6 - jmp NEAR $L$_done_amivrujEyduiFoi + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,223,200 + DB 98,242,109,72,223,208 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm2,zmm2,zmm10 + vmovdqu8 ZMMWORD[rsi],zmm1 + vmovdqu8 XMMWORD[64+rsi],xmm2 + add rsi,80 -$L$_num_blocks_is_5_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[64+rsp],rax - mov QWORD[72+rsp],rbx - vmovdqa xmm13,XMMWORD[64+rsp] - vmovdqu xmm5,XMMWORD[64+rcx] - add rcx,0x50 - and r8,0xf - je NEAR $L$_done_5_amivrujEyduiFoi - -$L$_steal_cipher_5_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm14,xmm13 - vmovdqa xmm13,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - add rdx,0x50 - vmovdqa xmm0,xmm14 - vmovdqa xmm8,xmm5 + vmovdqa xmm8,xmm2 + vextracti32x4 xmm0,zmm10,0x1 + vextracti32x4 xmm15,zmm10,0x2 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - -$L$_done_5_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vpxor xmm5,xmm5,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - DB 98,242,85,8,222,232 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - DB 98,242,85,8,223,232 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vpxor xmm5,xmm5,xmm13 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - vmovdqu XMMWORD[48+rdx],xmm4 - add rdx,0x50 - vmovdqa xmm8,xmm5 - jmp NEAR $L$_done_amivrujEyduiFoi - $L$_num_blocks_is_4_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[48+rsp],rax - mov QWORD[56+rsp],rbx - vmovdqa xmm12,XMMWORD[48+rsp] - vmovdqu xmm4,XMMWORD[48+rcx] - add rcx,0x40 - and r8,0xf - je NEAR $L$_done_4_amivrujEyduiFoi - -$L$_steal_cipher_4_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm13,xmm12 - vmovdqa xmm12,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - add rdx,0x40 - vmovdqa xmm0,xmm13 - vmovdqa xmm8,xmm4 + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + vmovdqu8 zmm1,ZMMWORD[rdi] + add rdi,64 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,223,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi],zmm1 + add rsi,64 + vmovdqa xmm0,xmm10 + vextracti32x4 xmm15,zmm10,0x1 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - -$L$_done_4_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vpxor xmm4,xmm4,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - DB 98,242,93,8,222,224 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - DB 98,242,93,8,223,224 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vpxor xmm4,xmm4,xmm12 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - vmovdqu XMMWORD[32+rdx],xmm3 - add rdx,0x40 - vmovdqa xmm8,xmm4 - jmp NEAR $L$_done_amivrujEyduiFoi - $L$_num_blocks_is_3_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[32+rsp],rax - mov QWORD[40+rsp],rbx - vmovdqa xmm11,XMMWORD[32+rsp] - vmovdqu xmm3,XMMWORD[32+rcx] - add rcx,0x30 - and r8,0xf - je NEAR $L$_done_3_amivrujEyduiFoi - -$L$_steal_cipher_3_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm12,xmm11 - vmovdqa xmm11,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - add rdx,0x30 - vmovdqa xmm0,xmm12 - vmovdqa xmm8,xmm3 + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + vpsllvq zmm5,zmm0,ZMMWORD[const_dq7654] + vpsrlvq zmm6,zmm1,ZMMWORD[const_dq1234] + DB 98,147,77,72,68,249,0 + vpxorq zmm5{k2},zmm5,zmm6 + vpxord zmm10,zmm7,zmm5 + mov r8,0x0000ffffffffffff + kmovq k1,r8 + vmovdqu8 zmm1{k1},[rdi] + add rdi,48 + vbroadcasti32x4 zmm0,ZMMWORD[rcx] + vpternlogq zmm1,zmm9,zmm0,0x96 + vbroadcasti32x4 zmm0,ZMMWORD[16+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[32+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[48+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[64+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[80+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[96+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[112+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[128+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[144+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[160+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[176+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[192+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[208+rcx] + DB 98,242,117,72,222,200 + vbroadcasti32x4 zmm0,ZMMWORD[224+rcx] + DB 98,242,117,72,223,200 + vpxorq zmm1,zmm1,zmm9 + vmovdqu8 ZMMWORD[rsi]{k1},zmm1 + add rsi,48 + vextracti32x4 xmm0,zmm9,3 + vextracti32x4 xmm15,zmm10,0 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi - -$L$_done_3_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - DB 98,242,101,8,222,216 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - DB 98,242,101,8,223,216 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vpxor xmm3,xmm3,xmm11 - vmovdqu XMMWORD[rdx],xmm1 - vmovdqu XMMWORD[16+rdx],xmm2 - add rdx,0x30 - vmovdqa xmm8,xmm3 - jmp NEAR $L$_done_amivrujEyduiFoi - $L$_num_blocks_is_2_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa xmm10,XMMWORD[16+rsp] - vmovdqu xmm2,XMMWORD[16+rcx] - add rcx,0x20 - and r8,0xf - je NEAR $L$_done_2_amivrujEyduiFoi - -$L$_steal_cipher_2_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm11,xmm10 - vmovdqa xmm10,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - add rdx,0x20 - vmovdqa xmm0,xmm11 - vmovdqa xmm8,xmm2 + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 + + vmovdqu8 ymm1,YMMWORD[rdi] + add rdi,32 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,223,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu8 YMMWORD[rsi],ymm1 + add rsi,32 + + vextracti32x4 xmm0,zmm9,2 + vextracti32x4 xmm15,zmm9,3 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi +$L$_num_blocks_is_1_amivrujEyduiFoi: + vpshufb zmm1,zmm0,zmm8 + vpsllvq zmm4,zmm0,ZMMWORD[const_dq3210] + vpsrlvq zmm2,zmm1,ZMMWORD[const_dq5678] + DB 98,147,109,72,68,217,0 + vpxorq zmm4{k2},zmm4,zmm2 + vpxord zmm9,zmm3,zmm4 -$L$_done_2_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vpxor xmm2,xmm2,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - DB 98,242,109,8,222,208 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - DB 98,242,109,8,223,208 - vpxor xmm1,xmm1,xmm9 - vpxor xmm2,xmm2,xmm10 - vmovdqu XMMWORD[rdx],xmm1 - add rdx,0x20 - vmovdqa xmm8,xmm2 - jmp NEAR $L$_done_amivrujEyduiFoi + vmovdqu8 xmm1,XMMWORD[rdi] + add rdi,16 + vbroadcasti32x4 ymm0,YMMWORD[rcx] + vpternlogq ymm1,ymm9,ymm0,0x96 + vbroadcasti32x4 ymm0,YMMWORD[16+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[32+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[48+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[64+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[80+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[96+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[112+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[128+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[144+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[160+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[176+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[192+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[208+rcx] + DB 98,242,117,40,222,200 + vbroadcasti32x4 ymm0,YMMWORD[224+rcx] + DB 98,242,117,40,223,200 + vpxorq ymm1,ymm1,ymm9 + vmovdqu8 XMMWORD[rsi],xmm1 + add rsi,16 -$L$_num_blocks_is_1_amivrujEyduiFoi: - vmovdqa xmm9,XMMWORD[rsp] - mov rax,QWORD[rsp] - mov rbx,QWORD[8+rsp] - vmovdqu xmm1,XMMWORD[rcx] - add rcx,0x10 - and r8,0xf - je NEAR $L$_done_1_amivrujEyduiFoi - -$L$_steal_cipher_1_amivrujEyduiFoi: - xor rsi,rsi - shl rax,1 - adc rbx,rbx - cmovc rsi,rdi - xor rax,rsi - mov QWORD[16+rsp],rax - mov QWORD[24+rsp],rbx - vmovdqa64 xmm10,xmm9 - vmovdqa xmm9,XMMWORD[16+rsp] - vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - vpxor xmm1,xmm1,xmm9 - add rdx,0x10 - vmovdqa xmm0,xmm10 vmovdqa xmm8,xmm1 + vextracti32x4 xmm0,zmm9,1 + vextracti32x4 xmm15,zmm9,2 + and rdx,0xf + je NEAR $L$_final_block_amivrujEyduiFoi jmp NEAR $L$_steal_cipher_amivrujEyduiFoi -$L$_done_1_amivrujEyduiFoi: - vpxor xmm1,xmm1,xmm9 - vmovdqa xmm0,XMMWORD[128+rsp] - vpxor xmm1,xmm1,xmm0 - vmovdqa xmm0,XMMWORD[144+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[160+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[176+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[192+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[208+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[224+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[240+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[256+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[272+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[288+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[304+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[320+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[336+rsp] - DB 98,242,117,8,222,200 - vmovdqa xmm0,XMMWORD[352+rsp] - DB 98,242,117,8,223,200 - vpxor xmm1,xmm1,xmm9 - add rdx,0x10 - vmovdqa xmm8,xmm1 - jmp NEAR $L$_done_amivrujEyduiFoi - section .rdata rdata align=8 ALIGN 16