crypto: x86/aes-xts - add AES-XTS assembly macro for modern CPUs

author Eric Biggers <ebiggers@google.com>

Fri, 29 Mar 2024 08:03:50 +0000 (01:03 -0700)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 5 Apr 2024 07:46:33 +0000 (15:46 +0800)
author Eric Biggers <ebiggers@google.com>
Fri, 29 Mar 2024 08:03:50 +0000 (01:03 -0700)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 5 Apr 2024 07:46:33 +0000 (15:46 +0800)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile

index 9aa46093c91b619da0d1641cefa73afa7d97d2e6..9c5ce56137385b8bfbdef5fe7ec219af0a13d789 100644 (file)
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
  
  obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
  aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
+       aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
  
  obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
  sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S

new file mode 100644 (file)

index 0000000..a5e2783
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,800 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ *    - 128-bit vectors (1 AES block per vector)
+ *    - VEX-coded instructions
+ *    - xmm0-xmm15
+ *    - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - VEX-coded instructions
+ *    - ymm0-ymm15
+ *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ *      e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - EVEX-coded instructions
+ *    - ymm0-ymm31
+ *    - This is for CPUs that have AVX512 but where using zmm registers causes
+ *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ *      To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ *    - Same as the previous one, but upgrades to 512-bit vectors
+ *      (4 AES blocks per vector) in zmm0-zmm31.
+ *    - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing.  However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+       // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+       // + 1.  It is the value that must be XOR'd into the low 64 bits of the
+       // tweak each time a 1 is carried out of the high 64 bits.
+       //
+       // The high 64 bits of this value is just the internal carry bit that
+       // exists when there's a carry out of the low 64 bits of the tweak.
+       .quad   0x87, 1
+
+       // This table contains constants for vpshufb and vpblendvb, used to
+       // handle variable byte shifts and blending during ciphertext stealing
+       // on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+       .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+       .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+       .byte   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+       .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+       .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set   KEY,            %rdi    // Initially points to crypto_aes_ctx, then is
+                               // advanced to point directly to the round keys
+.set   SRC,            %rsi    // Pointer to next source data
+.set   DST,            %rdx    // Pointer to next destination data
+.set   LEN,            %rcx    // Remaining length in bytes
+.set   TWEAK,          %r8     // Pointer to next tweak
+
+// %r9d holds the AES key length in bytes.
+.set   KEYLEN,         %r9d
+
+// %rax and %r10-r11 are available as temporaries.
+
+.macro _define_Vi      i
+.if VL == 16
+       .set    V\i,            %xmm\i
+.elseif VL == 32
+       .set    V\i,            %ymm\i
+.elseif VL == 64
+       .set    V\i,            %zmm\i
+.else
+       .error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+       // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+       // are available, that map to the xmm, ymm, or zmm registers according
+       // to the selected Vector Length (VL).
+       _define_Vi      0
+       _define_Vi      1
+       _define_Vi      2
+       _define_Vi      3
+       _define_Vi      4
+       _define_Vi      5
+       _define_Vi      6
+       _define_Vi      7
+       _define_Vi      8
+       _define_Vi      9
+       _define_Vi      10
+       _define_Vi      11
+       _define_Vi      12
+       _define_Vi      13
+       _define_Vi      14
+       _define_Vi      15
+.if USE_AVX10
+       _define_Vi      16
+       _define_Vi      17
+       _define_Vi      18
+       _define_Vi      19
+       _define_Vi      20
+       _define_Vi      21
+       _define_Vi      22
+       _define_Vi      23
+       _define_Vi      24
+       _define_Vi      25
+       _define_Vi      26
+       _define_Vi      27
+       _define_Vi      28
+       _define_Vi      29
+       _define_Vi      30
+       _define_Vi      31
+.endif
+
+       // V0-V3 hold the data blocks during the main loop, or temporary values
+       // otherwise.  V4-V5 hold temporary values.
+
+       // V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
+       .set    TWEAK0_XMM,     %xmm6
+       .set    TWEAK0,         V6
+       .set    TWEAK1_XMM,     %xmm7
+       .set    TWEAK1,         V7
+       .set    TWEAK2,         V8
+       .set    TWEAK3,         V9
+
+       // V10-V13 are used for computing the next values of TWEAK[0-3].
+       .set    NEXT_TWEAK0,    V10
+       .set    NEXT_TWEAK1,    V11
+       .set    NEXT_TWEAK2,    V12
+       .set    NEXT_TWEAK3,    V13
+
+       // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+       .set    GF_POLY_XMM,    %xmm14
+       .set    GF_POLY,        V14
+
+       // V15 holds the first AES round key, copied to all 128-bit lanes.
+       .set    KEY0_XMM,       %xmm15
+       .set    KEY0,           V15
+
+       // If 32 SIMD registers are available, then V16-V29 hold the remaining
+       // AES round keys, copied to all 128-bit lanes.
+.if USE_AVX10
+       .set    KEY1_XMM,       %xmm16
+       .set    KEY1,           V16
+       .set    KEY2_XMM,       %xmm17
+       .set    KEY2,           V17
+       .set    KEY3_XMM,       %xmm18
+       .set    KEY3,           V18
+       .set    KEY4_XMM,       %xmm19
+       .set    KEY4,           V19
+       .set    KEY5_XMM,       %xmm20
+       .set    KEY5,           V20
+       .set    KEY6_XMM,       %xmm21
+       .set    KEY6,           V21
+       .set    KEY7_XMM,       %xmm22
+       .set    KEY7,           V22
+       .set    KEY8_XMM,       %xmm23
+       .set    KEY8,           V23
+       .set    KEY9_XMM,       %xmm24
+       .set    KEY9,           V24
+       .set    KEY10_XMM,      %xmm25
+       .set    KEY10,          V25
+       .set    KEY11_XMM,      %xmm26
+       .set    KEY11,          V26
+       .set    KEY12_XMM,      %xmm27
+       .set    KEY12,          V27
+       .set    KEY13_XMM,      %xmm28
+       .set    KEY13,          V28
+       .set    KEY14_XMM,      %xmm29
+       .set    KEY14,          V29
+.endif
+       // V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro _vmovdqu        src, dst
+.if VL < 64
+       vmovdqu         \src, \dst
+.else
+       vmovdqu8        \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro _vbroadcast128  src, dst
+.if VL == 16 && !USE_AVX10
+       vmovdqu         \src, \dst
+.elseif VL == 32 && !USE_AVX10
+       vbroadcasti128  \src, \dst
+.else
+       vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor  src1, src2, dst
+.if USE_AVX10
+       vpxord          \src1, \src2, \dst
+.else
+       vpxor           \src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro _xor3   src1, src2, src3_and_dst
+.if USE_AVX10
+       // vpternlogd with immediate 0x96 is a three-argument XOR.
+       vpternlogd      $0x96, \src1, \src2, \src3_and_dst
+.else
+       vpxor           \src1, \src3_and_dst, \src3_and_dst
+       vpxor           \src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro _next_tweak     src, tmp, dst
+       vpshufd         $0x13, \src, \tmp
+       vpaddq          \src, \src, \dst
+       vpsrad          $31, \tmp, \tmp
+       vpand           GF_POLY_XMM, \tmp, \tmp
+       vpxor           \tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel.  If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro _next_tweakvec  src, tmp1, tmp2, dst
+.if VL == 16
+       _next_tweak     \src, \tmp1, \dst
+.else
+       vpsrlq          $64 - VL/16, \src, \tmp1
+       vpclmulqdq      $0x01, GF_POLY, \tmp1, \tmp2
+       vpslldq         $8, \tmp1, \tmp1
+       vpsllq          $VL/16, \src, \dst
+       _xor3           \tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
+.macro _compute_first_set_of_tweaks
+       vmovdqu         (TWEAK), TWEAK0_XMM
+       _vbroadcast128  .Lgf_poly(%rip), GF_POLY
+.if VL == 16
+       // With VL=16, multiplying by x serially is fastest.
+       _next_tweak     TWEAK0, %xmm0, TWEAK1
+       _next_tweak     TWEAK1, %xmm0, TWEAK2
+       _next_tweak     TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+       // Compute the second block of TWEAK0.
+       _next_tweak     TWEAK0_XMM, %xmm0, %xmm1
+       vinserti128     $1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+       // Compute the remaining blocks of TWEAK0.
+       _next_tweak     TWEAK0_XMM, %xmm0, %xmm1
+       _next_tweak     %xmm1, %xmm0, %xmm2
+       _next_tweak     %xmm2, %xmm0, %xmm3
+       vinserti32x4    $1, %xmm1, TWEAK0, TWEAK0
+       vinserti32x4    $2, %xmm2, TWEAK0, TWEAK0
+       vinserti32x4    $3, %xmm3, TWEAK0, TWEAK0
+.endif
+       // Compute TWEAK[1-3] from TWEAK0.
+       vpsrlq          $64 - 1*VL/16, TWEAK0, V0
+       vpsrlq          $64 - 2*VL/16, TWEAK0, V2
+       vpsrlq          $64 - 3*VL/16, TWEAK0, V4
+       vpclmulqdq      $0x01, GF_POLY, V0, V1
+       vpclmulqdq      $0x01, GF_POLY, V2, V3
+       vpclmulqdq      $0x01, GF_POLY, V4, V5
+       vpslldq         $8, V0, V0
+       vpslldq         $8, V2, V2
+       vpslldq         $8, V4, V4
+       vpsllq          $1*VL/16, TWEAK0, TWEAK1
+       vpsllq          $2*VL/16, TWEAK0, TWEAK2
+       vpsllq          $3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+       vpternlogd      $0x96, V0, V1, TWEAK1
+       vpternlogd      $0x96, V2, V3, TWEAK2
+       vpternlogd      $0x96, V4, V5, TWEAK3
+.else
+       vpxor           V0, TWEAK1, TWEAK1
+       vpxor           V2, TWEAK2, TWEAK2
+       vpxor           V4, TWEAK3, TWEAK3
+       vpxor           V1, TWEAK1, TWEAK1
+       vpxor           V3, TWEAK2, TWEAK2
+       vpxor           V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro _tweak_step_mulx        i
+.if \i == 0
+       .set PREV_TWEAK, TWEAK3
+       .set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+       .set PREV_TWEAK, NEXT_TWEAK0
+       .set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+       .set PREV_TWEAK, NEXT_TWEAK1
+       .set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+       .set PREV_TWEAK, NEXT_TWEAK2
+       .set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i < 20 && \i % 5 == 0
+       vpshufd         $0x13, PREV_TWEAK, V5
+.elseif \i < 20 && \i % 5 == 1
+       vpaddq          PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i < 20 && \i % 5 == 2
+       vpsrad          $31, V5, V5
+.elseif \i < 20 && \i % 5 == 3
+       vpand           GF_POLY, V5, V5
+.elseif \i < 20 && \i % 5 == 4
+       vpxor           V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+       vmovdqa         NEXT_TWEAK0, TWEAK0
+       vmovdqa         NEXT_TWEAK1, TWEAK1
+       vmovdqa         NEXT_TWEAK2, TWEAK2
+       vmovdqa         NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16).  This means multiplying
+// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+.macro _tweak_step_pclmul      i
+.if \i == 2
+       vpsrldq         $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 4
+       vpsrldq         $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 6
+       vpsrldq         $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 8
+       vpsrldq         $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 10
+       vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 12
+       vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 14
+       vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 16
+       vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+       vpslldq         $(4*VL/16) / 8, TWEAK0, TWEAK0
+       vpslldq         $(4*VL/16) / 8, TWEAK1, TWEAK1
+       vpslldq         $(4*VL/16) / 8, TWEAK2, TWEAK2
+       vpslldq         $(4*VL/16) / 8, TWEAK3, TWEAK3
+       _vpxor          NEXT_TWEAK0, TWEAK0, TWEAK0
+       _vpxor          NEXT_TWEAK1, TWEAK1, TWEAK1
+       _vpxor          NEXT_TWEAK2, TWEAK2, TWEAK2
+       _vpxor          NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3].  To complete all steps, this must be invoked with \i values 0
+// through at least 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases.
+.macro _tweak_step     i
+.if VL == 16
+       _tweak_step_mulx        \i
+.else
+       _tweak_step_pclmul      \i
+.endif
+.endm
+
+// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
+.macro _load_round_keys
+       _vbroadcast128  0*16(KEY), KEY0
+.if USE_AVX10
+       _vbroadcast128  1*16(KEY), KEY1
+       _vbroadcast128  2*16(KEY), KEY2
+       _vbroadcast128  3*16(KEY), KEY3
+       _vbroadcast128  4*16(KEY), KEY4
+       _vbroadcast128  5*16(KEY), KEY5
+       _vbroadcast128  6*16(KEY), KEY6
+       _vbroadcast128  7*16(KEY), KEY7
+       _vbroadcast128  8*16(KEY), KEY8
+       _vbroadcast128  9*16(KEY), KEY9
+       _vbroadcast128  10*16(KEY), KEY10
+       // Note: if it's AES-128 or AES-192, the last several round keys won't
+       // be used.  We do the loads anyway to save a conditional jump.
+       _vbroadcast128  11*16(KEY), KEY11
+       _vbroadcast128  12*16(KEY), KEY12
+       _vbroadcast128  13*16(KEY), KEY13
+       _vbroadcast128  14*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key.  The register length
+// determines the number of AES blocks en/decrypted.
+.macro _vaes   enc, last, key, data
+.if \enc
+.if \last
+       vaesenclast     \key, \data, \data
+.else
+       vaesenc         \key, \data, \data
+.endif
+.else
+.if \last
+       vaesdeclast     \key, \data, \data
+.else
+       vaesdec         \key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s).  The round key is loaded from the appropriate
+// register or memory location for round \i.  May clobber V4.
+.macro _vaes_1x                enc, last, i, xmm_suffix, data
+.if USE_AVX10
+       _vaes           \enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+       _vaes           \enc, \last, \i*16(KEY), \data
+.else
+       _vbroadcast128  \i*16(KEY), V4
+       _vaes           \enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks.  The round key is loaded from the
+// appropriate register or memory location for round \i.  In addition, does step
+// \i of the computation of the next set of tweaks.  May clobber V4.
+.macro _vaes_4x        enc, last, i
+.if USE_AVX10
+       _tweak_step     (2*(\i-1))
+       _vaes           \enc, \last, KEY\i, V0
+       _vaes           \enc, \last, KEY\i, V1
+       _tweak_step     (2*(\i-1) + 1)
+       _vaes           \enc, \last, KEY\i, V2
+       _vaes           \enc, \last, KEY\i, V3
+.else
+       _vbroadcast128  \i*16(KEY), V4
+       _tweak_step     (2*(\i-1))
+       _vaes           \enc, \last, V4, V0
+       _vaes           \enc, \last, V4, V1
+       _tweak_step     (2*(\i-1) + 1)
+       _vaes           \enc, \last, V4, V2
+       _vaes           \enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data.  To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
+.macro _aes_crypt      enc, xmm_suffix, tweak, data
+       _xor3           KEY0\xmm_suffix, \tweak, \data
+       _vaes_1x        \enc, 0, 1, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 2, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 3, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 4, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 5, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 6, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 7, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 8, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 9, \xmm_suffix, \data
+       cmp             $24, KEYLEN
+       jle             .Laes_128_or_192\@
+       _vaes_1x        \enc, 0, 10, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 11, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 12, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 13, \xmm_suffix, \data
+       _vaes_1x        \enc, 1, 14, \xmm_suffix, \data
+       jmp             .Laes_done\@
+.Laes_128_or_192\@:
+       je              .Laes_192\@
+       _vaes_1x        \enc, 1, 10, \xmm_suffix, \data
+       jmp             .Laes_done\@
+.Laes_192\@:
+       _vaes_1x        \enc, 0, 10, \xmm_suffix, \data
+       _vaes_1x        \enc, 0, 11, \xmm_suffix, \data
+       _vaes_1x        \enc, 1, 12, \xmm_suffix, \data
+.Laes_done\@:
+       _vpxor          \tweak, \data, \data
+.endm
+
+.macro _aes_xts_crypt  enc
+       _define_aliases
+
+       // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+       movl            480(KEY), KEYLEN
+
+       // If decrypting, advance KEY to the decryption round keys.
+.if !\enc
+       add             $240, KEY
+.endif
+
+       // Check whether the data length is a multiple of the AES block length.
+       test            $15, LEN
+       jnz             .Lneed_cts\@
+.Lxts_init\@:
+
+       // Cache as many round keys as possible.
+       _load_round_keys
+
+       // Compute the first set of tweaks TWEAK[0-3].
+       _compute_first_set_of_tweaks
+
+       sub             $4*VL, LEN
+       jl              .Lhandle_remainder\@
+
+.Lmain_loop\@:
+       // This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+       // XOR each source block with its tweak and the first round key.
+.if USE_AVX10
+       vmovdqu8        0*VL(SRC), V0
+       vmovdqu8        1*VL(SRC), V1
+       vmovdqu8        2*VL(SRC), V2
+       vmovdqu8        3*VL(SRC), V3
+       vpternlogd      $0x96, TWEAK0, KEY0, V0
+       vpternlogd      $0x96, TWEAK1, KEY0, V1
+       vpternlogd      $0x96, TWEAK2, KEY0, V2
+       vpternlogd      $0x96, TWEAK3, KEY0, V3
+.else
+       vpxor           0*VL(SRC), KEY0, V0
+       vpxor           1*VL(SRC), KEY0, V1
+       vpxor           2*VL(SRC), KEY0, V2
+       vpxor           3*VL(SRC), KEY0, V3
+       vpxor           TWEAK0, V0, V0
+       vpxor           TWEAK1, V1, V1
+       vpxor           TWEAK2, V2, V2
+       vpxor           TWEAK3, V3, V3
+.endif
+       // Do all the AES rounds on the data blocks, interleaved with
+       // the computation of the next set of tweaks.
+       _vaes_4x        \enc, 0, 1
+       _vaes_4x        \enc, 0, 2
+       _vaes_4x        \enc, 0, 3
+       _vaes_4x        \enc, 0, 4
+       _vaes_4x        \enc, 0, 5
+       _vaes_4x        \enc, 0, 6
+       _vaes_4x        \enc, 0, 7
+       _vaes_4x        \enc, 0, 8
+       _vaes_4x        \enc, 0, 9
+       // Try to optimize for AES-256 by keeping the code for AES-128 and
+       // AES-192 out-of-line.
+       cmp             $24, KEYLEN
+       jle             .Lencrypt_4x_aes_128_or_192\@
+       _vaes_4x        \enc, 0, 10
+       _vaes_4x        \enc, 0, 11
+       _vaes_4x        \enc, 0, 12
+       _vaes_4x        \enc, 0, 13
+       _vaes_4x        \enc, 1, 14
+.Lencrypt_4x_done\@:
+
+       // XOR in the tweaks again.
+       _vpxor          TWEAK0, V0, V0
+       _vpxor          TWEAK1, V1, V1
+       _vpxor          TWEAK2, V2, V2
+       _vpxor          TWEAK3, V3, V3
+
+       // Store the destination blocks.
+       _vmovdqu        V0, 0*VL(DST)
+       _vmovdqu        V1, 1*VL(DST)
+       _vmovdqu        V2, 2*VL(DST)
+       _vmovdqu        V3, 3*VL(DST)
+
+       // Finish computing the next set of tweaks.
+       _tweak_step     1000
+
+       add             $4*VL, SRC
+       add             $4*VL, DST
+       sub             $4*VL, LEN
+       jge             .Lmain_loop\@
+
+       // Check for the uncommon case where the data length isn't a multiple of
+       // 4*VL.  Handle it out-of-line in order to optimize for the common
+       // case.  In the common case, just fall through to the ret.
+       test            $4*VL-1, LEN
+       jnz             .Lhandle_remainder\@
+.Ldone\@:
+       // Store the next tweak back to *TWEAK to support continuation calls.
+       vmovdqu         TWEAK0_XMM, (TWEAK)
+.if VL > 16
+       vzeroupper
+.endif
+       RET
+
+.Lhandle_remainder\@:
+       add             $4*VL, LEN      // Undo the extra sub from earlier.
+
+       // En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+       sub             $VL, LEN
+       jl              .Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+       _vmovdqu        (SRC), V0
+       _aes_crypt      \enc, , TWEAK0, V0
+       _vmovdqu        V0, (DST)
+       _next_tweakvec  TWEAK0, V0, V1, TWEAK0
+       add             $VL, SRC
+       add             $VL, DST
+       sub             $VL, LEN
+       jge             .Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+       add             $VL-16, LEN     // Undo the extra sub from earlier.
+.else
+       sub             $16, LEN
+.endif
+
+       // En/decrypt any remaining full blocks, one at a time.
+       jl              .Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+       vmovdqu         (SRC), %xmm0
+       _aes_crypt      \enc, _XMM, TWEAK0_XMM, %xmm0
+       vmovdqu         %xmm0, (DST)
+       _next_tweak     TWEAK0_XMM, %xmm0, TWEAK0_XMM
+       add             $16, SRC
+       add             $16, DST
+       sub             $16, LEN
+       jge             .Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+       add             $16, LEN        // Undo the extra sub from earlier.
+
+.Lfull_blocks_done\@:
+       // Now 0 <= LEN <= 15.  If LEN is nonzero, do ciphertext stealing to
+       // process the last 16 + LEN bytes.  If LEN is zero, we're done.
+       test            LEN, LEN
+       jnz             .Lcts\@
+       jmp             .Ldone\@
+
+       // Out-of-line handling of AES-128 and AES-192
+.Lencrypt_4x_aes_128_or_192\@:
+       jz              .Lencrypt_4x_aes_192\@
+       _vaes_4x        \enc, 1, 10
+       jmp             .Lencrypt_4x_done\@
+.Lencrypt_4x_aes_192\@:
+       _vaes_4x        \enc, 0, 10
+       _vaes_4x        \enc, 0, 11
+       _vaes_4x        \enc, 1, 12
+       jmp             .Lencrypt_4x_done\@
+
+.Lneed_cts\@:
+       // The data length isn't a multiple of the AES block length, so
+       // ciphertext stealing (CTS) will be needed.  Subtract one block from
+       // LEN so that the main loop doesn't process the last full block.  The
+       // CTS step will process it specially along with the partial block.
+       sub             $16, LEN
+       jmp             .Lxts_init\@
+
+.Lcts\@:
+       // Do ciphertext stealing (CTS) to en/decrypt the last full block and
+       // the partial block.  CTS needs two tweaks.  TWEAK0_XMM contains the
+       // next tweak; compute the one after that.  Decryption uses these two
+       // tweaks in reverse order, so also define aliases to handle that.
+       _next_tweak     TWEAK0_XMM, %xmm0, TWEAK1_XMM
+.if \enc
+       .set            CTS_TWEAK0,     TWEAK0_XMM
+       .set            CTS_TWEAK1,     TWEAK1_XMM
+.else
+       .set            CTS_TWEAK0,     TWEAK1_XMM
+       .set            CTS_TWEAK1,     TWEAK0_XMM
+.endif
+
+       // En/decrypt the last full block.
+       vmovdqu         (SRC), %xmm0
+       _aes_crypt      \enc, _XMM, CTS_TWEAK0, %xmm0
+
+.if USE_AVX10
+       // Create a mask that has the first LEN bits set.
+       mov             $-1, %rax
+       bzhi            LEN, %rax, %rax
+       kmovq           %rax, %k1
+
+       // Swap the first LEN bytes of the above result with the partial block.
+       // Note that to support in-place en/decryption, the load from the src
+       // partial block must happen before the store to the dst partial block.
+       vmovdqa         %xmm0, %xmm1
+       vmovdqu8        16(SRC), %xmm0{%k1}
+       vmovdqu8        %xmm1, 16(DST){%k1}
+.else
+       lea             .Lcts_permute_table(%rip), %rax
+
+       // Load the src partial block, left-aligned.  Note that to support
+       // in-place en/decryption, this must happen before the store to the dst
+       // partial block.
+       vmovdqu         (SRC, LEN, 1), %xmm1
+
+       // Shift the first LEN bytes of the en/decryption of the last full block
+       // to the end of a register, then store it to DST+LEN.  This stores the
+       // dst partial block.  It also writes to the second part of the dst last
+       // full block, but that part is overwritten later.
+       vpshufb         (%rax, LEN, 1), %xmm0, %xmm2
+       vmovdqu         %xmm2, (DST, LEN, 1)
+
+       // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+       sub             LEN, %rax
+       vmovdqu         32(%rax), %xmm3
+
+       // Shift the src partial block to the beginning of its register.
+       vpshufb         %xmm3, %xmm1, %xmm1
+
+       // Do a blend to generate the src partial block followed by the second
+       // part of the en/decryption of the last full block.
+       vpblendvb       %xmm3, %xmm0, %xmm1, %xmm0
+.endif
+       // En/decrypt again and store the last full block.
+       _aes_crypt      \enc, _XMM, CTS_TWEAK1, %xmm0
+       vmovdqu         %xmm0, (DST)
+       jmp             .Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+//                        u8 iv[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_xts_encrypt_iv)
+       vmovdqu         (%rsi), %xmm0
+       vpxor           0*16(%rdi), %xmm0, %xmm0
+       vaesenc         1*16(%rdi), %xmm0, %xmm0
+       vaesenc         2*16(%rdi), %xmm0, %xmm0
+       vaesenc         3*16(%rdi), %xmm0, %xmm0
+       vaesenc         4*16(%rdi), %xmm0, %xmm0
+       vaesenc         5*16(%rdi), %xmm0, %xmm0
+       vaesenc         6*16(%rdi), %xmm0, %xmm0
+       vaesenc         7*16(%rdi), %xmm0, %xmm0
+       vaesenc         8*16(%rdi), %xmm0, %xmm0
+       vaesenc         9*16(%rdi), %xmm0, %xmm0
+       cmpl            $24, 480(%rdi)
+       jle             .Lencrypt_iv_aes_128_or_192
+       vaesenc         10*16(%rdi), %xmm0, %xmm0
+       vaesenc         11*16(%rdi), %xmm0, %xmm0
+       vaesenc         12*16(%rdi), %xmm0, %xmm0
+       vaesenc         13*16(%rdi), %xmm0, %xmm0
+       vaesenclast     14*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_done:
+       vmovdqu         %xmm0, (%rsi)
+       RET
+
+       // Out-of-line handling of AES-128 and AES-192
+.Lencrypt_iv_aes_128_or_192:
+       jz              .Lencrypt_iv_aes_192
+       vaesenclast     10*16(%rdi), %xmm0, %xmm0
+       jmp             .Lencrypt_iv_done
+.Lencrypt_iv_aes_192:
+       vaesenc         10*16(%rdi), %xmm0, %xmm0
+       vaesenc         11*16(%rdi), %xmm0, %xmm0
+       vaesenclast     12*16(%rdi), %xmm0, %xmm0
+       jmp             .Lencrypt_iv_done
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro.  They all have the following prototype:
+//
+// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+//                     const u8 *src, u8 *dst, size_t len,
+//                     u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key.  |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done.  This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call.  If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
author	Eric Biggers <ebiggers@google.com>
	Fri, 29 Mar 2024 08:03:50 +0000 (01:03 -0700)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 5 Apr 2024 07:46:33 +0000 (15:46 +0800)
arch/x86/crypto/Makefile		patch \| blob \| history
arch/x86/crypto/aes-xts-avx-x86_64.S	[new file with mode: 0644]	patch \| blob