1
0
Fork 0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-01-22 07:53:11 -05:00

crypto: x86/crc32c - eliminate jump table and excessive unrolling

crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
unrolled and uses a jump table to jump into the correct location.  This
optimization is misguided, as it bloats the binary code size and
introduces an indirect call.  x86_64 CPUs can predict loops well, so it
is fine to just use a loop instead.  Loop bookkeeping instructions can
compete with the crc instructions for the ALUs, but this is easily
mitigated by unrolling the loop by a smaller amount, such as 4 times.

Therefore, re-roll the loop and make related tweaks to the code.

This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
bytes, a 91% reduction.  In general it also makes the code faster, with
some large improvements seen when retpoline is enabled.

More detailed performance results are shown below.  They are given as
percent improvement in throughput (negative means regressed) for CPU
microarchitecture vs. input length in bytes.  E.g. an improvement from
40 GB/s to 50 GB/s would be listed as 25%.

Table 1: Results with retpoline enabled (the default):

                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
  ---------------------+-------+-------+-------+------ +-------+-------+
  Intel Haswell        | 35.0% | 20.7% | 17.8% |  9.7% | -0.2% |  4.4% |
  Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% |  0.0% |  5.4% |
  AMD Zen 2            | 29.5% | 17.2% | 13.5% |  8.6% | -0.5% |  2.8% |

Table 2: Results with retpoline disabled:

                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
  ---------------------+-------+-------+-------+------ +-------+-------+
  Intel Haswell        |  3.3% |  4.8% |  4.5% |  0.9% | -2.9% |  0.3% |
  Intel Emerald Rapids |  7.5% |  6.4% |  5.2% |  2.3% | -0.0% |  0.6% |
  AMD Zen 2            | 11.8% |  1.4% |  0.2% |  1.3% | -0.9% | -0.2% |

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Eric Biggers 2024-10-13 21:24:47 -07:00 committed by Herbert Xu
parent eebcadfa21
commit 84dd048cf8

View file

@ -7,6 +7,7 @@
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
*
* Copyright (C) 2012 Intel Corporation.
* Copyright 2024 Google LLC
*
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
@ -44,18 +45,9 @@
*/
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
.macro LABEL prefix n
.L\prefix\n\():
.endm
.macro JMPTBL_ENTRY i
.quad .Lcrc_\i
.endm
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
@ -64,139 +56,116 @@
.text
SYM_FUNC_START(crc_pcl)
#define bufp rdi
#define bufp_dw %edi
#define bufp_w %di
#define bufp_b %dil
#define bufptmp %rcx
#define block_0 %rcx
#define block_1 %rdx
#define block_2 %r11
#define len %esi
#define crc_init_arg %edx
#define tmp %rbx
#define crc_init %r8d
#define crc_init_q %r8
#define crc1 %r9
#define crc2 %r10
#define bufp %rdi
#define bufp_d %edi
#define len %esi
#define crc_init %edx
#define crc_init_q %rdx
#define n_misaligned %ecx /* overlaps chunk_bytes! */
#define n_misaligned_q %rcx
#define chunk_bytes %ecx /* overlaps n_misaligned! */
#define chunk_bytes_q %rcx
#define crc1 %r8
#define crc2 %r9
pushq %rbx
pushq %rdi
pushq %rsi
## Move crc_init for Linux to a different
mov crc_init_arg, crc_init
mov %bufp, bufptmp # rdi = *buf
cmp $SMALL_SIZE, len
jb .Lsmall
################################################################
## 1) ALIGN:
################################################################
neg %bufp
and $7, %bufp # calculate the unalignment amount of
mov bufp_d, n_misaligned
neg n_misaligned
and $7, n_misaligned # calculate the misalignment amount of
# the address
je .Lproc_block # Skip if aligned
je .Laligned # Skip if aligned
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
#### Calculate CRC of unaligned bytes of the buffer (if any)
movq (bufptmp), tmp # load a quadward from the buffer
add %bufp, bufptmp # align buffer pointer for quadword
# processing
sub bufp_dw, len # update buffer length
movq (bufp), %rax
add n_misaligned_q, bufp
sub n_misaligned, len
.Lalign_loop:
crc32b %bl, crc_init # compute crc32 of 1-byte
shr $8, tmp # get next byte
dec %bufp
crc32b %al, crc_init # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Lproc_block:
.Laligned:
################################################################
## 2) PROCESS BLOCKS:
## 2) PROCESS BLOCK:
################################################################
## compute num of bytes to be processed
cmp $128*24, len
jae .Lfull_block
.Lcontinue_block:
## len < 128*24
movq $2731, %rax # 2731 = ceil(2^16 / 24)
mul len
shrq $16, %rax
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
## eax contains floor(bytes / 24) = num 24-byte chunks to do
## process rax 24-byte chunks (128 >= rax >= 0)
## compute end address of each block
## block 0 (base addr + RAX * 8)
## block 1 (base addr + RAX * 16)
## block 2 (base addr + RAX * 24)
lea (bufptmp, %rax, 8), block_0
lea (block_0, %rax, 8), block_1
lea (block_1, %rax, 8), block_2
xor crc1, crc1
xor crc2, crc2
## branch into array
leaq jump_table(%rip), %bufp
mov (%bufp,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
## 2a) PROCESS FULL BLOCKS:
################################################################
.Lfull_block:
movl $128,%eax
lea 128*8*2(block_0), block_1
lea 128*8*3(block_0), block_2
add $128*8*1, block_0
# Processing 128 qwords from each lane.
mov $128, %eax
xor crc1,crc1
################################################################
## 3) CRC each of three lanes:
################################################################
.Lcrc_3lanes:
xor crc1,crc1
xor crc2,crc2
mov %eax, chunk_bytes
shl $3, chunk_bytes # num bytes to process from each lane
sub $5, %eax # 4 for 4x_loop, 1 for special last iter
jl .Lcrc_3lanes_4x_done
# Fall through into top of crc array (crc_128)
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
crc32q (bufp), crc_init_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
crc32q 8(bufp), crc_init_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
crc32q 16(bufp), crc_init_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
crc32q 24(bufp), crc_init_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp
sub $4, %eax
jge .Lcrc_3lanes_4x_loop
################################################################
## 3) CRC Array:
################################################################
.Lcrc_3lanes_4x_done:
add $4, %eax
jz .Lcrc_3lanes_last_qword
i=128
.rept 128-1
.altmacro
LABEL crc_ %i
.noaltmacro
ENDBR
crc32q -i*8(block_0), crc_init_q
crc32q -i*8(block_1), crc1
crc32q -i*8(block_2), crc2
i=(i-1)
.endr
.Lcrc_3lanes_1x_loop:
crc32q (bufp), crc_init_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
add $8, bufp
dec %eax
jnz .Lcrc_3lanes_1x_loop
.altmacro
LABEL crc_ %i
.noaltmacro
ENDBR
crc32q -i*8(block_0), crc_init_q
crc32q -i*8(block_1), crc1
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
mov block_2, block_0
.Lcrc_3lanes_last_qword:
crc32q (bufp), crc_init_q
crc32q (bufp,chunk_bytes_q), crc1
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
lea (K_table-8)(%rip), %bufp # first entry is for idx 1
shlq $3, %rax # rax *= 8
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
sub %eax, len # len -= rax*24
lea (K_table-8)(%rip), %rax # first entry is for idx 1
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
sub %eax, len # len -= chunk_bytes * 3
movq crc_init_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
@ -206,20 +175,19 @@ LABEL crc_ %i
pxor %xmm2,%xmm1
movq %xmm1, %rax
xor -i*8(block_2), %rax
xor (bufp,chunk_bytes_q,2), %rax
mov crc2, crc_init_q
crc32 %rax, crc_init_q
lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
## 5) Check for end:
## 5) If more blocks remain, goto (2):
################################################################
LABEL crc_ 0
ENDBR
cmp $128*24, len
jae .Lfull_block
jae .Lfull_block
cmp $SMALL_SIZE, len
jae .Lcontinue_block
jae .Lpartial_block
#######################################################################
## 6) Process any remainder without interleaving:
@ -231,47 +199,30 @@ LABEL crc_ 0
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
crc32q (bufptmp), crc_init_q
add $8, bufptmp
crc32q (bufp), crc_init_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
test $4, len
jz .Ldo_word
crc32l (bufptmp), crc_init
add $4, bufptmp
crc32l (bufp), crc_init
add $4, bufp
.Ldo_word:
test $2, len
jz .Ldo_byte
crc32w (bufptmp), crc_init
add $2, bufptmp
crc32w (bufp), crc_init
add $2, bufp
.Ldo_byte:
test $1, len
jz .Ldone
crc32b (bufptmp), crc_init
crc32b (bufp), crc_init
.Ldone:
mov crc_init, %eax
popq %rsi
popq %rdi
popq %rbx
RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
################################################################
## jump table Table is 129 entries x 2 bytes each
################################################################
.align 4
jump_table:
i=0
.rept 129
.altmacro
JMPTBL_ENTRY %i
.noaltmacro
i=i+1
.endr
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each