mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-22 16:06:04 -05:00
crypto: x86/crc32c - eliminate jump table and excessive unrolling
crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully unrolled and uses a jump table to jump into the correct location. This optimization is misguided, as it bloats the binary code size and introduces an indirect call. x86_64 CPUs can predict loops well, so it is fine to just use a loop instead. Loop bookkeeping instructions can compete with the crc instructions for the ALUs, but this is easily mitigated by unrolling the loop by a smaller amount, such as 4 times. Therefore, re-roll the loop and make related tweaks to the code. This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 bytes, a 91% reduction. In general it also makes the code faster, with some large improvements seen when retpoline is enabled. More detailed performance results are shown below. They are given as percent improvement in throughput (negative means regressed) for CPU microarchitecture vs. input length in bytes. E.g. an improvement from 40 GB/s to 50 GB/s would be listed as 25%. Table 1: Results with retpoline enabled (the default): | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | Table 2: Results with retpoline disabled: | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
eebcadfa21
commit
84dd048cf8
1 changed files with 91 additions and 140 deletions
|
@ -7,6 +7,7 @@
|
|||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
* Copyright 2024 Google LLC
|
||||
*
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
|
@ -44,18 +45,9 @@
|
|||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
|
||||
.macro LABEL prefix n
|
||||
.L\prefix\n\():
|
||||
.endm
|
||||
|
||||
.macro JMPTBL_ENTRY i
|
||||
.quad .Lcrc_\i
|
||||
.endm
|
||||
|
||||
# Define threshold below which buffers are considered "small" and routed to
|
||||
# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
|
@ -64,139 +56,116 @@
|
|||
|
||||
.text
|
||||
SYM_FUNC_START(crc_pcl)
|
||||
#define bufp rdi
|
||||
#define bufp_dw %edi
|
||||
#define bufp_w %di
|
||||
#define bufp_b %dil
|
||||
#define bufptmp %rcx
|
||||
#define block_0 %rcx
|
||||
#define block_1 %rdx
|
||||
#define block_2 %r11
|
||||
#define len %esi
|
||||
#define crc_init_arg %edx
|
||||
#define tmp %rbx
|
||||
#define crc_init %r8d
|
||||
#define crc_init_q %r8
|
||||
#define crc1 %r9
|
||||
#define crc2 %r10
|
||||
#define bufp %rdi
|
||||
#define bufp_d %edi
|
||||
#define len %esi
|
||||
#define crc_init %edx
|
||||
#define crc_init_q %rdx
|
||||
#define n_misaligned %ecx /* overlaps chunk_bytes! */
|
||||
#define n_misaligned_q %rcx
|
||||
#define chunk_bytes %ecx /* overlaps n_misaligned! */
|
||||
#define chunk_bytes_q %rcx
|
||||
#define crc1 %r8
|
||||
#define crc2 %r9
|
||||
|
||||
pushq %rbx
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
|
||||
## Move crc_init for Linux to a different
|
||||
mov crc_init_arg, crc_init
|
||||
|
||||
mov %bufp, bufptmp # rdi = *buf
|
||||
cmp $SMALL_SIZE, len
|
||||
jb .Lsmall
|
||||
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
neg %bufp
|
||||
and $7, %bufp # calculate the unalignment amount of
|
||||
mov bufp_d, n_misaligned
|
||||
neg n_misaligned
|
||||
and $7, n_misaligned # calculate the misalignment amount of
|
||||
# the address
|
||||
je .Lproc_block # Skip if aligned
|
||||
je .Laligned # Skip if aligned
|
||||
|
||||
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
|
||||
# the remaining data to an 8-byte boundary.
|
||||
.Ldo_align:
|
||||
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
add %bufp, bufptmp # align buffer pointer for quadword
|
||||
# processing
|
||||
sub bufp_dw, len # update buffer length
|
||||
movq (bufp), %rax
|
||||
add n_misaligned_q, bufp
|
||||
sub n_misaligned, len
|
||||
.Lalign_loop:
|
||||
crc32b %bl, crc_init # compute crc32 of 1-byte
|
||||
shr $8, tmp # get next byte
|
||||
dec %bufp
|
||||
crc32b %al, crc_init # compute crc32 of 1-byte
|
||||
shr $8, %rax # get next byte
|
||||
dec n_misaligned
|
||||
jne .Lalign_loop
|
||||
|
||||
.Lproc_block:
|
||||
.Laligned:
|
||||
|
||||
################################################################
|
||||
## 2) PROCESS BLOCKS:
|
||||
## 2) PROCESS BLOCK:
|
||||
################################################################
|
||||
|
||||
## compute num of bytes to be processed
|
||||
|
||||
cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
mul len
|
||||
shrq $16, %rax
|
||||
.Lpartial_block:
|
||||
# Compute floor(len / 24) to get num qwords to process from each lane.
|
||||
imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
|
||||
shr $16, %eax
|
||||
jmp .Lcrc_3lanes
|
||||
|
||||
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
|
||||
## process rax 24-byte chunks (128 >= rax >= 0)
|
||||
|
||||
## compute end address of each block
|
||||
## block 0 (base addr + RAX * 8)
|
||||
## block 1 (base addr + RAX * 16)
|
||||
## block 2 (base addr + RAX * 24)
|
||||
lea (bufptmp, %rax, 8), block_0
|
||||
lea (block_0, %rax, 8), block_1
|
||||
lea (block_1, %rax, 8), block_2
|
||||
|
||||
xor crc1, crc1
|
||||
xor crc2, crc2
|
||||
|
||||
## branch into array
|
||||
leaq jump_table(%rip), %bufp
|
||||
mov (%bufp,%rax,8), %bufp
|
||||
JMP_NOSPEC bufp
|
||||
|
||||
################################################################
|
||||
## 2a) PROCESS FULL BLOCKS:
|
||||
################################################################
|
||||
.Lfull_block:
|
||||
movl $128,%eax
|
||||
lea 128*8*2(block_0), block_1
|
||||
lea 128*8*3(block_0), block_2
|
||||
add $128*8*1, block_0
|
||||
# Processing 128 qwords from each lane.
|
||||
mov $128, %eax
|
||||
|
||||
xor crc1,crc1
|
||||
################################################################
|
||||
## 3) CRC each of three lanes:
|
||||
################################################################
|
||||
|
||||
.Lcrc_3lanes:
|
||||
xor crc1,crc1
|
||||
xor crc2,crc2
|
||||
mov %eax, chunk_bytes
|
||||
shl $3, chunk_bytes # num bytes to process from each lane
|
||||
sub $5, %eax # 4 for 4x_loop, 1 for special last iter
|
||||
jl .Lcrc_3lanes_4x_done
|
||||
|
||||
# Fall through into top of crc array (crc_128)
|
||||
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
|
||||
# bookkeeping instructions, which can compete with crc32q for the ALUs.
|
||||
.Lcrc_3lanes_4x_loop:
|
||||
crc32q (bufp), crc_init_q
|
||||
crc32q (bufp,chunk_bytes_q), crc1
|
||||
crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
crc32q 8(bufp), crc_init_q
|
||||
crc32q 8(bufp,chunk_bytes_q), crc1
|
||||
crc32q 8(bufp,chunk_bytes_q,2), crc2
|
||||
crc32q 16(bufp), crc_init_q
|
||||
crc32q 16(bufp,chunk_bytes_q), crc1
|
||||
crc32q 16(bufp,chunk_bytes_q,2), crc2
|
||||
crc32q 24(bufp), crc_init_q
|
||||
crc32q 24(bufp,chunk_bytes_q), crc1
|
||||
crc32q 24(bufp,chunk_bytes_q,2), crc2
|
||||
add $32, bufp
|
||||
sub $4, %eax
|
||||
jge .Lcrc_3lanes_4x_loop
|
||||
|
||||
################################################################
|
||||
## 3) CRC Array:
|
||||
################################################################
|
||||
.Lcrc_3lanes_4x_done:
|
||||
add $4, %eax
|
||||
jz .Lcrc_3lanes_last_qword
|
||||
|
||||
i=128
|
||||
.rept 128-1
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
crc32q -i*8(block_2), crc2
|
||||
i=(i-1)
|
||||
.endr
|
||||
.Lcrc_3lanes_1x_loop:
|
||||
crc32q (bufp), crc_init_q
|
||||
crc32q (bufp,chunk_bytes_q), crc1
|
||||
crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
add $8, bufp
|
||||
dec %eax
|
||||
jnz .Lcrc_3lanes_1x_loop
|
||||
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
|
||||
mov block_2, block_0
|
||||
.Lcrc_3lanes_last_qword:
|
||||
crc32q (bufp), crc_init_q
|
||||
crc32q (bufp,chunk_bytes_q), crc1
|
||||
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
|
||||
|
||||
################################################################
|
||||
## 4) Combine three results:
|
||||
################################################################
|
||||
|
||||
lea (K_table-8)(%rip), %bufp # first entry is for idx 1
|
||||
shlq $3, %rax # rax *= 8
|
||||
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
sub %eax, len # len -= rax*24
|
||||
lea (K_table-8)(%rip), %rax # first entry is for idx 1
|
||||
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
|
||||
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
|
||||
sub %eax, len # len -= chunk_bytes * 3
|
||||
|
||||
movq crc_init_q, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
|
@ -206,20 +175,19 @@ LABEL crc_ %i
|
|||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
xor -i*8(block_2), %rax
|
||||
xor (bufp,chunk_bytes_q,2), %rax
|
||||
mov crc2, crc_init_q
|
||||
crc32 %rax, crc_init_q
|
||||
lea 8(bufp,chunk_bytes_q,2), bufp
|
||||
|
||||
################################################################
|
||||
## 5) Check for end:
|
||||
## 5) If more blocks remain, goto (2):
|
||||
################################################################
|
||||
|
||||
LABEL crc_ 0
|
||||
ENDBR
|
||||
cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
jae .Lfull_block
|
||||
cmp $SMALL_SIZE, len
|
||||
jae .Lcontinue_block
|
||||
jae .Lpartial_block
|
||||
|
||||
#######################################################################
|
||||
## 6) Process any remainder without interleaving:
|
||||
|
@ -231,47 +199,30 @@ LABEL crc_ 0
|
|||
shr $3, %eax
|
||||
jz .Ldo_dword
|
||||
.Ldo_qwords:
|
||||
crc32q (bufptmp), crc_init_q
|
||||
add $8, bufptmp
|
||||
crc32q (bufp), crc_init_q
|
||||
add $8, bufp
|
||||
dec %eax
|
||||
jnz .Ldo_qwords
|
||||
.Ldo_dword:
|
||||
test $4, len
|
||||
jz .Ldo_word
|
||||
crc32l (bufptmp), crc_init
|
||||
add $4, bufptmp
|
||||
crc32l (bufp), crc_init
|
||||
add $4, bufp
|
||||
.Ldo_word:
|
||||
test $2, len
|
||||
jz .Ldo_byte
|
||||
crc32w (bufptmp), crc_init
|
||||
add $2, bufptmp
|
||||
crc32w (bufp), crc_init
|
||||
add $2, bufp
|
||||
.Ldo_byte:
|
||||
test $1, len
|
||||
jz .Ldone
|
||||
crc32b (bufptmp), crc_init
|
||||
crc32b (bufp), crc_init
|
||||
.Ldone:
|
||||
mov crc_init, %eax
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
||||
RET
|
||||
SYM_FUNC_END(crc_pcl)
|
||||
|
||||
.section .rodata, "a", @progbits
|
||||
################################################################
|
||||
## jump table Table is 129 entries x 2 bytes each
|
||||
################################################################
|
||||
.align 4
|
||||
jump_table:
|
||||
i=0
|
||||
.rept 129
|
||||
.altmacro
|
||||
JMPTBL_ENTRY %i
|
||||
.noaltmacro
|
||||
i=i+1
|
||||
.endr
|
||||
|
||||
|
||||
################################################################
|
||||
## PCLMULQDQ tables
|
||||
## Table is 128 entries x 2 words (8 bytes) each
|
||||
|
|
Loading…
Reference in a new issue