crypto: x86/crc32c - eliminate jump table and excessive unrolling

crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully unrolled and uses a jump table to jump into the correct location. This optimization is misguided, as it bloats the binary code size and introduces an indirect call. x86_64 CPUs can predict loops well, so it is fine to just use a loop instead. Loop bookkeeping instructions can compete with the crc instructions for the ALUs, but this is easily mitigated by unrolling the loop by a smaller amount, such as 4 times. Therefore, re-roll the loop and make related tweaks to the code. This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 bytes, a 91% reduction. In general it also makes the code faster, with some large improvements seen when retpoline is enabled. More detailed performance results are shown below. They are given as percent improvement in throughput (negative means regressed) for CPU microarchitecture vs. input length in bytes. E.g. an improvement from 40 GB/s to 50 GB/s would be listed as 25%. Table 1: Results with retpoline enabled (the default): | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | Table 2: Results with retpoline disabled: | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2025-01-22 16:06:04 -05:00 · 2024-10-13 21:24:47 -07:00 · 2024-10-13 21:24:47 -07:00 · 84dd048cf8
commit 84dd048cf8
parent eebcadfa21
1 changed files with 91 additions and 140 deletions
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@ -7,6 +7,7 @@
 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
 *
 * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
 *
 * Authors:
 *	Wajdi Feghali <wajdi.k.feghali@intel.com>
@ -44,18 +45,9 @@
 */

 #include <linux/linkage.h>
-#include <asm/nospec-branch.h>

 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction

-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
 # Define threshold below which buffers are considered "small" and routed to
 # regular CRC code that does not interleave the CRC instructions.
 #define SMALL_SIZE 200
@ -64,139 +56,116 @@

 .text
 SYM_FUNC_START(crc_pcl)
-#define    bufp		rdi
-#define    bufp_dw	%edi
-#define    bufp_w	%di
-#define    bufp_b	%dil
-#define    bufptmp	%rcx
-#define    block_0	%rcx
-#define    block_1	%rdx
-#define    block_2	%r11
-#define    len		%esi
-#define    crc_init_arg %edx
-#define    tmp		%rbx
-#define    crc_init	%r8d
-#define    crc_init_q	%r8
-#define    crc1		%r9
-#define    crc2		%r10
+#define    bufp		  %rdi
+#define    bufp_d	  %edi
+#define    len		  %esi
+#define    crc_init	  %edx
+#define    crc_init_q	  %rdx
+#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
+#define    n_misaligned_q %rcx
+#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
+#define    chunk_bytes_q  %rcx
+#define    crc1		  %r8
+#define    crc2		  %r9

-	pushq   %rbx
-	pushq   %rdi
-	pushq   %rsi
-
-	## Move crc_init for Linux to a different
-	mov     crc_init_arg, crc_init
-
-	mov	%bufp, bufptmp		# rdi = *buf
 	cmp	$SMALL_SIZE, len
 	jb	.Lsmall

 	################################################################
 	## 1) ALIGN:
 	################################################################
-	neg     %bufp
-	and     $7, %bufp		# calculate the unalignment amount of
+	mov	bufp_d, n_misaligned
+	neg	n_misaligned
+	and	$7, n_misaligned	# calculate the misalignment amount of
 					# the address
-	je      .Lproc_block		# Skip if aligned
+	je	.Laligned		# Skip if aligned

+	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
+	# the remaining data to an 8-byte boundary.
 .Ldo_align:
-	#### Calculate CRC of unaligned bytes of the buffer (if any)
-	movq    (bufptmp), tmp		# load a quadward from the buffer
-	add     %bufp, bufptmp		# align buffer pointer for quadword
-					# processing
-	sub	bufp_dw, len		# update buffer length
+	movq	(bufp), %rax
+	add	n_misaligned_q, bufp
+	sub	n_misaligned, len
 .Lalign_loop:
-	crc32b	%bl, crc_init		# compute crc32 of 1-byte
-	shr     $8, tmp			# get next byte
-	dec     %bufp
+	crc32b	%al, crc_init		# compute crc32 of 1-byte
+	shr	$8, %rax		# get next byte
+	dec	n_misaligned
 	jne     .Lalign_loop
-
-.Lproc_block:
+.Laligned:

 	################################################################
-	## 2) PROCESS  BLOCKS:
+	## 2) PROCESS BLOCK:
 	################################################################

-	## compute num of bytes to be processed
-
 	cmp	$128*24, len
 	jae     .Lfull_block

-.Lcontinue_block:
-	## len < 128*24
-	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
-	mul	len
-	shrq    $16, %rax
+.Lpartial_block:
+	# Compute floor(len / 24) to get num qwords to process from each lane.
+	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
+	shr	$16, %eax
+	jmp	.Lcrc_3lanes

-	## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
-	## process rax 24-byte chunks (128 >= rax >= 0)
-
-	## compute end address of each block
-	## block 0 (base addr + RAX * 8)
-	## block 1 (base addr + RAX * 16)
-	## block 2 (base addr + RAX * 24)
-	lea     (bufptmp, %rax, 8), block_0
-	lea     (block_0, %rax, 8), block_1
-	lea     (block_1, %rax, 8), block_2
-
-	xor     crc1, crc1
-	xor     crc2, crc2
-
-	## branch into array
-	leaq	jump_table(%rip), %bufp
-	mov	(%bufp,%rax,8), %bufp
-	JMP_NOSPEC bufp
-
-	################################################################
-	## 2a) PROCESS FULL BLOCKS:
-	################################################################
 .Lfull_block:
-	movl    $128,%eax
-	lea     128*8*2(block_0), block_1
-	lea     128*8*3(block_0), block_2
-	add     $128*8*1, block_0
+	# Processing 128 qwords from each lane.
+	mov	$128, %eax

-	xor     crc1,crc1
+	################################################################
+	## 3) CRC each of three lanes:
+	################################################################
+
+.Lcrc_3lanes:
+	xor	crc1,crc1
 	xor     crc2,crc2
+	mov	%eax, chunk_bytes
+	shl	$3, chunk_bytes		# num bytes to process from each lane
+	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
+	jl	.Lcrc_3lanes_4x_done

-	# Fall through into top of crc array (crc_128)
+	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
+	# bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	crc32q	8(bufp), crc_init_q
+	crc32q	8(bufp,chunk_bytes_q), crc1
+	crc32q	8(bufp,chunk_bytes_q,2), crc2
+	crc32q	16(bufp), crc_init_q
+	crc32q	16(bufp,chunk_bytes_q), crc1
+	crc32q	16(bufp,chunk_bytes_q,2), crc2
+	crc32q	24(bufp), crc_init_q
+	crc32q	24(bufp,chunk_bytes_q), crc1
+	crc32q	24(bufp,chunk_bytes_q,2), crc2
+	add	$32, bufp
+	sub	$4, %eax
+	jge	.Lcrc_3lanes_4x_loop

-	################################################################
-	## 3) CRC Array:
-	################################################################
+.Lcrc_3lanes_4x_done:
+	add	$4, %eax
+	jz	.Lcrc_3lanes_last_qword

-	i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-	crc32q   -i*8(block_2), crc2
-	i=(i-1)
-.endr
+.Lcrc_3lanes_1x_loop:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	add	$8, bufp
+	dec	%eax
+	jnz	.Lcrc_3lanes_1x_loop

-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
-
-	mov     block_2, block_0
+.Lcrc_3lanes_last_qword:
+	crc32q	(bufp), crc_init_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet

 	################################################################
 	## 4) Combine three results:
 	################################################################

-	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
-	shlq    $3, %rax			# rax *= 8
-	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
-	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
-	sub	%eax, len			# len -= rax*24
+	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
+	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
+	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+	sub	%eax, len			# len -= chunk_bytes * 3

 	movq	crc_init_q, %xmm1		# CRC for block 1
 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
@ -206,20 +175,19 @@ LABEL crc_ %i

 	pxor    %xmm2,%xmm1
 	movq    %xmm1, %rax
-	xor     -i*8(block_2), %rax
+	xor	(bufp,chunk_bytes_q,2), %rax
 	mov	crc2, crc_init_q
 	crc32	%rax, crc_init_q
+	lea	8(bufp,chunk_bytes_q,2), bufp

 	################################################################
-	## 5) Check for end:
+	## 5) If more blocks remain, goto (2):
 	################################################################

-LABEL crc_ 0
-	ENDBR
 	cmp	$128*24, len
-	jae     .Lfull_block
+	jae	.Lfull_block
 	cmp	$SMALL_SIZE, len
-	jae     .Lcontinue_block
+	jae	.Lpartial_block

 	#######################################################################
 	## 6) Process any remainder without interleaving:
@ -231,47 +199,30 @@ LABEL crc_ 0
 	shr	$3, %eax
 	jz	.Ldo_dword
 .Ldo_qwords:
-	crc32q	(bufptmp), crc_init_q
-	add	$8, bufptmp
+	crc32q	(bufp), crc_init_q
+	add	$8, bufp
 	dec	%eax
 	jnz	.Ldo_qwords
 .Ldo_dword:
 	test	$4, len
 	jz	.Ldo_word
-	crc32l	(bufptmp), crc_init
-	add	$4, bufptmp
+	crc32l	(bufp), crc_init
+	add	$4, bufp
 .Ldo_word:
 	test	$2, len
 	jz	.Ldo_byte
-	crc32w	(bufptmp), crc_init
-	add	$2, bufptmp
+	crc32w	(bufp), crc_init
+	add	$2, bufp
 .Ldo_byte:
 	test	$1, len
 	jz	.Ldone
-	crc32b	(bufptmp), crc_init
+	crc32b	(bufp), crc_init
 .Ldone:
 	mov	crc_init, %eax
-	popq    %rsi
-	popq    %rdi
-	popq    %rbx
        RET
 SYM_FUNC_END(crc_pcl)

 .section	.rodata, "a", @progbits
-        ################################################################
-        ## jump table        Table is 129 entries x 2 bytes each
-        ################################################################
-.align 4
-jump_table:
-	i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
-	i=i+1
-.endr
-
-
 	################################################################
 	## PCLMULQDQ tables
 	## Table is 128 entries x 2 words (8 bytes) each