[PATCH 0/2] crypto: x86/sha256-ni - cleanup and optimization

Linux-Crypto Archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] crypto: x86/sha256-ni - cleanup and optimization
@ 2024-04-09 12:42 Eric Biggers
  2024-04-09 12:42 ` [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
  2024-04-09 12:42 ` [PATCH 2/2] crypto: x86/sha256-ni - optimize code size Eric Biggers
  0 siblings, 2 replies; 7+ messages in thread
From: Eric Biggers @ 2024-04-09 12:42 UTC (permalink / raw
  To: linux-crypto; +Cc: Stefan Kanthak, linux-kernel

This patchset reduces the amount of source code duplication in
sha256_ni_asm.S and also reduces the binary code size slightly.

Eric Biggers (2):
  crypto: x86/sha256-ni - convert to use rounds macros
  crypto: x86/sha256-ni - optimize code size

 arch/x86/crypto/sha256_ni_asm.S | 242 +++++++-------------------------
 1 file changed, 47 insertions(+), 195 deletions(-)


base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659
-- 
2.44.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-09 12:42 [PATCH 0/2] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
@ 2024-04-09 12:42 ` Eric Biggers
  2024-04-09 16:52   ` Stefan Kanthak
  2024-04-09 12:42 ` [PATCH 2/2] crypto: x86/sha256-ni - optimize code size Eric Biggers
  1 sibling, 1 reply; 7+ messages in thread
From: Eric Biggers @ 2024-04-09 12:42 UTC (permalink / raw
  To: linux-crypto; +Cc: Stefan Kanthak, linux-kernel

From: Eric Biggers <ebiggers@google.com>

To avoid source code duplication, do the SHA-256 rounds using macros.
This reduces the length of sha256_ni_asm.S by 148 lines while still
producing the exact same object file.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 216 +++++---------------------------
 1 file changed, 34 insertions(+), 182 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7ed8..e485520e3b49 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -74,23 +74,50 @@
 #define SHUF_MASK	%xmm8
 
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
 
+.macro do_4rounds	i, m0, m1, m2, m3
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, \m0
+.else
+	movdqa		\m0, MSG
+.endif
+	paddd		\i*4(SHA256CONSTANTS), MSG
+	sha256rnds2	STATE0, STATE1
+.if \i >= 12 && \i < 60
+	movdqa		\m0, MSGTMP4
+	palignr		$4, \m3, MSGTMP4
+	paddd		MSGTMP4, \m1
+	sha256msg2	\m0, \m1
+.endif
+	pshufd 		$0x0E, MSG, MSG
+	sha256rnds2	STATE1, STATE0
+.if \i >= 4 && \i < 52
+	sha256msg1	\m0, \m3
+.endif
+.endm
+
+.macro do_16rounds	i
+	do_4rounds	(\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
+	do_4rounds	(\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
+	do_4rounds	(\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
+	do_4rounds	(\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+.endm
+
 /*
  * Intel SHA Extensions optimized implementation of a SHA-256 update function
  *
  * The function takes a pointer to the current hash values, a pointer to the
  * input data, and a number of 64 byte blocks to process.  Once all blocks have
  * been processed, the digest pointer is  updated with the resulting hash value.
  * The function only processes complete blocks, there is no functionality to
  * store partial blocks.  All message padding and hash value initialization must
  * be done outside the update function.
  *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
  * void sha256_ni_transform(uint32_t *digest, const void *data,
 		uint32_t numBlocks);
  * digest : pointer to digest
  * data: pointer to input data
  * numBlocks: Number of blocks to process
@@ -123,189 +150,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
 
-	/* Rounds 0-3 */
-	movdqu		0*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movdqu		1*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movdqu		2*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movdqu		3*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	movdqa		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	movdqa		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	movdqa		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	movdqa		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	movdqa		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	movdqa		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	movdqa		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	movdqa		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	movdqa		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	movdqa		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 56-59 */
-	movdqa		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 60-63 */
-	movdqa		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+	do_16rounds	0
+	do_16rounds	16
+	do_16rounds	32
+	do_16rounds	48
 
 	/* Add current hash values with previously saved */
 	paddd		ABEF_SAVE, STATE0
 	paddd		CDGH_SAVE, STATE1
 
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] crypto: x86/sha256-ni - optimize code size
  2024-04-09 12:42 [PATCH 0/2] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
  2024-04-09 12:42 ` [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
@ 2024-04-09 12:42 ` Eric Biggers
  1 sibling, 0 replies; 7+ messages in thread
From: Eric Biggers @ 2024-04-09 12:42 UTC (permalink / raw
  To: linux-crypto; +Cc: Stefan Kanthak, linux-kernel

From: Eric Biggers <ebiggers@google.com>

- Load the SHA-256 round constants relative to a pointer that points
  into the middle of the constants rather than to the beginning.  Since
  x86 instructions use signed offsets, this decreases the instruction
  length required to access some of the later round constants.

- Use punpcklqdq or punpckhqdq instead of longer instructions such as
  pshufd, pblendw, and palignr.  This doesn't harm performance.

The end result is that sha256_ni_transform shrinks from 839 bytes to 791
bytes, with no loss in performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index e485520e3b49..4d373069448d 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -82,19 +82,19 @@
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, \m0
 .else
 	movdqa		\m0, MSG
 .endif
-	paddd		\i*4(SHA256CONSTANTS), MSG
+	paddd		(\i-32)*4(SHA256CONSTANTS), MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
 	movdqa		\m0, MSGTMP4
 	palignr		$4, \m3, MSGTMP4
 	paddd		MSGTMP4, \m1
 	sha256msg2	\m0, \m1
 .endif
-	pshufd 		$0x0E, MSG, MSG
+	punpckhqdq	MSG, MSG
 	sha256rnds2	STATE1, STATE0
 .if \i >= 4 && \i < 52
 	sha256msg1	\m0, \m3
 .endif
 .endm
@@ -133,21 +133,21 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	/*
 	 * load initial hash values
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
+	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
+	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
 
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
 	movdqa		STATE0, MSGTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	MSGTMP4, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	lea		K256+32*4(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
@@ -165,18 +165,18 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	add		$64, DATA_PTR
 	cmp		NUM_BLKS, DATA_PTR
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
 	movdqa		STATE0, MSGTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	MSGTMP4, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
 
-	movdqu		STATE0, 0*16(DIGEST_PTR)
-	movdqu		STATE1, 1*16(DIGEST_PTR)
+	movdqu		STATE1, 0*16(DIGEST_PTR)
+	movdqu		STATE0, 1*16(DIGEST_PTR)
 
 .Ldone_hash:
 
 	RET
 SYM_FUNC_END(sha256_ni_transform)
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-09 12:42 ` [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
@ 2024-04-09 16:52   ` Stefan Kanthak
  2024-04-09 23:36     ` Eric Biggers
  0 siblings, 1 reply; 7+ messages in thread
From: Stefan Kanthak @ 2024-04-09 16:52 UTC (permalink / raw
  To: Eric Biggers, linux-crypto; +Cc: linux-kernel, ardb

"Eric Biggers" <ebiggers@kernel.org> wrote:

> +.macro do_4rounds i, m0, m1, m2, m3
> +.if \i < 16
> +        movdqu  \i*4(DATA_PTR), MSG
> +        pshufb  SHUF_MASK, MSG
> +        movdqa  MSG, \m0
> +.else
> +        movdqa  \m0, MSG
> +.endif
> +        paddd   \i*4(SHA256CONSTANTS), MSG

To load the round constant independent from and parallel to the previous
instructions which use \m0 I recommend to change the first lines of the
do_4rounds macro as follows (this might save 1+ cycle per macro invocation,
and most obviously 2 lines):

.macro do_4rounds i, m0, m1, m2, m3
.if \i < 16
        movdqu  \i*4(DATA_PTR), \m0
        pshufb  SHUF_MASK, \m0
.endif
        movdqa  \i*4(SHA256CONSTANTS), MSG
        paddd   \m0, MSG
...

regards
Stefan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-09 16:52   ` Stefan Kanthak
@ 2024-04-09 23:36     ` Eric Biggers
  2024-04-11  7:42       ` Stefan Kanthak
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Biggers @ 2024-04-09 23:36 UTC (permalink / raw
  To: Stefan Kanthak; +Cc: linux-crypto, linux-kernel, ardb

On Tue, Apr 09, 2024 at 06:52:02PM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <ebiggers@kernel.org> wrote:
> 
> > +.macro do_4rounds i, m0, m1, m2, m3
> > +.if \i < 16
> > +        movdqu  \i*4(DATA_PTR), MSG
> > +        pshufb  SHUF_MASK, MSG
> > +        movdqa  MSG, \m0
> > +.else
> > +        movdqa  \m0, MSG
> > +.endif
> > +        paddd   \i*4(SHA256CONSTANTS), MSG
> 
> To load the round constant independent from and parallel to the previous
> instructions which use \m0 I recommend to change the first lines of the
> do_4rounds macro as follows (this might save 1+ cycle per macro invocation,
> and most obviously 2 lines):
> 
> .macro do_4rounds i, m0, m1, m2, m3
> .if \i < 16
>         movdqu  \i*4(DATA_PTR), \m0
>         pshufb  SHUF_MASK, \m0
> .endif
>         movdqa  \i*4(SHA256CONSTANTS), MSG
>         paddd   \m0, MSG
> ...

Yes, your suggestion looks good.  I don't see any performance difference on
Ice Lake, but it does shorten the source code.  It belongs in a separate patch
though, since this patch isn't meant to change the output.

- Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-09 23:36     ` Eric Biggers
@ 2024-04-11  7:42       ` Stefan Kanthak
  2024-04-11 16:16         ` Eric Biggers
  0 siblings, 1 reply; 7+ messages in thread
From: Stefan Kanthak @ 2024-04-11  7:42 UTC (permalink / raw
  To: Eric Biggers; +Cc: linux-crypto, linux-kernel, ardb

"Eric Biggers" <ebiggers@kernel.org> wrote:

> On Tue, Apr 09, 2024 at 06:52:02PM +0200, Stefan Kanthak wrote:
>> "Eric Biggers" <ebiggers@kernel.org> wrote:
>> 
>> > +.macro do_4rounds i, m0, m1, m2, m3
>> > +.if \i < 16
>> > +        movdqu  \i*4(DATA_PTR), MSG
>> > +        pshufb  SHUF_MASK, MSG
>> > +        movdqa  MSG, \m0
>> > +.else
>> > +        movdqa  \m0, MSG
>> > +.endif
>> > +        paddd   \i*4(SHA256CONSTANTS), MSG
>> 
>> To load the round constant independent from and parallel to the previous
>> instructions which use \m0 I recommend to change the first lines of the
>> do_4rounds macro as follows (this might save 1+ cycle per macro invocation,
>> and most obviously 2 lines):
>> 
>> .macro do_4rounds i, m0, m1, m2, m3
>> .if \i < 16
>>         movdqu  \i*4(DATA_PTR), \m0
>>         pshufb  SHUF_MASK, \m0
>> .endif
>>         movdqa  \i*4(SHA256CONSTANTS), MSG
>>         paddd   \m0, MSG
>> ...
> 
> Yes, your suggestion looks good.  I don't see any performance difference on
> Ice Lake, but it does shorten the source code.  It belongs in a separate patch
> though, since this patch isn't meant to change the output.

Hmmm... the output was already changed: 2 palignr/pblendw and 16 pshufd
have been replaced with punpck?qdq, and 17 displacements changed.

Next simplification, and 5 more lines gone: replace the macro do_16rounds
with a repetition

@@ ...
-.macro do_16rounds i
-        do_4rounds (\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
-        do_4rounds (\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
-        do_4rounds (\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
-        do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
-.endm
-
@@ ...
-        do_16rounds 0
-        do_16rounds 16
-        do_16rounds 32
-        do_16rounds 48
+.irp i, 0, 16, 32, 48
+        do_4rounds (\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
+        do_4rounds (\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
+        do_4rounds (\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
+        do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+.endr

This doesn't change the instructions generated, so it belongs to this patch.


The following suggestion changes instructions: AFAIK all processors which
support the SHA extensions support AVX too

@@ ...
+.ifnotdef AVX
         movdqa          STATE0, MSGTMP4
         punpcklqdq      STATE1, STATE0                  /* FEBA */
         punpckhqdq      MSGTMP4, STATE1                 /* DCHG */
         pshufd          $0x1B, STATE0,  STATE0          /* ABEF */
         pshufd          $0xB1, STATE1,  STATE1          /* CDGH */
+.else
+        vpunpckhqdq     STATE0, STATE1, MSGTMP0         /* DCHG */
+        vpunpcklqdq     STATE0, STATE1, MSGTMP1         /* BAFE */
+        pshufd          $0xB1, MSGTMP0, STATE0          /* CDGH */
+        pshufd          $0xB1, MSGTMP1, STATE1          /* ABEF */
+.endif
@@ ...
+.ifnotdef AVX
         movdqa  \i*4(SHA256CONSTANTS), MSG
         paddd   \m0, MSG
+.else
+        vpaddd  \i*4(SHA256CONSTANTS), \m0, MSG
+.endif
@@ ...
+.ifnotdef AVX
         movdqa  \m0, MSGTMP4
         palignr $4, \m3, MSGTMP4
+.else
+        vpalignr $4, \m3, \m0, MSGTMP4
+.endif
@@ ...
+.ifnotdef AVX
         movdqa          STATE1, MSGTMP4
         punpcklqdq      STATE0, STATE1                  /* EFGH */
         punpckhqdq      MSGTMP4, STATE0                 /* CDAB */
         pshufd          $0x1B, STATE0,  STATE0          /* DCBA */
         pshufd          $0xB1, STATE1,  STATE1          /* HGFE */
+.else
+        vpunpckhqdq     STATE0, STATE1, MSGTMP0         /* ABCD */
+        vpunpcklqdq     STATE0, STATE1, MSGTMP1         /* EFGH */
+        pshufd          $0x1B, MSGTMP0, STATE0          /* DCBA */
+        pshufd          $0x1B, MSGTMP1, STATE1          /* HGFE */
+.endif


And last: are the "#define ... %xmm?" really necessary?

- MSG can't be anything but %xmm0;
- MSGTMP4 is despite its prefix MSG also used to shuffle STATE0 and STATE1,
  so it should be named TMP instead (if kept);
- MSGTMP0 to MSGTMP3 are the circular message schedule, they should be named
  MSG0 to MSG3 instead (if kept).

I suggest to remove at least those which are now encapsulated in the macro
and the repetition.


regards
Stefan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-11  7:42       ` Stefan Kanthak
@ 2024-04-11 16:16         ` Eric Biggers
  0 siblings, 0 replies; 7+ messages in thread
From: Eric Biggers @ 2024-04-11 16:16 UTC (permalink / raw
  To: Stefan Kanthak; +Cc: linux-crypto, linux-kernel, ardb

Hi Stefan,

On Thu, Apr 11, 2024 at 09:42:00AM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <ebiggers@kernel.org> wrote:
> 
> > On Tue, Apr 09, 2024 at 06:52:02PM +0200, Stefan Kanthak wrote:
> >> "Eric Biggers" <ebiggers@kernel.org> wrote:
> >> 
> >> > +.macro do_4rounds i, m0, m1, m2, m3
> >> > +.if \i < 16
> >> > +        movdqu  \i*4(DATA_PTR), MSG
> >> > +        pshufb  SHUF_MASK, MSG
> >> > +        movdqa  MSG, \m0
> >> > +.else
> >> > +        movdqa  \m0, MSG
> >> > +.endif
> >> > +        paddd   \i*4(SHA256CONSTANTS), MSG
> >> 
> >> To load the round constant independent from and parallel to the previous
> >> instructions which use \m0 I recommend to change the first lines of the
> >> do_4rounds macro as follows (this might save 1+ cycle per macro invocation,
> >> and most obviously 2 lines):
> >> 
> >> .macro do_4rounds i, m0, m1, m2, m3
> >> .if \i < 16
> >>         movdqu  \i*4(DATA_PTR), \m0
> >>         pshufb  SHUF_MASK, \m0
> >> .endif
> >>         movdqa  \i*4(SHA256CONSTANTS), MSG
> >>         paddd   \m0, MSG
> >> ...
> > 
> > Yes, your suggestion looks good.  I don't see any performance difference on
> > Ice Lake, but it does shorten the source code.  It belongs in a separate patch
> > though, since this patch isn't meant to change the output.
> 
> Hmmm... the output was already changed: 2 palignr/pblendw and 16 pshufd
> have been replaced with punpck?qdq, and 17 displacements changed.

Yes, the second patch does that.  Your comment is on the first patch, so I
thought you were suggesting changing the first patch.  I'll handle your
suggestion in another patch.  I'm just trying to keep changes to the actual
output separate from source code only cleanups.

> Next simplification, and 5 more lines gone: replace the macro do_16rounds
> with a repetition
> 
> @@ ...
> -.macro do_16rounds i
> -        do_4rounds (\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
> -        do_4rounds (\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
> -        do_4rounds (\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
> -        do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
> -.endm
> -
> @@ ...
> -        do_16rounds 0
> -        do_16rounds 16
> -        do_16rounds 32
> -        do_16rounds 48
> +.irp i, 0, 16, 32, 48
> +        do_4rounds (\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
> +        do_4rounds (\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
> +        do_4rounds (\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
> +        do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
> +.endr
> 
> This doesn't change the instructions generated, so it belongs to this patch.

Yes, that makes sense.

> The following suggestion changes instructions: AFAIK all processors which
> support the SHA extensions support AVX too

No (unfortunately).  Several generations of Intel's low-power CPUs support SHA
but not AVX.  Namely Goldmont, Goldmont Plus, and Tremont.

We could provide two SHA-256 implementations, one with AVX and one without.  I
think it's not worthwhile, though.

We ran into this issue with AES-NI too; I was hoping that we could just provide
the new AES-NI + AVX implementation of AES-XTS, and remove the older
implementation that uses AES-NI alone.  However, apparently the above-mentioned
low-power CPUs do need the implementation that uses AES-NI alone.

> And last: are the "#define ... %xmm?" really necessary?
> 
> - MSG can't be anything but %xmm0;
> - MSGTMP4 is despite its prefix MSG also used to shuffle STATE0 and STATE1,
>   so it should be named TMP instead (if kept);
> - MSGTMP0 to MSGTMP3 are the circular message schedule, they should be named
>   MSG0 to MSG3 instead (if kept).
> 
> I suggest to remove at least those which are now encapsulated in the macro
> and the repetition.

Yes, I noticed the weird naming too.  I'll rename MSGTMP[0-3] to MSG[0-3], and
MSGTMP4 to TMP.

You're correct that MSG has to be xmm0 because of how sha256rnds2 uses xmm0 as
an implicit operand.  I think I'll leave the '#define' for MSG anyway because
the list of aliases helps make it clear what each register is used for.

Thanks,

- Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-04-11 16:16 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-09 12:42 [PATCH 0/2] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
2024-04-09 12:42 ` [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
2024-04-09 16:52   ` Stefan Kanthak
2024-04-09 23:36     ` Eric Biggers
2024-04-11  7:42       ` Stefan Kanthak
2024-04-11 16:16         ` Eric Biggers
2024-04-09 12:42 ` [PATCH 2/2] crypto: x86/sha256-ni - optimize code size Eric Biggers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).