1//go:build !noasm && gc && amd64 && !arm64
  2
  3#include "textflag.h"
  4
  5// License information for the original SHA1 arm64 implemention:
  6// Copyright 2024 The Go Authors. All rights reserved.
  7// Use of this source code is governed by a BSD-style
  8// license that can be found at:
  9// 	- https://github.com/golang/go/blob/master/LICENSE
 10//
 11// Reference implementations:
 12// 	- https://github.com/golang/go/blob/master/src/crypto/sha1/sha1block_amd64.s
 13
 14// Reverse the dword order in abcd via PSHUFD then store the 16 bytes in one
 15// move, instead of issuing four VPEXTRD's that each go through the store port.
 16#define LOADCS(abcd, e, index, target) \
 17	VPSHUFD $0x1B, abcd, X8; \
 18	VMOVDQU X8, ((index*20)+0)(target); \
 19	MOVL e, ((index*20)+16)(target);
 20
 21#define LOADM1(m1, index, target) \
 22	VPSHUFD $0x1B, m1, X8; \
 23	VMOVDQU X8, ((index*16)+0)(target);
 24	
 25// func blockAMD64(h []uint32, p []byte, m1 []uint32, cs [][5]uint32)
 26// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
 27TEXT ·blockAMD64(SB), NOSPLIT, $80-96
 28	MOVQ h_base+0(FP), DI
 29	MOVQ p_base+24(FP), SI
 30	MOVQ p_len+32(FP), DX
 31	MOVQ m1_base+48(FP), R13
 32	MOVQ cs_base+72(FP), R15
 33	CMPQ DX, $0x00
 34	JEQ  done
 35	ADDQ SI, DX
 36
 37	// Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes
 38	LEAQ 15(SP), AX
 39	MOVQ $0x000000000000000f, CX
 40	NOTQ CX
 41	ANDQ CX, AX
 42
 43	// Load initial hash state
 44	PINSRD  $0x03, 16(DI), X5
 45	VMOVDQU (DI), X0
 46	PAND    upper_mask<>+0(SB), X5
 47	PSHUFD  $0x1b, X0, X0
 48	VMOVDQA shuffle_mask<>+0(SB), X7
 49
 50loop:
 51	// Save ABCD and E working values
 52	VMOVDQA X5, (AX)
 53	VMOVDQA X0, 16(AX)
 54
 55	// LOAD CS 0
 56	VPEXTRD $3, X5, R12
 57	LOADCS(X0, R12, 0, R15)
 58
 59	// Rounds 0-3
 60	VMOVDQU   (SI), X1
 61	PSHUFB    X7, X1
 62	PADDD     X1, X5
 63	VMOVDQA   X0, X6
 64	SHA1RNDS4 $0x00, X5, X0
 65	LOADM1(X1, 0, R13)
 66
 67	// Rounds 4-7
 68	VMOVDQU   16(SI), X2
 69	PSHUFB    X7, X2
 70	SHA1NEXTE X2, X6
 71	VMOVDQA   X0, X5
 72	SHA1RNDS4 $0x00, X6, X0
 73	SHA1MSG1  X2, X1
 74	LOADM1(X2, 1, R13)
 75
 76	// Rounds 8-11
 77	VMOVDQU   32(SI), X3
 78	PSHUFB    X7, X3
 79	SHA1NEXTE X3, X5
 80	VMOVDQA   X0, X6
 81	SHA1RNDS4 $0x00, X5, X0
 82	SHA1MSG1  X3, X2
 83	PXOR      X3, X1
 84	LOADM1(X3, 2, R13)
 85
 86	// Rounds 12-15
 87	VMOVDQU   48(SI), X4
 88	PSHUFB    X7, X4
 89	SHA1NEXTE X4, X6
 90	VMOVDQA   X0, X5
 91	SHA1MSG2  X4, X1
 92	SHA1RNDS4 $0x00, X6, X0
 93	SHA1MSG1  X4, X3
 94	PXOR      X4, X2
 95	LOADM1(X4, 3, R13)
 96
 97	// Rounds 16-19
 98	SHA1NEXTE X1, X5
 99	VMOVDQA   X0, X6
100	SHA1MSG2  X1, X2
101	SHA1RNDS4 $0x00, X5, X0
102	SHA1MSG1  X1, X4
103	PXOR      X1, X3
104	LOADM1(X1, 4, R13)
105
106	// Rounds 20-23
107	SHA1NEXTE X2, X6
108	VMOVDQA   X0, X5
109	SHA1MSG2  X2, X3
110	SHA1RNDS4 $0x01, X6, X0
111	SHA1MSG1  X2, X1
112	PXOR      X2, X4
113	LOADM1(X2, 5, R13)
114
115	// Rounds 24-27
116	SHA1NEXTE X3, X5
117	VMOVDQA   X0, X6
118	SHA1MSG2  X3, X4
119	SHA1RNDS4 $0x01, X5, X0
120	SHA1MSG1  X3, X2
121	PXOR      X3, X1
122	LOADM1(X3, 6, R13)
123
124	// Rounds 28-31
125	SHA1NEXTE X4, X6
126	VMOVDQA   X0, X5
127	SHA1MSG2  X4, X1
128	SHA1RNDS4 $0x01, X6, X0
129	SHA1MSG1  X4, X3
130	PXOR      X4, X2
131	LOADM1(X4, 7, R13)
132
133	// Rounds 32-35
134	SHA1NEXTE X1, X5
135	VMOVDQA   X0, X6
136	SHA1MSG2  X1, X2
137	SHA1RNDS4 $0x01, X5, X0
138	SHA1MSG1  X1, X4
139	PXOR      X1, X3
140	LOADM1(X1, 8, R13)
141
142	// Rounds 36-39
143	SHA1NEXTE X2, X6
144	VMOVDQA   X0, X5
145	SHA1MSG2  X2, X3
146	SHA1RNDS4 $0x01, X6, X0
147	SHA1MSG1  X2, X1
148	PXOR      X2, X4
149	LOADM1(X2, 9, R13)
150
151	// Rounds 40-43
152	SHA1NEXTE X3, X5
153	VMOVDQA   X0, X6
154	SHA1MSG2  X3, X4
155	SHA1RNDS4 $0x02, X5, X0
156	SHA1MSG1  X3, X2
157	PXOR      X3, X1
158	LOADM1(X3, 10, R13)
159
160	// Rounds 44-47
161	SHA1NEXTE X4, X6
162	VMOVDQA   X0, X5
163	SHA1MSG2  X4, X1
164	SHA1RNDS4 $0x02, X6, X0
165	SHA1MSG1  X4, X3
166	PXOR      X4, X2
167	LOADM1(X4, 11, R13)
168
169	// Rounds 48-51
170	SHA1NEXTE X1, X5
171	VMOVDQA   X0, X6
172	SHA1MSG2  X1, X2
173	SHA1RNDS4 $0x02, X5, X0
174	VPEXTRD $0, X5, R12
175	SHA1MSG1  X1, X4
176	PXOR      X1, X3
177	LOADM1(X1, 12, R13)
178
179	// derive pre-round 56's E out of round 51's A.
180	VPEXTRD $3, X0, R12
181	ROLL $30, R12
182
183	// Rounds 52-55
184	SHA1NEXTE X2, X6
185	VMOVDQA   X0, X5
186	SHA1MSG2  X2, X3
187	SHA1RNDS4 $0x02, X6, X0
188	SHA1MSG1  X2, X1
189	PXOR      X2, X4
190	LOADM1(X2, 13, R13)
191
192	// LOAD CS 58 (gathers 56 which will be rectified in Go)
193	LOADCS(X0, R12, 1, R15)
194
195	// Rounds 56-59
196	SHA1NEXTE X3, X5
197	VMOVDQA   X0, X6
198	SHA1MSG2  X3, X4
199	SHA1RNDS4 $0x02, X5, X0
200	VPEXTRD $0, X5, R12
201	SHA1MSG1  X3, X2
202	PXOR      X3, X1
203	LOADM1(X3, 14, R13)
204
205	// derive pre-round 64's E out of round 59's A.
206	VPEXTRD $3, X0, R12
207	ROLL $30, R12
208
209	// Rounds 60-63
210	SHA1NEXTE X4, X6
211	VMOVDQA   X0, X5
212	SHA1MSG2  X4, X1
213	SHA1RNDS4 $0x03, X6, X0
214	SHA1MSG1  X4, X3
215	PXOR      X4, X2
216	LOADM1(X4, 15, R13)
217
218	// LOAD CS 65 (gathers 64 which will be rectified in Go)
219	LOADCS(X0, R12, 2, R15)
220
221	// Rounds 64-67
222	SHA1NEXTE X1, X5
223	VMOVDQA   X0, X6
224	SHA1MSG2  X1, X2
225	SHA1RNDS4 $0x03, X5, X0
226	SHA1MSG1  X1, X4
227	PXOR      X1, X3
228	LOADM1(X1, 16, R13)
229
230	// Rounds 68-71
231	SHA1NEXTE X2, X6
232	VMOVDQA   X0, X5
233	SHA1MSG2  X2, X3
234	SHA1RNDS4 $0x03, X6, X0
235	PXOR      X2, X4
236	LOADM1(X2, 17, R13)
237
238	// Rounds 72-75
239	SHA1NEXTE X3, X5
240	VMOVDQA   X0, X6
241	SHA1MSG2  X3, X4
242	SHA1RNDS4 $0x03, X5, X0
243	LOADM1(X3, 18, R13)
244
245	// Rounds 76-79
246	SHA1NEXTE X4, X6
247	VMOVDQA   X0, X5
248	SHA1RNDS4 $0x03, X6, X0
249	LOADM1(X4, 19, R13)
250
251	// Add saved E and ABCD
252	SHA1NEXTE (AX), X5
253	PADDD     16(AX), X0
254
255	// Check if we are done, if not return to the loop
256	ADDQ $0x40, SI
257	CMPQ SI, DX
258	JNE  loop
259
260	// Write the hash state back to digest
261	PSHUFD  $0x1b, X0, X0
262	VMOVDQU X0, (DI)
263	PEXTRD  $0x03, X5, 16(DI)
264
265done:
266	RET
267
268DATA upper_mask<>+0(SB)/8, $0x0000000000000000
269DATA upper_mask<>+8(SB)/8, $0xffffffff00000000
270GLOBL upper_mask<>(SB), RODATA, $16
271
272DATA shuffle_mask<>+0(SB)/8, $0x08090a0b0c0d0e0f
273DATA shuffle_mask<>+8(SB)/8, $0x0001020304050607
274GLOBL shuffle_mask<>(SB), RODATA, $16