1//go:build !noasm && gc && amd64 && !arm64
2
3#include "textflag.h"
4
5// License information for the original SHA1 arm64 implemention:
6// Copyright 2024 The Go Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style
8// license that can be found at:
9// - https://github.com/golang/go/blob/master/LICENSE
10//
11// Reference implementations:
12// - https://github.com/golang/go/blob/master/src/crypto/sha1/sha1block_amd64.s
13
14// Reverse the dword order in abcd via PSHUFD then store the 16 bytes in one
15// move, instead of issuing four VPEXTRD's that each go through the store port.
16#define LOADCS(abcd, e, index, target) \
17 VPSHUFD $0x1B, abcd, X8; \
18 VMOVDQU X8, ((index*20)+0)(target); \
19 MOVL e, ((index*20)+16)(target);
20
21#define LOADM1(m1, index, target) \
22 VPSHUFD $0x1B, m1, X8; \
23 VMOVDQU X8, ((index*16)+0)(target);
24
25// func blockAMD64(h []uint32, p []byte, m1 []uint32, cs [][5]uint32)
26// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
27TEXT ·blockAMD64(SB), NOSPLIT, $80-96
28 MOVQ h_base+0(FP), DI
29 MOVQ p_base+24(FP), SI
30 MOVQ p_len+32(FP), DX
31 MOVQ m1_base+48(FP), R13
32 MOVQ cs_base+72(FP), R15
33 CMPQ DX, $0x00
34 JEQ done
35 ADDQ SI, DX
36
37 // Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes
38 LEAQ 15(SP), AX
39 MOVQ $0x000000000000000f, CX
40 NOTQ CX
41 ANDQ CX, AX
42
43 // Load initial hash state
44 PINSRD $0x03, 16(DI), X5
45 VMOVDQU (DI), X0
46 PAND upper_mask<>+0(SB), X5
47 PSHUFD $0x1b, X0, X0
48 VMOVDQA shuffle_mask<>+0(SB), X7
49
50loop:
51 // Save ABCD and E working values
52 VMOVDQA X5, (AX)
53 VMOVDQA X0, 16(AX)
54
55 // LOAD CS 0
56 VPEXTRD $3, X5, R12
57 LOADCS(X0, R12, 0, R15)
58
59 // Rounds 0-3
60 VMOVDQU (SI), X1
61 PSHUFB X7, X1
62 PADDD X1, X5
63 VMOVDQA X0, X6
64 SHA1RNDS4 $0x00, X5, X0
65 LOADM1(X1, 0, R13)
66
67 // Rounds 4-7
68 VMOVDQU 16(SI), X2
69 PSHUFB X7, X2
70 SHA1NEXTE X2, X6
71 VMOVDQA X0, X5
72 SHA1RNDS4 $0x00, X6, X0
73 SHA1MSG1 X2, X1
74 LOADM1(X2, 1, R13)
75
76 // Rounds 8-11
77 VMOVDQU 32(SI), X3
78 PSHUFB X7, X3
79 SHA1NEXTE X3, X5
80 VMOVDQA X0, X6
81 SHA1RNDS4 $0x00, X5, X0
82 SHA1MSG1 X3, X2
83 PXOR X3, X1
84 LOADM1(X3, 2, R13)
85
86 // Rounds 12-15
87 VMOVDQU 48(SI), X4
88 PSHUFB X7, X4
89 SHA1NEXTE X4, X6
90 VMOVDQA X0, X5
91 SHA1MSG2 X4, X1
92 SHA1RNDS4 $0x00, X6, X0
93 SHA1MSG1 X4, X3
94 PXOR X4, X2
95 LOADM1(X4, 3, R13)
96
97 // Rounds 16-19
98 SHA1NEXTE X1, X5
99 VMOVDQA X0, X6
100 SHA1MSG2 X1, X2
101 SHA1RNDS4 $0x00, X5, X0
102 SHA1MSG1 X1, X4
103 PXOR X1, X3
104 LOADM1(X1, 4, R13)
105
106 // Rounds 20-23
107 SHA1NEXTE X2, X6
108 VMOVDQA X0, X5
109 SHA1MSG2 X2, X3
110 SHA1RNDS4 $0x01, X6, X0
111 SHA1MSG1 X2, X1
112 PXOR X2, X4
113 LOADM1(X2, 5, R13)
114
115 // Rounds 24-27
116 SHA1NEXTE X3, X5
117 VMOVDQA X0, X6
118 SHA1MSG2 X3, X4
119 SHA1RNDS4 $0x01, X5, X0
120 SHA1MSG1 X3, X2
121 PXOR X3, X1
122 LOADM1(X3, 6, R13)
123
124 // Rounds 28-31
125 SHA1NEXTE X4, X6
126 VMOVDQA X0, X5
127 SHA1MSG2 X4, X1
128 SHA1RNDS4 $0x01, X6, X0
129 SHA1MSG1 X4, X3
130 PXOR X4, X2
131 LOADM1(X4, 7, R13)
132
133 // Rounds 32-35
134 SHA1NEXTE X1, X5
135 VMOVDQA X0, X6
136 SHA1MSG2 X1, X2
137 SHA1RNDS4 $0x01, X5, X0
138 SHA1MSG1 X1, X4
139 PXOR X1, X3
140 LOADM1(X1, 8, R13)
141
142 // Rounds 36-39
143 SHA1NEXTE X2, X6
144 VMOVDQA X0, X5
145 SHA1MSG2 X2, X3
146 SHA1RNDS4 $0x01, X6, X0
147 SHA1MSG1 X2, X1
148 PXOR X2, X4
149 LOADM1(X2, 9, R13)
150
151 // Rounds 40-43
152 SHA1NEXTE X3, X5
153 VMOVDQA X0, X6
154 SHA1MSG2 X3, X4
155 SHA1RNDS4 $0x02, X5, X0
156 SHA1MSG1 X3, X2
157 PXOR X3, X1
158 LOADM1(X3, 10, R13)
159
160 // Rounds 44-47
161 SHA1NEXTE X4, X6
162 VMOVDQA X0, X5
163 SHA1MSG2 X4, X1
164 SHA1RNDS4 $0x02, X6, X0
165 SHA1MSG1 X4, X3
166 PXOR X4, X2
167 LOADM1(X4, 11, R13)
168
169 // Rounds 48-51
170 SHA1NEXTE X1, X5
171 VMOVDQA X0, X6
172 SHA1MSG2 X1, X2
173 SHA1RNDS4 $0x02, X5, X0
174 VPEXTRD $0, X5, R12
175 SHA1MSG1 X1, X4
176 PXOR X1, X3
177 LOADM1(X1, 12, R13)
178
179 // derive pre-round 56's E out of round 51's A.
180 VPEXTRD $3, X0, R12
181 ROLL $30, R12
182
183 // Rounds 52-55
184 SHA1NEXTE X2, X6
185 VMOVDQA X0, X5
186 SHA1MSG2 X2, X3
187 SHA1RNDS4 $0x02, X6, X0
188 SHA1MSG1 X2, X1
189 PXOR X2, X4
190 LOADM1(X2, 13, R13)
191
192 // LOAD CS 58 (gathers 56 which will be rectified in Go)
193 LOADCS(X0, R12, 1, R15)
194
195 // Rounds 56-59
196 SHA1NEXTE X3, X5
197 VMOVDQA X0, X6
198 SHA1MSG2 X3, X4
199 SHA1RNDS4 $0x02, X5, X0
200 VPEXTRD $0, X5, R12
201 SHA1MSG1 X3, X2
202 PXOR X3, X1
203 LOADM1(X3, 14, R13)
204
205 // derive pre-round 64's E out of round 59's A.
206 VPEXTRD $3, X0, R12
207 ROLL $30, R12
208
209 // Rounds 60-63
210 SHA1NEXTE X4, X6
211 VMOVDQA X0, X5
212 SHA1MSG2 X4, X1
213 SHA1RNDS4 $0x03, X6, X0
214 SHA1MSG1 X4, X3
215 PXOR X4, X2
216 LOADM1(X4, 15, R13)
217
218 // LOAD CS 65 (gathers 64 which will be rectified in Go)
219 LOADCS(X0, R12, 2, R15)
220
221 // Rounds 64-67
222 SHA1NEXTE X1, X5
223 VMOVDQA X0, X6
224 SHA1MSG2 X1, X2
225 SHA1RNDS4 $0x03, X5, X0
226 SHA1MSG1 X1, X4
227 PXOR X1, X3
228 LOADM1(X1, 16, R13)
229
230 // Rounds 68-71
231 SHA1NEXTE X2, X6
232 VMOVDQA X0, X5
233 SHA1MSG2 X2, X3
234 SHA1RNDS4 $0x03, X6, X0
235 PXOR X2, X4
236 LOADM1(X2, 17, R13)
237
238 // Rounds 72-75
239 SHA1NEXTE X3, X5
240 VMOVDQA X0, X6
241 SHA1MSG2 X3, X4
242 SHA1RNDS4 $0x03, X5, X0
243 LOADM1(X3, 18, R13)
244
245 // Rounds 76-79
246 SHA1NEXTE X4, X6
247 VMOVDQA X0, X5
248 SHA1RNDS4 $0x03, X6, X0
249 LOADM1(X4, 19, R13)
250
251 // Add saved E and ABCD
252 SHA1NEXTE (AX), X5
253 PADDD 16(AX), X0
254
255 // Check if we are done, if not return to the loop
256 ADDQ $0x40, SI
257 CMPQ SI, DX
258 JNE loop
259
260 // Write the hash state back to digest
261 PSHUFD $0x1b, X0, X0
262 VMOVDQU X0, (DI)
263 PEXTRD $0x03, X5, 16(DI)
264
265done:
266 RET
267
268DATA upper_mask<>+0(SB)/8, $0x0000000000000000
269DATA upper_mask<>+8(SB)/8, $0xffffffff00000000
270GLOBL upper_mask<>(SB), RODATA, $16
271
272DATA shuffle_mask<>+0(SB)/8, $0x08090a0b0c0d0e0f
273DATA shuffle_mask<>+8(SB)/8, $0x0001020304050607
274GLOBL shuffle_mask<>(SB), RODATA, $16