Crypto++ 8.7
Free C++ class library of cryptographic schemes
sha_simd.cpp
1// sha_simd.cpp - written and placed in the public domain by
2// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3//
4// This source file uses intrinsics to gain access to SHA-NI and
5// ARMv8a SHA instructions. A separate source file is needed
6// because additional CXXFLAGS are required to enable the
7// appropriate instructions sets in some build configurations.
8
9#include "pch.h"
10#include "config.h"
11#include "sha.h"
12#include "misc.h"
13
14#if defined(CRYPTOPP_DISABLE_SHA_ASM)
15# undef CRYPTOPP_X86_ASM_AVAILABLE
16# undef CRYPTOPP_X32_ASM_AVAILABLE
17# undef CRYPTOPP_X64_ASM_AVAILABLE
18# undef CRYPTOPP_SSE2_ASM_AVAILABLE
19#endif
20
21#if (CRYPTOPP_SHANI_AVAILABLE)
22# include <nmmintrin.h>
23# include <immintrin.h>
24#endif
25
26// Android makes <arm_acle.h> available with ARMv7-a
27#if (CRYPTOPP_BOOL_ARMV8)
28# if (CRYPTOPP_ARM_NEON_HEADER)
29# include <arm_neon.h>
30# endif
31# if (CRYPTOPP_ARM_ACLE_HEADER)
32# include <stdint.h>
33# include <arm_acle.h>
34# endif
35#endif
36
37#if CRYPTOPP_POWER8_SHA_AVAILABLE
38# include "ppc_simd.h"
39#endif
40
41#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
42# include <signal.h>
43# include <setjmp.h>
44#endif
45
46#ifndef EXCEPTION_EXECUTE_HANDLER
47# define EXCEPTION_EXECUTE_HANDLER 1
48#endif
49
50// Squash MS LNK4221 and libtool warnings
51extern const char SHA_SIMD_FNAME[] = __FILE__;
52
53NAMESPACE_BEGIN(CryptoPP)
54
55// ***************** SHA key tables ********************
56
57extern const word32 SHA256_K[64];
58extern const word64 SHA512_K[80];
59
60// ***************** SIGILL probes ********************
61
62#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
63extern "C" {
64 typedef void (*SigHandler)(int);
65
66 static jmp_buf s_jmpSIGILL;
67 static void SigIllHandler(int)
68 {
69 longjmp(s_jmpSIGILL, 1);
70 }
71}
72#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
73
74#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
75bool CPU_ProbeSHA1()
76{
77#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
78 return false;
79#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
80# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
81 volatile bool result = true;
82 __try
83 {
84 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
85 uint32x4_t data1 = vld1q_u32(w+0);
86 uint32x4_t data2 = vld1q_u32(w+4);
87 uint32x4_t data3 = vld1q_u32(w+8);
88
89 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
90 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
91 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
92 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
93 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
94
95 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
96 }
97 __except (EXCEPTION_EXECUTE_HANDLER)
98 {
99 return false;
100 }
101 return result;
102# else
103
104 // longjmp and clobber warnings. Volatile is required.
105 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
106 volatile bool result = true;
107
108 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
109 if (oldHandler == SIG_ERR)
110 return false;
111
112 volatile sigset_t oldMask;
113 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
114 {
115 signal(SIGILL, oldHandler);
116 return false;
117 }
118
119 if (setjmp(s_jmpSIGILL))
120 result = false;
121 else
122 {
123 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124 uint32x4_t data1 = vld1q_u32(w+0);
125 uint32x4_t data2 = vld1q_u32(w+4);
126 uint32x4_t data3 = vld1q_u32(w+8);
127
128 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133
134 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135 }
136
137 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138 signal(SIGILL, oldHandler);
139 return result;
140# endif
141#else
142 return false;
143#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
144}
145
146bool CPU_ProbeSHA256()
147{
148#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149 return false;
150#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152 volatile bool result = true;
153 __try
154 {
155 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156 uint32x4_t data1 = vld1q_u32(w+0);
157 uint32x4_t data2 = vld1q_u32(w+4);
158 uint32x4_t data3 = vld1q_u32(w+8);
159
160 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164
165 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166 }
167 __except (EXCEPTION_EXECUTE_HANDLER)
168 {
169 return false;
170 }
171 return result;
172#else
173
174 // longjmp and clobber warnings. Volatile is required.
175 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176 volatile bool result = true;
177
178 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179 if (oldHandler == SIG_ERR)
180 return false;
181
182 volatile sigset_t oldMask;
183 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184 {
185 signal(SIGILL, oldHandler);
186 return false;
187 }
188
189 if (setjmp(s_jmpSIGILL))
190 result = false;
191 else
192 {
193 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
194 uint32x4_t data1 = vld1q_u32(w+0);
195 uint32x4_t data2 = vld1q_u32(w+4);
196 uint32x4_t data3 = vld1q_u32(w+8);
197
198 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
199 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
200 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
201 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
202
203 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
204 }
205
206 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
207 signal(SIGILL, oldHandler);
208 return result;
209# endif
210#else
211 return false;
212#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
213}
214#endif // ARM32 or ARM64
215
216// ***************** Intel x86 SHA ********************
217
218/////////////////////////////////////
219// start of Walton and Gulley code //
220/////////////////////////////////////
221
222#if CRYPTOPP_SHANI_AVAILABLE
223// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
224void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
225{
226 CRYPTOPP_ASSERT(state);
227 CRYPTOPP_ASSERT(data);
228 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
229
230 __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
231 __m128i MASK, MSG0, MSG1, MSG2, MSG3;
232
233 // Load initial values
234 ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
235 E0 = _mm_set_epi32(state[4], 0, 0, 0);
236 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
237
238 // IA-32 SHA is little endian, SHA::Transform is big endian,
239 // and SHA::HashMultipleBlocks can be either. ByteOrder
240 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
241 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
242 _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
243 _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
244
245 while (length >= SHA1::BLOCKSIZE)
246 {
247 // Save current hash
248 ABCD_SAVE = ABCD;
249 E0_SAVE = E0;
250
251 // Rounds 0-3
252 MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
253 MSG0 = _mm_shuffle_epi8(MSG0, MASK);
254 E0 = _mm_add_epi32(E0, MSG0);
255 E1 = ABCD;
256 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
257
258 // Rounds 4-7
259 MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
260 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
261 E1 = _mm_sha1nexte_epu32(E1, MSG1);
262 E0 = ABCD;
263 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
264 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
265
266 // Rounds 8-11
267 MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
268 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
269 E0 = _mm_sha1nexte_epu32(E0, MSG2);
270 E1 = ABCD;
271 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
272 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
273 MSG0 = _mm_xor_si128(MSG0, MSG2);
274
275 // Rounds 12-15
276 MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
277 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
278 E1 = _mm_sha1nexte_epu32(E1, MSG3);
279 E0 = ABCD;
280 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
281 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
282 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
283 MSG1 = _mm_xor_si128(MSG1, MSG3);
284
285 // Rounds 16-19
286 E0 = _mm_sha1nexte_epu32(E0, MSG0);
287 E1 = ABCD;
288 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
289 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
290 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
291 MSG2 = _mm_xor_si128(MSG2, MSG0);
292
293 // Rounds 20-23
294 E1 = _mm_sha1nexte_epu32(E1, MSG1);
295 E0 = ABCD;
296 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
297 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
299 MSG3 = _mm_xor_si128(MSG3, MSG1);
300
301 // Rounds 24-27
302 E0 = _mm_sha1nexte_epu32(E0, MSG2);
303 E1 = ABCD;
304 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
305 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
307 MSG0 = _mm_xor_si128(MSG0, MSG2);
308
309 // Rounds 28-31
310 E1 = _mm_sha1nexte_epu32(E1, MSG3);
311 E0 = ABCD;
312 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
313 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
315 MSG1 = _mm_xor_si128(MSG1, MSG3);
316
317 // Rounds 32-35
318 E0 = _mm_sha1nexte_epu32(E0, MSG0);
319 E1 = ABCD;
320 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
321 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
322 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
323 MSG2 = _mm_xor_si128(MSG2, MSG0);
324
325 // Rounds 36-39
326 E1 = _mm_sha1nexte_epu32(E1, MSG1);
327 E0 = ABCD;
328 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
329 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
330 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
331 MSG3 = _mm_xor_si128(MSG3, MSG1);
332
333 // Rounds 40-43
334 E0 = _mm_sha1nexte_epu32(E0, MSG2);
335 E1 = ABCD;
336 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
337 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
339 MSG0 = _mm_xor_si128(MSG0, MSG2);
340
341 // Rounds 44-47
342 E1 = _mm_sha1nexte_epu32(E1, MSG3);
343 E0 = ABCD;
344 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
345 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
347 MSG1 = _mm_xor_si128(MSG1, MSG3);
348
349 // Rounds 48-51
350 E0 = _mm_sha1nexte_epu32(E0, MSG0);
351 E1 = ABCD;
352 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
353 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
355 MSG2 = _mm_xor_si128(MSG2, MSG0);
356
357 // Rounds 52-55
358 E1 = _mm_sha1nexte_epu32(E1, MSG1);
359 E0 = ABCD;
360 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
361 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
362 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
363 MSG3 = _mm_xor_si128(MSG3, MSG1);
364
365 // Rounds 56-59
366 E0 = _mm_sha1nexte_epu32(E0, MSG2);
367 E1 = ABCD;
368 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
369 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
370 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
371 MSG0 = _mm_xor_si128(MSG0, MSG2);
372
373 // Rounds 60-63
374 E1 = _mm_sha1nexte_epu32(E1, MSG3);
375 E0 = ABCD;
376 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
377 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
379 MSG1 = _mm_xor_si128(MSG1, MSG3);
380
381 // Rounds 64-67
382 E0 = _mm_sha1nexte_epu32(E0, MSG0);
383 E1 = ABCD;
384 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
385 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
386 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
387 MSG2 = _mm_xor_si128(MSG2, MSG0);
388
389 // Rounds 68-71
390 E1 = _mm_sha1nexte_epu32(E1, MSG1);
391 E0 = ABCD;
392 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
393 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
394 MSG3 = _mm_xor_si128(MSG3, MSG1);
395
396 // Rounds 72-75
397 E0 = _mm_sha1nexte_epu32(E0, MSG2);
398 E1 = ABCD;
399 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
400 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
401
402 // Rounds 76-79
403 E1 = _mm_sha1nexte_epu32(E1, MSG3);
404 E0 = ABCD;
405 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
406
407 // Add values back to state
408 E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
409 ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
410
411 data += SHA1::BLOCKSIZE/sizeof(word32);
412 length -= SHA1::BLOCKSIZE;
413 }
414
415 // Save state
416 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
417 _mm_storeu_si128(M128_CAST(state), ABCD);
418 state[4] = _mm_extract_epi32(E0, 3);
419}
420
421// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
422void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
423{
424 CRYPTOPP_ASSERT(state);
425 CRYPTOPP_ASSERT(data);
426 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
427
428 __m128i STATE0, STATE1;
429 __m128i MSG, TMP, MASK;
430 __m128i TMSG0, TMSG1, TMSG2, TMSG3;
431 __m128i ABEF_SAVE, CDGH_SAVE;
432
433 // Load initial values
434 TMP = _mm_loadu_si128(M128_CAST(&state[0]));
435 STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
436
437 // IA-32 SHA is little endian, SHA::Transform is big endian,
438 // and SHA::HashMultipleBlocks can be either. ByteOrder
439 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
440 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
441 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
442 _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
443
444 TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
445 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
446 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
447 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
448
449 while (length >= SHA256::BLOCKSIZE)
450 {
451 // Save current hash
452 ABEF_SAVE = STATE0;
453 CDGH_SAVE = STATE1;
454
455 // Rounds 0-3
456 MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
457 TMSG0 = _mm_shuffle_epi8(MSG, MASK);
458 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
459 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
460 MSG = _mm_shuffle_epi32(MSG, 0x0E);
461 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
462
463 // Rounds 4-7
464 TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
465 TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
466 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
467 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
468 MSG = _mm_shuffle_epi32(MSG, 0x0E);
469 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
471
472 // Rounds 8-11
473 TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
474 TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
475 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
476 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
477 MSG = _mm_shuffle_epi32(MSG, 0x0E);
478 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
479 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
480
481 // Rounds 12-15
482 TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
483 TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
484 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
485 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
486 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
487 TMSG0 = _mm_add_epi32(TMSG0, TMP);
488 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
489 MSG = _mm_shuffle_epi32(MSG, 0x0E);
490 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
491 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
492
493 // Rounds 16-19
494 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
495 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
496 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
497 TMSG1 = _mm_add_epi32(TMSG1, TMP);
498 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
499 MSG = _mm_shuffle_epi32(MSG, 0x0E);
500 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
501 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
502
503 // Rounds 20-23
504 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
505 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
506 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
507 TMSG2 = _mm_add_epi32(TMSG2, TMP);
508 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
509 MSG = _mm_shuffle_epi32(MSG, 0x0E);
510 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
511 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
512
513 // Rounds 24-27
514 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
515 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
516 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
517 TMSG3 = _mm_add_epi32(TMSG3, TMP);
518 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
519 MSG = _mm_shuffle_epi32(MSG, 0x0E);
520 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
521 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
522
523 // Rounds 28-31
524 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
525 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
526 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
527 TMSG0 = _mm_add_epi32(TMSG0, TMP);
528 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
529 MSG = _mm_shuffle_epi32(MSG, 0x0E);
530 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
531 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
532
533 // Rounds 32-35
534 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
535 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
536 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
537 TMSG1 = _mm_add_epi32(TMSG1, TMP);
538 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
539 MSG = _mm_shuffle_epi32(MSG, 0x0E);
540 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
541 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
542
543 // Rounds 36-39
544 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
545 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
546 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
547 TMSG2 = _mm_add_epi32(TMSG2, TMP);
548 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
549 MSG = _mm_shuffle_epi32(MSG, 0x0E);
550 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
551 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
552
553 // Rounds 40-43
554 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
555 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
556 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
557 TMSG3 = _mm_add_epi32(TMSG3, TMP);
558 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
559 MSG = _mm_shuffle_epi32(MSG, 0x0E);
560 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
561 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
562
563 // Rounds 44-47
564 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
565 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
566 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
567 TMSG0 = _mm_add_epi32(TMSG0, TMP);
568 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
569 MSG = _mm_shuffle_epi32(MSG, 0x0E);
570 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
571 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
572
573 // Rounds 48-51
574 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
575 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
576 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
577 TMSG1 = _mm_add_epi32(TMSG1, TMP);
578 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
579 MSG = _mm_shuffle_epi32(MSG, 0x0E);
580 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
581 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
582
583 // Rounds 52-55
584 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
585 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
586 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
587 TMSG2 = _mm_add_epi32(TMSG2, TMP);
588 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
589 MSG = _mm_shuffle_epi32(MSG, 0x0E);
590 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
591
592 // Rounds 56-59
593 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
594 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
595 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
596 TMSG3 = _mm_add_epi32(TMSG3, TMP);
597 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
598 MSG = _mm_shuffle_epi32(MSG, 0x0E);
599 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
600
601 // Rounds 60-63
602 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
603 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
604 MSG = _mm_shuffle_epi32(MSG, 0x0E);
605 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
606
607 // Add values back to state
608 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
609 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
610
611 data += SHA256::BLOCKSIZE/sizeof(word32);
612 length -= SHA256::BLOCKSIZE;
613 }
614
615 TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
616 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
617 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
618 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
619
620 // Save state
621 _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
622 _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
623}
624#endif // CRYPTOPP_SHANI_AVAILABLE
625
626///////////////////////////////////
627// end of Walton and Gulley code //
628///////////////////////////////////
629
630// ***************** ARMV8 SHA ********************
631
632/////////////////////////////////////////////////////////////
633// start of Walton, Schneiders, O'Rourke and Hovsmith code //
634/////////////////////////////////////////////////////////////
635
636#if CRYPTOPP_ARM_SHA1_AVAILABLE
637void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
638{
639 CRYPTOPP_ASSERT(state);
640 CRYPTOPP_ASSERT(data);
641 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
642
643 uint32x4_t C0, C1, C2, C3;
644 uint32x4_t ABCD, ABCD_SAVED;
645 uint32x4_t MSG0, MSG1, MSG2, MSG3;
646 uint32x4_t TMP0, TMP1;
647 uint32_t E0, E0_SAVED, E1;
648
649 // Load initial values
650 C0 = vdupq_n_u32(0x5A827999);
651 C1 = vdupq_n_u32(0x6ED9EBA1);
652 C2 = vdupq_n_u32(0x8F1BBCDC);
653 C3 = vdupq_n_u32(0xCA62C1D6);
654
655 ABCD = vld1q_u32(&state[0]);
656 E0 = state[4];
657
658 while (length >= SHA1::BLOCKSIZE)
659 {
660 // Save current hash
661 ABCD_SAVED = ABCD;
662 E0_SAVED = E0;
663
664 MSG0 = vld1q_u32(data + 0);
665 MSG1 = vld1q_u32(data + 4);
666 MSG2 = vld1q_u32(data + 8);
667 MSG3 = vld1q_u32(data + 12);
668
669 if (order == BIG_ENDIAN_ORDER) // Data arrangement
670 {
671 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
672 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
673 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
674 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
675 }
676
677 TMP0 = vaddq_u32(MSG0, C0);
678 TMP1 = vaddq_u32(MSG1, C0);
679
680 // Rounds 0-3
681 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
682 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
683 TMP0 = vaddq_u32(MSG2, C0);
684 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
685
686 // Rounds 4-7
687 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
688 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
689 TMP1 = vaddq_u32(MSG3, C0);
690 MSG0 = vsha1su1q_u32(MSG0, MSG3);
691 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
692
693 // Rounds 8-11
694 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
695 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
696 TMP0 = vaddq_u32(MSG0, C0);
697 MSG1 = vsha1su1q_u32(MSG1, MSG0);
698 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
699
700 // Rounds 12-15
701 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
702 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
703 TMP1 = vaddq_u32(MSG1, C1);
704 MSG2 = vsha1su1q_u32(MSG2, MSG1);
705 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
706
707 // Rounds 16-19
708 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
709 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
710 TMP0 = vaddq_u32(MSG2, C1);
711 MSG3 = vsha1su1q_u32(MSG3, MSG2);
712 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
713
714 // Rounds 20-23
715 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
716 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
717 TMP1 = vaddq_u32(MSG3, C1);
718 MSG0 = vsha1su1q_u32(MSG0, MSG3);
719 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
720
721 // Rounds 24-27
722 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
723 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
724 TMP0 = vaddq_u32(MSG0, C1);
725 MSG1 = vsha1su1q_u32(MSG1, MSG0);
726 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
727
728 // Rounds 28-31
729 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
730 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
731 TMP1 = vaddq_u32(MSG1, C1);
732 MSG2 = vsha1su1q_u32(MSG2, MSG1);
733 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
734
735 // Rounds 32-35
736 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
737 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
738 TMP0 = vaddq_u32(MSG2, C2);
739 MSG3 = vsha1su1q_u32(MSG3, MSG2);
740 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
741
742 // Rounds 36-39
743 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
744 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
745 TMP1 = vaddq_u32(MSG3, C2);
746 MSG0 = vsha1su1q_u32(MSG0, MSG3);
747 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
748
749 // Rounds 40-43
750 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
751 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
752 TMP0 = vaddq_u32(MSG0, C2);
753 MSG1 = vsha1su1q_u32(MSG1, MSG0);
754 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
755
756 // Rounds 44-47
757 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
758 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
759 TMP1 = vaddq_u32(MSG1, C2);
760 MSG2 = vsha1su1q_u32(MSG2, MSG1);
761 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
762
763 // Rounds 48-51
764 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
765 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
766 TMP0 = vaddq_u32(MSG2, C2);
767 MSG3 = vsha1su1q_u32(MSG3, MSG2);
768 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
769
770 // Rounds 52-55
771 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
772 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
773 TMP1 = vaddq_u32(MSG3, C3);
774 MSG0 = vsha1su1q_u32(MSG0, MSG3);
775 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
776
777 // Rounds 56-59
778 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
779 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
780 TMP0 = vaddq_u32(MSG0, C3);
781 MSG1 = vsha1su1q_u32(MSG1, MSG0);
782 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
783
784 // Rounds 60-63
785 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787 TMP1 = vaddq_u32(MSG1, C3);
788 MSG2 = vsha1su1q_u32(MSG2, MSG1);
789 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
790
791 // Rounds 64-67
792 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
793 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
794 TMP0 = vaddq_u32(MSG2, C3);
795 MSG3 = vsha1su1q_u32(MSG3, MSG2);
796 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
797
798 // Rounds 68-71
799 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
800 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
801 TMP1 = vaddq_u32(MSG3, C3);
802 MSG0 = vsha1su1q_u32(MSG0, MSG3);
803
804 // Rounds 72-75
805 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
806 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
807
808 // Rounds 76-79
809 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
810 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
811
812 E0 += E0_SAVED;
813 ABCD = vaddq_u32(ABCD_SAVED, ABCD);
814
815 data += SHA1::BLOCKSIZE/sizeof(word32);
816 length -= SHA1::BLOCKSIZE;
817 }
818
819 // Save state
820 vst1q_u32(&state[0], ABCD);
821 state[4] = E0;
822}
823#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
824
825#if CRYPTOPP_ARM_SHA2_AVAILABLE
826void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
827{
828 CRYPTOPP_ASSERT(state);
829 CRYPTOPP_ASSERT(data);
830 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
831
832 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
833 uint32x4_t MSG0, MSG1, MSG2, MSG3;
834 uint32x4_t TMP0, TMP1, TMP2;
835
836 // Load initial values
837 STATE0 = vld1q_u32(&state[0]);
838 STATE1 = vld1q_u32(&state[4]);
839
840 while (length >= SHA256::BLOCKSIZE)
841 {
842 // Save current hash
843 ABEF_SAVE = STATE0;
844 CDGH_SAVE = STATE1;
845
846 // Load message
847 MSG0 = vld1q_u32(data + 0);
848 MSG1 = vld1q_u32(data + 4);
849 MSG2 = vld1q_u32(data + 8);
850 MSG3 = vld1q_u32(data + 12);
851
852 if (order == BIG_ENDIAN_ORDER) // Data arrangement
853 {
854 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
855 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
856 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
857 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
858 }
859
860 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
861
862 // Rounds 0-3
863 MSG0 = vsha256su0q_u32(MSG0, MSG1);
864 TMP2 = STATE0;
865 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
866 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
869
870 // Rounds 4-7
871 MSG1 = vsha256su0q_u32(MSG1, MSG2);
872 TMP2 = STATE0;
873 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
874 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
877
878 // Rounds 8-11
879 MSG2 = vsha256su0q_u32(MSG2, MSG3);
880 TMP2 = STATE0;
881 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
882 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
885
886 // Rounds 12-15
887 MSG3 = vsha256su0q_u32(MSG3, MSG0);
888 TMP2 = STATE0;
889 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
890 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
893
894 // Rounds 16-19
895 MSG0 = vsha256su0q_u32(MSG0, MSG1);
896 TMP2 = STATE0;
897 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
898 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
901
902 // Rounds 20-23
903 MSG1 = vsha256su0q_u32(MSG1, MSG2);
904 TMP2 = STATE0;
905 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
906 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
909
910 // Rounds 24-27
911 MSG2 = vsha256su0q_u32(MSG2, MSG3);
912 TMP2 = STATE0;
913 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
914 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
917
918 // Rounds 28-31
919 MSG3 = vsha256su0q_u32(MSG3, MSG0);
920 TMP2 = STATE0;
921 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
922 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
925
926 // Rounds 32-35
927 MSG0 = vsha256su0q_u32(MSG0, MSG1);
928 TMP2 = STATE0;
929 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
930 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
933
934 // Rounds 36-39
935 MSG1 = vsha256su0q_u32(MSG1, MSG2);
936 TMP2 = STATE0;
937 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
938 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
941
942 // Rounds 40-43
943 MSG2 = vsha256su0q_u32(MSG2, MSG3);
944 TMP2 = STATE0;
945 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
946 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
947 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
948 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
949
950 // Rounds 44-47
951 MSG3 = vsha256su0q_u32(MSG3, MSG0);
952 TMP2 = STATE0;
953 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
954 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
955 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
956 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
957
958 // Rounds 48-51
959 TMP2 = STATE0;
960 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
961 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
962 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
963
964 // Rounds 52-55
965 TMP2 = STATE0;
966 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
967 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
968 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
969
970 // Rounds 56-59
971 TMP2 = STATE0;
972 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
973 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
974 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
975
976 // Rounds 60-63
977 TMP2 = STATE0;
978 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
979 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
980
981 // Add back to state
982 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
983 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
984
985 data += SHA256::BLOCKSIZE/sizeof(word32);
986 length -= SHA256::BLOCKSIZE;
987 }
988
989 // Save state
990 vst1q_u32(&state[0], STATE0);
991 vst1q_u32(&state[4], STATE1);
992}
993#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
994
995///////////////////////////////////////////////////////////
996// end of Walton, Schneiders, O'Rourke and Hovsmith code //
997///////////////////////////////////////////////////////////
998
999// ***************** Power8 SHA ********************
1000
1001//////////////////////////////////////////////////
1002// start Gustavo, Serra, Scalet and Walton code //
1003//////////////////////////////////////////////////
1004
1005#if CRYPTOPP_POWER8_SHA_AVAILABLE
1006
1007// Indexes into the S[] array
1008enum {A=0, B=1, C, D, E, F, G, H};
1009
1010inline
1011uint32x4_p VecLoad32(const word32* data, int offset)
1012{
1013#if (CRYPTOPP_LITTLE_ENDIAN)
1014 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1015 const uint32x4_p val = VecLoad(offset, data);
1016 return (uint32x4_p)VecPermute(val, val, mask);
1017#else
1018 return VecLoad(offset, data);
1019#endif
1020}
1021
1022template<class T> inline
1023void VecStore32(const T data, word32 dest[4])
1024{
1025 VecStore(data, dest);
1026}
1027
1028inline
1029uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1030{
1031 // The trick below is due to Andy Polyakov and Jack Lloyd
1032 return vec_sel(z,y,x);
1033}
1034
1035inline
1036uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1037{
1038 // The trick below is due to Andy Polyakov and Jack Lloyd
1039 return vec_sel(y, z, VecXor(x, y));
1040}
1041
1042inline
1043uint32x4_p Vector_sigma0(const uint32x4_p val)
1044{
1045 return VecSHA256<0,0>(val);
1046}
1047
1048inline
1049uint32x4_p Vector_sigma1(const uint32x4_p val)
1050{
1051 return VecSHA256<0,0xf>(val);
1052}
1053
1054inline
1055uint32x4_p VectorSigma0(const uint32x4_p val)
1056{
1057 return VecSHA256<1,0>(val);
1058}
1059
1060inline
1061uint32x4_p VectorSigma1(const uint32x4_p val)
1062{
1063 return VecSHA256<1,0xf>(val);
1064}
1065
1066inline
1067uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1068 const uint32x4_p c, const uint32x4_p d)
1069{
1070 const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1071 const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1072 return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1073}
1074
1075template <unsigned int R> inline
1076void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1077{
1078 uint32x4_p T1, T2;
1079
1080 W[R] = M;
1081 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1082 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1083
1084 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1085 S[E] = S[D] + T1;
1086 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1087 S[A] = T1 + T2;
1088}
1089
1090template <unsigned int R> inline
1091void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1092{
1093 // Indexes into the W[] array
1094 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1095
1096 const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1097 const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1098
1099 uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1100 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1101 uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1102
1103 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1104 S[E] = S[D] + T1;
1105 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1106 S[A] = T1 + T2;
1107}
1108
1109void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1110{
1111 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1112 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1113 CRYPTOPP_UNUSED(order);
1114
1115 const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1116 const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1117
1118 uint32x4_p abcd = VecLoad(state+0);
1119 uint32x4_p efgh = VecLoad(state+4);
1120 uint32x4_p W[16], S[8], vm, vk;
1121
1122 size_t blocks = length / SHA256::BLOCKSIZE;
1123 while (blocks--)
1124 {
1125 unsigned int offset=0;
1126
1127 S[A] = abcd; S[E] = efgh;
1128 S[B] = VecShiftLeftOctet<4>(S[A]);
1129 S[F] = VecShiftLeftOctet<4>(S[E]);
1130 S[C] = VecShiftLeftOctet<4>(S[B]);
1131 S[G] = VecShiftLeftOctet<4>(S[F]);
1132 S[D] = VecShiftLeftOctet<4>(S[C]);
1133 S[H] = VecShiftLeftOctet<4>(S[G]);
1134
1135 // Rounds 0-16
1136 vk = VecLoad(offset, k);
1137 vm = VecLoad32(m, offset);
1138 SHA256_ROUND1<0>(W,S, vk,vm);
1139 offset+=16;
1140
1141 vk = VecShiftLeftOctet<4>(vk);
1142 vm = VecShiftLeftOctet<4>(vm);
1143 SHA256_ROUND1<1>(W,S, vk,vm);
1144
1145 vk = VecShiftLeftOctet<4>(vk);
1146 vm = VecShiftLeftOctet<4>(vm);
1147 SHA256_ROUND1<2>(W,S, vk,vm);
1148
1149 vk = VecShiftLeftOctet<4>(vk);
1150 vm = VecShiftLeftOctet<4>(vm);
1151 SHA256_ROUND1<3>(W,S, vk,vm);
1152
1153 vk = VecLoad(offset, k);
1154 vm = VecLoad32(m, offset);
1155 SHA256_ROUND1<4>(W,S, vk,vm);
1156 offset+=16;
1157
1158 vk = VecShiftLeftOctet<4>(vk);
1159 vm = VecShiftLeftOctet<4>(vm);
1160 SHA256_ROUND1<5>(W,S, vk,vm);
1161
1162 vk = VecShiftLeftOctet<4>(vk);
1163 vm = VecShiftLeftOctet<4>(vm);
1164 SHA256_ROUND1<6>(W,S, vk,vm);
1165
1166 vk = VecShiftLeftOctet<4>(vk);
1167 vm = VecShiftLeftOctet<4>(vm);
1168 SHA256_ROUND1<7>(W,S, vk,vm);
1169
1170 vk = VecLoad(offset, k);
1171 vm = VecLoad32(m, offset);
1172 SHA256_ROUND1<8>(W,S, vk,vm);
1173 offset+=16;
1174
1175 vk = VecShiftLeftOctet<4>(vk);
1176 vm = VecShiftLeftOctet<4>(vm);
1177 SHA256_ROUND1<9>(W,S, vk,vm);
1178
1179 vk = VecShiftLeftOctet<4>(vk);
1180 vm = VecShiftLeftOctet<4>(vm);
1181 SHA256_ROUND1<10>(W,S, vk,vm);
1182
1183 vk = VecShiftLeftOctet<4>(vk);
1184 vm = VecShiftLeftOctet<4>(vm);
1185 SHA256_ROUND1<11>(W,S, vk,vm);
1186
1187 vk = VecLoad(offset, k);
1188 vm = VecLoad32(m, offset);
1189 SHA256_ROUND1<12>(W,S, vk,vm);
1190 offset+=16;
1191
1192 vk = VecShiftLeftOctet<4>(vk);
1193 vm = VecShiftLeftOctet<4>(vm);
1194 SHA256_ROUND1<13>(W,S, vk,vm);
1195
1196 vk = VecShiftLeftOctet<4>(vk);
1197 vm = VecShiftLeftOctet<4>(vm);
1198 SHA256_ROUND1<14>(W,S, vk,vm);
1199
1200 vk = VecShiftLeftOctet<4>(vk);
1201 vm = VecShiftLeftOctet<4>(vm);
1202 SHA256_ROUND1<15>(W,S, vk,vm);
1203
1204 m += 16; // 32-bit words, not bytes
1205
1206 // Rounds 16-64
1207 for (unsigned int i=16; i<64; i+=16)
1208 {
1209 vk = VecLoad(offset, k);
1210 SHA256_ROUND2<0>(W,S, vk);
1211 SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1212 SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1213 SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1214 offset+=16;
1215
1216 vk = VecLoad(offset, k);
1217 SHA256_ROUND2<4>(W,S, vk);
1218 SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1219 SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1220 SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1221 offset+=16;
1222
1223 vk = VecLoad(offset, k);
1224 SHA256_ROUND2<8>(W,S, vk);
1225 SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1226 SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1227 SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1228 offset+=16;
1229
1230 vk = VecLoad(offset, k);
1231 SHA256_ROUND2<12>(W,S, vk);
1232 SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1233 SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1234 SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1235 offset+=16;
1236 }
1237
1238 abcd += VectorPack(S[A],S[B],S[C],S[D]);
1239 efgh += VectorPack(S[E],S[F],S[G],S[H]);
1240 }
1241
1242 VecStore32(abcd, state+0);
1243 VecStore32(efgh, state+4);
1244}
1245
1246inline
1247void VecStore64(const uint64x2_p val, word64* data)
1248{
1249 VecStore(val, data);
1250}
1251
1252inline
1253uint64x2_p VecLoad64(const word64* data, int offset)
1254{
1255#if (CRYPTOPP_LITTLE_ENDIAN)
1256 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1257 return VecPermute(VecLoad(offset, data), mask);
1258#else
1259 return VecLoad(offset, data);
1260#endif
1261}
1262
1263inline
1264uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1265{
1266 // The trick below is due to Andy Polyakov and Jack Lloyd
1267 return vec_sel(z,y,x);
1268}
1269
1270inline
1271uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1272{
1273 // The trick below is due to Andy Polyakov and Jack Lloyd
1274 return vec_sel(y, z, VecXor(x, y));
1275}
1276
1277inline
1278uint64x2_p Vector_sigma0(const uint64x2_p val)
1279{
1280 return VecSHA512<0,0>(val);
1281}
1282
1283inline
1284uint64x2_p Vector_sigma1(const uint64x2_p val)
1285{
1286 return VecSHA512<0,0xf>(val);
1287}
1288
1289inline
1290uint64x2_p VectorSigma0(const uint64x2_p val)
1291{
1292 return VecSHA512<1,0>(val);
1293}
1294
1295inline
1296uint64x2_p VectorSigma1(const uint64x2_p val)
1297{
1298 return VecSHA512<1,0xf>(val);
1299}
1300
1301inline
1302uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1303{
1304 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1305 return VecPermute(x,y,m);
1306}
1307
1308template <unsigned int R> inline
1309void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1310{
1311 uint64x2_p T1, T2;
1312
1313 W[R] = M;
1314 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1315 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1316
1317 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1318 S[E] = S[D] + T1;
1319 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1320 S[A] = T1 + T2;
1321}
1322
1323template <unsigned int R> inline
1324void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1325{
1326 // Indexes into the W[] array
1327 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1328
1329 const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1330 const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1331
1332 uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1333 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1334 uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1335
1336 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1337 S[E] = S[D] + T1;
1338 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1339 S[A] = T1 + T2;
1340}
1341
1342void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1343{
1344 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1345 CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1346 CRYPTOPP_UNUSED(order);
1347
1348 const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1349 const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1350
1351 uint64x2_p ab = VecLoad(state+0);
1352 uint64x2_p cd = VecLoad(state+2);
1353 uint64x2_p ef = VecLoad(state+4);
1354 uint64x2_p gh = VecLoad(state+6);
1355 uint64x2_p W[16], S[8], vm, vk;
1356
1357 size_t blocks = length / SHA512::BLOCKSIZE;
1358 while (blocks--)
1359 {
1360 unsigned int offset=0;
1361
1362 S[A] = ab; S[C] = cd;
1363 S[E] = ef; S[G] = gh;
1364 S[B] = VecShiftLeftOctet<8>(S[A]);
1365 S[D] = VecShiftLeftOctet<8>(S[C]);
1366 S[F] = VecShiftLeftOctet<8>(S[E]);
1367 S[H] = VecShiftLeftOctet<8>(S[G]);
1368
1369 // Rounds 0-16
1370 vk = VecLoad(offset, k);
1371 vm = VecLoad64(m, offset);
1372 SHA512_ROUND1<0>(W,S, vk,vm);
1373 offset+=16;
1374
1375 vk = VecShiftLeftOctet<8>(vk);
1376 vm = VecShiftLeftOctet<8>(vm);
1377 SHA512_ROUND1<1>(W,S, vk,vm);
1378
1379 vk = VecLoad(offset, k);
1380 vm = VecLoad64(m, offset);
1381 SHA512_ROUND1<2>(W,S, vk,vm);
1382 offset+=16;
1383
1384 vk = VecShiftLeftOctet<8>(vk);
1385 vm = VecShiftLeftOctet<8>(vm);
1386 SHA512_ROUND1<3>(W,S, vk,vm);
1387
1388 vk = VecLoad(offset, k);
1389 vm = VecLoad64(m, offset);
1390 SHA512_ROUND1<4>(W,S, vk,vm);
1391 offset+=16;
1392
1393 vk = VecShiftLeftOctet<8>(vk);
1394 vm = VecShiftLeftOctet<8>(vm);
1395 SHA512_ROUND1<5>(W,S, vk,vm);
1396
1397 vk = VecLoad(offset, k);
1398 vm = VecLoad64(m, offset);
1399 SHA512_ROUND1<6>(W,S, vk,vm);
1400 offset+=16;
1401
1402 vk = VecShiftLeftOctet<8>(vk);
1403 vm = VecShiftLeftOctet<8>(vm);
1404 SHA512_ROUND1<7>(W,S, vk,vm);
1405
1406 vk = VecLoad(offset, k);
1407 vm = VecLoad64(m, offset);
1408 SHA512_ROUND1<8>(W,S, vk,vm);
1409 offset+=16;
1410
1411 vk = VecShiftLeftOctet<8>(vk);
1412 vm = VecShiftLeftOctet<8>(vm);
1413 SHA512_ROUND1<9>(W,S, vk,vm);
1414
1415 vk = VecLoad(offset, k);
1416 vm = VecLoad64(m, offset);
1417 SHA512_ROUND1<10>(W,S, vk,vm);
1418 offset+=16;
1419
1420 vk = VecShiftLeftOctet<8>(vk);
1421 vm = VecShiftLeftOctet<8>(vm);
1422 SHA512_ROUND1<11>(W,S, vk,vm);
1423
1424 vk = VecLoad(offset, k);
1425 vm = VecLoad64(m, offset);
1426 SHA512_ROUND1<12>(W,S, vk,vm);
1427 offset+=16;
1428
1429 vk = VecShiftLeftOctet<8>(vk);
1430 vm = VecShiftLeftOctet<8>(vm);
1431 SHA512_ROUND1<13>(W,S, vk,vm);
1432
1433 vk = VecLoad(offset, k);
1434 vm = VecLoad64(m, offset);
1435 SHA512_ROUND1<14>(W,S, vk,vm);
1436 offset+=16;
1437
1438 vk = VecShiftLeftOctet<8>(vk);
1439 vm = VecShiftLeftOctet<8>(vm);
1440 SHA512_ROUND1<15>(W,S, vk,vm);
1441
1442 m += 16; // 64-bit words, not bytes
1443
1444 // Rounds 16-80
1445 for (unsigned int i=16; i<80; i+=16)
1446 {
1447 vk = VecLoad(offset, k);
1448 SHA512_ROUND2<0>(W,S, vk);
1449 SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1450 offset+=16;
1451
1452 vk = VecLoad(offset, k);
1453 SHA512_ROUND2<2>(W,S, vk);
1454 SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1455 offset+=16;
1456
1457 vk = VecLoad(offset, k);
1458 SHA512_ROUND2<4>(W,S, vk);
1459 SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1460 offset+=16;
1461
1462 vk = VecLoad(offset, k);
1463 SHA512_ROUND2<6>(W,S, vk);
1464 SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1465 offset+=16;
1466
1467 vk = VecLoad(offset, k);
1468 SHA512_ROUND2<8>(W,S, vk);
1469 SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1470 offset+=16;
1471
1472 vk = VecLoad(offset, k);
1473 SHA512_ROUND2<10>(W,S, vk);
1474 SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1475 offset+=16;
1476
1477 vk = VecLoad(offset, k);
1478 SHA512_ROUND2<12>(W,S, vk);
1479 SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1480 offset+=16;
1481
1482 vk = VecLoad(offset, k);
1483 SHA512_ROUND2<14>(W,S, vk);
1484 SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1485 offset+=16;
1486 }
1487
1488 ab += VectorPack(S[A],S[B]);
1489 cd += VectorPack(S[C],S[D]);
1490 ef += VectorPack(S[E],S[F]);
1491 gh += VectorPack(S[G],S[H]);
1492 }
1493
1494 VecStore64(ab, state+0);
1495 VecStore64(cd, state+2);
1496 VecStore64(ef, state+4);
1497 VecStore64(gh, state+6);
1498}
1499
1500#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1501
1502////////////////////////////////////////////////
1503// end Gustavo, Serra, Scalet and Walton code //
1504////////////////////////////////////////////////
1505
1506NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Library configuration file.
#define W64LIT(x)
Declare an unsigned word64.
Definition: config_int.h:119
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
Classes for SHA-1 and SHA-2 family of message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68