Crypto++ 8.7
Free C++ class library of cryptographic schemes
adv_simd.h
Go to the documentation of this file.
1// adv_simd.h - written and placed in the public domain by Jeffrey Walton
2
3/// \file adv_simd.h
4/// \brief Template for AdvancedProcessBlocks and SIMD processing
5
6// The SIMD based implementations for ciphers that use SSE, NEON and Power7
7// have a common pattern. Namely, they have a specialized implementation of
8// AdvancedProcessBlocks which processes multiple block using hardware
9// acceleration. After several implementations we noticed a lot of copy and
10// paste occurring. adv_simd.h provides a template to avoid the copy and paste.
11//
12// There are 6 templates provided in this file. The number following the
13// function name, 128, is the block size in bits. The name following the
14// block size is the arrangement and acceleration. For example 4x1_SSE means
15// Intel SSE using two encrypt (or decrypt) functions: one that operates on
16// 4 SIMD words, and one that operates on 1 SIMD words.
17//
18// * AdvancedProcessBlocks128_4x1_SSE
19// * AdvancedProcessBlocks128_6x2_SSE
20// * AdvancedProcessBlocks128_4x1_NEON
21// * AdvancedProcessBlocks128_6x1_NEON
22// * AdvancedProcessBlocks128_4x1_ALTIVEC
23// * AdvancedProcessBlocks128_6x1_ALTIVEC
24//
25// If an arrangement ends in 2, like 6x2, then the template will handle the
26// single block case by padding with 0's and using the two SIMD word
27// function. This happens at most one time when processing multiple blocks.
28// The extra processing of a zero block is trivial and worth the tradeoff.
29//
30// The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
31// of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
32// results in a failed link due to the const/non-const mismatch.
33//
34// In July 2020 the library stopped using 64-bit block version of
35// AdvancedProcessBlocks. Testing showed unreliable results and failed
36// self tests on occasion. Also see Issue 945 and
37// https://github.com/weidai11/cryptopp/commit/dd7598e638bb.
38
39#ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
40#define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
41
42#include "config.h"
43#include "misc.h"
44#include "stdcpp.h"
45
46#if (CRYPTOPP_ARM_NEON_HEADER)
47# include <arm_neon.h>
48#endif
49
50#if (CRYPTOPP_ARM_ACLE_HEADER)
51# include <stdint.h>
52# include <arm_acle.h>
53#endif
54
55#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
56# include <emmintrin.h>
57# include <xmmintrin.h>
58#endif
59
60// SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
61#if (CRYPTOPP_SSSE3_AVAILABLE)
62# include <emmintrin.h>
63# include <pmmintrin.h>
64# include <xmmintrin.h>
65#endif
66
67#if defined(__ALTIVEC__)
68# include "ppc_simd.h"
69#endif
70
71// ************************ All block ciphers *********************** //
72
73ANONYMOUS_NAMESPACE_BEGIN
74
75using CryptoPP::BlockTransformation;
76
77CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput);
78CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel);
79CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter);
80CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection);
81CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers);
82
83ANONYMOUS_NAMESPACE_END
84
85// *************************** ARM NEON ************************** //
86
87#if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \
88 defined(CRYPTOPP_DOXYGEN_PROCESSING)
89NAMESPACE_BEGIN(CryptoPP)
90
91/// \brief AdvancedProcessBlocks for 1 and 6 blocks
92/// \tparam F1 function to process 1 128-bit block
93/// \tparam F6 function to process 6 128-bit blocks
94/// \tparam W word type of the subkey table
95/// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
96/// at a time.
97/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
98/// same word type.
99template <typename F1, typename F6, typename W>
100inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
101 const W *subKeys, size_t rounds, const byte *inBlocks,
102 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
103{
104 CRYPTOPP_ASSERT(subKeys);
105 CRYPTOPP_ASSERT(inBlocks);
106 CRYPTOPP_ASSERT(outBlocks);
107 CRYPTOPP_ASSERT(length >= 16);
108
109 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
110 const uint32x4_t s_one = vld1q_u32(w_one);
111
112 const size_t blockSize = 16;
113 // const size_t neonBlockSize = 16;
114
115 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
116 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
117 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
118
119 // Clang and Coverity are generating findings using xorBlocks as a flag.
120 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
121 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
122
123 if (flags & BT_ReverseDirection)
124 {
125 inBlocks = PtrAdd(inBlocks, length - blockSize);
126 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
127 outBlocks = PtrAdd(outBlocks, length - blockSize);
128 inIncrement = 0-inIncrement;
129 xorIncrement = 0-xorIncrement;
130 outIncrement = 0-outIncrement;
131 }
132
133 if (flags & BT_AllowParallel)
134 {
135 while (length >= 6*blockSize)
136 {
137 uint64x2_t block0, block1, block2, block3, block4, block5;
138 if (flags & BT_InBlockIsCounter)
139 {
140 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
141 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
142 block1 = vaddq_u64(block0, one);
143 block2 = vaddq_u64(block1, one);
144 block3 = vaddq_u64(block2, one);
145 block4 = vaddq_u64(block3, one);
146 block5 = vaddq_u64(block4, one);
147 vst1q_u8(const_cast<byte*>(inBlocks),
148 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
149 }
150 else
151 {
152 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
153 inBlocks = PtrAdd(inBlocks, inIncrement);
154 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
155 inBlocks = PtrAdd(inBlocks, inIncrement);
156 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
157 inBlocks = PtrAdd(inBlocks, inIncrement);
158 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
159 inBlocks = PtrAdd(inBlocks, inIncrement);
160 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
161 inBlocks = PtrAdd(inBlocks, inIncrement);
162 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
163 inBlocks = PtrAdd(inBlocks, inIncrement);
164 }
165
166 if (xorInput)
167 {
168 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
169 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
170 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
171 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
172 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
173 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
174 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
175 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
176 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
177 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
178 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
179 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
180 }
181
182 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
183
184 if (xorOutput)
185 {
186 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
187 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
188 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
189 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
190 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
191 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
192 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
193 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
194 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
195 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
196 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
197 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
198 }
199
200 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
201 outBlocks = PtrAdd(outBlocks, outIncrement);
202 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
203 outBlocks = PtrAdd(outBlocks, outIncrement);
204 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
205 outBlocks = PtrAdd(outBlocks, outIncrement);
206 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
207 outBlocks = PtrAdd(outBlocks, outIncrement);
208 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
209 outBlocks = PtrAdd(outBlocks, outIncrement);
210 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
211 outBlocks = PtrAdd(outBlocks, outIncrement);
212
213 length -= 6*blockSize;
214 }
215 }
216
217 while (length >= blockSize)
218 {
219 uint64x2_t block;
220 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
221
222 if (xorInput)
223 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
224
225 if (flags & BT_InBlockIsCounter)
226 const_cast<byte *>(inBlocks)[15]++;
227
228 func1(block, subKeys, static_cast<unsigned int>(rounds));
229
230 if (xorOutput)
231 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
232
233 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
234
235 inBlocks = PtrAdd(inBlocks, inIncrement);
236 outBlocks = PtrAdd(outBlocks, outIncrement);
237 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
238 length -= blockSize;
239 }
240
241 return length;
242}
243
244/// \brief AdvancedProcessBlocks for 1 and 4 blocks
245/// \tparam F1 function to process 1 128-bit block
246/// \tparam F4 function to process 4 128-bit blocks
247/// \tparam W word type of the subkey table
248/// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
249/// at a time.
250/// \details The subkey type is usually word32 or word64. V is the vector type and it is
251/// usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and
252/// vector type.
253template <typename F1, typename F4, typename W>
254inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
255 const W *subKeys, size_t rounds, const byte *inBlocks,
256 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
257{
258 CRYPTOPP_ASSERT(subKeys);
259 CRYPTOPP_ASSERT(inBlocks);
260 CRYPTOPP_ASSERT(outBlocks);
261 CRYPTOPP_ASSERT(length >= 16);
262
263 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
264 const uint32x4_t s_one = vld1q_u32(w_one);
265
266 const size_t blockSize = 16;
267 // const size_t neonBlockSize = 16;
268
269 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
270 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
271 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
272
273 // Clang and Coverity are generating findings using xorBlocks as a flag.
274 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
275 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
276
277 if (flags & BT_ReverseDirection)
278 {
279 inBlocks = PtrAdd(inBlocks, length - blockSize);
280 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
281 outBlocks = PtrAdd(outBlocks, length - blockSize);
282 inIncrement = 0-inIncrement;
283 xorIncrement = 0-xorIncrement;
284 outIncrement = 0-outIncrement;
285 }
286
287 if (flags & BT_AllowParallel)
288 {
289 while (length >= 4*blockSize)
290 {
291 uint32x4_t block0, block1, block2, block3;
292 if (flags & BT_InBlockIsCounter)
293 {
294 const uint32x4_t one = s_one;
295 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
296 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
297 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
298 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
299 vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
300 vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
301 }
302 else
303 {
304 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
305 inBlocks = PtrAdd(inBlocks, inIncrement);
306 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
307 inBlocks = PtrAdd(inBlocks, inIncrement);
308 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
309 inBlocks = PtrAdd(inBlocks, inIncrement);
310 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
311 inBlocks = PtrAdd(inBlocks, inIncrement);
312 }
313
314 if (xorInput)
315 {
316 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
317 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
318 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
319 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
320 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
321 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
322 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
323 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
324 }
325
326 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
327
328 if (xorOutput)
329 {
330 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
331 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
332 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
333 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
334 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
335 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
336 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
337 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
338 }
339
340 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
341 outBlocks = PtrAdd(outBlocks, outIncrement);
342 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
343 outBlocks = PtrAdd(outBlocks, outIncrement);
344 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
345 outBlocks = PtrAdd(outBlocks, outIncrement);
346 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
347 outBlocks = PtrAdd(outBlocks, outIncrement);
348
349 length -= 4*blockSize;
350 }
351 }
352
353 while (length >= blockSize)
354 {
355 uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
356
357 if (xorInput)
358 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
359
360 if (flags & BT_InBlockIsCounter)
361 const_cast<byte *>(inBlocks)[15]++;
362
363 func1(block, subKeys, static_cast<unsigned int>(rounds));
364
365 if (xorOutput)
366 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
367
368 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
369
370 inBlocks = PtrAdd(inBlocks, inIncrement);
371 outBlocks = PtrAdd(outBlocks, outIncrement);
372 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
373 length -= blockSize;
374 }
375
376 return length;
377}
378
379/// \brief AdvancedProcessBlocks for 2 and 6 blocks
380/// \tparam F2 function to process 2 128-bit blocks
381/// \tparam F6 function to process 6 128-bit blocks
382/// \tparam W word type of the subkey table
383/// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
384/// at a time. For a single block the template uses F2 with a zero block.
385/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
386/// same word type.
387template <typename F2, typename F6, typename W>
388inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
389 const W *subKeys, size_t rounds, const byte *inBlocks,
390 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
391{
392 CRYPTOPP_ASSERT(subKeys);
393 CRYPTOPP_ASSERT(inBlocks);
394 CRYPTOPP_ASSERT(outBlocks);
395 CRYPTOPP_ASSERT(length >= 16);
396
397 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
398 const uint32x4_t s_one = vld1q_u32(w_one);
399
400 const size_t blockSize = 16;
401 // const size_t neonBlockSize = 16;
402
403 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
404 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
405 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
406
407 // Clang and Coverity are generating findings using xorBlocks as a flag.
408 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
409 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
410
411 if (flags & BT_ReverseDirection)
412 {
413 inBlocks = PtrAdd(inBlocks, length - blockSize);
414 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
415 outBlocks = PtrAdd(outBlocks, length - blockSize);
416 inIncrement = 0-inIncrement;
417 xorIncrement = 0-xorIncrement;
418 outIncrement = 0-outIncrement;
419 }
420
421 if (flags & BT_AllowParallel)
422 {
423 while (length >= 6*blockSize)
424 {
425 uint64x2_t block0, block1, block2, block3, block4, block5;
426 if (flags & BT_InBlockIsCounter)
427 {
428 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
429 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
430 block1 = vaddq_u64(block0, one);
431 block2 = vaddq_u64(block1, one);
432 block3 = vaddq_u64(block2, one);
433 block4 = vaddq_u64(block3, one);
434 block5 = vaddq_u64(block4, one);
435 vst1q_u8(const_cast<byte*>(inBlocks),
436 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
437 }
438 else
439 {
440 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
441 inBlocks = PtrAdd(inBlocks, inIncrement);
442 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
443 inBlocks = PtrAdd(inBlocks, inIncrement);
444 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
445 inBlocks = PtrAdd(inBlocks, inIncrement);
446 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
447 inBlocks = PtrAdd(inBlocks, inIncrement);
448 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
449 inBlocks = PtrAdd(inBlocks, inIncrement);
450 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
451 inBlocks = PtrAdd(inBlocks, inIncrement);
452 }
453
454 if (xorInput)
455 {
456 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
457 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
458 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
459 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
460 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
461 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
462 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
463 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
464 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
465 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
466 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
467 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
468 }
469
470 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
471
472 if (xorOutput)
473 {
474 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
475 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
476 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
477 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
478 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
479 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
480 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
481 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
482 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
483 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
484 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
485 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
486 }
487
488 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
489 outBlocks = PtrAdd(outBlocks, outIncrement);
490 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
491 outBlocks = PtrAdd(outBlocks, outIncrement);
492 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
493 outBlocks = PtrAdd(outBlocks, outIncrement);
494 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
495 outBlocks = PtrAdd(outBlocks, outIncrement);
496 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
497 outBlocks = PtrAdd(outBlocks, outIncrement);
498 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
499 outBlocks = PtrAdd(outBlocks, outIncrement);
500
501 length -= 6*blockSize;
502 }
503
504 while (length >= 2*blockSize)
505 {
506 uint64x2_t block0, block1;
507 if (flags & BT_InBlockIsCounter)
508 {
509 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
510 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
511 block1 = vaddq_u64(block0, one);
512 vst1q_u8(const_cast<byte*>(inBlocks),
513 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
514 }
515 else
516 {
517 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
518 inBlocks = PtrAdd(inBlocks, inIncrement);
519 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
520 inBlocks = PtrAdd(inBlocks, inIncrement);
521 }
522
523 if (xorInput)
524 {
525 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
526 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
527 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
528 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
529 }
530
531 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
532
533 if (xorOutput)
534 {
535 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
536 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
537 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
538 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
539 }
540
541 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
542 outBlocks = PtrAdd(outBlocks, outIncrement);
543 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
544 outBlocks = PtrAdd(outBlocks, outIncrement);
545
546 length -= 2*blockSize;
547 }
548 }
549
550 while (length >= blockSize)
551 {
552 uint64x2_t block, zero = {0,0};
553 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
554
555 if (xorInput)
556 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
557
558 if (flags & BT_InBlockIsCounter)
559 const_cast<byte *>(inBlocks)[15]++;
560
561 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
562
563 if (xorOutput)
564 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
565
566 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
567
568 inBlocks = PtrAdd(inBlocks, inIncrement);
569 outBlocks = PtrAdd(outBlocks, outIncrement);
570 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
571 length -= blockSize;
572 }
573
574 return length;
575}
576
577NAMESPACE_END // CryptoPP
578
579#endif // CRYPTOPP_ARM_NEON_AVAILABLE
580
581// *************************** Intel SSE ************************** //
582
583#if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
584
585#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
586/// \brief SunCC workaround
587/// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
588/// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
589/// 224, SunCC and failed compile for rijndael.cpp</A>
590# define MAYBE_CONST const
591/// \brief SunCC workaround
592/// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
593/// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
594/// 224, SunCC and failed compile for rijndael.cpp</A>
595# define MAYBE_UNCONST_CAST(T, x) (x)
596#elif (__SUNPRO_CC >= 0x5130)
597# define MAYBE_CONST
598# define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
599#else
600# define MAYBE_CONST const
601# define MAYBE_UNCONST_CAST(T, x) (x)
602#endif
603
604#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
605/// \brief Clang workaround
606/// \details Clang issues spurious alignment warnings
607/// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
608/// 20670, _mm_loadu_si128 parameter has wrong type</A>
609# define M128_CAST(x) ((__m128i *)(void *)(x))
610/// \brief Clang workaround
611/// \details Clang issues spurious alignment warnings
612/// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
613/// 20670, _mm_loadu_si128 parameter has wrong type</A>
614# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
615#else
616# ifndef M128_CAST
617# define M128_CAST(x) ((__m128i *)(void *)(x))
618# endif
619# ifndef CONST_M128_CAST
620# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
621# endif
622#endif
623
624NAMESPACE_BEGIN(CryptoPP)
625
626/// \brief AdvancedProcessBlocks for 2 and 6 blocks
627/// \tparam F2 function to process 2 128-bit blocks
628/// \tparam F6 function to process 6 128-bit blocks
629/// \tparam W word type of the subkey table
630/// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
631/// at a time. For a single block the template uses F2 with a zero block.
632/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
633/// same word type.
634template <typename F2, typename F6, typename W>
635inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
636 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
637 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
638{
639 CRYPTOPP_ASSERT(subKeys);
640 CRYPTOPP_ASSERT(inBlocks);
641 CRYPTOPP_ASSERT(outBlocks);
642 CRYPTOPP_ASSERT(length >= 16);
643
644 const size_t blockSize = 16;
645 // const size_t xmmBlockSize = 16;
646
647 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
648 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
649 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
650
651 // Clang and Coverity are generating findings using xorBlocks as a flag.
652 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
653 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
654
655 if (flags & BT_ReverseDirection)
656 {
657 inBlocks = PtrAdd(inBlocks, length - blockSize);
658 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
659 outBlocks = PtrAdd(outBlocks, length - blockSize);
660 inIncrement = 0-inIncrement;
661 xorIncrement = 0-xorIncrement;
662 outIncrement = 0-outIncrement;
663 }
664
665 if (flags & BT_AllowParallel)
666 {
667 while (length >= 6*blockSize)
668 {
669 __m128i block0, block1, block2, block3, block4, block5;
670 if (flags & BT_InBlockIsCounter)
671 {
672 // Increment of 1 in big-endian compatible with the ctr byte array.
673 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
674 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
675 block1 = _mm_add_epi32(block0, s_one);
676 block2 = _mm_add_epi32(block1, s_one);
677 block3 = _mm_add_epi32(block2, s_one);
678 block4 = _mm_add_epi32(block3, s_one);
679 block5 = _mm_add_epi32(block4, s_one);
680 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
681 }
682 else
683 {
684 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
685 inBlocks = PtrAdd(inBlocks, inIncrement);
686 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
687 inBlocks = PtrAdd(inBlocks, inIncrement);
688 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
689 inBlocks = PtrAdd(inBlocks, inIncrement);
690 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
691 inBlocks = PtrAdd(inBlocks, inIncrement);
692 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
693 inBlocks = PtrAdd(inBlocks, inIncrement);
694 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
695 inBlocks = PtrAdd(inBlocks, inIncrement);
696 }
697
698 if (xorInput)
699 {
700 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
701 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
702 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
703 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
704 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
705 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
706 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
707 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
708 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
709 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
710 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
711 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
712 }
713
714 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
715
716 if (xorOutput)
717 {
718 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
719 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
720 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
721 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
722 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
723 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
724 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
725 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
726 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
727 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
728 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
729 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730 }
731
732 _mm_storeu_si128(M128_CAST(outBlocks), block0);
733 outBlocks = PtrAdd(outBlocks, outIncrement);
734 _mm_storeu_si128(M128_CAST(outBlocks), block1);
735 outBlocks = PtrAdd(outBlocks, outIncrement);
736 _mm_storeu_si128(M128_CAST(outBlocks), block2);
737 outBlocks = PtrAdd(outBlocks, outIncrement);
738 _mm_storeu_si128(M128_CAST(outBlocks), block3);
739 outBlocks = PtrAdd(outBlocks, outIncrement);
740 _mm_storeu_si128(M128_CAST(outBlocks), block4);
741 outBlocks = PtrAdd(outBlocks, outIncrement);
742 _mm_storeu_si128(M128_CAST(outBlocks), block5);
743 outBlocks = PtrAdd(outBlocks, outIncrement);
744
745 length -= 6*blockSize;
746 }
747
748 while (length >= 2*blockSize)
749 {
750 __m128i block0, block1;
751 if (flags & BT_InBlockIsCounter)
752 {
753 // Increment of 1 in big-endian compatible with the ctr byte array.
754 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
755 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
756 block1 = _mm_add_epi32(block0, s_one);
757 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
758 }
759 else
760 {
761 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
762 inBlocks = PtrAdd(inBlocks, inIncrement);
763 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
764 inBlocks = PtrAdd(inBlocks, inIncrement);
765 }
766
767 if (xorInput)
768 {
769 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
770 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
771 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
772 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
773 }
774
775 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
776
777 if (xorOutput)
778 {
779 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
780 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
781 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
782 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
783 }
784
785 _mm_storeu_si128(M128_CAST(outBlocks), block0);
786 outBlocks = PtrAdd(outBlocks, outIncrement);
787 _mm_storeu_si128(M128_CAST(outBlocks), block1);
788 outBlocks = PtrAdd(outBlocks, outIncrement);
789
790 length -= 2*blockSize;
791 }
792 }
793
794 while (length >= blockSize)
795 {
796 __m128i block, zero = _mm_setzero_si128();
797 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
798
799 if (xorInput)
800 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
801
802 if (flags & BT_InBlockIsCounter)
803 const_cast<byte *>(inBlocks)[15]++;
804
805 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
806
807 if (xorOutput)
808 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
809
810 _mm_storeu_si128(M128_CAST(outBlocks), block);
811
812 inBlocks = PtrAdd(inBlocks, inIncrement);
813 outBlocks = PtrAdd(outBlocks, outIncrement);
814 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
815 length -= blockSize;
816 }
817
818 return length;
819}
820
821/// \brief AdvancedProcessBlocks for 1 and 4 blocks
822/// \tparam F1 function to process 1 128-bit block
823/// \tparam F4 function to process 4 128-bit blocks
824/// \tparam W word type of the subkey table
825/// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
826/// at a time.
827/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
828/// same word type.
829template <typename F1, typename F4, typename W>
830inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
831 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
832 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
833{
834 CRYPTOPP_ASSERT(subKeys);
835 CRYPTOPP_ASSERT(inBlocks);
836 CRYPTOPP_ASSERT(outBlocks);
837 CRYPTOPP_ASSERT(length >= 16);
838
839 const size_t blockSize = 16;
840 // const size_t xmmBlockSize = 16;
841
842 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
843 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
844 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
845
846 // Clang and Coverity are generating findings using xorBlocks as a flag.
847 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
848 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
849
850 if (flags & BT_ReverseDirection)
851 {
852 inBlocks = PtrAdd(inBlocks, length - blockSize);
853 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
854 outBlocks = PtrAdd(outBlocks, length - blockSize);
855 inIncrement = 0-inIncrement;
856 xorIncrement = 0-xorIncrement;
857 outIncrement = 0-outIncrement;
858 }
859
860 if (flags & BT_AllowParallel)
861 {
862 while (length >= 4*blockSize)
863 {
864 __m128i block0, block1, block2, block3;
865 if (flags & BT_InBlockIsCounter)
866 {
867 // Increment of 1 in big-endian compatible with the ctr byte array.
868 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
869 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
870 block1 = _mm_add_epi32(block0, s_one);
871 block2 = _mm_add_epi32(block1, s_one);
872 block3 = _mm_add_epi32(block2, s_one);
873 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
874 }
875 else
876 {
877 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
878 inBlocks = PtrAdd(inBlocks, inIncrement);
879 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
880 inBlocks = PtrAdd(inBlocks, inIncrement);
881 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
882 inBlocks = PtrAdd(inBlocks, inIncrement);
883 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
884 inBlocks = PtrAdd(inBlocks, inIncrement);
885 }
886
887 if (xorInput)
888 {
889 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
890 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
891 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
892 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
893 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
894 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
895 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
896 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
897 }
898
899 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
900
901 if (xorOutput)
902 {
903 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
904 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
905 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
906 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
907 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
908 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
909 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
910 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
911 }
912
913 _mm_storeu_si128(M128_CAST(outBlocks), block0);
914 outBlocks = PtrAdd(outBlocks, outIncrement);
915 _mm_storeu_si128(M128_CAST(outBlocks), block1);
916 outBlocks = PtrAdd(outBlocks, outIncrement);
917 _mm_storeu_si128(M128_CAST(outBlocks), block2);
918 outBlocks = PtrAdd(outBlocks, outIncrement);
919 _mm_storeu_si128(M128_CAST(outBlocks), block3);
920 outBlocks = PtrAdd(outBlocks, outIncrement);
921
922 length -= 4*blockSize;
923 }
924 }
925
926 while (length >= blockSize)
927 {
928 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
929
930 if (xorInput)
931 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
932
933 if (flags & BT_InBlockIsCounter)
934 const_cast<byte *>(inBlocks)[15]++;
935
936 func1(block, subKeys, static_cast<unsigned int>(rounds));
937
938 if (xorOutput)
939 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
940
941 _mm_storeu_si128(M128_CAST(outBlocks), block);
942
943 inBlocks = PtrAdd(inBlocks, inIncrement);
944 outBlocks = PtrAdd(outBlocks, outIncrement);
945 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
946 length -= blockSize;
947 }
948
949 return length;
950}
951
952NAMESPACE_END // CryptoPP
953
954#endif // CRYPTOPP_SSSE3_AVAILABLE
955
956// ************************** Altivec/Power 4 ************************** //
957
958#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
959
960NAMESPACE_BEGIN(CryptoPP)
961
962/// \brief AdvancedProcessBlocks for 1 and 4 blocks
963/// \tparam F1 function to process 1 128-bit block
964/// \tparam F4 function to process 4 128-bit blocks
965/// \tparam W word type of the subkey table
966/// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
967/// at a time.
968/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
969/// same word type.
970template <typename F1, typename F4, typename W>
971inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
972 const W *subKeys, size_t rounds, const byte *inBlocks,
973 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
974{
975 CRYPTOPP_ASSERT(subKeys);
976 CRYPTOPP_ASSERT(inBlocks);
977 CRYPTOPP_ASSERT(outBlocks);
978 CRYPTOPP_ASSERT(length >= 16);
979
980#if (CRYPTOPP_LITTLE_ENDIAN)
981 const uint32x4_p s_one = {1,0,0,0};
982#else
983 const uint32x4_p s_one = {0,0,0,1};
984#endif
985
986 const size_t blockSize = 16;
987 // const size_t simdBlockSize = 16;
988
989 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
990 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
991 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
992
993 // Clang and Coverity are generating findings using xorBlocks as a flag.
994 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
995 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
996
997 if (flags & BT_ReverseDirection)
998 {
999 inBlocks = PtrAdd(inBlocks, length - blockSize);
1000 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1001 outBlocks = PtrAdd(outBlocks, length - blockSize);
1002 inIncrement = 0-inIncrement;
1003 xorIncrement = 0-xorIncrement;
1004 outIncrement = 0-outIncrement;
1005 }
1006
1007 if (flags & BT_AllowParallel)
1008 {
1009 while (length >= 4*blockSize)
1010 {
1011 uint32x4_p block0, block1, block2, block3;
1012
1013 if (flags & BT_InBlockIsCounter)
1014 {
1015 block0 = VecLoadBE(inBlocks);
1016 block1 = VecAdd(block0, s_one);
1017 block2 = VecAdd(block1, s_one);
1018 block3 = VecAdd(block2, s_one);
1019
1020 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1021 // CTR_ModePolicy::OperateKeystream is wired such that after
1022 // returning from this function CTR_ModePolicy will detect wrap on
1023 // on the last counter byte and increment the next to last byte.
1024 // The problem is, with a big-endian load, inBlocks[15] is really
1025 // located at index 15. The vector addition using a 32-bit element
1026 // generates a carry into inBlocks[14] and then CTR_ModePolicy
1027 // increments inBlocks[14] too.
1028 const_cast<byte*>(inBlocks)[15] += 6;
1029 }
1030 else
1031 {
1032 block0 = VecLoadBE(inBlocks);
1033 inBlocks = PtrAdd(inBlocks, inIncrement);
1034 block1 = VecLoadBE(inBlocks);
1035 inBlocks = PtrAdd(inBlocks, inIncrement);
1036 block2 = VecLoadBE(inBlocks);
1037 inBlocks = PtrAdd(inBlocks, inIncrement);
1038 block3 = VecLoadBE(inBlocks);
1039 inBlocks = PtrAdd(inBlocks, inIncrement);
1040 }
1041
1042 if (xorInput)
1043 {
1044 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1045 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1046 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1047 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1048 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1049 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1050 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1051 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1052 }
1053
1054 func4(block0, block1, block2, block3, subKeys, rounds);
1055
1056 if (xorOutput)
1057 {
1058 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1059 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1060 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1061 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1062 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1063 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1064 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1065 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1066 }
1067
1068 VecStoreBE(block0, outBlocks);
1069 outBlocks = PtrAdd(outBlocks, outIncrement);
1070 VecStoreBE(block1, outBlocks);
1071 outBlocks = PtrAdd(outBlocks, outIncrement);
1072 VecStoreBE(block2, outBlocks);
1073 outBlocks = PtrAdd(outBlocks, outIncrement);
1074 VecStoreBE(block3, outBlocks);
1075 outBlocks = PtrAdd(outBlocks, outIncrement);
1076
1077 length -= 4*blockSize;
1078 }
1079 }
1080
1081 while (length >= blockSize)
1082 {
1083 uint32x4_p block = VecLoadBE(inBlocks);
1084
1085 if (xorInput)
1086 block = VecXor(block, VecLoadBE(xorBlocks));
1087
1088 if (flags & BT_InBlockIsCounter)
1089 const_cast<byte *>(inBlocks)[15]++;
1090
1091 func1(block, subKeys, rounds);
1092
1093 if (xorOutput)
1094 block = VecXor(block, VecLoadBE(xorBlocks));
1095
1096 VecStoreBE(block, outBlocks);
1097
1098 inBlocks = PtrAdd(inBlocks, inIncrement);
1099 outBlocks = PtrAdd(outBlocks, outIncrement);
1100 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1101 length -= blockSize;
1102 }
1103
1104 return length;
1105}
1106
1107/// \brief AdvancedProcessBlocks for 1 and 6 blocks
1108/// \tparam F1 function to process 1 128-bit block
1109/// \tparam F6 function to process 6 128-bit blocks
1110/// \tparam W word type of the subkey table
1111/// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
1112/// at a time.
1113/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
1114/// same word type.
1115template <typename F1, typename F6, typename W>
1116inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1117 const W *subKeys, size_t rounds, const byte *inBlocks,
1118 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1119{
1120 CRYPTOPP_ASSERT(subKeys);
1121 CRYPTOPP_ASSERT(inBlocks);
1122 CRYPTOPP_ASSERT(outBlocks);
1123 CRYPTOPP_ASSERT(length >= 16);
1124
1125#if (CRYPTOPP_LITTLE_ENDIAN)
1126 const uint32x4_p s_one = {1,0,0,0};
1127#else
1128 const uint32x4_p s_one = {0,0,0,1};
1129#endif
1130
1131 const size_t blockSize = 16;
1132 // const size_t simdBlockSize = 16;
1133
1134 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1137
1138 // Clang and Coverity are generating findings using xorBlocks as a flag.
1139 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
1140 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
1141
1142 if (flags & BT_ReverseDirection)
1143 {
1144 inBlocks = PtrAdd(inBlocks, length - blockSize);
1145 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1146 outBlocks = PtrAdd(outBlocks, length - blockSize);
1147 inIncrement = 0-inIncrement;
1148 xorIncrement = 0-xorIncrement;
1149 outIncrement = 0-outIncrement;
1150 }
1151
1152 if (flags & BT_AllowParallel)
1153 {
1154 while (length >= 6*blockSize)
1155 {
1156 uint32x4_p block0, block1, block2, block3, block4, block5;
1157
1158 if (flags & BT_InBlockIsCounter)
1159 {
1160 block0 = VecLoadBE(inBlocks);
1161 block1 = VecAdd(block0, s_one);
1162 block2 = VecAdd(block1, s_one);
1163 block3 = VecAdd(block2, s_one);
1164 block4 = VecAdd(block3, s_one);
1165 block5 = VecAdd(block4, s_one);
1166
1167 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1168 // CTR_ModePolicy::OperateKeystream is wired such that after
1169 // returning from this function CTR_ModePolicy will detect wrap on
1170 // on the last counter byte and increment the next to last byte.
1171 // The problem is, with a big-endian load, inBlocks[15] is really
1172 // located at index 15. The vector addition using a 32-bit element
1173 // generates a carry into inBlocks[14] and then CTR_ModePolicy
1174 // increments inBlocks[14] too.
1175 //
1176 // To find this bug we needed a test case with a ctr of 0xNN...FA.
1177 // The last octet is 0xFA and adding 6 creates the wrap to trigger
1178 // the issue. If the last octet was 0xFC then 4 would trigger it.
1179 // We dumb-lucked into the test with SPECK-128. The test case of
1180 // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
1181 uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
1182 VecStoreBE(temp, const_cast<byte*>(inBlocks));
1183 }
1184 else
1185 {
1186 block0 = VecLoadBE(inBlocks);
1187 inBlocks = PtrAdd(inBlocks, inIncrement);
1188 block1 = VecLoadBE(inBlocks);
1189 inBlocks = PtrAdd(inBlocks, inIncrement);
1190 block2 = VecLoadBE(inBlocks);
1191 inBlocks = PtrAdd(inBlocks, inIncrement);
1192 block3 = VecLoadBE(inBlocks);
1193 inBlocks = PtrAdd(inBlocks, inIncrement);
1194 block4 = VecLoadBE(inBlocks);
1195 inBlocks = PtrAdd(inBlocks, inIncrement);
1196 block5 = VecLoadBE(inBlocks);
1197 inBlocks = PtrAdd(inBlocks, inIncrement);
1198 }
1199
1200 if (xorInput)
1201 {
1202 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1203 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1204 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1205 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1206 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1207 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1208 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1209 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1210 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1211 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1212 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1213 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1214 }
1215
1216 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1217
1218 if (xorOutput)
1219 {
1220 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1221 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1222 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1223 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1224 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1225 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1226 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1227 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1228 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1229 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1230 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1231 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1232 }
1233
1234 VecStoreBE(block0, outBlocks);
1235 outBlocks = PtrAdd(outBlocks, outIncrement);
1236 VecStoreBE(block1, outBlocks);
1237 outBlocks = PtrAdd(outBlocks, outIncrement);
1238 VecStoreBE(block2, outBlocks);
1239 outBlocks = PtrAdd(outBlocks, outIncrement);
1240 VecStoreBE(block3, outBlocks);
1241 outBlocks = PtrAdd(outBlocks, outIncrement);
1242 VecStoreBE(block4, outBlocks);
1243 outBlocks = PtrAdd(outBlocks, outIncrement);
1244 VecStoreBE(block5, outBlocks);
1245 outBlocks = PtrAdd(outBlocks, outIncrement);
1246
1247 length -= 6*blockSize;
1248 }
1249 }
1250
1251 while (length >= blockSize)
1252 {
1253 uint32x4_p block = VecLoadBE(inBlocks);
1254
1255 if (xorInput)
1256 block = VecXor(block, VecLoadBE(xorBlocks));
1257
1258 if (flags & BT_InBlockIsCounter)
1259 const_cast<byte *>(inBlocks)[15]++;
1260
1261 func1(block, subKeys, rounds);
1262
1263 if (xorOutput)
1264 block = VecXor(block, VecLoadBE(xorBlocks));
1265
1266 VecStoreBE(block, outBlocks);
1267
1268 inBlocks = PtrAdd(inBlocks, inIncrement);
1269 outBlocks = PtrAdd(outBlocks, outIncrement);
1270 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1271 length -= blockSize;
1272 }
1273
1274 return length;
1275}
1276
1277NAMESPACE_END // CryptoPP
1278
1279#endif // __ALTIVEC__
1280
1281#endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
Definition: adv_simd.h:388
size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:254
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
Definition: adv_simd.h:1116
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
Definition: adv_simd.h:635
size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:971
size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
Definition: adv_simd.h:100
#define MAYBE_CONST
SunCC workaround.
Definition: adv_simd.h:590
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:830
@ BT_InBlockIsCounter
inBlock is a counter
Definition: cryptlib.h:917
@ BT_ReverseDirection
perform the transformation in reverse
Definition: cryptlib.h:923
@ BT_XorInput
Xor inputs before transformation.
Definition: cryptlib.h:921
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:925
@ BT_DontIncrementInOutPointers
should not modify block pointers
Definition: cryptlib.h:919
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
Utility functions for the Crypto++ library.
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:386
#define EnumToInt(v)
Integer value.
Definition: misc.h:502
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:742
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1231
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:1438
Common C++ header files.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68