19#if (CRYPTOPP_SSSE3_AVAILABLE)
21# include <pmmintrin.h>
22# include <tmmintrin.h>
26# include <ammintrin.h>
28# include <x86intrin.h>
32#if (CRYPTOPP_ARM_NEON_HEADER)
37#if (CRYPTOPP_ARM_ACLE_HEADER)
46#if (CRYPTOPP_ALTIVEC_AVAILABLE)
52extern const char SIMON128_SIMD_FNAME[] = __FILE__;
54ANONYMOUS_NAMESPACE_BEGIN
63#if (CRYPTOPP_ARM_NEON_AVAILABLE)
66#if defined(_MSC_VER) && !defined(_M_ARM64)
67inline uint64x2_t vld1q_dup_u64(
const uint64_t* ptr)
69 return vmovq_n_u64(*ptr);
74inline T UnpackHigh64(
const T& a,
const T& b)
76 const uint64x1_t x(vget_high_u64((uint64x2_t)a));
77 const uint64x1_t y(vget_high_u64((uint64x2_t)b));
78 return (T)vcombine_u64(x, y);
82inline T UnpackLow64(
const T& a,
const T& b)
84 const uint64x1_t x(vget_low_u64((uint64x2_t)a));
85 const uint64x1_t y(vget_low_u64((uint64x2_t)b));
86 return (T)vcombine_u64(x, y);
89template <
unsigned int R>
90inline uint64x2_t RotateLeft64(
const uint64x2_t& val)
92 const uint64x2_t a(vshlq_n_u64(val, R));
93 const uint64x2_t b(vshrq_n_u64(val, 64 - R));
94 return vorrq_u64(a, b);
97template <
unsigned int R>
98inline uint64x2_t RotateRight64(
const uint64x2_t& val)
100 const uint64x2_t a(vshlq_n_u64(val, 64 - R));
101 const uint64x2_t b(vshrq_n_u64(val, R));
102 return vorrq_u64(a, b);
105#if defined(__aarch32__) || defined(__aarch64__)
108inline uint64x2_t RotateLeft64<8>(
const uint64x2_t& val)
110 const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
111 const uint8x16_t mask = vld1q_u8(maskb);
113 return vreinterpretq_u64_u8(
114 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
119inline uint64x2_t RotateRight64<8>(
const uint64x2_t& val)
121 const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
122 const uint8x16_t mask = vld1q_u8(maskb);
124 return vreinterpretq_u64_u8(
125 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
129inline uint64x2_t SIMON128_f(
const uint64x2_t& val)
131 return veorq_u64(RotateLeft64<2>(val),
132 vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
135inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
136 const word64 *subkeys,
unsigned int rounds)
139 uint64x2_t x1 = UnpackHigh64(block0, block1);
140 uint64x2_t y1 = UnpackLow64(block0, block1);
142 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
144 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
145 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
147 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
148 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
153 const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
155 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
160 block0 = UnpackLow64(y1, x1);
161 block1 = UnpackHigh64(y1, x1);
164inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
165 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
166 const word64 *subkeys,
unsigned int rounds)
169 uint64x2_t x1 = UnpackHigh64(block0, block1);
170 uint64x2_t y1 = UnpackLow64(block0, block1);
171 uint64x2_t x2 = UnpackHigh64(block2, block3);
172 uint64x2_t y2 = UnpackLow64(block2, block3);
173 uint64x2_t x3 = UnpackHigh64(block4, block5);
174 uint64x2_t y3 = UnpackLow64(block4, block5);
176 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
178 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
179 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
180 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
181 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
183 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
184 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
185 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
186 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
191 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
193 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
194 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
195 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
200 block0 = UnpackLow64(y1, x1);
201 block1 = UnpackHigh64(y1, x1);
202 block2 = UnpackLow64(y2, x2);
203 block3 = UnpackHigh64(y2, x2);
204 block4 = UnpackLow64(y3, x3);
205 block5 = UnpackHigh64(y3, x3);
208inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
209 const word64 *subkeys,
unsigned int rounds)
212 uint64x2_t x1 = UnpackHigh64(block0, block1);
213 uint64x2_t y1 = UnpackLow64(block0, block1);
218 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
220 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
224 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
226 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
227 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
229 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
230 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
234 block0 = UnpackLow64(y1, x1);
235 block1 = UnpackHigh64(y1, x1);
238inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
239 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
240 const word64 *subkeys,
unsigned int rounds)
243 uint64x2_t x1 = UnpackHigh64(block0, block1);
244 uint64x2_t y1 = UnpackLow64(block0, block1);
245 uint64x2_t x2 = UnpackHigh64(block2, block3);
246 uint64x2_t y2 = UnpackLow64(block2, block3);
247 uint64x2_t x3 = UnpackHigh64(block4, block5);
248 uint64x2_t y3 = UnpackLow64(block4, block5);
253 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
255 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
256 y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
257 y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
261 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
263 const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
264 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
265 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
266 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
268 const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
269 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
270 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
271 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
275 block0 = UnpackLow64(y1, x1);
276 block1 = UnpackHigh64(y1, x1);
277 block2 = UnpackLow64(y2, x2);
278 block3 = UnpackHigh64(y2, x2);
279 block4 = UnpackLow64(y3, x3);
280 block5 = UnpackHigh64(y3, x3);
287#if (CRYPTOPP_SSSE3_AVAILABLE)
291# define DOUBLE_CAST(x) ((double *)(void *)(x))
293#ifndef CONST_DOUBLE_CAST
294# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
297inline void Swap128(__m128i& a,__m128i& b)
299#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
308template <
unsigned int R>
309inline __m128i RotateLeft64(
const __m128i& val)
312 return _mm_roti_epi64(val, R);
315 _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
319template <
unsigned int R>
320inline __m128i RotateRight64(
const __m128i& val)
323 return _mm_roti_epi64(val, 64-R);
326 _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
332__m128i RotateLeft64<8>(
const __m128i& val)
335 return _mm_roti_epi64(val, 8);
337 const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
338 return _mm_shuffle_epi8(val, mask);
344__m128i RotateRight64<8>(
const __m128i& val)
347 return _mm_roti_epi64(val, 64-8);
349 const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
350 return _mm_shuffle_epi8(val, mask);
354inline __m128i SIMON128_f(
const __m128i& v)
356 return _mm_xor_si128(RotateLeft64<2>(v),
357 _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
360inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
361 const word64 *subkeys,
unsigned int rounds)
364 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
365 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
367 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
371 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
374 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
380 const __m128i rk = _mm_load_si128(
CONST_M128_CAST(subkeys+(rounds-1)*2));
382 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
387 block0 = _mm_unpacklo_epi64(y1, x1);
388 block1 = _mm_unpackhi_epi64(y1, x1);
391inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
392 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
393 const word64 *subkeys,
unsigned int rounds)
396 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
397 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
398 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
399 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
400 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
401 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
403 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
407 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
408 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
409 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
413 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
414 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
415 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
421 const __m128i rk = _mm_load_si128(
CONST_M128_CAST(subkeys+(rounds-1)*2));
422 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
423 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
424 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
425 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
429 block0 = _mm_unpacklo_epi64(y1, x1);
430 block1 = _mm_unpackhi_epi64(y1, x1);
431 block2 = _mm_unpacklo_epi64(y2, x2);
432 block3 = _mm_unpackhi_epi64(y2, x2);
433 block4 = _mm_unpacklo_epi64(y3, x3);
434 block5 = _mm_unpackhi_epi64(y3, x3);
437inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
438 const word64 *subkeys,
unsigned int rounds)
441 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
442 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
446 const __m128i rk = _mm_castpd_si128(
447 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
450 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
454 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
456 const __m128i rk1 = _mm_castpd_si128(
457 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
458 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
460 const __m128i rk2 = _mm_castpd_si128(
461 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
462 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
466 block0 = _mm_unpacklo_epi64(y1, x1);
467 block1 = _mm_unpackhi_epi64(y1, x1);
470inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
471 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
472 const word64 *subkeys,
unsigned int rounds)
475 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
476 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
477 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
478 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
479 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
480 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
484 const __m128i rk = _mm_castpd_si128(
485 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
487 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
488 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
489 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
490 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
494 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
496 const __m128i rk1 = _mm_castpd_si128(
497 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
498 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
499 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
500 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
502 const __m128i rk2 = _mm_castpd_si128(
503 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
504 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
505 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
506 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
510 block0 = _mm_unpacklo_epi64(y1, x1);
511 block1 = _mm_unpackhi_epi64(y1, x1);
512 block2 = _mm_unpacklo_epi64(y2, x2);
513 block3 = _mm_unpackhi_epi64(y2, x2);
514 block4 = _mm_unpacklo_epi64(y3, x3);
515 block5 = _mm_unpackhi_epi64(y3, x3);
522#if (CRYPTOPP_ALTIVEC_AVAILABLE)
539#if defined(_ARCH_PWR8)
555#if defined(_ARCH_PWR8)
556#define simon128_t uint64x2_p
558#define simon128_t uint32x4_p
561inline simon128_t SIMON128_f(
const simon128_t val)
563 return (simon128_t)
VecXor64(VecRotateLeft64<2>(val),
567inline void SIMON128_Enc_Block(
uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
569#if (CRYPTOPP_BIG_ENDIAN)
570 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
571 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
573 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
574 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
578 simon128_t x1 = (simon128_t)
VecPermute(block, block, m1);
579 simon128_t y1 = (simon128_t)
VecPermute(block, block, m2);
581 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
584 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
586 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
596 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
604#if (CRYPTOPP_BIG_ENDIAN)
605 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
608 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
616inline void SIMON128_Dec_Block(
uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
618#if (CRYPTOPP_BIG_ENDIAN)
619 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
620 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
622 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
623 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
627 simon128_t x1 = (simon128_t)
VecPermute(block, block, m1);
628 simon128_t y1 = (simon128_t)
VecPermute(block, block, m2);
634 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
635 const simon128_t tk = (simon128_t)
VecLoad(ptr);
636 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
642 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
644 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
645 const simon128_t tk = (simon128_t)
VecLoad(ptr);
646 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
647 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
653#if (CRYPTOPP_BIG_ENDIAN)
654 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
657 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
669#if (CRYPTOPP_BIG_ENDIAN)
670 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
671 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
673 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
674 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
678 simon128_t x1 = (simon128_t)
VecPermute(block0, block1, m1);
679 simon128_t y1 = (simon128_t)
VecPermute(block0, block1, m2);
680 simon128_t x2 = (simon128_t)
VecPermute(block2, block3, m1);
681 simon128_t y2 = (simon128_t)
VecPermute(block2, block3, m2);
682 simon128_t x3 = (simon128_t)
VecPermute(block4, block5, m1);
683 simon128_t y3 = (simon128_t)
VecPermute(block4, block5, m2);
685 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
688 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
691 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
706 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
716#if (CRYPTOPP_BIG_ENDIAN)
717 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
718 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
720 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
721 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
737#if (CRYPTOPP_BIG_ENDIAN)
738 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
739 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
741 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
742 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
746 simon128_t x1 = (simon128_t)
VecPermute(block0, block1, m1);
747 simon128_t y1 = (simon128_t)
VecPermute(block0, block1, m2);
748 simon128_t x2 = (simon128_t)
VecPermute(block2, block3, m1);
749 simon128_t y2 = (simon128_t)
VecPermute(block2, block3, m2);
750 simon128_t x3 = (simon128_t)
VecPermute(block4, block5, m1);
751 simon128_t y3 = (simon128_t)
VecPermute(block4, block5, m2);
757 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
758 const simon128_t tk = (simon128_t)
VecLoad(ptr);
759 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
767 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
769 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
770 const simon128_t tk = (simon128_t)
VecLoad(ptr);
771 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
772 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
783#if (CRYPTOPP_BIG_ENDIAN)
784 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
785 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
787 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
788 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
802ANONYMOUS_NAMESPACE_END
810#if (CRYPTOPP_ARM_NEON_AVAILABLE)
811size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
812 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
815 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
818size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
819 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
822 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
828#if (CRYPTOPP_SSSE3_AVAILABLE)
829size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
830 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
833 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
836size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
837 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
840 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
846#if (CRYPTOPP_ALTIVEC_AVAILABLE)
847size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
848 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
851 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
854size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
855 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
858 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Template for AdvancedProcessBlocks and SIMD processing.
size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
#define CONST_M128_CAST(x)
Clang workaround.
Library configuration file.
unsigned char byte
8-bit unsigned datatype
unsigned int word32
32-bit unsigned datatype
unsigned long long word64
64-bit unsigned datatype
Utility functions for the Crypto++ library.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
T1 VecOr64(const T1 vec1, const T2 vec2)
OR two vectors as if uint64x2_p.
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
uint32x4_p VecSub64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Subtract two vectors as if uint64x2_p.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
uint32x4_p VecSplatElement64(const uint32x4_p val)
Broadcast 64-bit element to a vector as if uint64x2_p.
T1 VecXor64(const T1 vec1, const T2 vec2)
XOR two vectors as if uint64x2_p.
uint32x4_p VecRotateRight64(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors as if uint64x2_p.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
uint32x4_p VecRotateLeft64(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
uint32x4_p VecRotateLeft64< 8 >(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
T1 VecAnd64(const T1 vec1, const T2 vec2)
AND two vectors as if uint64x2_p.
void swap(::SecBlock< T, A > &a, ::SecBlock< T, A > &b)
Swap two SecBlocks.
Classes for the Simon block cipher.