19#if (CRYPTOPP_SSSE3_AVAILABLE)
21# include <pmmintrin.h>
22# include <tmmintrin.h>
26# include <ammintrin.h>
28# include <x86intrin.h>
33#define DOUBLE_CAST(x) ((double*)(void*)(x))
34#define CONST_DOUBLE_CAST(x) ((const double*)(const void*)(x))
37extern const char CHAM_SIMD_FNAME[] = __FILE__;
39ANONYMOUS_NAMESPACE_BEGIN
44#if (CRYPTOPP_SSSE3_AVAILABLE)
50template <
unsigned int R>
51inline __m128i RotateLeft32(
const __m128i& val)
54 return _mm_roti_epi32(val, R);
57 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
61template <
unsigned int R>
62inline __m128i RotateRight32(
const __m128i& val)
65 return _mm_roti_epi32(val, 32-R);
68 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
74inline __m128i RotateLeft32<8>(
const __m128i& val)
77 return _mm_roti_epi32(val, 8);
79 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
80 return _mm_shuffle_epi8(val, mask);
86inline __m128i RotateRight32<8>(
const __m128i& val)
89 return _mm_roti_epi32(val, 32-8);
91 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
92 return _mm_shuffle_epi8(val, mask);
96template <
unsigned int IDX>
97inline __m128i UnpackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
100 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
101 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
103 return _mm_setzero_si128();
107inline __m128i UnpackXMM<0>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
113 const __m128i r1 = _mm_unpacklo_epi32(a, b);
114 const __m128i r2 = _mm_unpacklo_epi32(c, d);
115 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
116 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
120inline __m128i UnpackXMM<1>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
126 const __m128i r1 = _mm_unpacklo_epi32(a, b);
127 const __m128i r2 = _mm_unpacklo_epi32(c, d);
128 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
129 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
133inline __m128i UnpackXMM<2>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
139 const __m128i r1 = _mm_unpackhi_epi32(a, b);
140 const __m128i r2 = _mm_unpackhi_epi32(c, d);
141 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
142 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
146inline __m128i UnpackXMM<3>(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
152 const __m128i r1 = _mm_unpackhi_epi32(a, b);
153 const __m128i r2 = _mm_unpackhi_epi32(c, d);
154 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
155 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
158template <
unsigned int IDX>
159inline __m128i UnpackXMM(
const __m128i& v)
163 return _mm_setzero_si128();
167inline __m128i UnpackXMM<0>(
const __m128i& v)
169 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
173inline __m128i UnpackXMM<1>(
const __m128i& v)
175 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
179inline __m128i UnpackXMM<2>(
const __m128i& v)
181 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
185inline __m128i UnpackXMM<3>(
const __m128i& v)
187 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
190template <
unsigned int IDX>
191inline __m128i RepackXMM(
const __m128i& a,
const __m128i& b,
const __m128i& c,
const __m128i& d)
193 return UnpackXMM<IDX>(a, b, c, d);
196template <
unsigned int IDX>
197inline __m128i RepackXMM(
const __m128i& v)
199 return UnpackXMM<IDX>(v);
202inline void CHAM128_Enc_Block(__m128i &block0,
203 const word32 *subkeys,
unsigned int rounds)
209 __m128i a = UnpackXMM<0>(block0);
210 __m128i b = UnpackXMM<1>(block0);
211 __m128i c = UnpackXMM<2>(block0);
212 __m128i d = UnpackXMM<3>(block0);
214 __m128i counter = _mm_set_epi32(0,0,0,0);
215 __m128i increment = _mm_set_epi32(1,1,1,1);
217 const unsigned int MASK = (rounds == 80 ? 7 : 15);
218 for (
int i=0; i<static_cast<int>(rounds); i+=4)
220 __m128i k, k1, k2, t1, t2;
221 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
224 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
225 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
227 t1 = _mm_xor_si128(a, counter);
228 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
229 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
231 counter = _mm_add_epi32(counter, increment);
233 t1 = _mm_xor_si128(b, counter);
234 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
235 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
237 counter = _mm_add_epi32(counter, increment);
238 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
241 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
242 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
244 t1 = _mm_xor_si128(c, counter);
245 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
246 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
248 counter = _mm_add_epi32(counter, increment);
250 t1 = _mm_xor_si128(d, counter);
251 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
252 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
254 counter = _mm_add_epi32(counter, increment);
258 block0 = RepackXMM<0>(a,b,c,d);
261inline void CHAM128_Dec_Block(__m128i &block0,
262 const word32 *subkeys,
unsigned int rounds)
268 __m128i a = UnpackXMM<0>(block0);
269 __m128i b = UnpackXMM<1>(block0);
270 __m128i c = UnpackXMM<2>(block0);
271 __m128i d = UnpackXMM<3>(block0);
273 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
274 __m128i decrement = _mm_set_epi32(1,1,1,1);
276 const unsigned int MASK = (rounds == 80 ? 7 : 15);
277 for (
int i =
static_cast<int>(rounds)-1; i >= 0; i-=4)
279 __m128i k, k1, k2, t1, t2;
280 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
283 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
284 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
287 t1 = RotateRight32<1>(d);
288 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
289 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
291 counter = _mm_sub_epi32(counter, decrement);
294 t1 = RotateRight32<8>(c);
295 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
296 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
298 counter = _mm_sub_epi32(counter, decrement);
299 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
302 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
303 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
306 t1 = RotateRight32<1>(b);
307 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
308 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
310 counter = _mm_sub_epi32(counter, decrement);
313 t1 = RotateRight32<8>(a);
314 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
315 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
317 counter = _mm_sub_epi32(counter, decrement);
321 block0 = RepackXMM<0>(a,b,c,d);
324inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
325 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
331 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
332 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
333 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
334 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
336 __m128i counter = _mm_set_epi32(0,0,0,0);
337 __m128i increment = _mm_set_epi32(1,1,1,1);
339 const unsigned int MASK = (rounds == 80 ? 7 : 15);
340 for (
int i=0; i<static_cast<int>(rounds); i+=4)
342 __m128i k, k1, k2, t1, t2;
343 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
346 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
347 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
349 t1 = _mm_xor_si128(a, counter);
350 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
351 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
353 counter = _mm_add_epi32(counter, increment);
355 t1 = _mm_xor_si128(b, counter);
356 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
357 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
359 counter = _mm_add_epi32(counter, increment);
360 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
363 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
364 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
366 t1 = _mm_xor_si128(c, counter);
367 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
368 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
370 counter = _mm_add_epi32(counter, increment);
372 t1 = _mm_xor_si128(d, counter);
373 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
374 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
376 counter = _mm_add_epi32(counter, increment);
380 block0 = RepackXMM<0>(a,b,c,d);
381 block1 = RepackXMM<1>(a,b,c,d);
382 block2 = RepackXMM<2>(a,b,c,d);
383 block3 = RepackXMM<3>(a,b,c,d);
386inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
387 __m128i &block2, __m128i &block3,
const word32 *subkeys,
unsigned int rounds)
393 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
394 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
395 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
396 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
398 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
399 __m128i decrement = _mm_set_epi32(1,1,1,1);
401 const unsigned int MASK = (rounds == 80 ? 7 : 15);
402 for (
int i =
static_cast<int>(rounds)-1; i >= 0; i-=4)
404 __m128i k, k1, k2, t1, t2;
405 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
408 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
409 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
412 t1 = RotateRight32<1>(d);
413 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
414 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
416 counter = _mm_sub_epi32(counter, decrement);
419 t1 = RotateRight32<8>(c);
420 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
421 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
423 counter = _mm_sub_epi32(counter, decrement);
424 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
427 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
428 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
431 t1 = RotateRight32<1>(b);
432 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
433 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
435 counter = _mm_sub_epi32(counter, decrement);
438 t1 = RotateRight32<8>(a);
439 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
440 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
442 counter = _mm_sub_epi32(counter, decrement);
446 block0 = RepackXMM<0>(a,b,c,d);
447 block1 = RepackXMM<1>(a,b,c,d);
448 block2 = RepackXMM<2>(a,b,c,d);
449 block3 = RepackXMM<3>(a,b,c,d);
458ANONYMOUS_NAMESPACE_END
462#if defined(CRYPTOPP_SSSE3_AVAILABLE)
463size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
464 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
467 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
470size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(
const word32* subKeys,
size_t rounds,
471 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
474 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Template for AdvancedProcessBlocks and SIMD processing.
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Classes for the CHAM block cipher.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
unsigned short word16
16-bit unsigned datatype
Utility functions for the Crypto++ library.
Crypto++ library namespace.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.