6#ifndef CRYPTOPP_ARM_SIMD_H
7#define CRYPTOPP_ARM_SIMD_H
11#if (CRYPTOPP_ARM_NEON_HEADER)
16#if (CRYPTOPP_ARM_ACLE_HEADER)
21#if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
30inline uint32_t
CRC32B (uint32_t crc, uint8_t val)
33 return __crc32b(crc, val);
35 __asm__ (
"crc32b %w0, %w0, %w1 \n\t"
36 :
"+r" (crc) :
"r" (val) );
46inline uint32_t
CRC32W (uint32_t crc, uint32_t val)
49 return __crc32w(crc, val);
51 __asm__ (
"crc32w %w0, %w0, %w1 \n\t"
52 :
"+r" (crc) :
"r" (val) );
62inline uint32_t
CRC32Wx4 (uint32_t crc,
const uint32_t vals[4])
65 return __crc32w(__crc32w(__crc32w(__crc32w(
66 crc, vals[0]), vals[1]), vals[2]), vals[3]);
68 __asm__ (
"crc32w %w0, %w0, %w1 \n\t"
69 "crc32w %w0, %w0, %w2 \n\t"
70 "crc32w %w0, %w0, %w3 \n\t"
71 "crc32w %w0, %w0, %w4 \n\t"
72 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
73 "r" (vals[2]),
"r" (vals[3]));
86inline uint32_t
CRC32CB (uint32_t crc, uint8_t val)
89 return __crc32cb(crc, val);
91 __asm__ (
"crc32cb %w0, %w0, %w1 \n\t"
92 :
"+r" (crc) :
"r" (val) );
102inline uint32_t
CRC32CW (uint32_t crc, uint32_t val)
105 return __crc32cw(crc, val);
107 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t"
108 :
"+r" (crc) :
"r" (val) );
118inline uint32_t
CRC32CWx4 (uint32_t crc,
const uint32_t vals[4])
121 return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122 crc, vals[0]), vals[1]), vals[2]), vals[3]);
124 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t"
125 "crc32cw %w0, %w0, %w2 \n\t"
126 "crc32cw %w0, %w0, %w3 \n\t"
127 "crc32cw %w0, %w0, %w4 \n\t"
128 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
129 "r" (vals[2]),
"r" (vals[3]));
136#if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
152inline uint64x2_t
PMULL_00(
const uint64x2_t a,
const uint64x2_t b)
155 const __n64 x = { vgetq_lane_u64(a, 0) };
156 const __n64 y = { vgetq_lane_u64(b, 0) };
157 return vmull_p64(x, y);
158#elif defined(__GNUC__)
160 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
161 :
"=w" (r) :
"w" (a),
"w" (b) );
164 return (uint64x2_t)(vmull_p64(
165 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
182inline uint64x2_t
PMULL_01(
const uint64x2_t a,
const uint64x2_t b)
185 const __n64 x = { vgetq_lane_u64(a, 0) };
186 const __n64 y = { vgetq_lane_u64(b, 1) };
187 return vmull_p64(x, y);
188#elif defined(__GNUC__)
190 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
191 :
"=w" (r) :
"w" (a),
"w" (vget_high_u64(b)) );
194 return (uint64x2_t)(vmull_p64(
195 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
212inline uint64x2_t
PMULL_10(
const uint64x2_t a,
const uint64x2_t b)
215 const __n64 x = { vgetq_lane_u64(a, 1) };
216 const __n64 y = { vgetq_lane_u64(b, 0) };
217 return vmull_p64(x, y);
218#elif defined(__GNUC__)
220 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
221 :
"=w" (r) :
"w" (vget_high_u64(a)),
"w" (b) );
224 return (uint64x2_t)(vmull_p64(
225 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
242inline uint64x2_t
PMULL_11(
const uint64x2_t a,
const uint64x2_t b)
245 const __n64 x = { vgetq_lane_u64(a, 1) };
246 const __n64 y = { vgetq_lane_u64(b, 1) };
247 return vmull_p64(x, y);
248#elif defined(__GNUC__)
250 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t"
251 :
"=w" (r) :
"w" (a),
"w" (b) );
254 return (uint64x2_t)(vmull_p64(
255 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
267inline uint64x2_t
PMULL(
const uint64x2_t a,
const uint64x2_t b)
270 const __n64 x = { vgetq_lane_u64(a, 0) };
271 const __n64 y = { vgetq_lane_u64(b, 0) };
272 return vmull_p64(x, y);
273#elif defined(__GNUC__)
275 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
276 :
"=w" (r) :
"w" (a),
"w" (b) );
279 return (uint64x2_t)(vmull_p64(
280 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
292inline uint64x2_t
PMULL_HIGH(
const uint64x2_t a,
const uint64x2_t b)
295 const __n64 x = { vgetq_lane_u64(a, 1) };
296 const __n64 y = { vgetq_lane_u64(b, 1) };
297 return vmull_p64(x, y);
298#elif defined(__GNUC__)
300 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t"
301 :
"=w" (r) :
"w" (a),
"w" (b) );
304 return (uint64x2_t)(vmull_p64(
305 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306 vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
319inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b,
unsigned int c)
322 return vreinterpretq_u64_u8(vextq_u8(
323 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
326 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
327 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (c) );
341template <
unsigned int C>
342inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b)
346 return vreinterpretq_u64_u8(vextq_u8(
347 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
350 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
351 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C) );
359#if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
372inline uint64x2_t
VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
375 return veor3q_u64(a, b, c);
378 __asm__ (
"eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
379 :
"=w" (r) :
"w" (a),
"w" (b),
"w" (c));
393inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b,
const int c)
396 return vxarq_u64(a, b, c);
399 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (c));
414template <
unsigned int C>
415inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b)
418 return vxarq_u64(a, b, C);
421 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t"
422 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C));
435inline uint64x2_t
VRAX1(uint64x2_t a, uint64x2_t b)
438 return vrax1q_u64(a, b);
441 __asm__ (
"rax1 %0.2d, %1.2d, %2.2d \n\t"
442 :
"=w" (r) :
"w" (a),
"w" (b));
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
XOR and rotate.
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
Library configuration file.