Crypto++  8.2
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1 // arm_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file arm_simd.h
4 /// \brief Support functions for ARM and vector operations
5 
6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
8 
9 #include "config.h"
10 
11 // C1189: error: This header is specific to ARM targets
12 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
13 # include <arm_neon.h>
14 #endif
15 
16 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
17 # include <stdint.h>
18 # include <arm_acle.h>
19 #endif
20 
21 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22 
23 /// \brief Polynomial multiplication
24 /// \param a the first term
25 /// \param b the second term
26 /// \returns vector product
27 /// \details PMULL_00() performs polynomial multiplication and presents
28 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
29 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
30 /// are multiplied.
31 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
32 /// is MSB and numbered 127, while the the rightmost bit is LSB and
33 /// numbered 0.
34 /// \since Crypto++ 8.0
35 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
36 {
37 #if defined(_MSC_VER)
38  const __n64 x = { vgetq_lane_u64(a, 0) };
39  const __n64 y = { vgetq_lane_u64(b, 0) };
40  return vmull_p64(x, y);
41 #elif defined(__GNUC__)
42  uint64x2_t r;
43  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
44  :"=w" (r) : "w" (a), "w" (b) );
45  return r;
46 #else
47  return (uint64x2_t)(vmull_p64(
48  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
49  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
50 #endif
51 }
52 
53 /// \brief Polynomial multiplication
54 /// \param a the first term
55 /// \param b the second term
56 /// \returns vector product
57 /// \details PMULL_01 performs() polynomial multiplication and presents
58 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
59 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
60 /// 64-bits of <tt>b</tt> are multiplied.
61 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
62 /// is MSB and numbered 127, while the the rightmost bit is LSB and
63 /// numbered 0.
64 /// \since Crypto++ 8.0
65 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
66 {
67 #if defined(_MSC_VER)
68  const __n64 x = { vgetq_lane_u64(a, 0) };
69  const __n64 y = { vgetq_lane_u64(b, 1) };
70  return vmull_p64(x, y);
71 #elif defined(__GNUC__)
72  uint64x2_t r;
73  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
74  :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
75  return r;
76 #else
77  return (uint64x2_t)(vmull_p64(
78  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
79  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
80 #endif
81 }
82 
83 /// \brief Polynomial multiplication
84 /// \param a the first term
85 /// \param b the second term
86 /// \returns vector product
87 /// \details PMULL_10() performs polynomial multiplication and presents
88 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
89 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
90 /// 64-bits of <tt>b</tt> are multiplied.
91 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
92 /// is MSB and numbered 127, while the the rightmost bit is LSB and
93 /// numbered 0.
94 /// \since Crypto++ 8.0
95 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
96 {
97 #if defined(_MSC_VER)
98  const __n64 x = { vgetq_lane_u64(a, 1) };
99  const __n64 y = { vgetq_lane_u64(b, 0) };
100  return vmull_p64(x, y);
101 #elif defined(__GNUC__)
102  uint64x2_t r;
103  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
104  :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
105  return r;
106 #else
107  return (uint64x2_t)(vmull_p64(
108  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
109  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
110 #endif
111 }
112 
113 /// \brief Polynomial multiplication
114 /// \param a the first term
115 /// \param b the second term
116 /// \returns vector product
117 /// \details PMULL_11() performs polynomial multiplication and presents
118 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
119 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
120 /// are multiplied.
121 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
122 /// is MSB and numbered 127, while the the rightmost bit is LSB and
123 /// numbered 0.
124 /// \since Crypto++ 8.0
125 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
126 {
127 #if defined(_MSC_VER)
128  const __n64 x = { vgetq_lane_u64(a, 1) };
129  const __n64 y = { vgetq_lane_u64(b, 1) };
130  return vmull_p64(x, y);
131 #elif defined(__GNUC__)
132  uint64x2_t r;
133  __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
134  :"=w" (r) : "w" (a), "w" (b) );
135  return r;
136 #else
137  return (uint64x2_t)(vmull_p64(
138  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
139  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
140 #endif
141 }
142 
143 /// \brief Vector extraction
144 /// \param a the first term
145 /// \param b the second term
146 /// \param c the byte count
147 /// \returns vector
148 /// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
149 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
150 /// \since Crypto++ 8.0
151 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
152 {
153 #if defined(_MSC_VER)
154  return (uint64x2_t)vextq_u8(
155  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
156 #else
157  uint64x2_t r;
158  __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
159  :"=w" (r) : "w" (a), "w" (b), "I" (c) );
160  return r;
161 #endif
162 }
163 
164 /// \brief Vector extraction
165 /// \tparam C the byte count
166 /// \param a the first term
167 /// \param b the second term
168 /// \returns vector
169 /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
170 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
171 /// \since Crypto++ 8.0
172 template <unsigned int C>
173 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
174 {
175  // https://github.com/weidai11/cryptopp/issues/366
176 #if defined(_MSC_VER)
177  return (uint64x2_t)vextq_u8(
178  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
179 #else
180  uint64x2_t r;
181  __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
182  :"=w" (r) : "w" (a), "w" (b), "I" (C) );
183  return r;
184 #endif
185 }
186 
187 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
188 
189 #endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:95
Library configuration file.
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:65
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:35
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:151
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:125