1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10 Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12
13macro_rules! impl_binop {
14 ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15 impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16 type Output = Self;
17 #[inline(always)]
18 fn $fn(self, rhs: Self) -> Self::Output {
19 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20 }
21 }
22 };
23}
24
25macro_rules! impl_binop_assign {
26 ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27 impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28 where
29 $vec<S3, S4, NI>: Copy,
30 {
31 #[inline(always)]
32 fn $fn_assign(&mut self, rhs: Self) {
33 *self = self.$fn(rhs);
34 }
35 }
36 };
37}
38
39macro_rules! def_vec {
40 ($vec:ident, $word:ident) => {
41 #[allow(non_camel_case_types)]
42 #[derive(Copy, Clone)]
43 pub struct $vec<S3, S4, NI> {
44 x: __m128i,
45 s3: PhantomData<S3>,
46 s4: PhantomData<S4>,
47 ni: PhantomData<NI>,
48 }
49
50 impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51 #[inline(always)]
52 unsafe fn unpack(x: vec128_storage) -> Self {
53 Self::new(x.sse2)
54 }
55 }
56 impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57 #[inline(always)]
58 fn from(x: $vec<S3, S4, NI>) -> Self {
59 vec128_storage { sse2: x.x }
60 }
61 }
62 impl<S3, S4, NI> $vec<S3, S4, NI> {
63 #[inline(always)]
64 fn new(x: __m128i) -> Self {
65 $vec {
66 x,
67 s3: PhantomData,
68 s4: PhantomData,
69 ni: PhantomData,
70 }
71 }
72 }
73
74 impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75 where
76 Self: BSwap,
77 {
78 #[inline(always)]
79 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80 assert_eq!(input.len(), 16);
81 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82 }
83 #[inline(always)]
84 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85 assert_eq!(input.len(), 16);
86 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87 }
88 #[inline(always)]
89 fn write_le(self, out: &mut [u8]) {
90 assert_eq!(out.len(), 16);
91 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92 }
93 #[inline(always)]
94 fn write_be(self, out: &mut [u8]) {
95 assert_eq!(out.len(), 16);
96 let x = self.bswap().x;
97 unsafe {
98 _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99 }
100 }
101 }
102
103 impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104 #[inline(always)]
105 fn default() -> Self {
106 Self::new(unsafe { _mm_setzero_si128() })
107 }
108 }
109
110 impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111 type Output = Self;
112 #[inline(always)]
113 fn not(self) -> Self::Output {
114 unsafe {
115 let ff = _mm_set1_epi64x(-1i64);
116 self ^ Self::new(ff)
117 }
118 }
119 }
120
121 impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122 impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123 impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124 impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125 impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126 impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127 impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128 impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129 type Output = Self;
130 #[inline(always)]
131 fn andnot(self, rhs: Self) -> Self {
132 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133 }
134 }
135 };
136}
137
138macro_rules! impl_bitops32 {
139 ($vec:ident) => {
140 impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141 $vec<S3, S4, NI>: RotateEachWord32
142 {
143 }
144 };
145}
146
147macro_rules! impl_bitops64 {
148 ($vec:ident) => {
149 impl_bitops32!($vec);
150 impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151 $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152 {
153 }
154 };
155}
156
157macro_rules! impl_bitops128 {
158 ($vec:ident) => {
159 impl_bitops64!($vec);
160 impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161 $vec<S3, S4, NI>: RotateEachWord128
162 {
163 }
164 };
165}
166
167macro_rules! rotr_32_s3 {
168 ($name:ident, $k0:expr, $k1:expr) => {
169 #[inline(always)]
170 fn $name(self) -> Self {
171 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172 }
173 };
174}
175macro_rules! rotr_32 {
176 ($name:ident, $i:expr) => {
177 #[inline(always)]
178 fn $name(self) -> Self {
179 Self::new(unsafe {
180 _mm_or_si128(
181 _mm_srli_epi32(self.x, $i as i32),
182 _mm_slli_epi32(self.x, 32 - $i as i32),
183 )
184 })
185 }
186 };
187}
188impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189 rotr_32!(rotate_each_word_right7, 7);
190 rotr_32_s3!(
191 rotate_each_word_right8,
192 0x0c0f0e0d_080b0a09,
193 0x04070605_00030201
194 );
195 rotr_32!(rotate_each_word_right11, 11);
196 rotr_32!(rotate_each_word_right12, 12);
197 rotr_32_s3!(
198 rotate_each_word_right16,
199 0x0d0c0f0e_09080b0a,
200 0x05040706_01000302
201 );
202 rotr_32!(rotate_each_word_right20, 20);
203 rotr_32_s3!(
204 rotate_each_word_right24,
205 0x0e0d0c0f_0a09080b,
206 0x06050407_02010003
207 );
208 rotr_32!(rotate_each_word_right25, 25);
209}
210impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211 rotr_32!(rotate_each_word_right7, 7);
212 rotr_32!(rotate_each_word_right8, 8);
213 rotr_32!(rotate_each_word_right11, 11);
214 rotr_32!(rotate_each_word_right12, 12);
215 #[inline(always)]
216 fn rotate_each_word_right16(self) -> Self {
217 Self::new(swap16_s2(self.x))
218 }
219 rotr_32!(rotate_each_word_right20, 20);
220 rotr_32!(rotate_each_word_right24, 24);
221 rotr_32!(rotate_each_word_right25, 25);
222}
223
224macro_rules! rotr_64_s3 {
225 ($name:ident, $k0:expr, $k1:expr) => {
226 #[inline(always)]
227 fn $name(self) -> Self {
228 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229 }
230 };
231}
232macro_rules! rotr_64 {
233 ($name:ident, $i:expr) => {
234 #[inline(always)]
235 fn $name(self) -> Self {
236 Self::new(unsafe {
237 _mm_or_si128(
238 _mm_srli_epi64(self.x, $i as i32),
239 _mm_slli_epi64(self.x, 64 - $i as i32),
240 )
241 })
242 }
243 };
244}
245impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246 rotr_64!(rotate_each_word_right7, 7);
247 rotr_64_s3!(
248 rotate_each_word_right8,
249 0x080f_0e0d_0c0b_0a09,
250 0x0007_0605_0403_0201
251 );
252 rotr_64!(rotate_each_word_right11, 11);
253 rotr_64!(rotate_each_word_right12, 12);
254 rotr_64_s3!(
255 rotate_each_word_right16,
256 0x0908_0f0e_0d0c_0b0a,
257 0x0100_0706_0504_0302
258 );
259 rotr_64!(rotate_each_word_right20, 20);
260 rotr_64_s3!(
261 rotate_each_word_right24,
262 0x0a09_080f_0e0d_0c0b,
263 0x0201_0007_0605_0403
264 );
265 rotr_64!(rotate_each_word_right25, 25);
266}
267impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268 rotr_64!(rotate_each_word_right7, 7);
269 rotr_64!(rotate_each_word_right8, 8);
270 rotr_64!(rotate_each_word_right11, 11);
271 rotr_64!(rotate_each_word_right12, 12);
272 #[inline(always)]
273 fn rotate_each_word_right16(self) -> Self {
274 Self::new(swap16_s2(self.x))
275 }
276 rotr_64!(rotate_each_word_right20, 20);
277 rotr_64!(rotate_each_word_right24, 24);
278 rotr_64!(rotate_each_word_right25, 25);
279}
280impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281 #[inline(always)]
282 fn rotate_each_word_right32(self) -> Self {
283 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284 }
285}
286
287macro_rules! rotr_128 {
288 ($name:ident, $i:expr) => {
289 #[inline(always)]
290 fn $name(self) -> Self {
291 Self::new(unsafe {
292 _mm_or_si128(
293 _mm_srli_si128(self.x, $i as i32),
294 _mm_slli_si128(self.x, 128 - $i as i32),
295 )
296 })
297 }
298 };
299}
300impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302 rotr_128!(rotate_each_word_right7, 7);
303 rotr_128!(rotate_each_word_right8, 8);
304 rotr_128!(rotate_each_word_right11, 11);
305 rotr_128!(rotate_each_word_right12, 12);
306 rotr_128!(rotate_each_word_right16, 16);
307 rotr_128!(rotate_each_word_right20, 20);
308 rotr_128!(rotate_each_word_right24, 24);
309 rotr_128!(rotate_each_word_right25, 25);
310}
311impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313 rotr_128!(rotate_each_word_right32, 32);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316
317def_vec!(u32x4_sse2, u32);
318def_vec!(u64x2_sse2, u64);
319def_vec!(u128x1_sse2, u128);
320
321impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322 #[inline(always)]
323 fn to_lanes(self) -> [u32; 4] {
324 unsafe {
325 let x = _mm_cvtsi128_si64(self.x) as u64;
326 let y = _mm_extract_epi64(self.x, 1) as u64;
327 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328 }
329 }
330 #[inline(always)]
331 fn from_lanes(xs: [u32; 4]) -> Self {
332 unsafe {
333 let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334 x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335 Self::new(x)
336 }
337 }
338}
339impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340 #[inline(always)]
341 fn to_lanes(self) -> [u32; 4] {
342 unsafe {
343 let x = _mm_cvtsi128_si64(self.x) as u64;
344 let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346 }
347 }
348 #[inline(always)]
349 fn from_lanes(xs: [u32; 4]) -> Self {
350 unsafe {
351 let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352 let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353 let x = _mm_cvtsi64_si128(x);
354 let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355 Self::new(_mm_or_si128(x, y))
356 }
357 }
358}
359impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360 #[inline(always)]
361 fn to_lanes(self) -> [u64; 2] {
362 unsafe {
363 [
364 _mm_cvtsi128_si64(self.x) as u64,
365 _mm_extract_epi64(self.x, 1) as u64,
366 ]
367 }
368 }
369 #[inline(always)]
370 fn from_lanes(xs: [u64; 2]) -> Self {
371 unsafe {
372 let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373 x = _mm_insert_epi64(x, xs[1] as i64, 1);
374 Self::new(x)
375 }
376 }
377}
378impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379 #[inline(always)]
380 fn to_lanes(self) -> [u64; 2] {
381 unsafe {
382 [
383 _mm_cvtsi128_si64(self.x) as u64,
384 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385 ]
386 }
387 }
388 #[inline(always)]
389 fn from_lanes(xs: [u64; 2]) -> Self {
390 unsafe {
391 let x = _mm_cvtsi64_si128(xs[0] as i64);
392 let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393 Self::new(_mm_or_si128(x, y))
394 }
395 }
396}
397impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398 #[inline(always)]
399 fn to_lanes(self) -> [u128; 1] {
400 unimplemented!()
401 }
402 #[inline(always)]
403 fn from_lanes(xs: [u128; 1]) -> Self {
404 unimplemented!("{:?}", xs)
405 }
406}
407
408impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409where
410 u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411{
412 #[inline(always)]
413 fn to_lanes(self) -> [u64; 4] {
414 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415 [a[0], a[1], b[0], b[1]]
416 }
417 #[inline(always)]
418 fn from_lanes(xs: [u64; 4]) -> Self {
419 let (a, b) = (
420 u64x2_sse2::from_lanes([xs[0], xs[1]]),
421 u64x2_sse2::from_lanes([xs[2], xs[3]]),
422 );
423 x2::new([a, b])
424 }
425}
426
427macro_rules! impl_into {
428 ($from:ident, $to:ident) => {
429 impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430 #[inline(always)]
431 fn from(x: $from<S3, S4, NI>) -> Self {
432 $to::new(x.x)
433 }
434 }
435 };
436}
437
438impl_into!(u128x1_sse2, u32x4_sse2);
439impl_into!(u128x1_sse2, u64x2_sse2);
440
441impl_bitops32!(u32x4_sse2);
442impl_bitops64!(u64x2_sse2);
443impl_bitops128!(u128x1_sse2);
444
445impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446 u32x4_sse2<S3, S4, NI>: BSwap
447{
448}
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450 u64x2_sse2<S3, S4, NI>: BSwap
451{
452}
453impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457
458impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459where
460 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461 Machine86<S3, S4, NI>: Machine,
462{
463}
464impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465where
466 u64x2_sse2<S3, S4, NI>:
467 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468 Machine86<S3, S4, NI>: Machine,
469{
470}
471impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472where
473 u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474 Machine86<S3, S4, NI>: Machine,
475 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477{
478}
479
480impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481where
482 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483 Machine86<YesS3, YesS4, NI>: Machine,
484{
485}
486impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487where
488 u64x2_sse2<YesS3, YesS4, NI>:
489 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490 Machine86<YesS3, YesS4, NI>: Machine,
491{
492}
493impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494where
495 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496 Machine86<YesS3, YesS4, NI>: Machine,
497 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499{
500}
501
502impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503 #[inline(always)]
504 unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505 Self::new(_mm_set_epi32(
506 xs[3] as i32,
507 xs[2] as i32,
508 xs[1] as i32,
509 xs[0] as i32,
510 ))
511 }
512}
513
514impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515where
516 Self: MultiLane<[u32; 4]>,
517{
518 #[inline(always)]
519 fn extract(self, i: u32) -> u32 {
520 self.to_lanes()[i as usize]
521 }
522 #[inline(always)]
523 fn insert(self, v: u32, i: u32) -> Self {
524 Self::new(unsafe {
525 match i {
526 0 => _mm_insert_epi32(self.x, v as i32, 0),
527 1 => _mm_insert_epi32(self.x, v as i32, 1),
528 2 => _mm_insert_epi32(self.x, v as i32, 2),
529 3 => _mm_insert_epi32(self.x, v as i32, 3),
530 _ => unreachable!(),
531 }
532 })
533 }
534}
535impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536where
537 Self: MultiLane<[u32; 4]>,
538{
539 #[inline(always)]
540 fn extract(self, i: u32) -> u32 {
541 self.to_lanes()[i as usize]
542 }
543 #[inline(always)]
544 fn insert(self, v: u32, i: u32) -> Self {
545 Self::new(unsafe {
546 match i {
547 0 => {
548 let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549 _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550 }
551 1 => {
552 let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553 x = _mm_slli_si128(x, 4);
554 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555 _mm_shuffle_epi32(x, 0b1110_0001)
556 }
557 2 => {
558 let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559 x = _mm_slli_si128(x, 4);
560 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561 _mm_shuffle_epi32(x, 0b1100_1001)
562 }
563 3 => {
564 let mut x = _mm_slli_si128(self.x, 4);
565 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566 _mm_shuffle_epi32(x, 0b0011_1001)
567 }
568 _ => unreachable!(),
569 }
570 })
571 }
572}
573
574impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575 #[inline(always)]
576 fn shuffle_lane_words2301(self) -> Self {
577 self.shuffle2301()
578 }
579 #[inline(always)]
580 fn shuffle_lane_words1230(self) -> Self {
581 self.shuffle1230()
582 }
583 #[inline(always)]
584 fn shuffle_lane_words3012(self) -> Self {
585 self.shuffle3012()
586 }
587}
588
589impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590 #[inline(always)]
591 fn shuffle2301(self) -> Self {
592 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593 }
594 #[inline(always)]
595 fn shuffle1230(self) -> Self {
596 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597 }
598 #[inline(always)]
599 fn shuffle3012(self) -> Self {
600 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601 }
602}
603
604impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605 #[inline(always)]
606 fn shuffle2301(self) -> Self {
607 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608 }
609 #[inline(always)]
610 fn shuffle3012(self) -> Self {
611 unsafe {
612 x2::new([
613 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615 ])
616 }
617 }
618 #[inline(always)]
619 fn shuffle1230(self) -> Self {
620 unsafe {
621 x2::new([
622 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624 ])
625 }
626 }
627}
628impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629 #[inline(always)]
630 fn shuffle2301(self) -> Self {
631 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632 }
633 #[inline(always)]
634 fn shuffle3012(self) -> Self {
635 unsafe {
636 let a = _mm_srli_si128(self.0[0].x, 8);
637 let b = _mm_slli_si128(self.0[0].x, 8);
638 let c = _mm_srli_si128(self.0[1].x, 8);
639 let d = _mm_slli_si128(self.0[1].x, 8);
640 let da = _mm_or_si128(d, a);
641 let bc = _mm_or_si128(b, c);
642 x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643 }
644 }
645 #[inline(always)]
646 fn shuffle1230(self) -> Self {
647 unsafe {
648 let a = _mm_srli_si128(self.0[0].x, 8);
649 let b = _mm_slli_si128(self.0[0].x, 8);
650 let c = _mm_srli_si128(self.0[1].x, 8);
651 let d = _mm_slli_si128(self.0[1].x, 8);
652 let da = _mm_or_si128(d, a);
653 let bc = _mm_or_si128(b, c);
654 x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655 }
656 }
657}
658
659impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660 #[inline(always)]
661 unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662 Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663 }
664}
665
666impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667 #[inline(always)]
668 fn extract(self, i: u32) -> u64 {
669 unsafe {
670 match i {
671 0 => _mm_cvtsi128_si64(self.x) as u64,
672 1 => _mm_extract_epi64(self.x, 1) as u64,
673 _ => unreachable!(),
674 }
675 }
676 }
677 #[inline(always)]
678 fn insert(self, x: u64, i: u32) -> Self {
679 Self::new(unsafe {
680 match i {
681 0 => _mm_insert_epi64(self.x, x as i64, 0),
682 1 => _mm_insert_epi64(self.x, x as i64, 1),
683 _ => unreachable!(),
684 }
685 })
686 }
687}
688impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689 #[inline(always)]
690 fn extract(self, i: u32) -> u64 {
691 unsafe {
692 match i {
693 0 => _mm_cvtsi128_si64(self.x) as u64,
694 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695 _ => unreachable!(),
696 }
697 }
698 }
699 #[inline(always)]
700 fn insert(self, x: u64, i: u32) -> Self {
701 Self::new(unsafe {
702 match i {
703 0 => _mm_or_si128(
704 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705 _mm_cvtsi64_si128(x as i64),
706 ),
707 1 => _mm_or_si128(
708 _mm_move_epi64(self.x),
709 _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710 ),
711 _ => unreachable!(),
712 }
713 })
714 }
715}
716
717impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718 #[inline(always)]
719 fn bswap(self) -> Self {
720 Self::new(unsafe {
721 let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722 _mm_shuffle_epi8(self.x, k)
723 })
724 }
725}
726#[inline(always)]
727fn bswap32_s2(x: __m128i) -> __m128i {
728 unsafe {
729 let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730 y = _mm_shufflehi_epi16(y, 0b0001_1011);
731 y = _mm_shufflelo_epi16(y, 0b0001_1011);
732 let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733 z = _mm_shufflehi_epi16(z, 0b0001_1011);
734 z = _mm_shufflelo_epi16(z, 0b0001_1011);
735 _mm_packus_epi16(y, z)
736 }
737}
738impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739 #[inline(always)]
740 fn bswap(self) -> Self {
741 Self::new(bswap32_s2(self.x))
742 }
743}
744
745impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746 #[inline(always)]
747 fn bswap(self) -> Self {
748 Self::new(unsafe {
749 let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750 _mm_shuffle_epi8(self.x, k)
751 })
752 }
753}
754impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755 #[inline(always)]
756 fn bswap(self) -> Self {
757 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758 }
759}
760
761impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762 #[inline(always)]
763 fn bswap(self) -> Self {
764 Self::new(unsafe {
765 let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766 _mm_shuffle_epi8(self.x, k)
767 })
768 }
769}
770impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771 #[inline(always)]
772 fn bswap(self) -> Self {
773 unimplemented!()
774 }
775}
776
777macro_rules! swapi {
778 ($x:expr, $i:expr, $k:expr) => {
779 unsafe {
780 const K: u8 = $k;
781 let k = _mm_set1_epi8(K as i8);
782 u128x1_sse2::new(_mm_or_si128(
783 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785 ))
786 }
787 };
788}
789#[inline(always)]
790fn swap16_s2(x: __m128i) -> __m128i {
791 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792}
793impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794 #[inline(always)]
795 fn swap1(self) -> Self {
796 swapi!(self, 1, 0xaa)
797 }
798 #[inline(always)]
799 fn swap2(self) -> Self {
800 swapi!(self, 2, 0xcc)
801 }
802 #[inline(always)]
803 fn swap4(self) -> Self {
804 swapi!(self, 4, 0xf0)
805 }
806 #[inline(always)]
807 fn swap8(self) -> Self {
808 u128x1_sse2::new(unsafe {
809 let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810 _mm_shuffle_epi8(self.x, k)
811 })
812 }
813 #[inline(always)]
814 fn swap16(self) -> Self {
815 u128x1_sse2::new(unsafe {
816 let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817 _mm_shuffle_epi8(self.x, k)
818 })
819 }
820 #[inline(always)]
821 fn swap32(self) -> Self {
822 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823 }
824 #[inline(always)]
825 fn swap64(self) -> Self {
826 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827 }
828}
829impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830 #[inline(always)]
831 fn swap1(self) -> Self {
832 swapi!(self, 1, 0xaa)
833 }
834 #[inline(always)]
835 fn swap2(self) -> Self {
836 swapi!(self, 2, 0xcc)
837 }
838 #[inline(always)]
839 fn swap4(self) -> Self {
840 swapi!(self, 4, 0xf0)
841 }
842 #[inline(always)]
843 fn swap8(self) -> Self {
844 u128x1_sse2::new(unsafe {
845 _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846 })
847 }
848 #[inline(always)]
849 fn swap16(self) -> Self {
850 u128x1_sse2::new(swap16_s2(self.x))
851 }
852 #[inline(always)]
853 fn swap32(self) -> Self {
854 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855 }
856 #[inline(always)]
857 fn swap64(self) -> Self {
858 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859 }
860}
861
862#[derive(Copy, Clone)]
863pub struct G0;
864#[derive(Copy, Clone)]
865pub struct G1;
866
867#[allow(non_camel_case_types)]
868pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869#[allow(non_camel_case_types)]
870pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873#[allow(non_camel_case_types)]
874pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875
876#[allow(non_camel_case_types)]
877pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878#[allow(non_camel_case_types)]
879pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882
883impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
884where
885 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
886 Machine86<S3, S4, NI>: Machine,
887 u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
888 u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
889{
890}
891impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
892where
893 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
894 Machine86<S3, S4, NI>: Machine,
895 u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
896 u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
897{
898}
899impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
900where
901 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
902 Machine86<S3, S4, NI>: Machine,
903 u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
904{
905}
906impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
907where
908 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
909 Machine86<S3, S4, NI>: Machine,
910 u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
911 u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
912 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
913 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
914 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
915{
916}
917
918impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
919where
920 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
921 Avx2Machine<NI>: Machine,
922 u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
923 u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
924{
925}
926impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
927where
928 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
929 Avx2Machine<NI>: Machine,
930 u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
931 u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
932{
933}
934impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
935where
936 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
937 Avx2Machine<NI>: Machine,
938 u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
939{
940}
941impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
942where
943 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
944 Avx2Machine<NI>: Machine,
945 u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
946 u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
947 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
948 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
949 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
950{
951}
952
953impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
954where
955 u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
956{
957 #[inline(always)]
958 fn extract(self, i: u32) -> u64 {
959 match i {
960 0 => self.0[0].extract(0),
961 1 => self.0[0].extract(1),
962 2 => self.0[1].extract(0),
963 3 => self.0[1].extract(1),
964 _ => panic!(),
965 }
966 }
967 #[inline(always)]
968 fn insert(mut self, w: u64, i: u32) -> Self {
969 match i {
970 0 => self.0[0] = self.0[0].insert(w, 0),
971 1 => self.0[0] = self.0[0].insert(w, 1),
972 2 => self.0[1] = self.0[1].insert(w, 0),
973 3 => self.0[1] = self.0[1].insert(w, 1),
974 _ => panic!(),
975 };
976 self
977 }
978}
979
980impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
981where
982 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
983 Machine86<S3, S4, NI>: Machine,
984 u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
985 u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
986{
987}
988impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
989where
990 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
991 Machine86<S3, S4, NI>: Machine,
992 u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
993 u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
994{
995}
996impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
997where
998 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
999 Machine86<S3, S4, NI>: Machine,
1000 u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1001 u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1002 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1003 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1004{
1005}
1006
1007impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1008where
1009 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1010 Avx2Machine<NI>: Machine,
1011 u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1012 u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1013{
1014}
1015impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1016where
1017 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1018 Avx2Machine<NI>: Machine,
1019 u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1020 u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1021{
1022}
1023impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1024where
1025 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1026 Avx2Machine<NI>: Machine,
1027 u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1028 u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1029 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1030 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1031{
1032}
1033
1034macro_rules! impl_into_x {
1035 ($from:ident, $to:ident) => {
1036 impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1037 for x2<$to<S3, S4, NI>, Gt>
1038 {
1039 #[inline(always)]
1040 fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1041 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1042 }
1043 }
1044 impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1045 #[inline(always)]
1046 fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1047 x4::new([
1048 $to::from(x.0[0]),
1049 $to::from(x.0[1]),
1050 $to::from(x.0[2]),
1051 $to::from(x.0[3]),
1052 ])
1053 }
1054 }
1055 };
1056}
1057impl_into_x!(u128x1_sse2, u64x2_sse2);
1058impl_into_x!(u128x1_sse2, u32x4_sse2);
1059
1060use core::fmt::{Debug, Formatter, Result};
1063
1064impl<W: PartialEq, G> PartialEq for x2<W, G> {
1065 #[inline(always)]
1066 fn eq(&self, rhs: &Self) -> bool {
1067 self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1068 }
1069}
1070
1071#[allow(unused)]
1072#[inline(always)]
1073unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1074 let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1075 _mm_cvtsi128_si64(q) == -1
1076}
1077
1078#[inline(always)]
1079unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1080 let q = _mm_cmpeq_epi32(x, y);
1081 let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1082 let q = _mm_cvtsi128_si64(q);
1083 (p & q) == -1
1084}
1085
1086impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1087 #[inline(always)]
1088 fn eq(&self, rhs: &Self) -> bool {
1089 unsafe { eq128_s2(self.x, rhs.x) }
1090 }
1091}
1092impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1093where
1094 Self: Copy + MultiLane<[u32; 4]>,
1095{
1096 #[cold]
1097 fn fmt(&self, fmt: &mut Formatter) -> Result {
1098 fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1099 }
1100}
1101
1102impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1103 #[inline(always)]
1104 fn eq(&self, rhs: &Self) -> bool {
1105 unsafe { eq128_s2(self.x, rhs.x) }
1106 }
1107}
1108impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1109where
1110 Self: Copy + MultiLane<[u64; 2]>,
1111{
1112 #[cold]
1113 fn fmt(&self, fmt: &mut Formatter) -> Result {
1114 fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1115 }
1116}
1117
1118impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1119where
1120 u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1121{
1122 #[cold]
1123 fn fmt(&self, fmt: &mut Formatter) -> Result {
1124 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1125 fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1126 }
1127}
1128
1129#[cfg(test)]
1130#[cfg(target_arch = "x86_64")]
1131mod test {
1132 use super::*;
1133 use crate::x86_64::{SSE2, SSE41, SSSE3};
1134 use crate::Machine;
1135
1136 #[test]
1137 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1138 fn test_bswap32_s2_vs_s3() {
1139 let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1140 let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1141
1142 let s2 = unsafe { SSE2::instance() };
1143 let s3 = unsafe { SSSE3::instance() };
1144
1145 let x_s2 = {
1146 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1147 x_s2.bswap()
1148 };
1149
1150 let x_s3 = {
1151 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1152 x_s3.bswap()
1153 };
1154
1155 assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1156 assert_eq!(x_s2, s2.vec(ys));
1157 }
1158
1159 #[test]
1160 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1161 fn test_bswap64_s2_vs_s3() {
1162 let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1163 let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1164
1165 let s2 = unsafe { SSE2::instance() };
1166 let s3 = unsafe { SSSE3::instance() };
1167
1168 let x_s2 = {
1169 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1170 x_s2.bswap()
1171 };
1172
1173 let x_s3 = {
1174 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1175 x_s3.bswap()
1176 };
1177
1178 assert_eq!(x_s2, s2.vec(ys));
1179 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1180 }
1181
1182 #[test]
1183 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1184 fn test_shuffle32_s2_vs_s3() {
1185 let xs = [0x0, 0x1, 0x2, 0x3];
1186 let ys = [0x2, 0x3, 0x0, 0x1];
1187 let zs = [0x1, 0x2, 0x3, 0x0];
1188
1189 let s2 = unsafe { SSE2::instance() };
1190 let s3 = unsafe { SSSE3::instance() };
1191
1192 let x_s2 = {
1193 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1194 x_s2.shuffle2301()
1195 };
1196 let x_s3 = {
1197 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1198 x_s3.shuffle2301()
1199 };
1200 assert_eq!(x_s2, s2.vec(ys));
1201 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1202
1203 let x_s2 = {
1204 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1205 x_s2.shuffle3012()
1206 };
1207 let x_s3 = {
1208 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1209 x_s3.shuffle3012()
1210 };
1211 assert_eq!(x_s2, s2.vec(zs));
1212 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1213
1214 let x_s2 = x_s2.shuffle1230();
1215 let x_s3 = x_s3.shuffle1230();
1216 assert_eq!(x_s2, s2.vec(xs));
1217 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1218 }
1219
1220 #[test]
1221 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1222 fn test_shuffle64_s2_vs_s3() {
1223 let xs = [0x0, 0x1, 0x2, 0x3];
1224 let ys = [0x2, 0x3, 0x0, 0x1];
1225 let zs = [0x1, 0x2, 0x3, 0x0];
1226
1227 let s2 = unsafe { SSE2::instance() };
1228 let s3 = unsafe { SSSE3::instance() };
1229
1230 let x_s2 = {
1231 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1232 x_s2.shuffle2301()
1233 };
1234 let x_s3 = {
1235 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1236 x_s3.shuffle2301()
1237 };
1238 assert_eq!(x_s2, s2.vec(ys));
1239 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1240
1241 let x_s2 = {
1242 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1243 x_s2.shuffle3012()
1244 };
1245 let x_s3 = {
1246 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1247 x_s3.shuffle3012()
1248 };
1249 assert_eq!(x_s2, s2.vec(zs));
1250 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1251
1252 let x_s2 = x_s2.shuffle1230();
1253 let x_s3 = x_s3.shuffle1230();
1254 assert_eq!(x_s2, s2.vec(xs));
1255 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1256 }
1257
1258 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1259 #[test]
1260 fn test_lanes_u32x4() {
1261 let xs = [0x1, 0x2, 0x3, 0x4];
1262
1263 let s2 = unsafe { SSE2::instance() };
1264 let s3 = unsafe { SSSE3::instance() };
1265 let s4 = unsafe { SSE41::instance() };
1266
1267 {
1268 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1269 let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1270 assert_eq!(x_s2, y_s2);
1271 assert_eq!(xs, y_s2.to_lanes());
1272 }
1273
1274 {
1275 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1276 let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1277 assert_eq!(x_s3, y_s3);
1278 assert_eq!(xs, y_s3.to_lanes());
1279 }
1280
1281 {
1282 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1283 let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1284 assert_eq!(x_s4, y_s4);
1285 assert_eq!(xs, y_s4.to_lanes());
1286 }
1287 }
1288
1289 #[test]
1290 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1291 fn test_lanes_u64x2() {
1292 let xs = [0x1, 0x2];
1293
1294 let s2 = unsafe { SSE2::instance() };
1295 let s3 = unsafe { SSSE3::instance() };
1296 let s4 = unsafe { SSE41::instance() };
1297
1298 {
1299 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1300 let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1301 assert_eq!(x_s2, y_s2);
1302 assert_eq!(xs, y_s2.to_lanes());
1303 }
1304
1305 {
1306 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1307 let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1308 assert_eq!(x_s3, y_s3);
1309 assert_eq!(xs, y_s3.to_lanes());
1310 }
1311
1312 {
1313 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1314 let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1315 assert_eq!(x_s4, y_s4);
1316 assert_eq!(xs, y_s4.to_lanes());
1317 }
1318 }
1319
1320 #[test]
1321 fn test_vec4_u32x4_s2() {
1322 let xs = [1, 2, 3, 4];
1323 let s2 = unsafe { SSE2::instance() };
1324 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1325 assert_eq!(x_s2.extract(0), 1);
1326 assert_eq!(x_s2.extract(1), 2);
1327 assert_eq!(x_s2.extract(2), 3);
1328 assert_eq!(x_s2.extract(3), 4);
1329 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1330 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1331 assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1332 assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1333 }
1334
1335 #[test]
1336 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1337 fn test_vec4_u32x4_s4() {
1338 let xs = [1, 2, 3, 4];
1339 let s4 = unsafe { SSE41::instance() };
1340 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1341 assert_eq!(x_s4.extract(0), 1);
1342 assert_eq!(x_s4.extract(1), 2);
1343 assert_eq!(x_s4.extract(2), 3);
1344 assert_eq!(x_s4.extract(3), 4);
1345 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1346 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1347 assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1348 assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1349 }
1350
1351 #[test]
1352 fn test_vec2_u64x2_s2() {
1353 let xs = [0x1, 0x2];
1354 let s2 = unsafe { SSE2::instance() };
1355 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1356 assert_eq!(x_s2.extract(0), 1);
1357 assert_eq!(x_s2.extract(1), 2);
1358 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1359 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1360 }
1361
1362 #[test]
1363 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1364 fn test_vec4_u64x2_s4() {
1365 let xs = [0x1, 0x2];
1366 let s4 = unsafe { SSE41::instance() };
1367 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1368 assert_eq!(x_s4.extract(0), 1);
1369 assert_eq!(x_s4.extract(1), 2);
1370 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1371 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1372 }
1373}
1374
1375pub mod avx2 {
1376 #![allow(non_camel_case_types)]
1377 use crate::soft::x4;
1378 use crate::types::*;
1379 use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1380 use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1381 use core::arch::x86_64::*;
1382 use core::marker::PhantomData;
1383 use core::ops::*;
1384
1385 #[derive(Copy, Clone)]
1386 pub struct u32x4x4_avx2<NI> {
1387 x: [__m256i; 2],
1388 ni: PhantomData<NI>,
1389 }
1390
1391 impl<NI> u32x4x4_avx2<NI> {
1392 #[inline(always)]
1393 fn new(x: [__m256i; 2]) -> Self {
1394 Self { x, ni: PhantomData }
1395 }
1396 }
1397
1398 impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1399 impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1400 #[inline(always)]
1401 unsafe fn unpack(p: vec512_storage) -> Self {
1402 Self::new([p.avx[0].avx, p.avx[1].avx])
1403 }
1404 }
1405 impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1406 #[inline(always)]
1407 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1408 unsafe {
1409 [
1410 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1411 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1412 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1413 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1414 ]
1415 }
1416 }
1417 #[inline(always)]
1418 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1419 Self::new(unsafe {
1420 [
1421 _mm256_setr_m128i(x[0].x, x[1].x),
1422 _mm256_setr_m128i(x[2].x, x[3].x),
1423 ]
1424 })
1425 }
1426 }
1427 impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1428 #[inline(always)]
1429 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1430 unsafe {
1431 match i {
1432 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1433 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1434 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1435 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1436 _ => panic!(),
1437 }
1438 }
1439 }
1440 #[inline(always)]
1441 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1442 Self::new(unsafe {
1443 match i {
1444 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1445 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1446 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1447 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1448 _ => panic!(),
1449 }
1450 })
1451 }
1452 }
1453 impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1454 #[inline(always)]
1455 fn shuffle_lane_words1230(self) -> Self {
1456 Self::new(unsafe {
1457 [
1458 _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1459 _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1460 ]
1461 })
1462 }
1463 #[inline(always)]
1464 fn shuffle_lane_words2301(self) -> Self {
1465 Self::new(unsafe {
1466 [
1467 _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1468 _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1469 ]
1470 })
1471 }
1472 #[inline(always)]
1473 fn shuffle_lane_words3012(self) -> Self {
1474 Self::new(unsafe {
1475 [
1476 _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1477 _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1478 ]
1479 })
1480 }
1481 }
1482 impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1483 impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1484 macro_rules! shuf_lane_bytes {
1485 ($name:ident, $k0:expr, $k1:expr) => {
1486 #[inline(always)]
1487 fn $name(self) -> Self {
1488 Self::new(unsafe {
1489 [
1490 _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1491 _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1492 ]
1493 })
1494 }
1495 };
1496 }
1497 macro_rules! rotr_32 {
1498 ($name:ident, $i:expr) => {
1499 #[inline(always)]
1500 fn $name(self) -> Self {
1501 Self::new(unsafe {
1502 [
1503 _mm256_or_si256(
1504 _mm256_srli_epi32(self.x[0], $i as i32),
1505 _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1506 ),
1507 _mm256_or_si256(
1508 _mm256_srli_epi32(self.x[1], $i as i32),
1509 _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1510 ),
1511 ]
1512 })
1513 }
1514 };
1515 }
1516 impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1517 rotr_32!(rotate_each_word_right7, 7);
1518 shuf_lane_bytes!(
1519 rotate_each_word_right8,
1520 0x0c0f0e0d_080b0a09,
1521 0x04070605_00030201
1522 );
1523 rotr_32!(rotate_each_word_right11, 11);
1524 rotr_32!(rotate_each_word_right12, 12);
1525 shuf_lane_bytes!(
1526 rotate_each_word_right16,
1527 0x0d0c0f0e_09080b0a,
1528 0x05040706_01000302
1529 );
1530 rotr_32!(rotate_each_word_right20, 20);
1531 shuf_lane_bytes!(
1532 rotate_each_word_right24,
1533 0x0e0d0c0f_0a09080b,
1534 0x06050407_02010003
1535 );
1536 rotr_32!(rotate_each_word_right25, 25);
1537 }
1538 impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1539 impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1540 #[inline(always)]
1541 fn from(x: u32x4x4_avx2<NI>) -> Self {
1542 Self {
1543 avx: [
1544 vec256_storage { avx: x.x[0] },
1545 vec256_storage { avx: x.x[1] },
1546 ],
1547 }
1548 }
1549 }
1550
1551 macro_rules! impl_assign {
1552 ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1553 impl<NI> $Assign for $vec<NI>
1554 where
1555 NI: Copy,
1556 {
1557 #[inline(always)]
1558 fn $assign_fn(&mut self, rhs: Self) {
1559 *self = self.$bin_fn(rhs);
1560 }
1561 }
1562 };
1563 }
1564 impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1565 impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1566 impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1567 impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1568
1569 macro_rules! impl_bitop_x2 {
1570 ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1571 impl<NI> $Op for $vec<NI> {
1572 type Output = Self;
1573 #[inline(always)]
1574 fn $op_fn(self, rhs: Self) -> Self::Output {
1575 Self::new(unsafe {
1576 [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1577 })
1578 }
1579 }
1580 };
1581 }
1582 impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1583 impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1584 impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1585 impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1586 impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1587
1588 impl<NI> Not for u32x4x4_avx2<NI> {
1589 type Output = Self;
1590 #[inline(always)]
1591 fn not(self) -> Self::Output {
1592 unsafe {
1593 let f = _mm256_set1_epi8(-0x7f);
1594 Self::new([f, f]) ^ self
1595 }
1596 }
1597 }
1598
1599 impl<NI> BSwap for u32x4x4_avx2<NI> {
1600 shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1601 }
1602
1603 impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1604 where
1605 NI: Copy,
1606 {
1607 #[inline(always)]
1608 fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1609 Self::new(unsafe {
1610 [
1611 _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1612 _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1613 ]
1614 })
1615 }
1616 }
1617}