ppv_lite86/x86_64/
sse2.rs

1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12
13macro_rules! impl_binop {
14    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16            type Output = Self;
17            #[inline(always)]
18            fn $fn(self, rhs: Self) -> Self::Output {
19                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20            }
21        }
22    };
23}
24
25macro_rules! impl_binop_assign {
26    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28        where
29            $vec<S3, S4, NI>: Copy,
30        {
31            #[inline(always)]
32            fn $fn_assign(&mut self, rhs: Self) {
33                *self = self.$fn(rhs);
34            }
35        }
36    };
37}
38
39macro_rules! def_vec {
40    ($vec:ident, $word:ident) => {
41        #[allow(non_camel_case_types)]
42        #[derive(Copy, Clone)]
43        pub struct $vec<S3, S4, NI> {
44            x: __m128i,
45            s3: PhantomData<S3>,
46            s4: PhantomData<S4>,
47            ni: PhantomData<NI>,
48        }
49
50        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51            #[inline(always)]
52            unsafe fn unpack(x: vec128_storage) -> Self {
53                Self::new(x.sse2)
54            }
55        }
56        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57            #[inline(always)]
58            fn from(x: $vec<S3, S4, NI>) -> Self {
59                vec128_storage { sse2: x.x }
60            }
61        }
62        impl<S3, S4, NI> $vec<S3, S4, NI> {
63            #[inline(always)]
64            fn new(x: __m128i) -> Self {
65                $vec {
66                    x,
67                    s3: PhantomData,
68                    s4: PhantomData,
69                    ni: PhantomData,
70                }
71            }
72        }
73
74        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75        where
76            Self: BSwap,
77        {
78            #[inline(always)]
79            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80                assert_eq!(input.len(), 16);
81                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82            }
83            #[inline(always)]
84            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85                assert_eq!(input.len(), 16);
86                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87            }
88            #[inline(always)]
89            fn write_le(self, out: &mut [u8]) {
90                assert_eq!(out.len(), 16);
91                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92            }
93            #[inline(always)]
94            fn write_be(self, out: &mut [u8]) {
95                assert_eq!(out.len(), 16);
96                let x = self.bswap().x;
97                unsafe {
98                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99                }
100            }
101        }
102
103        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104            #[inline(always)]
105            fn default() -> Self {
106                Self::new(unsafe { _mm_setzero_si128() })
107            }
108        }
109
110        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111            type Output = Self;
112            #[inline(always)]
113            fn not(self) -> Self::Output {
114                unsafe {
115                    let ff = _mm_set1_epi64x(-1i64);
116                    self ^ Self::new(ff)
117                }
118            }
119        }
120
121        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129            type Output = Self;
130            #[inline(always)]
131            fn andnot(self, rhs: Self) -> Self {
132                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133            }
134        }
135    };
136}
137
138macro_rules! impl_bitops32 {
139    ($vec:ident) => {
140        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141            $vec<S3, S4, NI>: RotateEachWord32
142        {
143        }
144    };
145}
146
147macro_rules! impl_bitops64 {
148    ($vec:ident) => {
149        impl_bitops32!($vec);
150        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152        {
153        }
154    };
155}
156
157macro_rules! impl_bitops128 {
158    ($vec:ident) => {
159        impl_bitops64!($vec);
160        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161            $vec<S3, S4, NI>: RotateEachWord128
162        {
163        }
164    };
165}
166
167macro_rules! rotr_32_s3 {
168    ($name:ident, $k0:expr, $k1:expr) => {
169        #[inline(always)]
170        fn $name(self) -> Self {
171            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172        }
173    };
174}
175macro_rules! rotr_32 {
176    ($name:ident, $i:expr) => {
177        #[inline(always)]
178        fn $name(self) -> Self {
179            Self::new(unsafe {
180                _mm_or_si128(
181                    _mm_srli_epi32(self.x, $i as i32),
182                    _mm_slli_epi32(self.x, 32 - $i as i32),
183                )
184            })
185        }
186    };
187}
188impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189    rotr_32!(rotate_each_word_right7, 7);
190    rotr_32_s3!(
191        rotate_each_word_right8,
192        0x0c0f0e0d_080b0a09,
193        0x04070605_00030201
194    );
195    rotr_32!(rotate_each_word_right11, 11);
196    rotr_32!(rotate_each_word_right12, 12);
197    rotr_32_s3!(
198        rotate_each_word_right16,
199        0x0d0c0f0e_09080b0a,
200        0x05040706_01000302
201    );
202    rotr_32!(rotate_each_word_right20, 20);
203    rotr_32_s3!(
204        rotate_each_word_right24,
205        0x0e0d0c0f_0a09080b,
206        0x06050407_02010003
207    );
208    rotr_32!(rotate_each_word_right25, 25);
209}
210impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211    rotr_32!(rotate_each_word_right7, 7);
212    rotr_32!(rotate_each_word_right8, 8);
213    rotr_32!(rotate_each_word_right11, 11);
214    rotr_32!(rotate_each_word_right12, 12);
215    #[inline(always)]
216    fn rotate_each_word_right16(self) -> Self {
217        Self::new(swap16_s2(self.x))
218    }
219    rotr_32!(rotate_each_word_right20, 20);
220    rotr_32!(rotate_each_word_right24, 24);
221    rotr_32!(rotate_each_word_right25, 25);
222}
223
224macro_rules! rotr_64_s3 {
225    ($name:ident, $k0:expr, $k1:expr) => {
226        #[inline(always)]
227        fn $name(self) -> Self {
228            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229        }
230    };
231}
232macro_rules! rotr_64 {
233    ($name:ident, $i:expr) => {
234        #[inline(always)]
235        fn $name(self) -> Self {
236            Self::new(unsafe {
237                _mm_or_si128(
238                    _mm_srli_epi64(self.x, $i as i32),
239                    _mm_slli_epi64(self.x, 64 - $i as i32),
240                )
241            })
242        }
243    };
244}
245impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246    rotr_64!(rotate_each_word_right7, 7);
247    rotr_64_s3!(
248        rotate_each_word_right8,
249        0x080f_0e0d_0c0b_0a09,
250        0x0007_0605_0403_0201
251    );
252    rotr_64!(rotate_each_word_right11, 11);
253    rotr_64!(rotate_each_word_right12, 12);
254    rotr_64_s3!(
255        rotate_each_word_right16,
256        0x0908_0f0e_0d0c_0b0a,
257        0x0100_0706_0504_0302
258    );
259    rotr_64!(rotate_each_word_right20, 20);
260    rotr_64_s3!(
261        rotate_each_word_right24,
262        0x0a09_080f_0e0d_0c0b,
263        0x0201_0007_0605_0403
264    );
265    rotr_64!(rotate_each_word_right25, 25);
266}
267impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268    rotr_64!(rotate_each_word_right7, 7);
269    rotr_64!(rotate_each_word_right8, 8);
270    rotr_64!(rotate_each_word_right11, 11);
271    rotr_64!(rotate_each_word_right12, 12);
272    #[inline(always)]
273    fn rotate_each_word_right16(self) -> Self {
274        Self::new(swap16_s2(self.x))
275    }
276    rotr_64!(rotate_each_word_right20, 20);
277    rotr_64!(rotate_each_word_right24, 24);
278    rotr_64!(rotate_each_word_right25, 25);
279}
280impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281    #[inline(always)]
282    fn rotate_each_word_right32(self) -> Self {
283        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284    }
285}
286
287macro_rules! rotr_128 {
288    ($name:ident, $i:expr) => {
289        #[inline(always)]
290        fn $name(self) -> Self {
291            Self::new(unsafe {
292                _mm_or_si128(
293                    _mm_srli_si128(self.x, $i as i32),
294                    _mm_slli_si128(self.x, 128 - $i as i32),
295                )
296            })
297        }
298    };
299}
300// TODO: completely unoptimized
301impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302    rotr_128!(rotate_each_word_right7, 7);
303    rotr_128!(rotate_each_word_right8, 8);
304    rotr_128!(rotate_each_word_right11, 11);
305    rotr_128!(rotate_each_word_right12, 12);
306    rotr_128!(rotate_each_word_right16, 16);
307    rotr_128!(rotate_each_word_right20, 20);
308    rotr_128!(rotate_each_word_right24, 24);
309    rotr_128!(rotate_each_word_right25, 25);
310}
311// TODO: completely unoptimized
312impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313    rotr_128!(rotate_each_word_right32, 32);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316
317def_vec!(u32x4_sse2, u32);
318def_vec!(u64x2_sse2, u64);
319def_vec!(u128x1_sse2, u128);
320
321impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322    #[inline(always)]
323    fn to_lanes(self) -> [u32; 4] {
324        unsafe {
325            let x = _mm_cvtsi128_si64(self.x) as u64;
326            let y = _mm_extract_epi64(self.x, 1) as u64;
327            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328        }
329    }
330    #[inline(always)]
331    fn from_lanes(xs: [u32; 4]) -> Self {
332        unsafe {
333            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335            Self::new(x)
336        }
337    }
338}
339impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340    #[inline(always)]
341    fn to_lanes(self) -> [u32; 4] {
342        unsafe {
343            let x = _mm_cvtsi128_si64(self.x) as u64;
344            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346        }
347    }
348    #[inline(always)]
349    fn from_lanes(xs: [u32; 4]) -> Self {
350        unsafe {
351            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353            let x = _mm_cvtsi64_si128(x);
354            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355            Self::new(_mm_or_si128(x, y))
356        }
357    }
358}
359impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360    #[inline(always)]
361    fn to_lanes(self) -> [u64; 2] {
362        unsafe {
363            [
364                _mm_cvtsi128_si64(self.x) as u64,
365                _mm_extract_epi64(self.x, 1) as u64,
366            ]
367        }
368    }
369    #[inline(always)]
370    fn from_lanes(xs: [u64; 2]) -> Self {
371        unsafe {
372            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373            x = _mm_insert_epi64(x, xs[1] as i64, 1);
374            Self::new(x)
375        }
376    }
377}
378impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379    #[inline(always)]
380    fn to_lanes(self) -> [u64; 2] {
381        unsafe {
382            [
383                _mm_cvtsi128_si64(self.x) as u64,
384                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385            ]
386        }
387    }
388    #[inline(always)]
389    fn from_lanes(xs: [u64; 2]) -> Self {
390        unsafe {
391            let x = _mm_cvtsi64_si128(xs[0] as i64);
392            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393            Self::new(_mm_or_si128(x, y))
394        }
395    }
396}
397impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398    #[inline(always)]
399    fn to_lanes(self) -> [u128; 1] {
400        unimplemented!()
401    }
402    #[inline(always)]
403    fn from_lanes(xs: [u128; 1]) -> Self {
404        unimplemented!("{:?}", xs)
405    }
406}
407
408impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409where
410    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411{
412    #[inline(always)]
413    fn to_lanes(self) -> [u64; 4] {
414        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415        [a[0], a[1], b[0], b[1]]
416    }
417    #[inline(always)]
418    fn from_lanes(xs: [u64; 4]) -> Self {
419        let (a, b) = (
420            u64x2_sse2::from_lanes([xs[0], xs[1]]),
421            u64x2_sse2::from_lanes([xs[2], xs[3]]),
422        );
423        x2::new([a, b])
424    }
425}
426
427macro_rules! impl_into {
428    ($from:ident, $to:ident) => {
429        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430            #[inline(always)]
431            fn from(x: $from<S3, S4, NI>) -> Self {
432                $to::new(x.x)
433            }
434        }
435    };
436}
437
438impl_into!(u128x1_sse2, u32x4_sse2);
439impl_into!(u128x1_sse2, u64x2_sse2);
440
441impl_bitops32!(u32x4_sse2);
442impl_bitops64!(u64x2_sse2);
443impl_bitops128!(u128x1_sse2);
444
445impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446    u32x4_sse2<S3, S4, NI>: BSwap
447{
448}
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450    u64x2_sse2<S3, S4, NI>: BSwap
451{
452}
453impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457
458impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459where
460    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461    Machine86<S3, S4, NI>: Machine,
462{
463}
464impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465where
466    u64x2_sse2<S3, S4, NI>:
467        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468    Machine86<S3, S4, NI>: Machine,
469{
470}
471impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472where
473    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474    Machine86<S3, S4, NI>: Machine,
475    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477{
478}
479
480impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481where
482    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483    Machine86<YesS3, YesS4, NI>: Machine,
484{
485}
486impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487where
488    u64x2_sse2<YesS3, YesS4, NI>:
489        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490    Machine86<YesS3, YesS4, NI>: Machine,
491{
492}
493impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494where
495    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496    Machine86<YesS3, YesS4, NI>: Machine,
497    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499{
500}
501
502impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503    #[inline(always)]
504    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505        Self::new(_mm_set_epi32(
506            xs[3] as i32,
507            xs[2] as i32,
508            xs[1] as i32,
509            xs[0] as i32,
510        ))
511    }
512}
513
514impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515where
516    Self: MultiLane<[u32; 4]>,
517{
518    #[inline(always)]
519    fn extract(self, i: u32) -> u32 {
520        self.to_lanes()[i as usize]
521    }
522    #[inline(always)]
523    fn insert(self, v: u32, i: u32) -> Self {
524        Self::new(unsafe {
525            match i {
526                0 => _mm_insert_epi32(self.x, v as i32, 0),
527                1 => _mm_insert_epi32(self.x, v as i32, 1),
528                2 => _mm_insert_epi32(self.x, v as i32, 2),
529                3 => _mm_insert_epi32(self.x, v as i32, 3),
530                _ => unreachable!(),
531            }
532        })
533    }
534}
535impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536where
537    Self: MultiLane<[u32; 4]>,
538{
539    #[inline(always)]
540    fn extract(self, i: u32) -> u32 {
541        self.to_lanes()[i as usize]
542    }
543    #[inline(always)]
544    fn insert(self, v: u32, i: u32) -> Self {
545        Self::new(unsafe {
546            match i {
547                0 => {
548                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550                }
551                1 => {
552                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553                    x = _mm_slli_si128(x, 4);
554                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555                    _mm_shuffle_epi32(x, 0b1110_0001)
556                }
557                2 => {
558                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559                    x = _mm_slli_si128(x, 4);
560                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561                    _mm_shuffle_epi32(x, 0b1100_1001)
562                }
563                3 => {
564                    let mut x = _mm_slli_si128(self.x, 4);
565                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566                    _mm_shuffle_epi32(x, 0b0011_1001)
567                }
568                _ => unreachable!(),
569            }
570        })
571    }
572}
573
574impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575    #[inline(always)]
576    fn shuffle_lane_words2301(self) -> Self {
577        self.shuffle2301()
578    }
579    #[inline(always)]
580    fn shuffle_lane_words1230(self) -> Self {
581        self.shuffle1230()
582    }
583    #[inline(always)]
584    fn shuffle_lane_words3012(self) -> Self {
585        self.shuffle3012()
586    }
587}
588
589impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590    #[inline(always)]
591    fn shuffle2301(self) -> Self {
592        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593    }
594    #[inline(always)]
595    fn shuffle1230(self) -> Self {
596        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597    }
598    #[inline(always)]
599    fn shuffle3012(self) -> Self {
600        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601    }
602}
603
604impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605    #[inline(always)]
606    fn shuffle2301(self) -> Self {
607        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608    }
609    #[inline(always)]
610    fn shuffle3012(self) -> Self {
611        unsafe {
612            x2::new([
613                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615            ])
616        }
617    }
618    #[inline(always)]
619    fn shuffle1230(self) -> Self {
620        unsafe {
621            x2::new([
622                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624            ])
625        }
626    }
627}
628impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629    #[inline(always)]
630    fn shuffle2301(self) -> Self {
631        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632    }
633    #[inline(always)]
634    fn shuffle3012(self) -> Self {
635        unsafe {
636            let a = _mm_srli_si128(self.0[0].x, 8);
637            let b = _mm_slli_si128(self.0[0].x, 8);
638            let c = _mm_srli_si128(self.0[1].x, 8);
639            let d = _mm_slli_si128(self.0[1].x, 8);
640            let da = _mm_or_si128(d, a);
641            let bc = _mm_or_si128(b, c);
642            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643        }
644    }
645    #[inline(always)]
646    fn shuffle1230(self) -> Self {
647        unsafe {
648            let a = _mm_srli_si128(self.0[0].x, 8);
649            let b = _mm_slli_si128(self.0[0].x, 8);
650            let c = _mm_srli_si128(self.0[1].x, 8);
651            let d = _mm_slli_si128(self.0[1].x, 8);
652            let da = _mm_or_si128(d, a);
653            let bc = _mm_or_si128(b, c);
654            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655        }
656    }
657}
658
659impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660    #[inline(always)]
661    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663    }
664}
665
666impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667    #[inline(always)]
668    fn extract(self, i: u32) -> u64 {
669        unsafe {
670            match i {
671                0 => _mm_cvtsi128_si64(self.x) as u64,
672                1 => _mm_extract_epi64(self.x, 1) as u64,
673                _ => unreachable!(),
674            }
675        }
676    }
677    #[inline(always)]
678    fn insert(self, x: u64, i: u32) -> Self {
679        Self::new(unsafe {
680            match i {
681                0 => _mm_insert_epi64(self.x, x as i64, 0),
682                1 => _mm_insert_epi64(self.x, x as i64, 1),
683                _ => unreachable!(),
684            }
685        })
686    }
687}
688impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689    #[inline(always)]
690    fn extract(self, i: u32) -> u64 {
691        unsafe {
692            match i {
693                0 => _mm_cvtsi128_si64(self.x) as u64,
694                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695                _ => unreachable!(),
696            }
697        }
698    }
699    #[inline(always)]
700    fn insert(self, x: u64, i: u32) -> Self {
701        Self::new(unsafe {
702            match i {
703                0 => _mm_or_si128(
704                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705                    _mm_cvtsi64_si128(x as i64),
706                ),
707                1 => _mm_or_si128(
708                    _mm_move_epi64(self.x),
709                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710                ),
711                _ => unreachable!(),
712            }
713        })
714    }
715}
716
717impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718    #[inline(always)]
719    fn bswap(self) -> Self {
720        Self::new(unsafe {
721            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722            _mm_shuffle_epi8(self.x, k)
723        })
724    }
725}
726#[inline(always)]
727fn bswap32_s2(x: __m128i) -> __m128i {
728    unsafe {
729        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730        y = _mm_shufflehi_epi16(y, 0b0001_1011);
731        y = _mm_shufflelo_epi16(y, 0b0001_1011);
732        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733        z = _mm_shufflehi_epi16(z, 0b0001_1011);
734        z = _mm_shufflelo_epi16(z, 0b0001_1011);
735        _mm_packus_epi16(y, z)
736    }
737}
738impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739    #[inline(always)]
740    fn bswap(self) -> Self {
741        Self::new(bswap32_s2(self.x))
742    }
743}
744
745impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746    #[inline(always)]
747    fn bswap(self) -> Self {
748        Self::new(unsafe {
749            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750            _mm_shuffle_epi8(self.x, k)
751        })
752    }
753}
754impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755    #[inline(always)]
756    fn bswap(self) -> Self {
757        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758    }
759}
760
761impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762    #[inline(always)]
763    fn bswap(self) -> Self {
764        Self::new(unsafe {
765            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766            _mm_shuffle_epi8(self.x, k)
767        })
768    }
769}
770impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771    #[inline(always)]
772    fn bswap(self) -> Self {
773        unimplemented!()
774    }
775}
776
777macro_rules! swapi {
778    ($x:expr, $i:expr, $k:expr) => {
779        unsafe {
780            const K: u8 = $k;
781            let k = _mm_set1_epi8(K as i8);
782            u128x1_sse2::new(_mm_or_si128(
783                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785            ))
786        }
787    };
788}
789#[inline(always)]
790fn swap16_s2(x: __m128i) -> __m128i {
791    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792}
793impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794    #[inline(always)]
795    fn swap1(self) -> Self {
796        swapi!(self, 1, 0xaa)
797    }
798    #[inline(always)]
799    fn swap2(self) -> Self {
800        swapi!(self, 2, 0xcc)
801    }
802    #[inline(always)]
803    fn swap4(self) -> Self {
804        swapi!(self, 4, 0xf0)
805    }
806    #[inline(always)]
807    fn swap8(self) -> Self {
808        u128x1_sse2::new(unsafe {
809            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810            _mm_shuffle_epi8(self.x, k)
811        })
812    }
813    #[inline(always)]
814    fn swap16(self) -> Self {
815        u128x1_sse2::new(unsafe {
816            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817            _mm_shuffle_epi8(self.x, k)
818        })
819    }
820    #[inline(always)]
821    fn swap32(self) -> Self {
822        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823    }
824    #[inline(always)]
825    fn swap64(self) -> Self {
826        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827    }
828}
829impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830    #[inline(always)]
831    fn swap1(self) -> Self {
832        swapi!(self, 1, 0xaa)
833    }
834    #[inline(always)]
835    fn swap2(self) -> Self {
836        swapi!(self, 2, 0xcc)
837    }
838    #[inline(always)]
839    fn swap4(self) -> Self {
840        swapi!(self, 4, 0xf0)
841    }
842    #[inline(always)]
843    fn swap8(self) -> Self {
844        u128x1_sse2::new(unsafe {
845            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846        })
847    }
848    #[inline(always)]
849    fn swap16(self) -> Self {
850        u128x1_sse2::new(swap16_s2(self.x))
851    }
852    #[inline(always)]
853    fn swap32(self) -> Self {
854        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855    }
856    #[inline(always)]
857    fn swap64(self) -> Self {
858        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859    }
860}
861
862#[derive(Copy, Clone)]
863pub struct G0;
864#[derive(Copy, Clone)]
865pub struct G1;
866
867#[allow(non_camel_case_types)]
868pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869#[allow(non_camel_case_types)]
870pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873#[allow(non_camel_case_types)]
874pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875
876#[allow(non_camel_case_types)]
877pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878#[allow(non_camel_case_types)]
879pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882
883impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
884where
885    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
886    Machine86<S3, S4, NI>: Machine,
887    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
888    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
889{
890}
891impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
892where
893    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
894    Machine86<S3, S4, NI>: Machine,
895    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
896    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
897{
898}
899impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
900where
901    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
902    Machine86<S3, S4, NI>: Machine,
903    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
904{
905}
906impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
907where
908    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
909    Machine86<S3, S4, NI>: Machine,
910    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
911    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
912    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
913    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
914    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
915{
916}
917
918impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
919where
920    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
921    Avx2Machine<NI>: Machine,
922    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
923    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
924{
925}
926impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
927where
928    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
929    Avx2Machine<NI>: Machine,
930    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
931    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
932{
933}
934impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
935where
936    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
937    Avx2Machine<NI>: Machine,
938    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
939{
940}
941impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
942where
943    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
944    Avx2Machine<NI>: Machine,
945    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
946    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
947    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
948    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
949    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
950{
951}
952
953impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
954where
955    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
956{
957    #[inline(always)]
958    fn extract(self, i: u32) -> u64 {
959        match i {
960            0 => self.0[0].extract(0),
961            1 => self.0[0].extract(1),
962            2 => self.0[1].extract(0),
963            3 => self.0[1].extract(1),
964            _ => panic!(),
965        }
966    }
967    #[inline(always)]
968    fn insert(mut self, w: u64, i: u32) -> Self {
969        match i {
970            0 => self.0[0] = self.0[0].insert(w, 0),
971            1 => self.0[0] = self.0[0].insert(w, 1),
972            2 => self.0[1] = self.0[1].insert(w, 0),
973            3 => self.0[1] = self.0[1].insert(w, 1),
974            _ => panic!(),
975        };
976        self
977    }
978}
979
980impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
981where
982    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
983    Machine86<S3, S4, NI>: Machine,
984    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
985    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
986{
987}
988impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
989where
990    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
991    Machine86<S3, S4, NI>: Machine,
992    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
993    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
994{
995}
996impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
997where
998    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
999    Machine86<S3, S4, NI>: Machine,
1000    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1001    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1002    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1003    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1004{
1005}
1006
1007impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1008where
1009    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1010    Avx2Machine<NI>: Machine,
1011    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1012    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1013{
1014}
1015impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1016where
1017    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1018    Avx2Machine<NI>: Machine,
1019    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1020    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1021{
1022}
1023impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1024where
1025    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1026    Avx2Machine<NI>: Machine,
1027    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1028    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1029    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1030    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1031{
1032}
1033
1034macro_rules! impl_into_x {
1035    ($from:ident, $to:ident) => {
1036        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1037            for x2<$to<S3, S4, NI>, Gt>
1038        {
1039            #[inline(always)]
1040            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1041                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1042            }
1043        }
1044        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1045            #[inline(always)]
1046            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1047                x4::new([
1048                    $to::from(x.0[0]),
1049                    $to::from(x.0[1]),
1050                    $to::from(x.0[2]),
1051                    $to::from(x.0[3]),
1052                ])
1053            }
1054        }
1055    };
1056}
1057impl_into_x!(u128x1_sse2, u64x2_sse2);
1058impl_into_x!(u128x1_sse2, u32x4_sse2);
1059
1060///// Debugging
1061
1062use core::fmt::{Debug, Formatter, Result};
1063
1064impl<W: PartialEq, G> PartialEq for x2<W, G> {
1065    #[inline(always)]
1066    fn eq(&self, rhs: &Self) -> bool {
1067        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1068    }
1069}
1070
1071#[allow(unused)]
1072#[inline(always)]
1073unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1074    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1075    _mm_cvtsi128_si64(q) == -1
1076}
1077
1078#[inline(always)]
1079unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1080    let q = _mm_cmpeq_epi32(x, y);
1081    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1082    let q = _mm_cvtsi128_si64(q);
1083    (p & q) == -1
1084}
1085
1086impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1087    #[inline(always)]
1088    fn eq(&self, rhs: &Self) -> bool {
1089        unsafe { eq128_s2(self.x, rhs.x) }
1090    }
1091}
1092impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1093where
1094    Self: Copy + MultiLane<[u32; 4]>,
1095{
1096    #[cold]
1097    fn fmt(&self, fmt: &mut Formatter) -> Result {
1098        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1099    }
1100}
1101
1102impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1103    #[inline(always)]
1104    fn eq(&self, rhs: &Self) -> bool {
1105        unsafe { eq128_s2(self.x, rhs.x) }
1106    }
1107}
1108impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1109where
1110    Self: Copy + MultiLane<[u64; 2]>,
1111{
1112    #[cold]
1113    fn fmt(&self, fmt: &mut Formatter) -> Result {
1114        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1115    }
1116}
1117
1118impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1119where
1120    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1121{
1122    #[cold]
1123    fn fmt(&self, fmt: &mut Formatter) -> Result {
1124        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1125        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1126    }
1127}
1128
1129#[cfg(test)]
1130#[cfg(target_arch = "x86_64")]
1131mod test {
1132    use super::*;
1133    use crate::x86_64::{SSE2, SSE41, SSSE3};
1134    use crate::Machine;
1135
1136    #[test]
1137    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1138    fn test_bswap32_s2_vs_s3() {
1139        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1140        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1141
1142        let s2 = unsafe { SSE2::instance() };
1143        let s3 = unsafe { SSSE3::instance() };
1144
1145        let x_s2 = {
1146            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1147            x_s2.bswap()
1148        };
1149
1150        let x_s3 = {
1151            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1152            x_s3.bswap()
1153        };
1154
1155        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1156        assert_eq!(x_s2, s2.vec(ys));
1157    }
1158
1159    #[test]
1160    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1161    fn test_bswap64_s2_vs_s3() {
1162        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1163        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1164
1165        let s2 = unsafe { SSE2::instance() };
1166        let s3 = unsafe { SSSE3::instance() };
1167
1168        let x_s2 = {
1169            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1170            x_s2.bswap()
1171        };
1172
1173        let x_s3 = {
1174            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1175            x_s3.bswap()
1176        };
1177
1178        assert_eq!(x_s2, s2.vec(ys));
1179        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1180    }
1181
1182    #[test]
1183    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1184    fn test_shuffle32_s2_vs_s3() {
1185        let xs = [0x0, 0x1, 0x2, 0x3];
1186        let ys = [0x2, 0x3, 0x0, 0x1];
1187        let zs = [0x1, 0x2, 0x3, 0x0];
1188
1189        let s2 = unsafe { SSE2::instance() };
1190        let s3 = unsafe { SSSE3::instance() };
1191
1192        let x_s2 = {
1193            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1194            x_s2.shuffle2301()
1195        };
1196        let x_s3 = {
1197            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1198            x_s3.shuffle2301()
1199        };
1200        assert_eq!(x_s2, s2.vec(ys));
1201        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1202
1203        let x_s2 = {
1204            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1205            x_s2.shuffle3012()
1206        };
1207        let x_s3 = {
1208            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1209            x_s3.shuffle3012()
1210        };
1211        assert_eq!(x_s2, s2.vec(zs));
1212        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1213
1214        let x_s2 = x_s2.shuffle1230();
1215        let x_s3 = x_s3.shuffle1230();
1216        assert_eq!(x_s2, s2.vec(xs));
1217        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1218    }
1219
1220    #[test]
1221    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1222    fn test_shuffle64_s2_vs_s3() {
1223        let xs = [0x0, 0x1, 0x2, 0x3];
1224        let ys = [0x2, 0x3, 0x0, 0x1];
1225        let zs = [0x1, 0x2, 0x3, 0x0];
1226
1227        let s2 = unsafe { SSE2::instance() };
1228        let s3 = unsafe { SSSE3::instance() };
1229
1230        let x_s2 = {
1231            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1232            x_s2.shuffle2301()
1233        };
1234        let x_s3 = {
1235            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1236            x_s3.shuffle2301()
1237        };
1238        assert_eq!(x_s2, s2.vec(ys));
1239        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1240
1241        let x_s2 = {
1242            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1243            x_s2.shuffle3012()
1244        };
1245        let x_s3 = {
1246            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1247            x_s3.shuffle3012()
1248        };
1249        assert_eq!(x_s2, s2.vec(zs));
1250        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1251
1252        let x_s2 = x_s2.shuffle1230();
1253        let x_s3 = x_s3.shuffle1230();
1254        assert_eq!(x_s2, s2.vec(xs));
1255        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1256    }
1257
1258    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1259    #[test]
1260    fn test_lanes_u32x4() {
1261        let xs = [0x1, 0x2, 0x3, 0x4];
1262
1263        let s2 = unsafe { SSE2::instance() };
1264        let s3 = unsafe { SSSE3::instance() };
1265        let s4 = unsafe { SSE41::instance() };
1266
1267        {
1268            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1269            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1270            assert_eq!(x_s2, y_s2);
1271            assert_eq!(xs, y_s2.to_lanes());
1272        }
1273
1274        {
1275            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1276            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1277            assert_eq!(x_s3, y_s3);
1278            assert_eq!(xs, y_s3.to_lanes());
1279        }
1280
1281        {
1282            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1283            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1284            assert_eq!(x_s4, y_s4);
1285            assert_eq!(xs, y_s4.to_lanes());
1286        }
1287    }
1288
1289    #[test]
1290    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1291    fn test_lanes_u64x2() {
1292        let xs = [0x1, 0x2];
1293
1294        let s2 = unsafe { SSE2::instance() };
1295        let s3 = unsafe { SSSE3::instance() };
1296        let s4 = unsafe { SSE41::instance() };
1297
1298        {
1299            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1300            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1301            assert_eq!(x_s2, y_s2);
1302            assert_eq!(xs, y_s2.to_lanes());
1303        }
1304
1305        {
1306            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1307            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1308            assert_eq!(x_s3, y_s3);
1309            assert_eq!(xs, y_s3.to_lanes());
1310        }
1311
1312        {
1313            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1314            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1315            assert_eq!(x_s4, y_s4);
1316            assert_eq!(xs, y_s4.to_lanes());
1317        }
1318    }
1319
1320    #[test]
1321    fn test_vec4_u32x4_s2() {
1322        let xs = [1, 2, 3, 4];
1323        let s2 = unsafe { SSE2::instance() };
1324        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1325        assert_eq!(x_s2.extract(0), 1);
1326        assert_eq!(x_s2.extract(1), 2);
1327        assert_eq!(x_s2.extract(2), 3);
1328        assert_eq!(x_s2.extract(3), 4);
1329        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1330        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1331        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1332        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1333    }
1334
1335    #[test]
1336    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1337    fn test_vec4_u32x4_s4() {
1338        let xs = [1, 2, 3, 4];
1339        let s4 = unsafe { SSE41::instance() };
1340        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1341        assert_eq!(x_s4.extract(0), 1);
1342        assert_eq!(x_s4.extract(1), 2);
1343        assert_eq!(x_s4.extract(2), 3);
1344        assert_eq!(x_s4.extract(3), 4);
1345        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1346        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1347        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1348        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1349    }
1350
1351    #[test]
1352    fn test_vec2_u64x2_s2() {
1353        let xs = [0x1, 0x2];
1354        let s2 = unsafe { SSE2::instance() };
1355        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1356        assert_eq!(x_s2.extract(0), 1);
1357        assert_eq!(x_s2.extract(1), 2);
1358        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1359        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1360    }
1361
1362    #[test]
1363    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1364    fn test_vec4_u64x2_s4() {
1365        let xs = [0x1, 0x2];
1366        let s4 = unsafe { SSE41::instance() };
1367        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1368        assert_eq!(x_s4.extract(0), 1);
1369        assert_eq!(x_s4.extract(1), 2);
1370        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1371        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1372    }
1373}
1374
1375pub mod avx2 {
1376    #![allow(non_camel_case_types)]
1377    use crate::soft::x4;
1378    use crate::types::*;
1379    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1380    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1381    use core::arch::x86_64::*;
1382    use core::marker::PhantomData;
1383    use core::ops::*;
1384
1385    #[derive(Copy, Clone)]
1386    pub struct u32x4x4_avx2<NI> {
1387        x: [__m256i; 2],
1388        ni: PhantomData<NI>,
1389    }
1390
1391    impl<NI> u32x4x4_avx2<NI> {
1392        #[inline(always)]
1393        fn new(x: [__m256i; 2]) -> Self {
1394            Self { x, ni: PhantomData }
1395        }
1396    }
1397
1398    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1399    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1400        #[inline(always)]
1401        unsafe fn unpack(p: vec512_storage) -> Self {
1402            Self::new([p.avx[0].avx, p.avx[1].avx])
1403        }
1404    }
1405    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1406        #[inline(always)]
1407        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1408            unsafe {
1409                [
1410                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1411                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1412                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1413                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1414                ]
1415            }
1416        }
1417        #[inline(always)]
1418        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1419            Self::new(unsafe {
1420                [
1421                    _mm256_setr_m128i(x[0].x, x[1].x),
1422                    _mm256_setr_m128i(x[2].x, x[3].x),
1423                ]
1424            })
1425        }
1426    }
1427    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1428        #[inline(always)]
1429        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1430            unsafe {
1431                match i {
1432                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1433                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1434                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1435                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1436                    _ => panic!(),
1437                }
1438            }
1439        }
1440        #[inline(always)]
1441        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1442            Self::new(unsafe {
1443                match i {
1444                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1445                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1446                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1447                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1448                    _ => panic!(),
1449                }
1450            })
1451        }
1452    }
1453    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1454        #[inline(always)]
1455        fn shuffle_lane_words1230(self) -> Self {
1456            Self::new(unsafe {
1457                [
1458                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1459                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1460                ]
1461            })
1462        }
1463        #[inline(always)]
1464        fn shuffle_lane_words2301(self) -> Self {
1465            Self::new(unsafe {
1466                [
1467                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1468                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1469                ]
1470            })
1471        }
1472        #[inline(always)]
1473        fn shuffle_lane_words3012(self) -> Self {
1474            Self::new(unsafe {
1475                [
1476                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1477                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1478                ]
1479            })
1480        }
1481    }
1482    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1483    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1484    macro_rules! shuf_lane_bytes {
1485        ($name:ident, $k0:expr, $k1:expr) => {
1486            #[inline(always)]
1487            fn $name(self) -> Self {
1488                Self::new(unsafe {
1489                    [
1490                        _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1491                        _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1492                    ]
1493                })
1494            }
1495        };
1496    }
1497    macro_rules! rotr_32 {
1498        ($name:ident, $i:expr) => {
1499            #[inline(always)]
1500            fn $name(self) -> Self {
1501                Self::new(unsafe {
1502                    [
1503                        _mm256_or_si256(
1504                            _mm256_srli_epi32(self.x[0], $i as i32),
1505                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1506                        ),
1507                        _mm256_or_si256(
1508                            _mm256_srli_epi32(self.x[1], $i as i32),
1509                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1510                        ),
1511                    ]
1512                })
1513            }
1514        };
1515    }
1516    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1517        rotr_32!(rotate_each_word_right7, 7);
1518        shuf_lane_bytes!(
1519            rotate_each_word_right8,
1520            0x0c0f0e0d_080b0a09,
1521            0x04070605_00030201
1522        );
1523        rotr_32!(rotate_each_word_right11, 11);
1524        rotr_32!(rotate_each_word_right12, 12);
1525        shuf_lane_bytes!(
1526            rotate_each_word_right16,
1527            0x0d0c0f0e_09080b0a,
1528            0x05040706_01000302
1529        );
1530        rotr_32!(rotate_each_word_right20, 20);
1531        shuf_lane_bytes!(
1532            rotate_each_word_right24,
1533            0x0e0d0c0f_0a09080b,
1534            0x06050407_02010003
1535        );
1536        rotr_32!(rotate_each_word_right25, 25);
1537    }
1538    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1539    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1540        #[inline(always)]
1541        fn from(x: u32x4x4_avx2<NI>) -> Self {
1542            Self {
1543                avx: [
1544                    vec256_storage { avx: x.x[0] },
1545                    vec256_storage { avx: x.x[1] },
1546                ],
1547            }
1548        }
1549    }
1550
1551    macro_rules! impl_assign {
1552        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1553            impl<NI> $Assign for $vec<NI>
1554            where
1555                NI: Copy,
1556            {
1557                #[inline(always)]
1558                fn $assign_fn(&mut self, rhs: Self) {
1559                    *self = self.$bin_fn(rhs);
1560                }
1561            }
1562        };
1563    }
1564    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1565    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1566    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1567    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1568
1569    macro_rules! impl_bitop_x2 {
1570        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1571            impl<NI> $Op for $vec<NI> {
1572                type Output = Self;
1573                #[inline(always)]
1574                fn $op_fn(self, rhs: Self) -> Self::Output {
1575                    Self::new(unsafe {
1576                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1577                    })
1578                }
1579            }
1580        };
1581    }
1582    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1583    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1584    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1585    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1586    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1587
1588    impl<NI> Not for u32x4x4_avx2<NI> {
1589        type Output = Self;
1590        #[inline(always)]
1591        fn not(self) -> Self::Output {
1592            unsafe {
1593                let f = _mm256_set1_epi8(-0x7f);
1594                Self::new([f, f]) ^ self
1595            }
1596        }
1597    }
1598
1599    impl<NI> BSwap for u32x4x4_avx2<NI> {
1600        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1601    }
1602
1603    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1604    where
1605        NI: Copy,
1606    {
1607        #[inline(always)]
1608        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1609            Self::new(unsafe {
1610                [
1611                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1612                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1613                ]
1614            })
1615        }
1616    }
1617}