polyval/backend/
clmul.rs

1//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
2//! (i.e. Intel Sandy Bridge-compatible or newer)
3
4use crate::{Block, Key, Tag};
5use universal_hash::{
6    consts::{U1, U16},
7    crypto_common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
8    KeyInit, Reset, UhfBackend,
9};
10
11#[cfg(target_arch = "x86")]
12use core::arch::x86::*;
13#[cfg(target_arch = "x86_64")]
14use core::arch::x86_64::*;
15
16/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
17#[derive(Clone)]
18pub struct Polyval {
19    h: __m128i,
20    y: __m128i,
21}
22
23impl KeySizeUser for Polyval {
24    type KeySize = U16;
25}
26
27impl KeyInit for Polyval {
28    /// Initialize POLYVAL with the given `H` field element
29    fn new(h: &Key) -> Self {
30        unsafe {
31            // `_mm_loadu_si128` performs an unaligned load
32            #[allow(clippy::cast_ptr_alignment)]
33            Self {
34                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
35                y: _mm_setzero_si128(),
36            }
37        }
38    }
39}
40
41impl BlockSizeUser for Polyval {
42    type BlockSize = U16;
43}
44
45impl ParBlocksSizeUser for Polyval {
46    type ParBlocksSize = U1;
47}
48
49impl UhfBackend for Polyval {
50    fn proc_block(&mut self, x: &Block) {
51        unsafe {
52            self.mul(x);
53        }
54    }
55}
56
57impl Polyval {
58    /// Get GHASH output
59    pub(crate) fn finalize(self) -> Tag {
60        unsafe { core::mem::transmute(self.y) }
61    }
62}
63
64impl Polyval {
65    #[inline]
66    #[target_feature(enable = "pclmulqdq")]
67    unsafe fn mul(&mut self, x: &Block) {
68        let h = self.h;
69
70        // `_mm_loadu_si128` performs an unaligned load
71        #[allow(clippy::cast_ptr_alignment)]
72        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
73        let y = _mm_xor_si128(self.y, x);
74
75        let h0 = h;
76        let h1 = _mm_shuffle_epi32(h, 0x0E);
77        let h2 = _mm_xor_si128(h0, h1);
78        let y0 = y;
79
80        // Multiply values partitioned to 64-bit parts
81        let y1 = _mm_shuffle_epi32(y, 0x0E);
82        let y2 = _mm_xor_si128(y0, y1);
83        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
84        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
85        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
86        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
87        let v0 = t0;
88        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
89        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
90        let v3 = _mm_shuffle_epi32(t1, 0x0E);
91
92        // Polynomial reduction
93        let v2 = xor5(
94            v2,
95            v0,
96            _mm_srli_epi64(v0, 1),
97            _mm_srli_epi64(v0, 2),
98            _mm_srli_epi64(v0, 7),
99        );
100
101        let v1 = xor4(
102            v1,
103            _mm_slli_epi64(v0, 63),
104            _mm_slli_epi64(v0, 62),
105            _mm_slli_epi64(v0, 57),
106        );
107
108        let v3 = xor5(
109            v3,
110            v1,
111            _mm_srli_epi64(v1, 1),
112            _mm_srli_epi64(v1, 2),
113            _mm_srli_epi64(v1, 7),
114        );
115
116        let v2 = xor4(
117            v2,
118            _mm_slli_epi64(v1, 63),
119            _mm_slli_epi64(v1, 62),
120            _mm_slli_epi64(v1, 57),
121        );
122
123        self.y = _mm_unpacklo_epi64(v2, v3);
124    }
125}
126
127impl Reset for Polyval {
128    fn reset(&mut self) {
129        unsafe {
130            self.y = _mm_setzero_si128();
131        }
132    }
133}
134
135#[cfg(feature = "zeroize")]
136impl Drop for Polyval {
137    fn drop(&mut self) {
138        use zeroize::Zeroize;
139        self.h.zeroize();
140        self.y.zeroize();
141    }
142}
143
144#[inline(always)]
145unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
146    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
147}
148
149#[inline(always)]
150unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
151    _mm_xor_si128(
152        e1,
153        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
154    )
155}