aes/soft/
fixslice64.rs

1//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
2//! adapted from the C implementation.
3//!
4//! All implementations are fully bitsliced and do not rely on any
5//! Look-Up Table (LUT).
6//!
7//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8//!
9//! # Author (original C code)
10//!
11//! Alexandre Adomnicai, Nanyang Technological University, Singapore
12//! <alexandre.adomnicai@ntu.edu.sg>
13//!
14//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15
16#![allow(clippy::unreadable_literal)]
17
18use crate::Block;
19use cipher::{consts::U4, generic_array::GenericArray};
20
21/// AES block batch size for this implementation
22pub(crate) type FixsliceBlocks = U4;
23
24pub(crate) type BatchBlocks = GenericArray<Block, FixsliceBlocks>;
25
26/// AES-128 round keys
27pub(crate) type FixsliceKeys128 = [u64; 88];
28
29/// AES-192 round keys
30pub(crate) type FixsliceKeys192 = [u64; 104];
31
32/// AES-256 round keys
33pub(crate) type FixsliceKeys256 = [u64; 120];
34
35/// 512-bit internal state
36pub(crate) type State = [u64; 8];
37
38/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
39pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
40    let mut rkeys = [0u64; 88];
41
42    bitslice(&mut rkeys[..8], key, key, key, key);
43
44    let mut rk_off = 0;
45    for rcon in 0..10 {
46        memshift32(&mut rkeys, rk_off);
47        rk_off += 8;
48
49        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
50        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
51
52        if rcon < 8 {
53            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
54        } else {
55            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
56            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
57            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
58            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
59        }
60
61        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
62    }
63
64    // Adjust to match fixslicing format
65    #[cfg(aes_compact)]
66    {
67        for i in (8..88).step_by(16) {
68            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
69        }
70    }
71    #[cfg(not(aes_compact))]
72    {
73        for i in (8..72).step_by(32) {
74            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
75            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
76            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
77        }
78        inv_shift_rows_1(&mut rkeys[72..80]);
79    }
80
81    // Account for NOTs removed from sub_bytes
82    for i in 1..11 {
83        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
84    }
85
86    rkeys
87}
88
89/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
90pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
91    let mut rkeys = [0u64; 104];
92    let mut tmp = [0u64; 8];
93
94    bitslice(
95        &mut rkeys[..8],
96        &key[..16],
97        &key[..16],
98        &key[..16],
99        &key[..16],
100    );
101    bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
102
103    let mut rcon = 0;
104    let mut rk_off = 8;
105
106    loop {
107        for i in 0..8 {
108            rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
109                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
110        }
111
112        sub_bytes(&mut tmp);
113        sub_bytes_nots(&mut tmp);
114
115        add_round_constant_bit(&mut tmp, rcon);
116        rcon += 1;
117
118        for i in 0..8 {
119            let mut ti = rkeys[rk_off + i];
120            ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
121            ti ^= 0xf000f000f000f000 & (ti << 4);
122            tmp[i] = ti;
123        }
124        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
125        rk_off += 8;
126
127        for i in 0..8 {
128            let ui = tmp[i];
129            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
130                | (0xff00ff00ff00ff00 & (ui << 8));
131            ti ^= 0x000f000f000f000f & (ui >> 12);
132            tmp[i] = ti
133                ^ (0xfff0fff0fff0fff0 & (ti << 4))
134                ^ (0xff00ff00ff00ff00 & (ti << 8))
135                ^ (0xf000f000f000f000 & (ti << 12));
136        }
137        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
138        rk_off += 8;
139
140        sub_bytes(&mut tmp);
141        sub_bytes_nots(&mut tmp);
142
143        add_round_constant_bit(&mut tmp, rcon);
144        rcon += 1;
145
146        for i in 0..8 {
147            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
148                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
149            ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
150            rkeys[rk_off + i] = ti
151                ^ (0xfff0fff0fff0fff0 & (ti << 4))
152                ^ (0xff00ff00ff00ff00 & (ti << 8))
153                ^ (0xf000f000f000f000 & (ti << 12));
154        }
155        rk_off += 8;
156
157        if rcon >= 8 {
158            break;
159        }
160
161        for i in 0..8 {
162            let ui = rkeys[(rk_off - 8) + i];
163            let mut ti = rkeys[(rk_off - 16) + i];
164            ti ^= 0x0f000f000f000f00 & (ui >> 4);
165            ti ^= 0xf000f000f000f000 & (ti << 4);
166            tmp[i] = ti;
167        }
168    }
169
170    // Adjust to match fixslicing format
171    #[cfg(aes_compact)]
172    {
173        for i in (8..104).step_by(16) {
174            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
175        }
176    }
177    #[cfg(not(aes_compact))]
178    {
179        for i in (0..96).step_by(32) {
180            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
181            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
182            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
183        }
184    }
185
186    // Account for NOTs removed from sub_bytes
187    for i in 1..13 {
188        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
189    }
190
191    rkeys
192}
193
194/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
195pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
196    let mut rkeys = [0u64; 120];
197
198    bitslice(
199        &mut rkeys[..8],
200        &key[..16],
201        &key[..16],
202        &key[..16],
203        &key[..16],
204    );
205    bitslice(
206        &mut rkeys[8..16],
207        &key[16..],
208        &key[16..],
209        &key[16..],
210        &key[16..],
211    );
212
213    let mut rk_off = 8;
214
215    let mut rcon = 0;
216    loop {
217        memshift32(&mut rkeys, rk_off);
218        rk_off += 8;
219
220        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
221        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
222
223        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
224        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
225        rcon += 1;
226
227        if rcon == 7 {
228            break;
229        }
230
231        memshift32(&mut rkeys, rk_off);
232        rk_off += 8;
233
234        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
235        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
236
237        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
238    }
239
240    // Adjust to match fixslicing format
241    #[cfg(aes_compact)]
242    {
243        for i in (8..120).step_by(16) {
244            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
245        }
246    }
247    #[cfg(not(aes_compact))]
248    {
249        for i in (8..104).step_by(32) {
250            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
251            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
252            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
253        }
254        inv_shift_rows_1(&mut rkeys[104..112]);
255    }
256
257    // Account for NOTs removed from sub_bytes
258    for i in 1..15 {
259        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
260    }
261
262    rkeys
263}
264
265/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
266///
267/// Decrypts four blocks in-place and in parallel.
268pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
269    let mut state = State::default();
270
271    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
272
273    add_round_key(&mut state, &rkeys[80..]);
274    inv_sub_bytes(&mut state);
275
276    #[cfg(not(aes_compact))]
277    {
278        inv_shift_rows_2(&mut state);
279    }
280
281    let mut rk_off = 72;
282    loop {
283        #[cfg(aes_compact)]
284        {
285            inv_shift_rows_2(&mut state);
286        }
287
288        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
289        inv_mix_columns_1(&mut state);
290        inv_sub_bytes(&mut state);
291        rk_off -= 8;
292
293        if rk_off == 0 {
294            break;
295        }
296
297        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
298        inv_mix_columns_0(&mut state);
299        inv_sub_bytes(&mut state);
300        rk_off -= 8;
301
302        #[cfg(not(aes_compact))]
303        {
304            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
305            inv_mix_columns_3(&mut state);
306            inv_sub_bytes(&mut state);
307            rk_off -= 8;
308
309            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
310            inv_mix_columns_2(&mut state);
311            inv_sub_bytes(&mut state);
312            rk_off -= 8;
313        }
314    }
315
316    add_round_key(&mut state, &rkeys[..8]);
317
318    inv_bitslice(&state)
319}
320
321/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
322///
323/// Encrypts four blocks in-place and in parallel.
324pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
325    let mut state = State::default();
326
327    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
328
329    add_round_key(&mut state, &rkeys[..8]);
330
331    let mut rk_off = 8;
332    loop {
333        sub_bytes(&mut state);
334        mix_columns_1(&mut state);
335        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
336        rk_off += 8;
337
338        #[cfg(aes_compact)]
339        {
340            shift_rows_2(&mut state);
341        }
342
343        if rk_off == 80 {
344            break;
345        }
346
347        #[cfg(not(aes_compact))]
348        {
349            sub_bytes(&mut state);
350            mix_columns_2(&mut state);
351            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
352            rk_off += 8;
353
354            sub_bytes(&mut state);
355            mix_columns_3(&mut state);
356            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
357            rk_off += 8;
358        }
359
360        sub_bytes(&mut state);
361        mix_columns_0(&mut state);
362        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
363        rk_off += 8;
364    }
365
366    #[cfg(not(aes_compact))]
367    {
368        shift_rows_2(&mut state);
369    }
370
371    sub_bytes(&mut state);
372    add_round_key(&mut state, &rkeys[80..]);
373
374    inv_bitslice(&state)
375}
376
377/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
378///
379/// Decrypts four blocks in-place and in parallel.
380pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
381    let mut state = State::default();
382
383    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
384
385    add_round_key(&mut state, &rkeys[96..]);
386    inv_sub_bytes(&mut state);
387
388    let mut rk_off = 88;
389    loop {
390        #[cfg(aes_compact)]
391        {
392            inv_shift_rows_2(&mut state);
393        }
394        #[cfg(not(aes_compact))]
395        {
396            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
397            inv_mix_columns_3(&mut state);
398            inv_sub_bytes(&mut state);
399            rk_off -= 8;
400
401            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
402            inv_mix_columns_2(&mut state);
403            inv_sub_bytes(&mut state);
404            rk_off -= 8;
405        }
406
407        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
408        inv_mix_columns_1(&mut state);
409        inv_sub_bytes(&mut state);
410        rk_off -= 8;
411
412        if rk_off == 0 {
413            break;
414        }
415
416        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
417        inv_mix_columns_0(&mut state);
418        inv_sub_bytes(&mut state);
419        rk_off -= 8;
420    }
421
422    add_round_key(&mut state, &rkeys[..8]);
423
424    inv_bitslice(&state)
425}
426
427/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
428///
429/// Encrypts four blocks in-place and in parallel.
430pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
431    let mut state = State::default();
432
433    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
434
435    add_round_key(&mut state, &rkeys[..8]);
436
437    let mut rk_off = 8;
438    loop {
439        sub_bytes(&mut state);
440        mix_columns_1(&mut state);
441        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
442        rk_off += 8;
443
444        #[cfg(aes_compact)]
445        {
446            shift_rows_2(&mut state);
447        }
448        #[cfg(not(aes_compact))]
449        {
450            sub_bytes(&mut state);
451            mix_columns_2(&mut state);
452            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
453            rk_off += 8;
454
455            sub_bytes(&mut state);
456            mix_columns_3(&mut state);
457            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
458            rk_off += 8;
459        }
460
461        if rk_off == 96 {
462            break;
463        }
464
465        sub_bytes(&mut state);
466        mix_columns_0(&mut state);
467        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
468        rk_off += 8;
469    }
470
471    sub_bytes(&mut state);
472    add_round_key(&mut state, &rkeys[96..]);
473
474    inv_bitslice(&state)
475}
476
477/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
478///
479/// Decrypts four blocks in-place and in parallel.
480pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
481    let mut state = State::default();
482
483    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
484
485    add_round_key(&mut state, &rkeys[112..]);
486    inv_sub_bytes(&mut state);
487
488    #[cfg(not(aes_compact))]
489    {
490        inv_shift_rows_2(&mut state);
491    }
492
493    let mut rk_off = 104;
494    loop {
495        #[cfg(aes_compact)]
496        {
497            inv_shift_rows_2(&mut state);
498        }
499
500        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
501        inv_mix_columns_1(&mut state);
502        inv_sub_bytes(&mut state);
503        rk_off -= 8;
504
505        if rk_off == 0 {
506            break;
507        }
508
509        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
510        inv_mix_columns_0(&mut state);
511        inv_sub_bytes(&mut state);
512        rk_off -= 8;
513
514        #[cfg(not(aes_compact))]
515        {
516            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
517            inv_mix_columns_3(&mut state);
518            inv_sub_bytes(&mut state);
519            rk_off -= 8;
520
521            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
522            inv_mix_columns_2(&mut state);
523            inv_sub_bytes(&mut state);
524            rk_off -= 8;
525        }
526    }
527
528    add_round_key(&mut state, &rkeys[..8]);
529
530    inv_bitslice(&state)
531}
532
533/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
534///
535/// Encrypts four blocks in-place and in parallel.
536pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
537    let mut state = State::default();
538
539    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
540
541    add_round_key(&mut state, &rkeys[..8]);
542
543    let mut rk_off = 8;
544    loop {
545        sub_bytes(&mut state);
546        mix_columns_1(&mut state);
547        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
548        rk_off += 8;
549
550        #[cfg(aes_compact)]
551        {
552            shift_rows_2(&mut state);
553        }
554
555        if rk_off == 112 {
556            break;
557        }
558
559        #[cfg(not(aes_compact))]
560        {
561            sub_bytes(&mut state);
562            mix_columns_2(&mut state);
563            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
564            rk_off += 8;
565
566            sub_bytes(&mut state);
567            mix_columns_3(&mut state);
568            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
569            rk_off += 8;
570        }
571
572        sub_bytes(&mut state);
573        mix_columns_0(&mut state);
574        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
575        rk_off += 8;
576    }
577
578    #[cfg(not(aes_compact))]
579    {
580        shift_rows_2(&mut state);
581    }
582
583    sub_bytes(&mut state);
584    add_round_key(&mut state, &rkeys[112..]);
585
586    inv_bitslice(&state)
587}
588
589/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
590/// inverse of 'sub_bytes'.
591fn inv_sub_bytes(state: &mut [u64]) {
592    debug_assert_eq!(state.len(), 8);
593
594    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
595    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
596
597    let u7 = state[0];
598    let u6 = state[1];
599    let u5 = state[2];
600    let u4 = state[3];
601    let u3 = state[4];
602    let u2 = state[5];
603    let u1 = state[6];
604    let u0 = state[7];
605
606    let t23 = u0 ^ u3;
607    let t8 = u1 ^ t23;
608    let m2 = t23 & t8;
609    let t4 = u4 ^ t8;
610    let t22 = u1 ^ u3;
611    let t2 = u0 ^ u1;
612    let t1 = u3 ^ u4;
613    // t23 -> stack
614    let t9 = u7 ^ t1;
615    // t8 -> stack
616    let m7 = t22 & t9;
617    // t9 -> stack
618    let t24 = u4 ^ u7;
619    // m7 -> stack
620    let t10 = t2 ^ t24;
621    // u4 -> stack
622    let m14 = t2 & t10;
623    let r5 = u6 ^ u7;
624    // m2 -> stack
625    let t3 = t1 ^ r5;
626    // t2 -> stack
627    let t13 = t2 ^ r5;
628    let t19 = t22 ^ r5;
629    // t3 -> stack
630    let t17 = u2 ^ t19;
631    // t4 -> stack
632    let t25 = u2 ^ t1;
633    let r13 = u1 ^ u6;
634    // t25 -> stack
635    let t20 = t24 ^ r13;
636    // t17 -> stack
637    let m9 = t20 & t17;
638    // t20 -> stack
639    let r17 = u2 ^ u5;
640    // t22 -> stack
641    let t6 = t22 ^ r17;
642    // t13 -> stack
643    let m1 = t13 & t6;
644    let y5 = u0 ^ r17;
645    let m4 = t19 & y5;
646    let m5 = m4 ^ m1;
647    let m17 = m5 ^ t24;
648    let r18 = u5 ^ u6;
649    let t27 = t1 ^ r18;
650    let t15 = t10 ^ t27;
651    // t6 -> stack
652    let m11 = t1 & t15;
653    let m15 = m14 ^ m11;
654    let m21 = m17 ^ m15;
655    // t1 -> stack
656    // t4 <- stack
657    let m12 = t4 & t27;
658    let m13 = m12 ^ m11;
659    let t14 = t10 ^ r18;
660    let m3 = t14 ^ m1;
661    // m2 <- stack
662    let m16 = m3 ^ m2;
663    let m20 = m16 ^ m13;
664    // u4 <- stack
665    let r19 = u2 ^ u4;
666    let t16 = r13 ^ r19;
667    // t3 <- stack
668    let t26 = t3 ^ t16;
669    let m6 = t3 & t16;
670    let m8 = t26 ^ m6;
671    // t10 -> stack
672    // m7 <- stack
673    let m18 = m8 ^ m7;
674    let m22 = m18 ^ m13;
675    let m25 = m22 & m20;
676    let m26 = m21 ^ m25;
677    let m10 = m9 ^ m6;
678    let m19 = m10 ^ m15;
679    // t25 <- stack
680    let m23 = m19 ^ t25;
681    let m28 = m23 ^ m25;
682    let m24 = m22 ^ m23;
683    let m30 = m26 & m24;
684    let m39 = m23 ^ m30;
685    let m48 = m39 & y5;
686    let m57 = m39 & t19;
687    // m48 -> stack
688    let m36 = m24 ^ m25;
689    let m31 = m20 & m23;
690    let m27 = m20 ^ m21;
691    let m32 = m27 & m31;
692    let m29 = m28 & m27;
693    let m37 = m21 ^ m29;
694    // m39 -> stack
695    let m42 = m37 ^ m39;
696    let m52 = m42 & t15;
697    // t27 -> stack
698    // t1 <- stack
699    let m61 = m42 & t1;
700    let p0 = m52 ^ m61;
701    let p16 = m57 ^ m61;
702    // m57 -> stack
703    // t20 <- stack
704    let m60 = m37 & t20;
705    // p16 -> stack
706    // t17 <- stack
707    let m51 = m37 & t17;
708    let m33 = m27 ^ m25;
709    let m38 = m32 ^ m33;
710    let m43 = m37 ^ m38;
711    let m49 = m43 & t16;
712    let p6 = m49 ^ m60;
713    let p13 = m49 ^ m51;
714    let m58 = m43 & t3;
715    // t9 <- stack
716    let m50 = m38 & t9;
717    // t22 <- stack
718    let m59 = m38 & t22;
719    // p6 -> stack
720    let p1 = m58 ^ m59;
721    let p7 = p0 ^ p1;
722    let m34 = m21 & m22;
723    let m35 = m24 & m34;
724    let m40 = m35 ^ m36;
725    let m41 = m38 ^ m40;
726    let m45 = m42 ^ m41;
727    // t27 <- stack
728    let m53 = m45 & t27;
729    let p8 = m50 ^ m53;
730    let p23 = p7 ^ p8;
731    // t4 <- stack
732    let m62 = m45 & t4;
733    let p14 = m49 ^ m62;
734    let s6 = p14 ^ p23;
735    // t10 <- stack
736    let m54 = m41 & t10;
737    let p2 = m54 ^ m62;
738    let p22 = p2 ^ p7;
739    let s0 = p13 ^ p22;
740    let p17 = m58 ^ p2;
741    let p15 = m54 ^ m59;
742    // t2 <- stack
743    let m63 = m41 & t2;
744    // m39 <- stack
745    let m44 = m39 ^ m40;
746    // p17 -> stack
747    // t6 <- stack
748    let m46 = m44 & t6;
749    let p5 = m46 ^ m51;
750    // p23 -> stack
751    let p18 = m63 ^ p5;
752    let p24 = p5 ^ p7;
753    // m48 <- stack
754    let p12 = m46 ^ m48;
755    let s3 = p12 ^ p22;
756    // t13 <- stack
757    let m55 = m44 & t13;
758    let p9 = m55 ^ m63;
759    // p16 <- stack
760    let s7 = p9 ^ p16;
761    // t8 <- stack
762    let m47 = m40 & t8;
763    let p3 = m47 ^ m50;
764    let p19 = p2 ^ p3;
765    let s5 = p19 ^ p24;
766    let p11 = p0 ^ p3;
767    let p26 = p9 ^ p11;
768    // t23 <- stack
769    let m56 = m40 & t23;
770    let p4 = m48 ^ m56;
771    // p6 <- stack
772    let p20 = p4 ^ p6;
773    let p29 = p15 ^ p20;
774    let s1 = p26 ^ p29;
775    // m57 <- stack
776    let p10 = m57 ^ p4;
777    let p27 = p10 ^ p18;
778    // p23 <- stack
779    let s4 = p23 ^ p27;
780    let p25 = p6 ^ p10;
781    let p28 = p11 ^ p25;
782    // p17 <- stack
783    let s2 = p17 ^ p28;
784
785    state[0] = s7;
786    state[1] = s6;
787    state[2] = s5;
788    state[3] = s4;
789    state[4] = s3;
790    state[5] = s2;
791    state[6] = s1;
792    state[7] = s0;
793}
794
795/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
796///
797/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
798///
799/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
800fn sub_bytes(state: &mut [u64]) {
801    debug_assert_eq!(state.len(), 8);
802
803    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
804    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
805
806    let u7 = state[0];
807    let u6 = state[1];
808    let u5 = state[2];
809    let u4 = state[3];
810    let u3 = state[4];
811    let u2 = state[5];
812    let u1 = state[6];
813    let u0 = state[7];
814
815    let y14 = u3 ^ u5;
816    let y13 = u0 ^ u6;
817    let y12 = y13 ^ y14;
818    let t1 = u4 ^ y12;
819    let y15 = t1 ^ u5;
820    let t2 = y12 & y15;
821    let y6 = y15 ^ u7;
822    let y20 = t1 ^ u1;
823    // y12 -> stack
824    let y9 = u0 ^ u3;
825    // y20 -> stack
826    let y11 = y20 ^ y9;
827    // y9 -> stack
828    let t12 = y9 & y11;
829    // y6 -> stack
830    let y7 = u7 ^ y11;
831    let y8 = u0 ^ u5;
832    let t0 = u1 ^ u2;
833    let y10 = y15 ^ t0;
834    // y15 -> stack
835    let y17 = y10 ^ y11;
836    // y14 -> stack
837    let t13 = y14 & y17;
838    let t14 = t13 ^ t12;
839    // y17 -> stack
840    let y19 = y10 ^ y8;
841    // y10 -> stack
842    let t15 = y8 & y10;
843    let t16 = t15 ^ t12;
844    let y16 = t0 ^ y11;
845    // y11 -> stack
846    let y21 = y13 ^ y16;
847    // y13 -> stack
848    let t7 = y13 & y16;
849    // y16 -> stack
850    let y18 = u0 ^ y16;
851    let y1 = t0 ^ u7;
852    let y4 = y1 ^ u3;
853    // u7 -> stack
854    let t5 = y4 & u7;
855    let t6 = t5 ^ t2;
856    let t18 = t6 ^ t16;
857    let t22 = t18 ^ y19;
858    let y2 = y1 ^ u0;
859    let t10 = y2 & y7;
860    let t11 = t10 ^ t7;
861    let t20 = t11 ^ t16;
862    let t24 = t20 ^ y18;
863    let y5 = y1 ^ u6;
864    let t8 = y5 & y1;
865    let t9 = t8 ^ t7;
866    let t19 = t9 ^ t14;
867    let t23 = t19 ^ y21;
868    let y3 = y5 ^ y8;
869    // y6 <- stack
870    let t3 = y3 & y6;
871    let t4 = t3 ^ t2;
872    // y20 <- stack
873    let t17 = t4 ^ y20;
874    let t21 = t17 ^ t14;
875    let t26 = t21 & t23;
876    let t27 = t24 ^ t26;
877    let t31 = t22 ^ t26;
878    let t25 = t21 ^ t22;
879    // y4 -> stack
880    let t28 = t25 & t27;
881    let t29 = t28 ^ t22;
882    let z14 = t29 & y2;
883    let z5 = t29 & y7;
884    let t30 = t23 ^ t24;
885    let t32 = t31 & t30;
886    let t33 = t32 ^ t24;
887    let t35 = t27 ^ t33;
888    let t36 = t24 & t35;
889    let t38 = t27 ^ t36;
890    let t39 = t29 & t38;
891    let t40 = t25 ^ t39;
892    let t43 = t29 ^ t40;
893    // y16 <- stack
894    let z3 = t43 & y16;
895    let tc12 = z3 ^ z5;
896    // tc12 -> stack
897    // y13 <- stack
898    let z12 = t43 & y13;
899    let z13 = t40 & y5;
900    let z4 = t40 & y1;
901    let tc6 = z3 ^ z4;
902    let t34 = t23 ^ t33;
903    let t37 = t36 ^ t34;
904    let t41 = t40 ^ t37;
905    // y10 <- stack
906    let z8 = t41 & y10;
907    let z17 = t41 & y8;
908    let t44 = t33 ^ t37;
909    // y15 <- stack
910    let z0 = t44 & y15;
911    // z17 -> stack
912    // y12 <- stack
913    let z9 = t44 & y12;
914    let z10 = t37 & y3;
915    let z1 = t37 & y6;
916    let tc5 = z1 ^ z0;
917    let tc11 = tc6 ^ tc5;
918    // y4 <- stack
919    let z11 = t33 & y4;
920    let t42 = t29 ^ t33;
921    let t45 = t42 ^ t41;
922    // y17 <- stack
923    let z7 = t45 & y17;
924    let tc8 = z7 ^ tc6;
925    // y14 <- stack
926    let z16 = t45 & y14;
927    // y11 <- stack
928    let z6 = t42 & y11;
929    let tc16 = z6 ^ tc8;
930    // z14 -> stack
931    // y9 <- stack
932    let z15 = t42 & y9;
933    let tc20 = z15 ^ tc16;
934    let tc1 = z15 ^ z16;
935    let tc2 = z10 ^ tc1;
936    let tc21 = tc2 ^ z11;
937    let tc3 = z9 ^ tc2;
938    let s0 = tc3 ^ tc16;
939    let s3 = tc3 ^ tc11;
940    let s1 = s3 ^ tc16;
941    let tc13 = z13 ^ tc1;
942    // u7 <- stack
943    let z2 = t33 & u7;
944    let tc4 = z0 ^ z2;
945    let tc7 = z12 ^ tc4;
946    let tc9 = z8 ^ tc7;
947    let tc10 = tc8 ^ tc9;
948    // z14 <- stack
949    let tc17 = z14 ^ tc10;
950    let s5 = tc21 ^ tc17;
951    let tc26 = tc17 ^ tc20;
952    // z17 <- stack
953    let s2 = tc26 ^ z17;
954    // tc12 <- stack
955    let tc14 = tc4 ^ tc12;
956    let tc18 = tc13 ^ tc14;
957    let s6 = tc10 ^ tc18;
958    let s7 = z12 ^ tc18;
959    let s4 = tc14 ^ s3;
960
961    state[0] = s7;
962    state[1] = s6;
963    state[2] = s5;
964    state[3] = s4;
965    state[4] = s3;
966    state[5] = s2;
967    state[6] = s1;
968    state[7] = s0;
969}
970
971/// NOT operations that are omitted in S-box
972#[inline]
973fn sub_bytes_nots(state: &mut [u64]) {
974    debug_assert_eq!(state.len(), 8);
975    state[0] ^= 0xffffffffffffffff;
976    state[1] ^= 0xffffffffffffffff;
977    state[5] ^= 0xffffffffffffffff;
978    state[6] ^= 0xffffffffffffffff;
979}
980
981/// Computation of the MixColumns transformation in the fixsliced representation, with different
982/// rotations used according to the round number mod 4.
983///
984/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
985macro_rules! define_mix_columns {
986    (
987        $name:ident,
988        $name_inv:ident,
989        $first_rotate:path,
990        $second_rotate:path
991    ) => {
992        #[rustfmt::skip]
993        fn $name(state: &mut State) {
994            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
995                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
996            );
997            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
998                $first_rotate(a0),
999                $first_rotate(a1),
1000                $first_rotate(a2),
1001                $first_rotate(a3),
1002                $first_rotate(a4),
1003                $first_rotate(a5),
1004                $first_rotate(a6),
1005                $first_rotate(a7),
1006            );
1007            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1008                a0 ^ b0,
1009                a1 ^ b1,
1010                a2 ^ b2,
1011                a3 ^ b3,
1012                a4 ^ b4,
1013                a5 ^ b5,
1014                a6 ^ b6,
1015                a7 ^ b7,
1016            );
1017            state[0] = b0      ^ c7 ^ $second_rotate(c0);
1018            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1019            state[2] = b2 ^ c1      ^ $second_rotate(c2);
1020            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1021            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1022            state[5] = b5 ^ c4      ^ $second_rotate(c5);
1023            state[6] = b6 ^ c5      ^ $second_rotate(c6);
1024            state[7] = b7 ^ c6      ^ $second_rotate(c7);
1025        }
1026
1027        #[rustfmt::skip]
1028        fn $name_inv(state: &mut State) {
1029            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1030                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1031            );
1032            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1033                $first_rotate(a0),
1034                $first_rotate(a1),
1035                $first_rotate(a2),
1036                $first_rotate(a3),
1037                $first_rotate(a4),
1038                $first_rotate(a5),
1039                $first_rotate(a6),
1040                $first_rotate(a7),
1041            );
1042            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1043                a0 ^ b0,
1044                a1 ^ b1,
1045                a2 ^ b2,
1046                a3 ^ b3,
1047                a4 ^ b4,
1048                a5 ^ b5,
1049                a6 ^ b6,
1050                a7 ^ b7,
1051            );
1052            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1053                a0      ^ c7,
1054                a1 ^ c0 ^ c7,
1055                a2 ^ c1,
1056                a3 ^ c2 ^ c7,
1057                a4 ^ c3 ^ c7,
1058                a5 ^ c4,
1059                a6 ^ c5,
1060                a7 ^ c6,
1061            );
1062            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1063                c0      ^ d6,
1064                c1      ^ d6 ^ d7,
1065                c2 ^ d0      ^ d7,
1066                c3 ^ d1 ^ d6,
1067                c4 ^ d2 ^ d6 ^ d7,
1068                c5 ^ d3      ^ d7,
1069                c6 ^ d4,
1070                c7 ^ d5,
1071            );
1072            state[0] = d0 ^ e0 ^ $second_rotate(e0);
1073            state[1] = d1 ^ e1 ^ $second_rotate(e1);
1074            state[2] = d2 ^ e2 ^ $second_rotate(e2);
1075            state[3] = d3 ^ e3 ^ $second_rotate(e3);
1076            state[4] = d4 ^ e4 ^ $second_rotate(e4);
1077            state[5] = d5 ^ e5 ^ $second_rotate(e5);
1078            state[6] = d6 ^ e6 ^ $second_rotate(e6);
1079            state[7] = d7 ^ e7 ^ $second_rotate(e7);
1080        }
1081    }
1082}
1083
1084define_mix_columns!(
1085    mix_columns_0,
1086    inv_mix_columns_0,
1087    rotate_rows_1,
1088    rotate_rows_2
1089);
1090
1091define_mix_columns!(
1092    mix_columns_1,
1093    inv_mix_columns_1,
1094    rotate_rows_and_columns_1_1,
1095    rotate_rows_and_columns_2_2
1096);
1097
1098#[cfg(not(aes_compact))]
1099define_mix_columns!(
1100    mix_columns_2,
1101    inv_mix_columns_2,
1102    rotate_rows_and_columns_1_2,
1103    rotate_rows_2
1104);
1105
1106#[cfg(not(aes_compact))]
1107define_mix_columns!(
1108    mix_columns_3,
1109    inv_mix_columns_3,
1110    rotate_rows_and_columns_1_3,
1111    rotate_rows_and_columns_2_2
1112);
1113
1114#[inline]
1115fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
1116    let t = (*a ^ ((*a) >> shift)) & mask;
1117    *a ^= t ^ (t << shift);
1118}
1119
1120#[inline]
1121fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
1122    let t = (*a ^ ((*b) >> shift)) & mask;
1123    *a ^= t;
1124    *b ^= t << shift;
1125}
1126
1127/// Applies ShiftRows once on an AES state (or key).
1128#[cfg(any(not(aes_compact), feature = "hazmat"))]
1129#[inline]
1130fn shift_rows_1(state: &mut [u64]) {
1131    debug_assert_eq!(state.len(), 8);
1132    for x in state.iter_mut() {
1133        delta_swap_1(x, 8, 0x00f000ff000f0000);
1134        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1135    }
1136}
1137
1138/// Applies ShiftRows twice on an AES state (or key).
1139#[inline]
1140fn shift_rows_2(state: &mut [u64]) {
1141    debug_assert_eq!(state.len(), 8);
1142    for x in state.iter_mut() {
1143        delta_swap_1(x, 8, 0x00ff000000ff0000);
1144    }
1145}
1146
1147/// Applies ShiftRows three times on an AES state (or key).
1148#[inline]
1149fn shift_rows_3(state: &mut [u64]) {
1150    debug_assert_eq!(state.len(), 8);
1151    for x in state.iter_mut() {
1152        delta_swap_1(x, 8, 0x000f00ff00f00000);
1153        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1154    }
1155}
1156
1157#[inline(always)]
1158fn inv_shift_rows_1(state: &mut [u64]) {
1159    shift_rows_3(state);
1160}
1161
1162#[inline(always)]
1163fn inv_shift_rows_2(state: &mut [u64]) {
1164    shift_rows_2(state);
1165}
1166
1167#[cfg(not(aes_compact))]
1168#[inline(always)]
1169fn inv_shift_rows_3(state: &mut [u64]) {
1170    shift_rows_1(state);
1171}
1172
1173/// XOR the columns after the S-box during the key schedule round function.
1174///
1175/// The `idx_xor` parameter refers to the index of the previous round key that is
1176/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1177/// respectively).
1178///
1179/// The `idx_ror` parameter refers to the rotation value, which varies between the
1180/// different key schedules.
1181fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
1182    for i in 0..8 {
1183        let off_i = offset + i;
1184        let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
1185        rkeys[off_i] = rk
1186            ^ (0xfff0fff0fff0fff0 & (rk << 4))
1187            ^ (0xff00ff00ff00ff00 & (rk << 8))
1188            ^ (0xf000f000f000f000 & (rk << 12));
1189    }
1190}
1191
1192/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
1193fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
1194    debug_assert_eq!(output.len(), 8);
1195    debug_assert_eq!(input0.len(), 16);
1196    debug_assert_eq!(input1.len(), 16);
1197    debug_assert_eq!(input2.len(), 16);
1198    debug_assert_eq!(input3.len(), 16);
1199
1200    // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
1201    // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1202    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1203    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1204    //
1205    // The desired bitsliced data groups first by bit position, then row, column, block:
1206    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1207
1208    #[rustfmt::skip]
1209    fn read_reordered(input: &[u8]) -> u64 {
1210        (u64::from(input[0x0])        ) |
1211        (u64::from(input[0x1]) << 0x10) |
1212        (u64::from(input[0x2]) << 0x20) |
1213        (u64::from(input[0x3]) << 0x30) |
1214        (u64::from(input[0x8]) << 0x08) |
1215        (u64::from(input[0x9]) << 0x18) |
1216        (u64::from(input[0xa]) << 0x28) |
1217        (u64::from(input[0xb]) << 0x38)
1218    }
1219
1220    // Reorder each block's bytes on input
1221    //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
1222    // Reorder by relabeling (note the order of input)
1223    //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
1224    let mut t0 = read_reordered(&input0[0x00..0x0c]);
1225    let mut t4 = read_reordered(&input0[0x04..0x10]);
1226    let mut t1 = read_reordered(&input1[0x00..0x0c]);
1227    let mut t5 = read_reordered(&input1[0x04..0x10]);
1228    let mut t2 = read_reordered(&input2[0x00..0x0c]);
1229    let mut t6 = read_reordered(&input2[0x04..0x10]);
1230    let mut t3 = read_reordered(&input3[0x00..0x0c]);
1231    let mut t7 = read_reordered(&input3[0x04..0x10]);
1232
1233    // Bit Index Swap 6 <-> 0:
1234    //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
1235    let m0 = 0x5555555555555555;
1236    delta_swap_2(&mut t1, &mut t0, 1, m0);
1237    delta_swap_2(&mut t3, &mut t2, 1, m0);
1238    delta_swap_2(&mut t5, &mut t4, 1, m0);
1239    delta_swap_2(&mut t7, &mut t6, 1, m0);
1240
1241    // Bit Index Swap 7 <-> 1:
1242    //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
1243    let m1 = 0x3333333333333333;
1244    delta_swap_2(&mut t2, &mut t0, 2, m1);
1245    delta_swap_2(&mut t3, &mut t1, 2, m1);
1246    delta_swap_2(&mut t6, &mut t4, 2, m1);
1247    delta_swap_2(&mut t7, &mut t5, 2, m1);
1248
1249    // Bit Index Swap 8 <-> 2:
1250    //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
1251    let m2 = 0x0f0f0f0f0f0f0f0f;
1252    delta_swap_2(&mut t4, &mut t0, 4, m2);
1253    delta_swap_2(&mut t5, &mut t1, 4, m2);
1254    delta_swap_2(&mut t6, &mut t2, 4, m2);
1255    delta_swap_2(&mut t7, &mut t3, 4, m2);
1256
1257    // Final bitsliced bit index, as desired:
1258    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1259    output[0] = t0;
1260    output[1] = t1;
1261    output[2] = t2;
1262    output[3] = t3;
1263    output[4] = t4;
1264    output[5] = t5;
1265    output[6] = t6;
1266    output[7] = t7;
1267}
1268
1269/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
1270fn inv_bitslice(input: &[u64]) -> BatchBlocks {
1271    debug_assert_eq!(input.len(), 8);
1272
1273    // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
1274    // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1275    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1276    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1277    //
1278    // The initially bitsliced data groups first by bit position, then row, column, block:
1279    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1280
1281    let mut t0 = input[0];
1282    let mut t1 = input[1];
1283    let mut t2 = input[2];
1284    let mut t3 = input[3];
1285    let mut t4 = input[4];
1286    let mut t5 = input[5];
1287    let mut t6 = input[6];
1288    let mut t7 = input[7];
1289
1290    // TODO: these bit index swaps are identical to those in 'packing'
1291
1292    // Bit Index Swap 6 <-> 0:
1293    //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
1294    let m0 = 0x5555555555555555;
1295    delta_swap_2(&mut t1, &mut t0, 1, m0);
1296    delta_swap_2(&mut t3, &mut t2, 1, m0);
1297    delta_swap_2(&mut t5, &mut t4, 1, m0);
1298    delta_swap_2(&mut t7, &mut t6, 1, m0);
1299
1300    // Bit Index Swap 7 <-> 1:
1301    //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
1302    let m1 = 0x3333333333333333;
1303    delta_swap_2(&mut t2, &mut t0, 2, m1);
1304    delta_swap_2(&mut t3, &mut t1, 2, m1);
1305    delta_swap_2(&mut t6, &mut t4, 2, m1);
1306    delta_swap_2(&mut t7, &mut t5, 2, m1);
1307
1308    // Bit Index Swap 8 <-> 2:
1309    //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
1310    let m2 = 0x0f0f0f0f0f0f0f0f;
1311    delta_swap_2(&mut t4, &mut t0, 4, m2);
1312    delta_swap_2(&mut t5, &mut t1, 4, m2);
1313    delta_swap_2(&mut t6, &mut t2, 4, m2);
1314    delta_swap_2(&mut t7, &mut t3, 4, m2);
1315
1316    #[rustfmt::skip]
1317    fn write_reordered(columns: u64, output: &mut [u8]) {
1318        output[0x0] = (columns        ) as u8;
1319        output[0x1] = (columns >> 0x10) as u8;
1320        output[0x2] = (columns >> 0x20) as u8;
1321        output[0x3] = (columns >> 0x30) as u8;
1322        output[0x8] = (columns >> 0x08) as u8;
1323        output[0x9] = (columns >> 0x18) as u8;
1324        output[0xa] = (columns >> 0x28) as u8;
1325        output[0xb] = (columns >> 0x38) as u8;
1326    }
1327
1328    let mut output = BatchBlocks::default();
1329    // Reorder by relabeling (note the order of output)
1330    //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
1331    // Reorder each block's bytes on output
1332    //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
1333    write_reordered(t0, &mut output[0][0x00..0x0c]);
1334    write_reordered(t4, &mut output[0][0x04..0x10]);
1335    write_reordered(t1, &mut output[1][0x00..0x0c]);
1336    write_reordered(t5, &mut output[1][0x04..0x10]);
1337    write_reordered(t2, &mut output[2][0x00..0x0c]);
1338    write_reordered(t6, &mut output[2][0x04..0x10]);
1339    write_reordered(t3, &mut output[3][0x00..0x0c]);
1340    write_reordered(t7, &mut output[3][0x04..0x10]);
1341
1342    // Final AES bit index, as desired:
1343    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1344    output
1345}
1346
1347/// Copy 32-bytes within the provided slice to an 8-byte offset
1348fn memshift32(buffer: &mut [u64], src_offset: usize) {
1349    debug_assert_eq!(src_offset % 8, 0);
1350
1351    let dst_offset = src_offset + 8;
1352    debug_assert!(dst_offset + 8 <= buffer.len());
1353
1354    for i in (0..8).rev() {
1355        buffer[dst_offset + i] = buffer[src_offset + i];
1356    }
1357}
1358
1359/// XOR the round key to the internal state. The round keys are expected to be
1360/// pre-computed and to be packed in the fixsliced representation.
1361#[inline]
1362fn add_round_key(state: &mut State, rkey: &[u64]) {
1363    debug_assert_eq!(rkey.len(), 8);
1364    for (a, b) in state.iter_mut().zip(rkey) {
1365        *a ^= b;
1366    }
1367}
1368
1369#[inline(always)]
1370fn add_round_constant_bit(state: &mut [u64], bit: usize) {
1371    state[bit] ^= 0x00000000f0000000;
1372}
1373
1374#[inline(always)]
1375fn ror(x: u64, y: u32) -> u64 {
1376    x.rotate_right(y)
1377}
1378
1379#[inline(always)]
1380fn ror_distance(rows: u32, cols: u32) -> u32 {
1381    (rows << 4) + (cols << 2)
1382}
1383
1384#[inline(always)]
1385fn rotate_rows_1(x: u64) -> u64 {
1386    ror(x, ror_distance(1, 0))
1387}
1388
1389#[inline(always)]
1390fn rotate_rows_2(x: u64) -> u64 {
1391    ror(x, ror_distance(2, 0))
1392}
1393
1394#[inline(always)]
1395#[rustfmt::skip]
1396fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
1397    (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
1398    (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
1399}
1400
1401#[cfg(not(aes_compact))]
1402#[inline(always)]
1403#[rustfmt::skip]
1404fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
1405    (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
1406    (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
1407}
1408
1409#[cfg(not(aes_compact))]
1410#[inline(always)]
1411#[rustfmt::skip]
1412fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
1413    (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
1414    (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
1415}
1416
1417#[inline(always)]
1418#[rustfmt::skip]
1419fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
1420    (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
1421    (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
1422}
1423
1424/// Low-level "hazmat" AES functions.
1425///
1426/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
1427/// implementations in this crate, but instead provides raw access to
1428/// the AES round function gated under the `hazmat` crate feature.
1429#[cfg(feature = "hazmat")]
1430pub(crate) mod hazmat {
1431    use super::{
1432        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
1433        shift_rows_1, sub_bytes, sub_bytes_nots, State,
1434    };
1435    use crate::{Block, Block8};
1436
1437    /// XOR the `src` block into the `dst` block in-place.
1438    fn xor_in_place(dst: &mut Block, src: &Block) {
1439        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
1440            *a ^= *b;
1441        }
1442    }
1443
1444    /// Perform a bitslice operation, loading a single block.
1445    fn bitslice_block(block: &Block) -> State {
1446        let mut state = State::default();
1447        bitslice(&mut state, block, block, block, block);
1448        state
1449    }
1450
1451    /// Perform an inverse bitslice operation, extracting a single block.
1452    fn inv_bitslice_block(block: &mut Block, state: &State) {
1453        block.copy_from_slice(&inv_bitslice(state)[0]);
1454    }
1455
1456    /// AES cipher (encrypt) round function.
1457    #[inline]
1458    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
1459        let mut state = bitslice_block(block);
1460        sub_bytes(&mut state);
1461        sub_bytes_nots(&mut state);
1462        shift_rows_1(&mut state);
1463        mix_columns_0(&mut state);
1464        inv_bitslice_block(block, &state);
1465        xor_in_place(block, round_key);
1466    }
1467
1468    /// AES cipher (encrypt) round function: parallel version.
1469    #[inline]
1470    pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
1471        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1472            let mut state = State::default();
1473            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1474            sub_bytes(&mut state);
1475            sub_bytes_nots(&mut state);
1476            shift_rows_1(&mut state);
1477            mix_columns_0(&mut state);
1478            let res = inv_bitslice(&state);
1479
1480            for i in 0..4 {
1481                chunk[i] = res[i];
1482                xor_in_place(&mut chunk[i], &keys[i]);
1483            }
1484        }
1485    }
1486
1487    /// AES cipher (encrypt) round function.
1488    #[inline]
1489    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
1490        let mut state = State::default();
1491        bitslice(&mut state, block, block, block, block);
1492        sub_bytes_nots(&mut state);
1493        inv_sub_bytes(&mut state);
1494        inv_shift_rows_1(&mut state);
1495        inv_mix_columns_0(&mut state);
1496        inv_bitslice_block(block, &state);
1497        xor_in_place(block, round_key);
1498    }
1499
1500    /// AES cipher (encrypt) round function: parallel version.
1501    #[inline]
1502    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
1503        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1504            let mut state = State::default();
1505            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1506            sub_bytes_nots(&mut state);
1507            inv_sub_bytes(&mut state);
1508            inv_shift_rows_1(&mut state);
1509            inv_mix_columns_0(&mut state);
1510            let res = inv_bitslice(&state);
1511
1512            for i in 0..4 {
1513                chunk[i] = res[i];
1514                xor_in_place(&mut chunk[i], &keys[i]);
1515            }
1516        }
1517    }
1518
1519    /// AES mix columns function.
1520    #[inline]
1521    pub(crate) fn mix_columns(block: &mut Block) {
1522        let mut state = bitslice_block(block);
1523        mix_columns_0(&mut state);
1524        inv_bitslice_block(block, &state);
1525    }
1526
1527    /// AES inverse mix columns function.
1528    #[inline]
1529    pub(crate) fn inv_mix_columns(block: &mut Block) {
1530        let mut state = bitslice_block(block);
1531        inv_mix_columns_0(&mut state);
1532        inv_bitslice_block(block, &state);
1533    }
1534}