use super::{super::overlapping::IndexError, Counter, Key, Overlapping, BLOCK_LEN};
use crate::{constant_time, polyfill::sliceutil};
use core::mem::size_of;
pub(super) fn ChaCha20_ctr32(key: &Key, counter: Counter, mut in_out: Overlapping<'_>) {
const SIGMA: [u32; 4] = [
u32::from_le_bytes(*b"expa"),
u32::from_le_bytes(*b"nd 3"),
u32::from_le_bytes(*b"2-by"),
u32::from_le_bytes(*b"te k"),
];
let key = key.words_less_safe();
let counter = counter.into_words_less_safe();
let mut state = [
SIGMA[0], SIGMA[1], SIGMA[2], SIGMA[3], key[0], key[1], key[2], key[3], key[4], key[5],
key[6], key[7], counter[0], counter[1], counter[2], counter[3],
];
let mut in_out_len = in_out.len();
let mut buf = [0u8; BLOCK_LEN];
while in_out_len > 0 {
chacha_core(&mut buf, &state);
state[12] += 1;
debug_assert_eq!(in_out_len, in_out.len());
if in_out_len >= BLOCK_LEN {
in_out = in_out
.split_first_chunk::<BLOCK_LEN>(|in_out| {
constant_time::xor_assign_at_start(&mut buf, in_out.input());
sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf);
})
.unwrap_or_else(|IndexError { .. }| {
unreachable!()
});
} else {
constant_time::xor_assign_at_start(&mut buf, in_out.input());
sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf);
break;
}
in_out_len -= BLOCK_LEN;
}
}
#[inline(always)]
fn chacha_core(output: &mut [u8; BLOCK_LEN], input: &State) {
let mut x = *input;
for _ in (0..20).step_by(2) {
quarterround(&mut x, 0, 4, 8, 12);
quarterround(&mut x, 1, 5, 9, 13);
quarterround(&mut x, 2, 6, 10, 14);
quarterround(&mut x, 3, 7, 11, 15);
quarterround(&mut x, 0, 5, 10, 15);
quarterround(&mut x, 1, 6, 11, 12);
quarterround(&mut x, 2, 7, 8, 13);
quarterround(&mut x, 3, 4, 9, 14);
}
for (x, input) in x.iter_mut().zip(input.iter()) {
*x = x.wrapping_add(*input);
}
output
.chunks_exact_mut(size_of::<u32>())
.zip(x.iter())
.for_each(|(output, &x)| output.copy_from_slice(&x.to_le_bytes()));
}
#[inline(always)]
fn quarterround(x: &mut State, a: usize, b: usize, c: usize, d: usize) {
#[inline(always)]
fn step(x: &mut State, a: usize, b: usize, c: usize, rotation: u32) {
x[a] = x[a].wrapping_add(x[b]);
x[c] = (x[c] ^ x[a]).rotate_left(rotation);
}
step(x, a, b, d, 16);
step(x, c, d, b, 12);
step(x, a, b, d, 8);
step(x, c, d, b, 7);
}
type State = [u32; BLOCK_LEN / 4];