aesni: Avoid loading AES/GHASH round keys into local variables

The performance impact is not measurable, as the compiler loads these variables
in xmm registers in unrolled loops anyway.

However, we avoid loading these sensitive keys onto the stack. This happens for
larger key schedules, where the register count is insufficient. If that key
material is not on the stack, we can avoid to wipe it explicitly after
crypto operations.
This commit is contained in:
Martin Willi 2015-04-14 12:38:18 +02:00
parent 93f0080265
commit 37794878cc
6 changed files with 1192 additions and 1516 deletions

View File

@ -70,22 +70,10 @@ struct private_aesni_cbc_t {
static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i t, fb, *bi, *bo;
__m128i *ks, t, fb, *bi, *bo;
int i;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@ -94,19 +82,19 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
fb = _mm_xor_si128(fb, k0);
fb = _mm_xor_si128(fb, ks[0]);
fb = _mm_aesenc_si128(fb, k1);
fb = _mm_aesenc_si128(fb, k2);
fb = _mm_aesenc_si128(fb, k3);
fb = _mm_aesenc_si128(fb, k4);
fb = _mm_aesenc_si128(fb, k5);
fb = _mm_aesenc_si128(fb, k6);
fb = _mm_aesenc_si128(fb, k7);
fb = _mm_aesenc_si128(fb, k8);
fb = _mm_aesenc_si128(fb, k9);
fb = _mm_aesenc_si128(fb, ks[1]);
fb = _mm_aesenc_si128(fb, ks[2]);
fb = _mm_aesenc_si128(fb, ks[3]);
fb = _mm_aesenc_si128(fb, ks[4]);
fb = _mm_aesenc_si128(fb, ks[5]);
fb = _mm_aesenc_si128(fb, ks[6]);
fb = _mm_aesenc_si128(fb, ks[7]);
fb = _mm_aesenc_si128(fb, ks[8]);
fb = _mm_aesenc_si128(fb, ks[9]);
fb = _mm_aesenclast_si128(fb, k10);
fb = _mm_aesenclast_si128(fb, ks[10]);
_mm_storeu_si128(bo + i, fb);
}
}
@ -117,24 +105,12 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i last, *bi, *bo;
__m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@ -153,52 +129,52 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
t1 = _mm_xor_si128(t1, k0);
t2 = _mm_xor_si128(t2, k0);
t3 = _mm_xor_si128(t3, k0);
t4 = _mm_xor_si128(t4, k0);
t1 = _mm_xor_si128(t1, ks[0]);
t2 = _mm_xor_si128(t2, ks[0]);
t3 = _mm_xor_si128(t3, ks[0]);
t4 = _mm_xor_si128(t4, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t2 = _mm_aesdec_si128(t2, k1);
t3 = _mm_aesdec_si128(t3, k1);
t4 = _mm_aesdec_si128(t4, k1);
t1 = _mm_aesdec_si128(t1, k2);
t2 = _mm_aesdec_si128(t2, k2);
t3 = _mm_aesdec_si128(t3, k2);
t4 = _mm_aesdec_si128(t4, k2);
t1 = _mm_aesdec_si128(t1, k3);
t2 = _mm_aesdec_si128(t2, k3);
t3 = _mm_aesdec_si128(t3, k3);
t4 = _mm_aesdec_si128(t4, k3);
t1 = _mm_aesdec_si128(t1, k4);
t2 = _mm_aesdec_si128(t2, k4);
t3 = _mm_aesdec_si128(t3, k4);
t4 = _mm_aesdec_si128(t4, k4);
t1 = _mm_aesdec_si128(t1, k5);
t2 = _mm_aesdec_si128(t2, k5);
t3 = _mm_aesdec_si128(t3, k5);
t4 = _mm_aesdec_si128(t4, k5);
t1 = _mm_aesdec_si128(t1, k6);
t2 = _mm_aesdec_si128(t2, k6);
t3 = _mm_aesdec_si128(t3, k6);
t4 = _mm_aesdec_si128(t4, k6);
t1 = _mm_aesdec_si128(t1, k7);
t2 = _mm_aesdec_si128(t2, k7);
t3 = _mm_aesdec_si128(t3, k7);
t4 = _mm_aesdec_si128(t4, k7);
t1 = _mm_aesdec_si128(t1, k8);
t2 = _mm_aesdec_si128(t2, k8);
t3 = _mm_aesdec_si128(t3, k8);
t4 = _mm_aesdec_si128(t4, k8);
t1 = _mm_aesdec_si128(t1, k9);
t2 = _mm_aesdec_si128(t2, k9);
t3 = _mm_aesdec_si128(t3, k9);
t4 = _mm_aesdec_si128(t4, k9);
t1 = _mm_aesdec_si128(t1, ks[1]);
t2 = _mm_aesdec_si128(t2, ks[1]);
t3 = _mm_aesdec_si128(t3, ks[1]);
t4 = _mm_aesdec_si128(t4, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t2 = _mm_aesdec_si128(t2, ks[2]);
t3 = _mm_aesdec_si128(t3, ks[2]);
t4 = _mm_aesdec_si128(t4, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t2 = _mm_aesdec_si128(t2, ks[3]);
t3 = _mm_aesdec_si128(t3, ks[3]);
t4 = _mm_aesdec_si128(t4, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t2 = _mm_aesdec_si128(t2, ks[4]);
t3 = _mm_aesdec_si128(t3, ks[4]);
t4 = _mm_aesdec_si128(t4, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t2 = _mm_aesdec_si128(t2, ks[5]);
t3 = _mm_aesdec_si128(t3, ks[5]);
t4 = _mm_aesdec_si128(t4, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t2 = _mm_aesdec_si128(t2, ks[6]);
t3 = _mm_aesdec_si128(t3, ks[6]);
t4 = _mm_aesdec_si128(t4, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t2 = _mm_aesdec_si128(t2, ks[7]);
t3 = _mm_aesdec_si128(t3, ks[7]);
t4 = _mm_aesdec_si128(t4, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t2 = _mm_aesdec_si128(t2, ks[8]);
t3 = _mm_aesdec_si128(t3, ks[8]);
t4 = _mm_aesdec_si128(t4, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t2 = _mm_aesdec_si128(t2, ks[9]);
t3 = _mm_aesdec_si128(t3, ks[9]);
t4 = _mm_aesdec_si128(t4, ks[9]);
t1 = _mm_aesdeclast_si128(t1, k10);
t2 = _mm_aesdeclast_si128(t2, k10);
t3 = _mm_aesdeclast_si128(t3, k10);
t4 = _mm_aesdeclast_si128(t4, k10);
t1 = _mm_aesdeclast_si128(t1, ks[10]);
t2 = _mm_aesdeclast_si128(t2, ks[10]);
t3 = _mm_aesdeclast_si128(t3, ks[10]);
t4 = _mm_aesdeclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@ -213,19 +189,19 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(last, k0);
t1 = _mm_xor_si128(last, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t1 = _mm_aesdec_si128(t1, k2);
t1 = _mm_aesdec_si128(t1, k3);
t1 = _mm_aesdec_si128(t1, k4);
t1 = _mm_aesdec_si128(t1, k5);
t1 = _mm_aesdec_si128(t1, k6);
t1 = _mm_aesdec_si128(t1, k7);
t1 = _mm_aesdec_si128(t1, k8);
t1 = _mm_aesdec_si128(t1, k9);
t1 = _mm_aesdec_si128(t1, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t1 = _mm_aesdeclast_si128(t1, k10);
t1 = _mm_aesdeclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
@ -238,24 +214,10 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i t, fb, *bi, *bo;
__m128i *ks, t, fb, *bi, *bo;
int i;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
k11 = key->schedule[11];
k12 = key->schedule[12];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@ -264,21 +226,21 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
fb = _mm_xor_si128(fb, k0);
fb = _mm_xor_si128(fb, ks[0]);
fb = _mm_aesenc_si128(fb, k1);
fb = _mm_aesenc_si128(fb, k2);
fb = _mm_aesenc_si128(fb, k3);
fb = _mm_aesenc_si128(fb, k4);
fb = _mm_aesenc_si128(fb, k5);
fb = _mm_aesenc_si128(fb, k6);
fb = _mm_aesenc_si128(fb, k7);
fb = _mm_aesenc_si128(fb, k8);
fb = _mm_aesenc_si128(fb, k9);
fb = _mm_aesenc_si128(fb, k10);
fb = _mm_aesenc_si128(fb, k11);
fb = _mm_aesenc_si128(fb, ks[1]);
fb = _mm_aesenc_si128(fb, ks[2]);
fb = _mm_aesenc_si128(fb, ks[3]);
fb = _mm_aesenc_si128(fb, ks[4]);
fb = _mm_aesenc_si128(fb, ks[5]);
fb = _mm_aesenc_si128(fb, ks[6]);
fb = _mm_aesenc_si128(fb, ks[7]);
fb = _mm_aesenc_si128(fb, ks[8]);
fb = _mm_aesenc_si128(fb, ks[9]);
fb = _mm_aesenc_si128(fb, ks[10]);
fb = _mm_aesenc_si128(fb, ks[11]);
fb = _mm_aesenclast_si128(fb, k12);
fb = _mm_aesenclast_si128(fb, ks[12]);
_mm_storeu_si128(bo + i, fb);
}
}
@ -289,26 +251,12 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i last, *bi, *bo;
__m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
k11 = key->schedule[11];
k12 = key->schedule[12];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@ -327,60 +275,60 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
t1 = _mm_xor_si128(t1, k0);
t2 = _mm_xor_si128(t2, k0);
t3 = _mm_xor_si128(t3, k0);
t4 = _mm_xor_si128(t4, k0);
t1 = _mm_xor_si128(t1, ks[0]);
t2 = _mm_xor_si128(t2, ks[0]);
t3 = _mm_xor_si128(t3, ks[0]);
t4 = _mm_xor_si128(t4, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t2 = _mm_aesdec_si128(t2, k1);
t3 = _mm_aesdec_si128(t3, k1);
t4 = _mm_aesdec_si128(t4, k1);
t1 = _mm_aesdec_si128(t1, k2);
t2 = _mm_aesdec_si128(t2, k2);
t3 = _mm_aesdec_si128(t3, k2);
t4 = _mm_aesdec_si128(t4, k2);
t1 = _mm_aesdec_si128(t1, k3);
t2 = _mm_aesdec_si128(t2, k3);
t3 = _mm_aesdec_si128(t3, k3);
t4 = _mm_aesdec_si128(t4, k3);
t1 = _mm_aesdec_si128(t1, k4);
t2 = _mm_aesdec_si128(t2, k4);
t3 = _mm_aesdec_si128(t3, k4);
t4 = _mm_aesdec_si128(t4, k4);
t1 = _mm_aesdec_si128(t1, k5);
t2 = _mm_aesdec_si128(t2, k5);
t3 = _mm_aesdec_si128(t3, k5);
t4 = _mm_aesdec_si128(t4, k5);
t1 = _mm_aesdec_si128(t1, k6);
t2 = _mm_aesdec_si128(t2, k6);
t3 = _mm_aesdec_si128(t3, k6);
t4 = _mm_aesdec_si128(t4, k6);
t1 = _mm_aesdec_si128(t1, k7);
t2 = _mm_aesdec_si128(t2, k7);
t3 = _mm_aesdec_si128(t3, k7);
t4 = _mm_aesdec_si128(t4, k7);
t1 = _mm_aesdec_si128(t1, k8);
t2 = _mm_aesdec_si128(t2, k8);
t3 = _mm_aesdec_si128(t3, k8);
t4 = _mm_aesdec_si128(t4, k8);
t1 = _mm_aesdec_si128(t1, k9);
t2 = _mm_aesdec_si128(t2, k9);
t3 = _mm_aesdec_si128(t3, k9);
t4 = _mm_aesdec_si128(t4, k9);
t1 = _mm_aesdec_si128(t1, k10);
t2 = _mm_aesdec_si128(t2, k10);
t3 = _mm_aesdec_si128(t3, k10);
t4 = _mm_aesdec_si128(t4, k10);
t1 = _mm_aesdec_si128(t1, k11);
t2 = _mm_aesdec_si128(t2, k11);
t3 = _mm_aesdec_si128(t3, k11);
t4 = _mm_aesdec_si128(t4, k11);
t1 = _mm_aesdec_si128(t1, ks[1]);
t2 = _mm_aesdec_si128(t2, ks[1]);
t3 = _mm_aesdec_si128(t3, ks[1]);
t4 = _mm_aesdec_si128(t4, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t2 = _mm_aesdec_si128(t2, ks[2]);
t3 = _mm_aesdec_si128(t3, ks[2]);
t4 = _mm_aesdec_si128(t4, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t2 = _mm_aesdec_si128(t2, ks[3]);
t3 = _mm_aesdec_si128(t3, ks[3]);
t4 = _mm_aesdec_si128(t4, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t2 = _mm_aesdec_si128(t2, ks[4]);
t3 = _mm_aesdec_si128(t3, ks[4]);
t4 = _mm_aesdec_si128(t4, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t2 = _mm_aesdec_si128(t2, ks[5]);
t3 = _mm_aesdec_si128(t3, ks[5]);
t4 = _mm_aesdec_si128(t4, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t2 = _mm_aesdec_si128(t2, ks[6]);
t3 = _mm_aesdec_si128(t3, ks[6]);
t4 = _mm_aesdec_si128(t4, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t2 = _mm_aesdec_si128(t2, ks[7]);
t3 = _mm_aesdec_si128(t3, ks[7]);
t4 = _mm_aesdec_si128(t4, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t2 = _mm_aesdec_si128(t2, ks[8]);
t3 = _mm_aesdec_si128(t3, ks[8]);
t4 = _mm_aesdec_si128(t4, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t2 = _mm_aesdec_si128(t2, ks[9]);
t3 = _mm_aesdec_si128(t3, ks[9]);
t4 = _mm_aesdec_si128(t4, ks[9]);
t1 = _mm_aesdec_si128(t1, ks[10]);
t2 = _mm_aesdec_si128(t2, ks[10]);
t3 = _mm_aesdec_si128(t3, ks[10]);
t4 = _mm_aesdec_si128(t4, ks[10]);
t1 = _mm_aesdec_si128(t1, ks[11]);
t2 = _mm_aesdec_si128(t2, ks[11]);
t3 = _mm_aesdec_si128(t3, ks[11]);
t4 = _mm_aesdec_si128(t4, ks[11]);
t1 = _mm_aesdeclast_si128(t1, k12);
t2 = _mm_aesdeclast_si128(t2, k12);
t3 = _mm_aesdeclast_si128(t3, k12);
t4 = _mm_aesdeclast_si128(t4, k12);
t1 = _mm_aesdeclast_si128(t1, ks[12]);
t2 = _mm_aesdeclast_si128(t2, ks[12]);
t3 = _mm_aesdeclast_si128(t3, ks[12]);
t4 = _mm_aesdeclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@ -395,21 +343,21 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(last, k0);
t1 = _mm_xor_si128(last, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t1 = _mm_aesdec_si128(t1, k2);
t1 = _mm_aesdec_si128(t1, k3);
t1 = _mm_aesdec_si128(t1, k4);
t1 = _mm_aesdec_si128(t1, k5);
t1 = _mm_aesdec_si128(t1, k6);
t1 = _mm_aesdec_si128(t1, k7);
t1 = _mm_aesdec_si128(t1, k8);
t1 = _mm_aesdec_si128(t1, k9);
t1 = _mm_aesdec_si128(t1, k10);
t1 = _mm_aesdec_si128(t1, k11);
t1 = _mm_aesdec_si128(t1, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t1 = _mm_aesdec_si128(t1, ks[10]);
t1 = _mm_aesdec_si128(t1, ks[11]);
t1 = _mm_aesdeclast_si128(t1, k12);
t1 = _mm_aesdeclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
@ -422,26 +370,10 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i t, fb, *bi, *bo;
__m128i *ks, t, fb, *bi, *bo;
int i;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
k11 = key->schedule[11];
k12 = key->schedule[12];
k13 = key->schedule[13];
k14 = key->schedule[14];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
@ -450,23 +382,23 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
fb = _mm_xor_si128(fb, k0);
fb = _mm_xor_si128(fb, ks[0]);
fb = _mm_aesenc_si128(fb, k1);
fb = _mm_aesenc_si128(fb, k2);
fb = _mm_aesenc_si128(fb, k3);
fb = _mm_aesenc_si128(fb, k4);
fb = _mm_aesenc_si128(fb, k5);
fb = _mm_aesenc_si128(fb, k6);
fb = _mm_aesenc_si128(fb, k7);
fb = _mm_aesenc_si128(fb, k8);
fb = _mm_aesenc_si128(fb, k9);
fb = _mm_aesenc_si128(fb, k10);
fb = _mm_aesenc_si128(fb, k11);
fb = _mm_aesenc_si128(fb, k12);
fb = _mm_aesenc_si128(fb, k13);
fb = _mm_aesenc_si128(fb, ks[1]);
fb = _mm_aesenc_si128(fb, ks[2]);
fb = _mm_aesenc_si128(fb, ks[3]);
fb = _mm_aesenc_si128(fb, ks[4]);
fb = _mm_aesenc_si128(fb, ks[5]);
fb = _mm_aesenc_si128(fb, ks[6]);
fb = _mm_aesenc_si128(fb, ks[7]);
fb = _mm_aesenc_si128(fb, ks[8]);
fb = _mm_aesenc_si128(fb, ks[9]);
fb = _mm_aesenc_si128(fb, ks[10]);
fb = _mm_aesenc_si128(fb, ks[11]);
fb = _mm_aesenc_si128(fb, ks[12]);
fb = _mm_aesenc_si128(fb, ks[13]);
fb = _mm_aesenclast_si128(fb, k14);
fb = _mm_aesenclast_si128(fb, ks[14]);
_mm_storeu_si128(bo + i, fb);
}
}
@ -477,28 +409,12 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i last, *bi, *bo;
__m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
k0 = key->schedule[0];
k1 = key->schedule[1];
k2 = key->schedule[2];
k3 = key->schedule[3];
k4 = key->schedule[4];
k5 = key->schedule[5];
k6 = key->schedule[6];
k7 = key->schedule[7];
k8 = key->schedule[8];
k9 = key->schedule[9];
k10 = key->schedule[10];
k11 = key->schedule[11];
k12 = key->schedule[12];
k13 = key->schedule[13];
k14 = key->schedule[14];
ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@ -517,68 +433,68 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
f4 = t3;
last = t4;
t1 = _mm_xor_si128(t1, k0);
t2 = _mm_xor_si128(t2, k0);
t3 = _mm_xor_si128(t3, k0);
t4 = _mm_xor_si128(t4, k0);
t1 = _mm_xor_si128(t1, ks[0]);
t2 = _mm_xor_si128(t2, ks[0]);
t3 = _mm_xor_si128(t3, ks[0]);
t4 = _mm_xor_si128(t4, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t2 = _mm_aesdec_si128(t2, k1);
t3 = _mm_aesdec_si128(t3, k1);
t4 = _mm_aesdec_si128(t4, k1);
t1 = _mm_aesdec_si128(t1, k2);
t2 = _mm_aesdec_si128(t2, k2);
t3 = _mm_aesdec_si128(t3, k2);
t4 = _mm_aesdec_si128(t4, k2);
t1 = _mm_aesdec_si128(t1, k3);
t2 = _mm_aesdec_si128(t2, k3);
t3 = _mm_aesdec_si128(t3, k3);
t4 = _mm_aesdec_si128(t4, k3);
t1 = _mm_aesdec_si128(t1, k4);
t2 = _mm_aesdec_si128(t2, k4);
t3 = _mm_aesdec_si128(t3, k4);
t4 = _mm_aesdec_si128(t4, k4);
t1 = _mm_aesdec_si128(t1, k5);
t2 = _mm_aesdec_si128(t2, k5);
t3 = _mm_aesdec_si128(t3, k5);
t4 = _mm_aesdec_si128(t4, k5);
t1 = _mm_aesdec_si128(t1, k6);
t2 = _mm_aesdec_si128(t2, k6);
t3 = _mm_aesdec_si128(t3, k6);
t4 = _mm_aesdec_si128(t4, k6);
t1 = _mm_aesdec_si128(t1, k7);
t2 = _mm_aesdec_si128(t2, k7);
t3 = _mm_aesdec_si128(t3, k7);
t4 = _mm_aesdec_si128(t4, k7);
t1 = _mm_aesdec_si128(t1, k8);
t2 = _mm_aesdec_si128(t2, k8);
t3 = _mm_aesdec_si128(t3, k8);
t4 = _mm_aesdec_si128(t4, k8);
t1 = _mm_aesdec_si128(t1, k9);
t2 = _mm_aesdec_si128(t2, k9);
t3 = _mm_aesdec_si128(t3, k9);
t4 = _mm_aesdec_si128(t4, k9);
t1 = _mm_aesdec_si128(t1, k10);
t2 = _mm_aesdec_si128(t2, k10);
t3 = _mm_aesdec_si128(t3, k10);
t4 = _mm_aesdec_si128(t4, k10);
t1 = _mm_aesdec_si128(t1, k11);
t2 = _mm_aesdec_si128(t2, k11);
t3 = _mm_aesdec_si128(t3, k11);
t4 = _mm_aesdec_si128(t4, k11);
t1 = _mm_aesdec_si128(t1, k12);
t2 = _mm_aesdec_si128(t2, k12);
t3 = _mm_aesdec_si128(t3, k12);
t4 = _mm_aesdec_si128(t4, k12);
t1 = _mm_aesdec_si128(t1, k13);
t2 = _mm_aesdec_si128(t2, k13);
t3 = _mm_aesdec_si128(t3, k13);
t4 = _mm_aesdec_si128(t4, k13);
t1 = _mm_aesdec_si128(t1, ks[1]);
t2 = _mm_aesdec_si128(t2, ks[1]);
t3 = _mm_aesdec_si128(t3, ks[1]);
t4 = _mm_aesdec_si128(t4, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t2 = _mm_aesdec_si128(t2, ks[2]);
t3 = _mm_aesdec_si128(t3, ks[2]);
t4 = _mm_aesdec_si128(t4, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t2 = _mm_aesdec_si128(t2, ks[3]);
t3 = _mm_aesdec_si128(t3, ks[3]);
t4 = _mm_aesdec_si128(t4, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t2 = _mm_aesdec_si128(t2, ks[4]);
t3 = _mm_aesdec_si128(t3, ks[4]);
t4 = _mm_aesdec_si128(t4, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t2 = _mm_aesdec_si128(t2, ks[5]);
t3 = _mm_aesdec_si128(t3, ks[5]);
t4 = _mm_aesdec_si128(t4, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t2 = _mm_aesdec_si128(t2, ks[6]);
t3 = _mm_aesdec_si128(t3, ks[6]);
t4 = _mm_aesdec_si128(t4, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t2 = _mm_aesdec_si128(t2, ks[7]);
t3 = _mm_aesdec_si128(t3, ks[7]);
t4 = _mm_aesdec_si128(t4, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t2 = _mm_aesdec_si128(t2, ks[8]);
t3 = _mm_aesdec_si128(t3, ks[8]);
t4 = _mm_aesdec_si128(t4, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t2 = _mm_aesdec_si128(t2, ks[9]);
t3 = _mm_aesdec_si128(t3, ks[9]);
t4 = _mm_aesdec_si128(t4, ks[9]);
t1 = _mm_aesdec_si128(t1, ks[10]);
t2 = _mm_aesdec_si128(t2, ks[10]);
t3 = _mm_aesdec_si128(t3, ks[10]);
t4 = _mm_aesdec_si128(t4, ks[10]);
t1 = _mm_aesdec_si128(t1, ks[11]);
t2 = _mm_aesdec_si128(t2, ks[11]);
t3 = _mm_aesdec_si128(t3, ks[11]);
t4 = _mm_aesdec_si128(t4, ks[11]);
t1 = _mm_aesdec_si128(t1, ks[12]);
t2 = _mm_aesdec_si128(t2, ks[12]);
t3 = _mm_aesdec_si128(t3, ks[12]);
t4 = _mm_aesdec_si128(t4, ks[12]);
t1 = _mm_aesdec_si128(t1, ks[13]);
t2 = _mm_aesdec_si128(t2, ks[13]);
t3 = _mm_aesdec_si128(t3, ks[13]);
t4 = _mm_aesdec_si128(t4, ks[13]);
t1 = _mm_aesdeclast_si128(t1, k14);
t2 = _mm_aesdeclast_si128(t2, k14);
t3 = _mm_aesdeclast_si128(t3, k14);
t4 = _mm_aesdeclast_si128(t4, k14);
t1 = _mm_aesdeclast_si128(t1, ks[14]);
t2 = _mm_aesdeclast_si128(t2, ks[14]);
t3 = _mm_aesdeclast_si128(t3, ks[14]);
t4 = _mm_aesdeclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
@ -593,23 +509,23 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(last, k0);
t1 = _mm_xor_si128(last, ks[0]);
t1 = _mm_aesdec_si128(t1, k1);
t1 = _mm_aesdec_si128(t1, k2);
t1 = _mm_aesdec_si128(t1, k3);
t1 = _mm_aesdec_si128(t1, k4);
t1 = _mm_aesdec_si128(t1, k5);
t1 = _mm_aesdec_si128(t1, k6);
t1 = _mm_aesdec_si128(t1, k7);
t1 = _mm_aesdec_si128(t1, k8);
t1 = _mm_aesdec_si128(t1, k9);
t1 = _mm_aesdec_si128(t1, k10);
t1 = _mm_aesdec_si128(t1, k11);
t1 = _mm_aesdec_si128(t1, k12);
t1 = _mm_aesdec_si128(t1, k13);
t1 = _mm_aesdec_si128(t1, ks[1]);
t1 = _mm_aesdec_si128(t1, ks[2]);
t1 = _mm_aesdec_si128(t1, ks[3]);
t1 = _mm_aesdec_si128(t1, ks[4]);
t1 = _mm_aesdec_si128(t1, ks[5]);
t1 = _mm_aesdec_si128(t1, ks[6]);
t1 = _mm_aesdec_si128(t1, ks[7]);
t1 = _mm_aesdec_si128(t1, ks[8]);
t1 = _mm_aesdec_si128(t1, ks[9]);
t1 = _mm_aesdec_si128(t1, ks[10]);
t1 = _mm_aesdec_si128(t1, ks[11]);
t1 = _mm_aesdec_si128(t1, ks[12]);
t1 = _mm_aesdec_si128(t1, ks[13]);
t1 = _mm_aesdeclast_si128(t1, k14);
t1 = _mm_aesdeclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;

View File

@ -159,17 +159,18 @@ static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv,
static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
u_int16_t alen, u_char *assoc)
{
__m128i b, t, c;
__m128i *ks, b, t, c;
u_int i, round, blocks, rem;
ks = this->key->schedule;
build_b0(this, len, alen, iv, &b);
c = _mm_loadu_si128(&b);
c = _mm_xor_si128(c, this->key->schedule[0]);
c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
c = _mm_aesenc_si128(c, this->key->schedule[round]);
c = _mm_aesenc_si128(c, ks[round]);
}
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
if (alen)
{
@ -200,12 +201,12 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
}
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, this->key->schedule[0]);
c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
c = _mm_aesenc_si128(c, this->key->schedule[round]);
c = _mm_aesenc_si128(c, ks[round]);
}
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
}
}
return c;
@ -217,18 +218,19 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
__m128i c, u_char *icv)
{
__m128i b, t;
__m128i *ks, b, t;
u_int round;
ks = this->key->schedule;
build_ctr(this, 0, iv, &b);
t = _mm_loadu_si128(&b);
t = _mm_xor_si128(t, this->key->schedule[0]);
t = _mm_xor_si128(t, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
t = _mm_aesenc_si128(t, this->key->schedule[round]);
t = _mm_aesenc_si128(t, ks[round]);
}
t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
t = _mm_xor_si128(t, c);
@ -258,23 +260,24 @@ static inline __m128i increment_be(__m128i x)
static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
__m128i t, b, d;
__m128i *ks, t, b, d;
u_int round;
ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, key->schedule[0]);
t = _mm_xor_si128(state, key->schedule[0]);
c = _mm_xor_si128(c, ks[0]);
t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
c = _mm_aesenc_si128(c, key->schedule[round]);
t = _mm_aesenc_si128(t, key->schedule[round]);
c = _mm_aesenc_si128(c, ks[round]);
t = _mm_aesenc_si128(t, ks[round]);
}
c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
c = _mm_aesenclast_si128(c, ks[key->rounds]);
t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
@ -290,31 +293,32 @@ static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
__m128i t, b, d;
__m128i *ks, t, b, d;
u_int round;
ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
t = _mm_xor_si128(state, key->schedule[0]);
t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
t = _mm_aesenc_si128(t, key->schedule[round]);
t = _mm_aesenc_si128(t, ks[round]);
}
t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
memset((u_char*)&b + rem, 0, sizeof(b) - rem);
t = _mm_loadu_si128(&b);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, key->schedule[0]);
c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < key->rounds; round++)
{
c = _mm_aesenc_si128(c, key->schedule[round]);
c = _mm_aesenc_si128(c, ks[round]);
}
c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
c = _mm_aesenclast_si128(c, ks[key->rounds]);
memcpy(out, &b, rem);
@ -328,8 +332,7 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -340,47 +343,37 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_xor_si128(c, ks[0]);
t = _mm_xor_si128(state, ks[0]);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenc_si128(c, ks[1]);
t = _mm_aesenc_si128(t, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
t = _mm_aesenc_si128(t, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
t = _mm_aesenc_si128(t, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
t = _mm_aesenc_si128(t, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
t = _mm_aesenc_si128(t, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
t = _mm_aesenc_si128(t, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
t = _mm_aesenc_si128(t, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
t = _mm_aesenc_si128(t, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
t = _mm_aesenc_si128(t, ks[9]);
c = _mm_aesenclast_si128(c, k10);
t = _mm_aesenclast_si128(t, k10);
c = _mm_aesenclast_si128(c, ks[10]);
t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@ -402,8 +395,7 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -414,52 +406,42 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, k0);
t = _mm_xor_si128(state, ks[0]);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenclast_si128(t, k10);
t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, k0);
c = _mm_xor_si128(c, ks[0]);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenc_si128(c, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
c = _mm_aesenclast_si128(c, k10);
c = _mm_aesenclast_si128(c, ks[10]);
state = increment_be(state);
}
@ -478,8 +460,7 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -490,53 +471,41 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_xor_si128(c, ks[0]);
t = _mm_xor_si128(state, ks[0]);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenc_si128(c, k10);
t = _mm_aesenc_si128(t, k10);
c = _mm_aesenc_si128(c, k11);
t = _mm_aesenc_si128(t, k11);
c = _mm_aesenc_si128(c, ks[1]);
t = _mm_aesenc_si128(t, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
t = _mm_aesenc_si128(t, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
t = _mm_aesenc_si128(t, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
t = _mm_aesenc_si128(t, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
t = _mm_aesenc_si128(t, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
t = _mm_aesenc_si128(t, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
t = _mm_aesenc_si128(t, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
t = _mm_aesenc_si128(t, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
t = _mm_aesenc_si128(t, ks[9]);
c = _mm_aesenc_si128(c, ks[10]);
t = _mm_aesenc_si128(t, ks[10]);
c = _mm_aesenc_si128(c, ks[11]);
t = _mm_aesenc_si128(t, ks[11]);
c = _mm_aesenclast_si128(c, k12);
t = _mm_aesenclast_si128(t, k12);
c = _mm_aesenclast_si128(c, ks[12]);
t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@ -558,8 +527,7 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -570,58 +538,46 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, k0);
t = _mm_xor_si128(state, ks[0]);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenc_si128(t, k10);
t = _mm_aesenc_si128(t, k11);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenc_si128(t, ks[10]);
t = _mm_aesenc_si128(t, ks[11]);
t = _mm_aesenclast_si128(t, k12);
t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, k0);
c = _mm_xor_si128(c, ks[0]);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenc_si128(c, k10);
c = _mm_aesenc_si128(c, k11);
c = _mm_aesenc_si128(c, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
c = _mm_aesenc_si128(c, ks[10]);
c = _mm_aesenc_si128(c, ks[11]);
c = _mm_aesenclast_si128(c, k12);
c = _mm_aesenclast_si128(c, ks[12]);
state = increment_be(state);
}
@ -640,8 +596,7 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -652,59 +607,45 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
c = _mm_xor_si128(c, k0);
t = _mm_xor_si128(state, k0);
c = _mm_xor_si128(c, ks[0]);
t = _mm_xor_si128(state, ks[0]);
c = _mm_aesenc_si128(c, k1);
t = _mm_aesenc_si128(t, k1);
c = _mm_aesenc_si128(c, k2);
t = _mm_aesenc_si128(t, k2);
c = _mm_aesenc_si128(c, k3);
t = _mm_aesenc_si128(t, k3);
c = _mm_aesenc_si128(c, k4);
t = _mm_aesenc_si128(t, k4);
c = _mm_aesenc_si128(c, k5);
t = _mm_aesenc_si128(t, k5);
c = _mm_aesenc_si128(c, k6);
t = _mm_aesenc_si128(t, k6);
c = _mm_aesenc_si128(c, k7);
t = _mm_aesenc_si128(t, k7);
c = _mm_aesenc_si128(c, k8);
t = _mm_aesenc_si128(t, k8);
c = _mm_aesenc_si128(c, k9);
t = _mm_aesenc_si128(t, k9);
c = _mm_aesenc_si128(c, k10);
t = _mm_aesenc_si128(t, k10);
c = _mm_aesenc_si128(c, k11);
t = _mm_aesenc_si128(t, k11);
c = _mm_aesenc_si128(c, k12);
t = _mm_aesenc_si128(t, k12);
c = _mm_aesenc_si128(c, k13);
t = _mm_aesenc_si128(t, k13);
c = _mm_aesenc_si128(c, ks[1]);
t = _mm_aesenc_si128(t, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
t = _mm_aesenc_si128(t, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
t = _mm_aesenc_si128(t, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
t = _mm_aesenc_si128(t, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
t = _mm_aesenc_si128(t, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
t = _mm_aesenc_si128(t, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
t = _mm_aesenc_si128(t, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
t = _mm_aesenc_si128(t, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
t = _mm_aesenc_si128(t, ks[9]);
c = _mm_aesenc_si128(c, ks[10]);
t = _mm_aesenc_si128(t, ks[10]);
c = _mm_aesenc_si128(c, ks[11]);
t = _mm_aesenc_si128(t, ks[11]);
c = _mm_aesenc_si128(c, ks[12]);
t = _mm_aesenc_si128(t, ks[12]);
c = _mm_aesenc_si128(c, ks[13]);
t = _mm_aesenc_si128(t, ks[13]);
c = _mm_aesenclast_si128(c, k14);
t = _mm_aesenclast_si128(t, k14);
c = _mm_aesenclast_si128(c, ks[14]);
t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
@ -726,8 +667,7 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i d, t, c, b, state, *bi, *bo;
__m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
@ -738,64 +678,50 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
t = _mm_xor_si128(state, k0);
t = _mm_xor_si128(state, ks[0]);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenc_si128(t, k10);
t = _mm_aesenc_si128(t, k11);
t = _mm_aesenc_si128(t, k12);
t = _mm_aesenc_si128(t, k13);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenc_si128(t, ks[10]);
t = _mm_aesenc_si128(t, ks[11]);
t = _mm_aesenc_si128(t, ks[12]);
t = _mm_aesenc_si128(t, ks[13]);
t = _mm_aesenclast_si128(t, k14);
t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
c = _mm_xor_si128(c, k0);
c = _mm_xor_si128(c, ks[0]);
c = _mm_aesenc_si128(c, k1);
c = _mm_aesenc_si128(c, k2);
c = _mm_aesenc_si128(c, k3);
c = _mm_aesenc_si128(c, k4);
c = _mm_aesenc_si128(c, k5);
c = _mm_aesenc_si128(c, k6);
c = _mm_aesenc_si128(c, k7);
c = _mm_aesenc_si128(c, k8);
c = _mm_aesenc_si128(c, k9);
c = _mm_aesenc_si128(c, k10);
c = _mm_aesenc_si128(c, k11);
c = _mm_aesenc_si128(c, k12);
c = _mm_aesenc_si128(c, k13);
c = _mm_aesenc_si128(c, ks[1]);
c = _mm_aesenc_si128(c, ks[2]);
c = _mm_aesenc_si128(c, ks[3]);
c = _mm_aesenc_si128(c, ks[4]);
c = _mm_aesenc_si128(c, ks[5]);
c = _mm_aesenc_si128(c, ks[6]);
c = _mm_aesenc_si128(c, ks[7]);
c = _mm_aesenc_si128(c, ks[8]);
c = _mm_aesenc_si128(c, ks[9]);
c = _mm_aesenc_si128(c, ks[10]);
c = _mm_aesenc_si128(c, ks[11]);
c = _mm_aesenc_si128(c, ks[12]);
c = _mm_aesenc_si128(c, ks[13]);
c = _mm_aesenclast_si128(c, k14);
c = _mm_aesenclast_si128(c, ks[14]);
state = increment_be(state);
}

View File

@ -67,8 +67,7 @@ struct private_mac_t {
METHOD(mac_t, get_mac, bool,
private_mac_t *this, chunk_t data, u_int8_t *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i t, l, *bi;
__m128i *ks, t, l, *bi;
u_int blocks, rem, i;
if (!this->k)
@ -76,18 +75,7 @@ METHOD(mac_t, get_mac, bool,
return FALSE;
}
k0 = this->k->schedule[0];
k1 = this->k->schedule[1];
k2 = this->k->schedule[2];
k3 = this->k->schedule[3];
k4 = this->k->schedule[4];
k5 = this->k->schedule[5];
k6 = this->k->schedule[6];
k7 = this->k->schedule[7];
k8 = this->k->schedule[8];
k9 = this->k->schedule[9];
k10 = this->k->schedule[10];
ks = this->k->schedule;
t = this->t;
if (this->rem_size + data.len > AES_BLOCK_SIZE)
@ -105,17 +93,17 @@ METHOD(mac_t, get_mac, bool,
t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
t = _mm_xor_si128(t, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenclast_si128(t, k10);
t = _mm_xor_si128(t, ks[0]);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenclast_si128(t, ks[10]);
/* process blocks M_2 ... M_n-1 */
bi = (__m128i*)data.ptr;
@ -132,17 +120,17 @@ METHOD(mac_t, get_mac, bool,
{
t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
t = _mm_xor_si128(t, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenclast_si128(t, k10);
t = _mm_xor_si128(t, ks[0]);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenclast_si128(t, ks[10]);
}
/* store remaining bytes of block M_n */
@ -188,17 +176,17 @@ METHOD(mac_t, get_mac, bool,
*/
t = _mm_xor_si128(l, t);
t = _mm_xor_si128(t, k0);
t = _mm_aesenc_si128(t, k1);
t = _mm_aesenc_si128(t, k2);
t = _mm_aesenc_si128(t, k3);
t = _mm_aesenc_si128(t, k4);
t = _mm_aesenc_si128(t, k5);
t = _mm_aesenc_si128(t, k6);
t = _mm_aesenc_si128(t, k7);
t = _mm_aesenc_si128(t, k8);
t = _mm_aesenc_si128(t, k9);
t = _mm_aesenclast_si128(t, k10);
t = _mm_xor_si128(t, ks[0]);
t = _mm_aesenc_si128(t, ks[1]);
t = _mm_aesenc_si128(t, ks[2]);
t = _mm_aesenc_si128(t, ks[3]);
t = _mm_aesenc_si128(t, ks[4]);
t = _mm_aesenc_si128(t, ks[5]);
t = _mm_aesenc_si128(t, ks[6]);
t = _mm_aesenc_si128(t, ks[7]);
t = _mm_aesenc_si128(t, ks[8]);
t = _mm_aesenc_si128(t, ks[9]);
t = _mm_aesenclast_si128(t, ks[10]);
_mm_storeu_si128((__m128i*)out, t);

View File

@ -87,10 +87,9 @@ static inline __m128i increment_be(__m128i x)
static void encrypt_ctr128(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
__m128i state, b, *bi, *bo;
__m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@ -100,17 +99,7 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@ -119,56 +108,56 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t2 = _mm_xor_si128(state, k0);
t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t3 = _mm_xor_si128(state, k0);
t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t4 = _mm_xor_si128(state, k0);
t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t2 = _mm_aesenc_si128(t2, k1);
t3 = _mm_aesenc_si128(t3, k1);
t4 = _mm_aesenc_si128(t4, k1);
t1 = _mm_aesenc_si128(t1, k2);
t2 = _mm_aesenc_si128(t2, k2);
t3 = _mm_aesenc_si128(t3, k2);
t4 = _mm_aesenc_si128(t4, k2);
t1 = _mm_aesenc_si128(t1, k3);
t2 = _mm_aesenc_si128(t2, k3);
t3 = _mm_aesenc_si128(t3, k3);
t4 = _mm_aesenc_si128(t4, k3);
t1 = _mm_aesenc_si128(t1, k4);
t2 = _mm_aesenc_si128(t2, k4);
t3 = _mm_aesenc_si128(t3, k4);
t4 = _mm_aesenc_si128(t4, k4);
t1 = _mm_aesenc_si128(t1, k5);
t2 = _mm_aesenc_si128(t2, k5);
t3 = _mm_aesenc_si128(t3, k5);
t4 = _mm_aesenc_si128(t4, k5);
t1 = _mm_aesenc_si128(t1, k6);
t2 = _mm_aesenc_si128(t2, k6);
t3 = _mm_aesenc_si128(t3, k6);
t4 = _mm_aesenc_si128(t4, k6);
t1 = _mm_aesenc_si128(t1, k7);
t2 = _mm_aesenc_si128(t2, k7);
t3 = _mm_aesenc_si128(t3, k7);
t4 = _mm_aesenc_si128(t4, k7);
t1 = _mm_aesenc_si128(t1, k8);
t2 = _mm_aesenc_si128(t2, k8);
t3 = _mm_aesenc_si128(t3, k8);
t4 = _mm_aesenc_si128(t4, k8);
t1 = _mm_aesenc_si128(t1, k9);
t2 = _mm_aesenc_si128(t2, k9);
t3 = _mm_aesenc_si128(t3, k9);
t4 = _mm_aesenc_si128(t4, k9);
t1 = _mm_aesenc_si128(t1, ks[1]);
t2 = _mm_aesenc_si128(t2, ks[1]);
t3 = _mm_aesenc_si128(t3, ks[1]);
t4 = _mm_aesenc_si128(t4, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t2 = _mm_aesenc_si128(t2, ks[2]);
t3 = _mm_aesenc_si128(t3, ks[2]);
t4 = _mm_aesenc_si128(t4, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t2 = _mm_aesenc_si128(t2, ks[3]);
t3 = _mm_aesenc_si128(t3, ks[3]);
t4 = _mm_aesenc_si128(t4, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t2 = _mm_aesenc_si128(t2, ks[4]);
t3 = _mm_aesenc_si128(t3, ks[4]);
t4 = _mm_aesenc_si128(t4, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t2 = _mm_aesenc_si128(t2, ks[5]);
t3 = _mm_aesenc_si128(t3, ks[5]);
t4 = _mm_aesenc_si128(t4, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t2 = _mm_aesenc_si128(t2, ks[6]);
t3 = _mm_aesenc_si128(t3, ks[6]);
t4 = _mm_aesenc_si128(t4, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t2 = _mm_aesenc_si128(t2, ks[7]);
t3 = _mm_aesenc_si128(t3, ks[7]);
t4 = _mm_aesenc_si128(t4, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t2 = _mm_aesenc_si128(t2, ks[8]);
t3 = _mm_aesenc_si128(t3, ks[8]);
t4 = _mm_aesenc_si128(t4, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t2 = _mm_aesenc_si128(t2, ks[9]);
t3 = _mm_aesenc_si128(t3, ks[9]);
t4 = _mm_aesenc_si128(t4, ks[9]);
t1 = _mm_aesenclast_si128(t1, k10);
t2 = _mm_aesenclast_si128(t2, k10);
t3 = _mm_aesenclast_si128(t3, k10);
t4 = _mm_aesenclast_si128(t4, k10);
t1 = _mm_aesenclast_si128(t1, ks[10]);
t2 = _mm_aesenclast_si128(t2, ks[10]);
t3 = _mm_aesenclast_si128(t3, ks[10]);
t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@ -183,20 +172,20 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenclast_si128(t1, k10);
t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@ -207,19 +196,19 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenclast_si128(t1, k10);
t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
@ -233,10 +222,9 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
static void encrypt_ctr192(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
__m128i state, b, *bi, *bo;
__m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@ -246,19 +234,7 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@ -267,64 +243,64 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t2 = _mm_xor_si128(state, k0);
t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t3 = _mm_xor_si128(state, k0);
t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t4 = _mm_xor_si128(state, k0);
t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t2 = _mm_aesenc_si128(t2, k1);
t3 = _mm_aesenc_si128(t3, k1);
t4 = _mm_aesenc_si128(t4, k1);
t1 = _mm_aesenc_si128(t1, k2);
t2 = _mm_aesenc_si128(t2, k2);
t3 = _mm_aesenc_si128(t3, k2);
t4 = _mm_aesenc_si128(t4, k2);
t1 = _mm_aesenc_si128(t1, k3);
t2 = _mm_aesenc_si128(t2, k3);
t3 = _mm_aesenc_si128(t3, k3);
t4 = _mm_aesenc_si128(t4, k3);
t1 = _mm_aesenc_si128(t1, k4);
t2 = _mm_aesenc_si128(t2, k4);
t3 = _mm_aesenc_si128(t3, k4);
t4 = _mm_aesenc_si128(t4, k4);
t1 = _mm_aesenc_si128(t1, k5);
t2 = _mm_aesenc_si128(t2, k5);
t3 = _mm_aesenc_si128(t3, k5);
t4 = _mm_aesenc_si128(t4, k5);
t1 = _mm_aesenc_si128(t1, k6);
t2 = _mm_aesenc_si128(t2, k6);
t3 = _mm_aesenc_si128(t3, k6);
t4 = _mm_aesenc_si128(t4, k6);
t1 = _mm_aesenc_si128(t1, k7);
t2 = _mm_aesenc_si128(t2, k7);
t3 = _mm_aesenc_si128(t3, k7);
t4 = _mm_aesenc_si128(t4, k7);
t1 = _mm_aesenc_si128(t1, k8);
t2 = _mm_aesenc_si128(t2, k8);
t3 = _mm_aesenc_si128(t3, k8);
t4 = _mm_aesenc_si128(t4, k8);
t1 = _mm_aesenc_si128(t1, k9);
t2 = _mm_aesenc_si128(t2, k9);
t3 = _mm_aesenc_si128(t3, k9);
t4 = _mm_aesenc_si128(t4, k9);
t1 = _mm_aesenc_si128(t1, k10);
t2 = _mm_aesenc_si128(t2, k10);
t3 = _mm_aesenc_si128(t3, k10);
t4 = _mm_aesenc_si128(t4, k10);
t1 = _mm_aesenc_si128(t1, k11);
t2 = _mm_aesenc_si128(t2, k11);
t3 = _mm_aesenc_si128(t3, k11);
t4 = _mm_aesenc_si128(t4, k11);
t1 = _mm_aesenc_si128(t1, ks[1]);
t2 = _mm_aesenc_si128(t2, ks[1]);
t3 = _mm_aesenc_si128(t3, ks[1]);
t4 = _mm_aesenc_si128(t4, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t2 = _mm_aesenc_si128(t2, ks[2]);
t3 = _mm_aesenc_si128(t3, ks[2]);
t4 = _mm_aesenc_si128(t4, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t2 = _mm_aesenc_si128(t2, ks[3]);
t3 = _mm_aesenc_si128(t3, ks[3]);
t4 = _mm_aesenc_si128(t4, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t2 = _mm_aesenc_si128(t2, ks[4]);
t3 = _mm_aesenc_si128(t3, ks[4]);
t4 = _mm_aesenc_si128(t4, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t2 = _mm_aesenc_si128(t2, ks[5]);
t3 = _mm_aesenc_si128(t3, ks[5]);
t4 = _mm_aesenc_si128(t4, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t2 = _mm_aesenc_si128(t2, ks[6]);
t3 = _mm_aesenc_si128(t3, ks[6]);
t4 = _mm_aesenc_si128(t4, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t2 = _mm_aesenc_si128(t2, ks[7]);
t3 = _mm_aesenc_si128(t3, ks[7]);
t4 = _mm_aesenc_si128(t4, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t2 = _mm_aesenc_si128(t2, ks[8]);
t3 = _mm_aesenc_si128(t3, ks[8]);
t4 = _mm_aesenc_si128(t4, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t2 = _mm_aesenc_si128(t2, ks[9]);
t3 = _mm_aesenc_si128(t3, ks[9]);
t4 = _mm_aesenc_si128(t4, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t2 = _mm_aesenc_si128(t2, ks[10]);
t3 = _mm_aesenc_si128(t3, ks[10]);
t4 = _mm_aesenc_si128(t4, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t2 = _mm_aesenc_si128(t2, ks[11]);
t3 = _mm_aesenc_si128(t3, ks[11]);
t4 = _mm_aesenc_si128(t4, ks[11]);
t1 = _mm_aesenclast_si128(t1, k12);
t2 = _mm_aesenclast_si128(t2, k12);
t3 = _mm_aesenclast_si128(t3, k12);
t4 = _mm_aesenclast_si128(t4, k12);
t1 = _mm_aesenclast_si128(t1, ks[12]);
t2 = _mm_aesenclast_si128(t2, ks[12]);
t3 = _mm_aesenclast_si128(t3, ks[12]);
t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@ -339,22 +315,22 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, k10);
t1 = _mm_aesenc_si128(t1, k11);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t1 = _mm_aesenclast_si128(t1, k12);
t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@ -365,21 +341,21 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, k10);
t1 = _mm_aesenc_si128(t1, k11);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t1 = _mm_aesenclast_si128(t1, k12);
t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
@ -393,10 +369,9 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
static void encrypt_ctr256(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
__m128i state, b, *bi, *bo;
__m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
@ -406,21 +381,7 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
bi = (__m128i*)in;
bo = (__m128i*)out;
k0 = this->key->schedule[0];
k1 = this->key->schedule[1];
k2 = this->key->schedule[2];
k3 = this->key->schedule[3];
k4 = this->key->schedule[4];
k5 = this->key->schedule[5];
k6 = this->key->schedule[6];
k7 = this->key->schedule[7];
k8 = this->key->schedule[8];
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
@ -429,72 +390,72 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t2 = _mm_xor_si128(state, k0);
t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t3 = _mm_xor_si128(state, k0);
t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t4 = _mm_xor_si128(state, k0);
t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t2 = _mm_aesenc_si128(t2, k1);
t3 = _mm_aesenc_si128(t3, k1);
t4 = _mm_aesenc_si128(t4, k1);
t1 = _mm_aesenc_si128(t1, k2);
t2 = _mm_aesenc_si128(t2, k2);
t3 = _mm_aesenc_si128(t3, k2);
t4 = _mm_aesenc_si128(t4, k2);
t1 = _mm_aesenc_si128(t1, k3);
t2 = _mm_aesenc_si128(t2, k3);
t3 = _mm_aesenc_si128(t3, k3);
t4 = _mm_aesenc_si128(t4, k3);
t1 = _mm_aesenc_si128(t1, k4);
t2 = _mm_aesenc_si128(t2, k4);
t3 = _mm_aesenc_si128(t3, k4);
t4 = _mm_aesenc_si128(t4, k4);
t1 = _mm_aesenc_si128(t1, k5);
t2 = _mm_aesenc_si128(t2, k5);
t3 = _mm_aesenc_si128(t3, k5);
t4 = _mm_aesenc_si128(t4, k5);
t1 = _mm_aesenc_si128(t1, k6);
t2 = _mm_aesenc_si128(t2, k6);
t3 = _mm_aesenc_si128(t3, k6);
t4 = _mm_aesenc_si128(t4, k6);
t1 = _mm_aesenc_si128(t1, k7);
t2 = _mm_aesenc_si128(t2, k7);
t3 = _mm_aesenc_si128(t3, k7);
t4 = _mm_aesenc_si128(t4, k7);
t1 = _mm_aesenc_si128(t1, k8);
t2 = _mm_aesenc_si128(t2, k8);
t3 = _mm_aesenc_si128(t3, k8);
t4 = _mm_aesenc_si128(t4, k8);
t1 = _mm_aesenc_si128(t1, k9);
t2 = _mm_aesenc_si128(t2, k9);
t3 = _mm_aesenc_si128(t3, k9);
t4 = _mm_aesenc_si128(t4, k9);
t1 = _mm_aesenc_si128(t1, k10);
t2 = _mm_aesenc_si128(t2, k10);
t3 = _mm_aesenc_si128(t3, k10);
t4 = _mm_aesenc_si128(t4, k10);
t1 = _mm_aesenc_si128(t1, k11);
t2 = _mm_aesenc_si128(t2, k11);
t3 = _mm_aesenc_si128(t3, k11);
t4 = _mm_aesenc_si128(t4, k11);
t1 = _mm_aesenc_si128(t1, k12);
t2 = _mm_aesenc_si128(t2, k12);
t3 = _mm_aesenc_si128(t3, k12);
t4 = _mm_aesenc_si128(t4, k12);
t1 = _mm_aesenc_si128(t1, k13);
t2 = _mm_aesenc_si128(t2, k13);
t3 = _mm_aesenc_si128(t3, k13);
t4 = _mm_aesenc_si128(t4, k13);
t1 = _mm_aesenc_si128(t1, ks[1]);
t2 = _mm_aesenc_si128(t2, ks[1]);
t3 = _mm_aesenc_si128(t3, ks[1]);
t4 = _mm_aesenc_si128(t4, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t2 = _mm_aesenc_si128(t2, ks[2]);
t3 = _mm_aesenc_si128(t3, ks[2]);
t4 = _mm_aesenc_si128(t4, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t2 = _mm_aesenc_si128(t2, ks[3]);
t3 = _mm_aesenc_si128(t3, ks[3]);
t4 = _mm_aesenc_si128(t4, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t2 = _mm_aesenc_si128(t2, ks[4]);
t3 = _mm_aesenc_si128(t3, ks[4]);
t4 = _mm_aesenc_si128(t4, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t2 = _mm_aesenc_si128(t2, ks[5]);
t3 = _mm_aesenc_si128(t3, ks[5]);
t4 = _mm_aesenc_si128(t4, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t2 = _mm_aesenc_si128(t2, ks[6]);
t3 = _mm_aesenc_si128(t3, ks[6]);
t4 = _mm_aesenc_si128(t4, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t2 = _mm_aesenc_si128(t2, ks[7]);
t3 = _mm_aesenc_si128(t3, ks[7]);
t4 = _mm_aesenc_si128(t4, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t2 = _mm_aesenc_si128(t2, ks[8]);
t3 = _mm_aesenc_si128(t3, ks[8]);
t4 = _mm_aesenc_si128(t4, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t2 = _mm_aesenc_si128(t2, ks[9]);
t3 = _mm_aesenc_si128(t3, ks[9]);
t4 = _mm_aesenc_si128(t4, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t2 = _mm_aesenc_si128(t2, ks[10]);
t3 = _mm_aesenc_si128(t3, ks[10]);
t4 = _mm_aesenc_si128(t4, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t2 = _mm_aesenc_si128(t2, ks[11]);
t3 = _mm_aesenc_si128(t3, ks[11]);
t4 = _mm_aesenc_si128(t4, ks[11]);
t1 = _mm_aesenc_si128(t1, ks[12]);
t2 = _mm_aesenc_si128(t2, ks[12]);
t3 = _mm_aesenc_si128(t3, ks[12]);
t4 = _mm_aesenc_si128(t4, ks[12]);
t1 = _mm_aesenc_si128(t1, ks[13]);
t2 = _mm_aesenc_si128(t2, ks[13]);
t3 = _mm_aesenc_si128(t3, ks[13]);
t4 = _mm_aesenc_si128(t4, ks[13]);
t1 = _mm_aesenclast_si128(t1, k14);
t2 = _mm_aesenclast_si128(t2, k14);
t3 = _mm_aesenclast_si128(t3, k14);
t4 = _mm_aesenclast_si128(t4, k14);
t1 = _mm_aesenclast_si128(t1, ks[14]);
t2 = _mm_aesenclast_si128(t2, ks[14]);
t3 = _mm_aesenclast_si128(t3, ks[14]);
t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
@ -509,24 +470,24 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
{
d1 = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, k10);
t1 = _mm_aesenc_si128(t1, k11);
t1 = _mm_aesenc_si128(t1, k12);
t1 = _mm_aesenc_si128(t1, k13);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t1 = _mm_aesenc_si128(t1, ks[12]);
t1 = _mm_aesenc_si128(t1, ks[13]);
t1 = _mm_aesenclast_si128(t1, k14);
t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
@ -537,23 +498,23 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
t1 = _mm_xor_si128(state, k0);
t1 = _mm_xor_si128(state, ks[0]);
t1 = _mm_aesenc_si128(t1, k1);
t1 = _mm_aesenc_si128(t1, k2);
t1 = _mm_aesenc_si128(t1, k3);
t1 = _mm_aesenc_si128(t1, k4);
t1 = _mm_aesenc_si128(t1, k5);
t1 = _mm_aesenc_si128(t1, k6);
t1 = _mm_aesenc_si128(t1, k7);
t1 = _mm_aesenc_si128(t1, k8);
t1 = _mm_aesenc_si128(t1, k9);
t1 = _mm_aesenc_si128(t1, k10);
t1 = _mm_aesenc_si128(t1, k11);
t1 = _mm_aesenc_si128(t1, k12);
t1 = _mm_aesenc_si128(t1, k13);
t1 = _mm_aesenc_si128(t1, ks[1]);
t1 = _mm_aesenc_si128(t1, ks[2]);
t1 = _mm_aesenc_si128(t1, ks[3]);
t1 = _mm_aesenc_si128(t1, ks[4]);
t1 = _mm_aesenc_si128(t1, ks[5]);
t1 = _mm_aesenc_si128(t1, ks[6]);
t1 = _mm_aesenc_si128(t1, ks[7]);
t1 = _mm_aesenc_si128(t1, ks[8]);
t1 = _mm_aesenc_si128(t1, ks[9]);
t1 = _mm_aesenc_si128(t1, ks[10]);
t1 = _mm_aesenc_si128(t1, ks[11]);
t1 = _mm_aesenc_si128(t1, ks[12]);
t1 = _mm_aesenc_si128(t1, ks[13]);
t1 = _mm_aesenclast_si128(t1, k14);
t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);

File diff suppressed because it is too large Load Diff

View File

@ -72,8 +72,7 @@ struct private_aesni_mac_t {
METHOD(mac_t, get_mac, bool,
private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i e, *bi;
__m128i *ks, e, *bi;
u_int blocks, rem, i;
if (!this->k1)
@ -81,17 +80,7 @@ METHOD(mac_t, get_mac, bool,
return FALSE;
}
k0 = this->k1->schedule[0];
k1 = this->k1->schedule[1];
k2 = this->k1->schedule[2];
k3 = this->k1->schedule[3];
k4 = this->k1->schedule[4];
k5 = this->k1->schedule[5];
k6 = this->k1->schedule[6];
k7 = this->k1->schedule[7];
k8 = this->k1->schedule[8];
k9 = this->k1->schedule[9];
k10 = this->k1->schedule[10];
ks = this->k1->schedule;
e = this->e;
@ -114,17 +103,17 @@ METHOD(mac_t, get_mac, bool,
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
e = _mm_xor_si128(e, k0);
e = _mm_aesenc_si128(e, k1);
e = _mm_aesenc_si128(e, k2);
e = _mm_aesenc_si128(e, k3);
e = _mm_aesenc_si128(e, k4);
e = _mm_aesenc_si128(e, k5);
e = _mm_aesenc_si128(e, k6);
e = _mm_aesenc_si128(e, k7);
e = _mm_aesenc_si128(e, k8);
e = _mm_aesenc_si128(e, k9);
e = _mm_aesenclast_si128(e, k10);
e = _mm_xor_si128(e, ks[0]);
e = _mm_aesenc_si128(e, ks[1]);
e = _mm_aesenc_si128(e, ks[2]);
e = _mm_aesenc_si128(e, ks[3]);
e = _mm_aesenc_si128(e, ks[4]);
e = _mm_aesenc_si128(e, ks[5]);
e = _mm_aesenc_si128(e, ks[6]);
e = _mm_aesenc_si128(e, ks[7]);
e = _mm_aesenc_si128(e, ks[8]);
e = _mm_aesenc_si128(e, ks[9]);
e = _mm_aesenclast_si128(e, ks[10]);
bi = (__m128i*)data.ptr;
rem = data.len % AES_BLOCK_SIZE;
@ -140,17 +129,17 @@ METHOD(mac_t, get_mac, bool,
{
e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
e = _mm_xor_si128(e, k0);
e = _mm_aesenc_si128(e, k1);
e = _mm_aesenc_si128(e, k2);
e = _mm_aesenc_si128(e, k3);
e = _mm_aesenc_si128(e, k4);
e = _mm_aesenc_si128(e, k5);
e = _mm_aesenc_si128(e, k6);
e = _mm_aesenc_si128(e, k7);
e = _mm_aesenc_si128(e, k8);
e = _mm_aesenc_si128(e, k9);
e = _mm_aesenclast_si128(e, k10);
e = _mm_xor_si128(e, ks[0]);
e = _mm_aesenc_si128(e, ks[1]);
e = _mm_aesenc_si128(e, ks[2]);
e = _mm_aesenc_si128(e, ks[3]);
e = _mm_aesenc_si128(e, ks[4]);
e = _mm_aesenc_si128(e, ks[5]);
e = _mm_aesenc_si128(e, ks[6]);
e = _mm_aesenc_si128(e, ks[7]);
e = _mm_aesenc_si128(e, ks[8]);
e = _mm_aesenc_si128(e, ks[9]);
e = _mm_aesenclast_si128(e, ks[10]);
}
/* store remaining bytes of block M[n] */
@ -196,17 +185,17 @@ METHOD(mac_t, get_mac, bool,
}
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
e = _mm_xor_si128(e, k0);
e = _mm_aesenc_si128(e, k1);
e = _mm_aesenc_si128(e, k2);
e = _mm_aesenc_si128(e, k3);
e = _mm_aesenc_si128(e, k4);
e = _mm_aesenc_si128(e, k5);
e = _mm_aesenc_si128(e, k6);
e = _mm_aesenc_si128(e, k7);
e = _mm_aesenc_si128(e, k8);
e = _mm_aesenc_si128(e, k9);
e = _mm_aesenclast_si128(e, k10);
e = _mm_xor_si128(e, ks[0]);
e = _mm_aesenc_si128(e, ks[1]);
e = _mm_aesenc_si128(e, ks[2]);
e = _mm_aesenc_si128(e, ks[3]);
e = _mm_aesenc_si128(e, ks[4]);
e = _mm_aesenc_si128(e, ks[5]);
e = _mm_aesenc_si128(e, ks[6]);
e = _mm_aesenc_si128(e, ks[7]);
e = _mm_aesenc_si128(e, ks[8]);
e = _mm_aesenc_si128(e, ks[9]);
e = _mm_aesenclast_si128(e, ks[10]);
_mm_storeu_si128((__m128i*)out, e);
/* (2) Define E[0] = 0x00000000000000000000000000000000 */