aesni: Avoid loading AES/GHASH round keys into local variables
The performance impact is not measurable, as the compiler loads these variables in xmm registers in unrolled loops anyway. However, we avoid loading these sensitive keys onto the stack. This happens for larger key schedules, where the register count is insufficient. If that key material is not on the stack, we can avoid to wipe it explicitly after crypto operations.
This commit is contained in:
parent
93f0080265
commit
37794878cc
|
@ -70,22 +70,10 @@ struct private_aesni_cbc_t {
|
|||
static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i t, fb, *bi, *bo;
|
||||
__m128i *ks, t, fb, *bi, *bo;
|
||||
int i;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
|
@ -94,19 +82,19 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
{
|
||||
t = _mm_loadu_si128(bi + i);
|
||||
fb = _mm_xor_si128(t, fb);
|
||||
fb = _mm_xor_si128(fb, k0);
|
||||
fb = _mm_xor_si128(fb, ks[0]);
|
||||
|
||||
fb = _mm_aesenc_si128(fb, k1);
|
||||
fb = _mm_aesenc_si128(fb, k2);
|
||||
fb = _mm_aesenc_si128(fb, k3);
|
||||
fb = _mm_aesenc_si128(fb, k4);
|
||||
fb = _mm_aesenc_si128(fb, k5);
|
||||
fb = _mm_aesenc_si128(fb, k6);
|
||||
fb = _mm_aesenc_si128(fb, k7);
|
||||
fb = _mm_aesenc_si128(fb, k8);
|
||||
fb = _mm_aesenc_si128(fb, k9);
|
||||
fb = _mm_aesenc_si128(fb, ks[1]);
|
||||
fb = _mm_aesenc_si128(fb, ks[2]);
|
||||
fb = _mm_aesenc_si128(fb, ks[3]);
|
||||
fb = _mm_aesenc_si128(fb, ks[4]);
|
||||
fb = _mm_aesenc_si128(fb, ks[5]);
|
||||
fb = _mm_aesenc_si128(fb, ks[6]);
|
||||
fb = _mm_aesenc_si128(fb, ks[7]);
|
||||
fb = _mm_aesenc_si128(fb, ks[8]);
|
||||
fb = _mm_aesenc_si128(fb, ks[9]);
|
||||
|
||||
fb = _mm_aesenclast_si128(fb, k10);
|
||||
fb = _mm_aesenclast_si128(fb, ks[10]);
|
||||
_mm_storeu_si128(bo + i, fb);
|
||||
}
|
||||
}
|
||||
|
@ -117,24 +105,12 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i last, *bi, *bo;
|
||||
__m128i *ks, last, *bi, *bo;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i f1, f2, f3, f4;
|
||||
u_int i, pblocks;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
|
||||
|
@ -153,52 +129,52 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
f4 = t3;
|
||||
last = t4;
|
||||
|
||||
t1 = _mm_xor_si128(t1, k0);
|
||||
t2 = _mm_xor_si128(t2, k0);
|
||||
t3 = _mm_xor_si128(t3, k0);
|
||||
t4 = _mm_xor_si128(t4, k0);
|
||||
t1 = _mm_xor_si128(t1, ks[0]);
|
||||
t2 = _mm_xor_si128(t2, ks[0]);
|
||||
t3 = _mm_xor_si128(t3, ks[0]);
|
||||
t4 = _mm_xor_si128(t4, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t2 = _mm_aesdec_si128(t2, k1);
|
||||
t3 = _mm_aesdec_si128(t3, k1);
|
||||
t4 = _mm_aesdec_si128(t4, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t2 = _mm_aesdec_si128(t2, k2);
|
||||
t3 = _mm_aesdec_si128(t3, k2);
|
||||
t4 = _mm_aesdec_si128(t4, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t2 = _mm_aesdec_si128(t2, k3);
|
||||
t3 = _mm_aesdec_si128(t3, k3);
|
||||
t4 = _mm_aesdec_si128(t4, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t2 = _mm_aesdec_si128(t2, k4);
|
||||
t3 = _mm_aesdec_si128(t3, k4);
|
||||
t4 = _mm_aesdec_si128(t4, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t2 = _mm_aesdec_si128(t2, k5);
|
||||
t3 = _mm_aesdec_si128(t3, k5);
|
||||
t4 = _mm_aesdec_si128(t4, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t2 = _mm_aesdec_si128(t2, k6);
|
||||
t3 = _mm_aesdec_si128(t3, k6);
|
||||
t4 = _mm_aesdec_si128(t4, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t2 = _mm_aesdec_si128(t2, k7);
|
||||
t3 = _mm_aesdec_si128(t3, k7);
|
||||
t4 = _mm_aesdec_si128(t4, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t2 = _mm_aesdec_si128(t2, k8);
|
||||
t3 = _mm_aesdec_si128(t3, k8);
|
||||
t4 = _mm_aesdec_si128(t4, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t2 = _mm_aesdec_si128(t2, k9);
|
||||
t3 = _mm_aesdec_si128(t3, k9);
|
||||
t4 = _mm_aesdec_si128(t4, k9);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[1]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[1]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[2]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[2]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[3]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[3]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[4]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[4]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[5]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[5]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[6]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[6]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[7]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[7]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[8]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[8]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[9]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[9]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[9]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k10);
|
||||
t2 = _mm_aesdeclast_si128(t2, k10);
|
||||
t3 = _mm_aesdeclast_si128(t3, k10);
|
||||
t4 = _mm_aesdeclast_si128(t4, k10);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[10]);
|
||||
t2 = _mm_aesdeclast_si128(t2, ks[10]);
|
||||
t3 = _mm_aesdeclast_si128(t3, ks[10]);
|
||||
t4 = _mm_aesdeclast_si128(t4, ks[10]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
t2 = _mm_xor_si128(t2, f2);
|
||||
t3 = _mm_xor_si128(t3, f3);
|
||||
|
@ -213,19 +189,19 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
for (i = pblocks; i < blocks; i++)
|
||||
{
|
||||
last = _mm_loadu_si128(bi + i);
|
||||
t1 = _mm_xor_si128(last, k0);
|
||||
t1 = _mm_xor_si128(last, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k10);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[10]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
f1 = last;
|
||||
|
@ -238,24 +214,10 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
|
||||
__m128i t, fb, *bi, *bo;
|
||||
__m128i *ks, t, fb, *bi, *bo;
|
||||
int i;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
k11 = key->schedule[11];
|
||||
k12 = key->schedule[12];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
|
@ -264,21 +226,21 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
{
|
||||
t = _mm_loadu_si128(bi + i);
|
||||
fb = _mm_xor_si128(t, fb);
|
||||
fb = _mm_xor_si128(fb, k0);
|
||||
fb = _mm_xor_si128(fb, ks[0]);
|
||||
|
||||
fb = _mm_aesenc_si128(fb, k1);
|
||||
fb = _mm_aesenc_si128(fb, k2);
|
||||
fb = _mm_aesenc_si128(fb, k3);
|
||||
fb = _mm_aesenc_si128(fb, k4);
|
||||
fb = _mm_aesenc_si128(fb, k5);
|
||||
fb = _mm_aesenc_si128(fb, k6);
|
||||
fb = _mm_aesenc_si128(fb, k7);
|
||||
fb = _mm_aesenc_si128(fb, k8);
|
||||
fb = _mm_aesenc_si128(fb, k9);
|
||||
fb = _mm_aesenc_si128(fb, k10);
|
||||
fb = _mm_aesenc_si128(fb, k11);
|
||||
fb = _mm_aesenc_si128(fb, ks[1]);
|
||||
fb = _mm_aesenc_si128(fb, ks[2]);
|
||||
fb = _mm_aesenc_si128(fb, ks[3]);
|
||||
fb = _mm_aesenc_si128(fb, ks[4]);
|
||||
fb = _mm_aesenc_si128(fb, ks[5]);
|
||||
fb = _mm_aesenc_si128(fb, ks[6]);
|
||||
fb = _mm_aesenc_si128(fb, ks[7]);
|
||||
fb = _mm_aesenc_si128(fb, ks[8]);
|
||||
fb = _mm_aesenc_si128(fb, ks[9]);
|
||||
fb = _mm_aesenc_si128(fb, ks[10]);
|
||||
fb = _mm_aesenc_si128(fb, ks[11]);
|
||||
|
||||
fb = _mm_aesenclast_si128(fb, k12);
|
||||
fb = _mm_aesenclast_si128(fb, ks[12]);
|
||||
_mm_storeu_si128(bo + i, fb);
|
||||
}
|
||||
}
|
||||
|
@ -289,26 +251,12 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
|
||||
__m128i last, *bi, *bo;
|
||||
__m128i *ks, last, *bi, *bo;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i f1, f2, f3, f4;
|
||||
u_int i, pblocks;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
k11 = key->schedule[11];
|
||||
k12 = key->schedule[12];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
|
||||
|
@ -327,60 +275,60 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
f4 = t3;
|
||||
last = t4;
|
||||
|
||||
t1 = _mm_xor_si128(t1, k0);
|
||||
t2 = _mm_xor_si128(t2, k0);
|
||||
t3 = _mm_xor_si128(t3, k0);
|
||||
t4 = _mm_xor_si128(t4, k0);
|
||||
t1 = _mm_xor_si128(t1, ks[0]);
|
||||
t2 = _mm_xor_si128(t2, ks[0]);
|
||||
t3 = _mm_xor_si128(t3, ks[0]);
|
||||
t4 = _mm_xor_si128(t4, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t2 = _mm_aesdec_si128(t2, k1);
|
||||
t3 = _mm_aesdec_si128(t3, k1);
|
||||
t4 = _mm_aesdec_si128(t4, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t2 = _mm_aesdec_si128(t2, k2);
|
||||
t3 = _mm_aesdec_si128(t3, k2);
|
||||
t4 = _mm_aesdec_si128(t4, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t2 = _mm_aesdec_si128(t2, k3);
|
||||
t3 = _mm_aesdec_si128(t3, k3);
|
||||
t4 = _mm_aesdec_si128(t4, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t2 = _mm_aesdec_si128(t2, k4);
|
||||
t3 = _mm_aesdec_si128(t3, k4);
|
||||
t4 = _mm_aesdec_si128(t4, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t2 = _mm_aesdec_si128(t2, k5);
|
||||
t3 = _mm_aesdec_si128(t3, k5);
|
||||
t4 = _mm_aesdec_si128(t4, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t2 = _mm_aesdec_si128(t2, k6);
|
||||
t3 = _mm_aesdec_si128(t3, k6);
|
||||
t4 = _mm_aesdec_si128(t4, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t2 = _mm_aesdec_si128(t2, k7);
|
||||
t3 = _mm_aesdec_si128(t3, k7);
|
||||
t4 = _mm_aesdec_si128(t4, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t2 = _mm_aesdec_si128(t2, k8);
|
||||
t3 = _mm_aesdec_si128(t3, k8);
|
||||
t4 = _mm_aesdec_si128(t4, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t2 = _mm_aesdec_si128(t2, k9);
|
||||
t3 = _mm_aesdec_si128(t3, k9);
|
||||
t4 = _mm_aesdec_si128(t4, k9);
|
||||
t1 = _mm_aesdec_si128(t1, k10);
|
||||
t2 = _mm_aesdec_si128(t2, k10);
|
||||
t3 = _mm_aesdec_si128(t3, k10);
|
||||
t4 = _mm_aesdec_si128(t4, k10);
|
||||
t1 = _mm_aesdec_si128(t1, k11);
|
||||
t2 = _mm_aesdec_si128(t2, k11);
|
||||
t3 = _mm_aesdec_si128(t3, k11);
|
||||
t4 = _mm_aesdec_si128(t4, k11);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[1]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[1]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[2]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[2]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[3]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[3]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[4]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[4]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[5]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[5]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[6]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[6]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[7]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[7]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[8]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[8]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[9]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[9]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[9]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[10]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[10]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[10]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[10]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[11]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[11]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[11]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[11]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k12);
|
||||
t2 = _mm_aesdeclast_si128(t2, k12);
|
||||
t3 = _mm_aesdeclast_si128(t3, k12);
|
||||
t4 = _mm_aesdeclast_si128(t4, k12);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[12]);
|
||||
t2 = _mm_aesdeclast_si128(t2, ks[12]);
|
||||
t3 = _mm_aesdeclast_si128(t3, ks[12]);
|
||||
t4 = _mm_aesdeclast_si128(t4, ks[12]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
t2 = _mm_xor_si128(t2, f2);
|
||||
t3 = _mm_xor_si128(t3, f3);
|
||||
|
@ -395,21 +343,21 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
for (i = pblocks; i < blocks; i++)
|
||||
{
|
||||
last = _mm_loadu_si128(bi + i);
|
||||
t1 = _mm_xor_si128(last, k0);
|
||||
t1 = _mm_xor_si128(last, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t1 = _mm_aesdec_si128(t1, k10);
|
||||
t1 = _mm_aesdec_si128(t1, k11);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[10]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[11]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k12);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[12]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
f1 = last;
|
||||
|
@ -422,26 +370,10 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
|
||||
__m128i t, fb, *bi, *bo;
|
||||
__m128i *ks, t, fb, *bi, *bo;
|
||||
int i;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
k11 = key->schedule[11];
|
||||
k12 = key->schedule[12];
|
||||
k13 = key->schedule[13];
|
||||
k14 = key->schedule[14];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
|
@ -450,23 +382,23 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
{
|
||||
t = _mm_loadu_si128(bi + i);
|
||||
fb = _mm_xor_si128(t, fb);
|
||||
fb = _mm_xor_si128(fb, k0);
|
||||
fb = _mm_xor_si128(fb, ks[0]);
|
||||
|
||||
fb = _mm_aesenc_si128(fb, k1);
|
||||
fb = _mm_aesenc_si128(fb, k2);
|
||||
fb = _mm_aesenc_si128(fb, k3);
|
||||
fb = _mm_aesenc_si128(fb, k4);
|
||||
fb = _mm_aesenc_si128(fb, k5);
|
||||
fb = _mm_aesenc_si128(fb, k6);
|
||||
fb = _mm_aesenc_si128(fb, k7);
|
||||
fb = _mm_aesenc_si128(fb, k8);
|
||||
fb = _mm_aesenc_si128(fb, k9);
|
||||
fb = _mm_aesenc_si128(fb, k10);
|
||||
fb = _mm_aesenc_si128(fb, k11);
|
||||
fb = _mm_aesenc_si128(fb, k12);
|
||||
fb = _mm_aesenc_si128(fb, k13);
|
||||
fb = _mm_aesenc_si128(fb, ks[1]);
|
||||
fb = _mm_aesenc_si128(fb, ks[2]);
|
||||
fb = _mm_aesenc_si128(fb, ks[3]);
|
||||
fb = _mm_aesenc_si128(fb, ks[4]);
|
||||
fb = _mm_aesenc_si128(fb, ks[5]);
|
||||
fb = _mm_aesenc_si128(fb, ks[6]);
|
||||
fb = _mm_aesenc_si128(fb, ks[7]);
|
||||
fb = _mm_aesenc_si128(fb, ks[8]);
|
||||
fb = _mm_aesenc_si128(fb, ks[9]);
|
||||
fb = _mm_aesenc_si128(fb, ks[10]);
|
||||
fb = _mm_aesenc_si128(fb, ks[11]);
|
||||
fb = _mm_aesenc_si128(fb, ks[12]);
|
||||
fb = _mm_aesenc_si128(fb, ks[13]);
|
||||
|
||||
fb = _mm_aesenclast_si128(fb, k14);
|
||||
fb = _mm_aesenclast_si128(fb, ks[14]);
|
||||
_mm_storeu_si128(bo + i, fb);
|
||||
}
|
||||
}
|
||||
|
@ -477,28 +409,12 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
||||
u_char *iv, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
|
||||
__m128i last, *bi, *bo;
|
||||
__m128i *ks, last, *bi, *bo;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i f1, f2, f3, f4;
|
||||
u_int i, pblocks;
|
||||
|
||||
k0 = key->schedule[0];
|
||||
k1 = key->schedule[1];
|
||||
k2 = key->schedule[2];
|
||||
k3 = key->schedule[3];
|
||||
k4 = key->schedule[4];
|
||||
k5 = key->schedule[5];
|
||||
k6 = key->schedule[6];
|
||||
k7 = key->schedule[7];
|
||||
k8 = key->schedule[8];
|
||||
k9 = key->schedule[9];
|
||||
k10 = key->schedule[10];
|
||||
k11 = key->schedule[11];
|
||||
k12 = key->schedule[12];
|
||||
k13 = key->schedule[13];
|
||||
k14 = key->schedule[14];
|
||||
|
||||
ks = key->schedule;
|
||||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
|
||||
|
@ -517,68 +433,68 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
f4 = t3;
|
||||
last = t4;
|
||||
|
||||
t1 = _mm_xor_si128(t1, k0);
|
||||
t2 = _mm_xor_si128(t2, k0);
|
||||
t3 = _mm_xor_si128(t3, k0);
|
||||
t4 = _mm_xor_si128(t4, k0);
|
||||
t1 = _mm_xor_si128(t1, ks[0]);
|
||||
t2 = _mm_xor_si128(t2, ks[0]);
|
||||
t3 = _mm_xor_si128(t3, ks[0]);
|
||||
t4 = _mm_xor_si128(t4, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t2 = _mm_aesdec_si128(t2, k1);
|
||||
t3 = _mm_aesdec_si128(t3, k1);
|
||||
t4 = _mm_aesdec_si128(t4, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t2 = _mm_aesdec_si128(t2, k2);
|
||||
t3 = _mm_aesdec_si128(t3, k2);
|
||||
t4 = _mm_aesdec_si128(t4, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t2 = _mm_aesdec_si128(t2, k3);
|
||||
t3 = _mm_aesdec_si128(t3, k3);
|
||||
t4 = _mm_aesdec_si128(t4, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t2 = _mm_aesdec_si128(t2, k4);
|
||||
t3 = _mm_aesdec_si128(t3, k4);
|
||||
t4 = _mm_aesdec_si128(t4, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t2 = _mm_aesdec_si128(t2, k5);
|
||||
t3 = _mm_aesdec_si128(t3, k5);
|
||||
t4 = _mm_aesdec_si128(t4, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t2 = _mm_aesdec_si128(t2, k6);
|
||||
t3 = _mm_aesdec_si128(t3, k6);
|
||||
t4 = _mm_aesdec_si128(t4, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t2 = _mm_aesdec_si128(t2, k7);
|
||||
t3 = _mm_aesdec_si128(t3, k7);
|
||||
t4 = _mm_aesdec_si128(t4, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t2 = _mm_aesdec_si128(t2, k8);
|
||||
t3 = _mm_aesdec_si128(t3, k8);
|
||||
t4 = _mm_aesdec_si128(t4, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t2 = _mm_aesdec_si128(t2, k9);
|
||||
t3 = _mm_aesdec_si128(t3, k9);
|
||||
t4 = _mm_aesdec_si128(t4, k9);
|
||||
t1 = _mm_aesdec_si128(t1, k10);
|
||||
t2 = _mm_aesdec_si128(t2, k10);
|
||||
t3 = _mm_aesdec_si128(t3, k10);
|
||||
t4 = _mm_aesdec_si128(t4, k10);
|
||||
t1 = _mm_aesdec_si128(t1, k11);
|
||||
t2 = _mm_aesdec_si128(t2, k11);
|
||||
t3 = _mm_aesdec_si128(t3, k11);
|
||||
t4 = _mm_aesdec_si128(t4, k11);
|
||||
t1 = _mm_aesdec_si128(t1, k12);
|
||||
t2 = _mm_aesdec_si128(t2, k12);
|
||||
t3 = _mm_aesdec_si128(t3, k12);
|
||||
t4 = _mm_aesdec_si128(t4, k12);
|
||||
t1 = _mm_aesdec_si128(t1, k13);
|
||||
t2 = _mm_aesdec_si128(t2, k13);
|
||||
t3 = _mm_aesdec_si128(t3, k13);
|
||||
t4 = _mm_aesdec_si128(t4, k13);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[1]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[1]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[2]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[2]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[3]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[3]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[4]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[4]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[5]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[5]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[6]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[6]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[7]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[7]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[8]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[8]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[9]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[9]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[9]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[10]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[10]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[10]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[10]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[11]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[11]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[11]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[11]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[12]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[12]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[12]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[12]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[13]);
|
||||
t2 = _mm_aesdec_si128(t2, ks[13]);
|
||||
t3 = _mm_aesdec_si128(t3, ks[13]);
|
||||
t4 = _mm_aesdec_si128(t4, ks[13]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k14);
|
||||
t2 = _mm_aesdeclast_si128(t2, k14);
|
||||
t3 = _mm_aesdeclast_si128(t3, k14);
|
||||
t4 = _mm_aesdeclast_si128(t4, k14);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[14]);
|
||||
t2 = _mm_aesdeclast_si128(t2, ks[14]);
|
||||
t3 = _mm_aesdeclast_si128(t3, ks[14]);
|
||||
t4 = _mm_aesdeclast_si128(t4, ks[14]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
t2 = _mm_xor_si128(t2, f2);
|
||||
t3 = _mm_xor_si128(t3, f3);
|
||||
|
@ -593,23 +509,23 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
|
|||
for (i = pblocks; i < blocks; i++)
|
||||
{
|
||||
last = _mm_loadu_si128(bi + i);
|
||||
t1 = _mm_xor_si128(last, k0);
|
||||
t1 = _mm_xor_si128(last, ks[0]);
|
||||
|
||||
t1 = _mm_aesdec_si128(t1, k1);
|
||||
t1 = _mm_aesdec_si128(t1, k2);
|
||||
t1 = _mm_aesdec_si128(t1, k3);
|
||||
t1 = _mm_aesdec_si128(t1, k4);
|
||||
t1 = _mm_aesdec_si128(t1, k5);
|
||||
t1 = _mm_aesdec_si128(t1, k6);
|
||||
t1 = _mm_aesdec_si128(t1, k7);
|
||||
t1 = _mm_aesdec_si128(t1, k8);
|
||||
t1 = _mm_aesdec_si128(t1, k9);
|
||||
t1 = _mm_aesdec_si128(t1, k10);
|
||||
t1 = _mm_aesdec_si128(t1, k11);
|
||||
t1 = _mm_aesdec_si128(t1, k12);
|
||||
t1 = _mm_aesdec_si128(t1, k13);
|
||||
t1 = _mm_aesdec_si128(t1, ks[1]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[2]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[3]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[4]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[5]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[6]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[7]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[8]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[9]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[10]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[11]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[12]);
|
||||
t1 = _mm_aesdec_si128(t1, ks[13]);
|
||||
|
||||
t1 = _mm_aesdeclast_si128(t1, k14);
|
||||
t1 = _mm_aesdeclast_si128(t1, ks[14]);
|
||||
t1 = _mm_xor_si128(t1, f1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
f1 = last;
|
||||
|
|
|
@ -159,17 +159,18 @@ static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv,
|
|||
static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
|
||||
u_int16_t alen, u_char *assoc)
|
||||
{
|
||||
__m128i b, t, c;
|
||||
__m128i *ks, b, t, c;
|
||||
u_int i, round, blocks, rem;
|
||||
|
||||
ks = this->key->schedule;
|
||||
build_b0(this, len, alen, iv, &b);
|
||||
c = _mm_loadu_si128(&b);
|
||||
c = _mm_xor_si128(c, this->key->schedule[0]);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
for (round = 1; round < this->key->rounds; round++)
|
||||
{
|
||||
c = _mm_aesenc_si128(c, this->key->schedule[round]);
|
||||
c = _mm_aesenc_si128(c, ks[round]);
|
||||
}
|
||||
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
|
||||
c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
|
||||
|
||||
if (alen)
|
||||
{
|
||||
|
@ -200,12 +201,12 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
|
|||
t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
|
||||
}
|
||||
c = _mm_xor_si128(t, c);
|
||||
c = _mm_xor_si128(c, this->key->schedule[0]);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
for (round = 1; round < this->key->rounds; round++)
|
||||
{
|
||||
c = _mm_aesenc_si128(c, this->key->schedule[round]);
|
||||
c = _mm_aesenc_si128(c, ks[round]);
|
||||
}
|
||||
c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
|
||||
c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
|
@ -217,18 +218,19 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
|
|||
static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
|
||||
__m128i c, u_char *icv)
|
||||
{
|
||||
__m128i b, t;
|
||||
__m128i *ks, b, t;
|
||||
u_int round;
|
||||
|
||||
ks = this->key->schedule;
|
||||
build_ctr(this, 0, iv, &b);
|
||||
|
||||
t = _mm_loadu_si128(&b);
|
||||
t = _mm_xor_si128(t, this->key->schedule[0]);
|
||||
t = _mm_xor_si128(t, ks[0]);
|
||||
for (round = 1; round < this->key->rounds; round++)
|
||||
{
|
||||
t = _mm_aesenc_si128(t, this->key->schedule[round]);
|
||||
t = _mm_aesenc_si128(t, ks[round]);
|
||||
}
|
||||
t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
|
||||
t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
|
||||
|
||||
t = _mm_xor_si128(t, c);
|
||||
|
||||
|
@ -258,23 +260,24 @@ static inline __m128i increment_be(__m128i x)
|
|||
static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
|
||||
void *in, void *out, __m128i c)
|
||||
{
|
||||
__m128i t, b, d;
|
||||
__m128i *ks, t, b, d;
|
||||
u_int round;
|
||||
|
||||
ks = key->schedule;
|
||||
memset(&b, 0, sizeof(b));
|
||||
memcpy(&b, in, rem);
|
||||
d = _mm_loadu_si128(&b);
|
||||
|
||||
c = _mm_xor_si128(d, c);
|
||||
c = _mm_xor_si128(c, key->schedule[0]);
|
||||
t = _mm_xor_si128(state, key->schedule[0]);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
for (round = 1; round < key->rounds; round++)
|
||||
{
|
||||
c = _mm_aesenc_si128(c, key->schedule[round]);
|
||||
t = _mm_aesenc_si128(t, key->schedule[round]);
|
||||
c = _mm_aesenc_si128(c, ks[round]);
|
||||
t = _mm_aesenc_si128(t, ks[round]);
|
||||
}
|
||||
c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
|
||||
t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
|
||||
c = _mm_aesenclast_si128(c, ks[key->rounds]);
|
||||
t = _mm_aesenclast_si128(t, ks[key->rounds]);
|
||||
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(&b, t);
|
||||
|
@ -290,31 +293,32 @@ static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
|
|||
static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
|
||||
void *in, void *out, __m128i c)
|
||||
{
|
||||
__m128i t, b, d;
|
||||
__m128i *ks, t, b, d;
|
||||
u_int round;
|
||||
|
||||
ks = key->schedule;
|
||||
memset(&b, 0, sizeof(b));
|
||||
memcpy(&b, in, rem);
|
||||
d = _mm_loadu_si128(&b);
|
||||
|
||||
t = _mm_xor_si128(state, key->schedule[0]);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
for (round = 1; round < key->rounds; round++)
|
||||
{
|
||||
t = _mm_aesenc_si128(t, key->schedule[round]);
|
||||
t = _mm_aesenc_si128(t, ks[round]);
|
||||
}
|
||||
t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
|
||||
t = _mm_aesenclast_si128(t, ks[key->rounds]);
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(&b, t);
|
||||
|
||||
memset((u_char*)&b + rem, 0, sizeof(b) - rem);
|
||||
t = _mm_loadu_si128(&b);
|
||||
c = _mm_xor_si128(t, c);
|
||||
c = _mm_xor_si128(c, key->schedule[0]);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
for (round = 1; round < key->rounds; round++)
|
||||
{
|
||||
c = _mm_aesenc_si128(c, key->schedule[round]);
|
||||
c = _mm_aesenc_si128(c, ks[round]);
|
||||
}
|
||||
c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
|
||||
c = _mm_aesenclast_si128(c, ks[key->rounds]);
|
||||
|
||||
memcpy(out, &b, rem);
|
||||
|
||||
|
@ -328,8 +332,7 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -340,47 +343,37 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
c = _mm_xor_si128(d, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
t = _mm_xor_si128(state, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k10);
|
||||
t = _mm_aesenclast_si128(t, k10);
|
||||
c = _mm_aesenclast_si128(c, ks[10]);
|
||||
t = _mm_aesenclast_si128(t, ks[10]);
|
||||
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
@ -402,8 +395,7 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -414,52 +406,42 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
t = _mm_xor_si128(state, k0);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
|
||||
t = _mm_aesenclast_si128(t, k10);
|
||||
t = _mm_aesenclast_si128(t, ks[10]);
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
||||
c = _mm_xor_si128(t, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k10);
|
||||
c = _mm_aesenclast_si128(c, ks[10]);
|
||||
|
||||
state = increment_be(state);
|
||||
}
|
||||
|
@ -478,8 +460,7 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -490,53 +471,41 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
c = _mm_xor_si128(d, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
t = _mm_xor_si128(state, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
c = _mm_aesenc_si128(c, k10);
|
||||
t = _mm_aesenc_si128(t, k10);
|
||||
c = _mm_aesenc_si128(c, k11);
|
||||
t = _mm_aesenc_si128(t, k11);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
c = _mm_aesenc_si128(c, ks[10]);
|
||||
t = _mm_aesenc_si128(t, ks[10]);
|
||||
c = _mm_aesenc_si128(c, ks[11]);
|
||||
t = _mm_aesenc_si128(t, ks[11]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k12);
|
||||
t = _mm_aesenclast_si128(t, k12);
|
||||
c = _mm_aesenclast_si128(c, ks[12]);
|
||||
t = _mm_aesenclast_si128(t, ks[12]);
|
||||
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
@ -558,8 +527,7 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -570,58 +538,46 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
t = _mm_xor_si128(state, k0);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenc_si128(t, k10);
|
||||
t = _mm_aesenc_si128(t, k11);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
t = _mm_aesenc_si128(t, ks[10]);
|
||||
t = _mm_aesenc_si128(t, ks[11]);
|
||||
|
||||
t = _mm_aesenclast_si128(t, k12);
|
||||
t = _mm_aesenclast_si128(t, ks[12]);
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
||||
c = _mm_xor_si128(t, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
c = _mm_aesenc_si128(c, k10);
|
||||
c = _mm_aesenc_si128(c, k11);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
c = _mm_aesenc_si128(c, ks[10]);
|
||||
c = _mm_aesenc_si128(c, ks[11]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k12);
|
||||
c = _mm_aesenclast_si128(c, ks[12]);
|
||||
|
||||
state = increment_be(state);
|
||||
}
|
||||
|
@ -640,8 +596,7 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -652,59 +607,45 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
k13 = this->key->schedule[13];
|
||||
k14 = this->key->schedule[14];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
c = _mm_xor_si128(d, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
t = _mm_xor_si128(state, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
c = _mm_aesenc_si128(c, k10);
|
||||
t = _mm_aesenc_si128(t, k10);
|
||||
c = _mm_aesenc_si128(c, k11);
|
||||
t = _mm_aesenc_si128(t, k11);
|
||||
c = _mm_aesenc_si128(c, k12);
|
||||
t = _mm_aesenc_si128(t, k12);
|
||||
c = _mm_aesenc_si128(c, k13);
|
||||
t = _mm_aesenc_si128(t, k13);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
c = _mm_aesenc_si128(c, ks[10]);
|
||||
t = _mm_aesenc_si128(t, ks[10]);
|
||||
c = _mm_aesenc_si128(c, ks[11]);
|
||||
t = _mm_aesenc_si128(t, ks[11]);
|
||||
c = _mm_aesenc_si128(c, ks[12]);
|
||||
t = _mm_aesenc_si128(t, ks[12]);
|
||||
c = _mm_aesenc_si128(c, ks[13]);
|
||||
t = _mm_aesenc_si128(t, ks[13]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k14);
|
||||
t = _mm_aesenclast_si128(t, k14);
|
||||
c = _mm_aesenclast_si128(c, ks[14]);
|
||||
t = _mm_aesenclast_si128(t, ks[14]);
|
||||
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
@ -726,8 +667,7 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
|
|||
size_t len, u_char *in, u_char *out, u_char *iv,
|
||||
size_t alen, u_char *assoc, u_char *icv)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
|
||||
__m128i d, t, c, b, state, *bi, *bo;
|
||||
__m128i *ks, d, t, c, b, state, *bi, *bo;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
c = icv_header(this, len, iv, alen, assoc);
|
||||
|
@ -738,64 +678,50 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
k13 = this->key->schedule[13];
|
||||
k14 = this->key->schedule[14];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < blocks; i++)
|
||||
{
|
||||
d = _mm_loadu_si128(bi + i);
|
||||
|
||||
t = _mm_xor_si128(state, k0);
|
||||
t = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenc_si128(t, k10);
|
||||
t = _mm_aesenc_si128(t, k11);
|
||||
t = _mm_aesenc_si128(t, k12);
|
||||
t = _mm_aesenc_si128(t, k13);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
t = _mm_aesenc_si128(t, ks[10]);
|
||||
t = _mm_aesenc_si128(t, ks[11]);
|
||||
t = _mm_aesenc_si128(t, ks[12]);
|
||||
t = _mm_aesenc_si128(t, ks[13]);
|
||||
|
||||
t = _mm_aesenclast_si128(t, k14);
|
||||
t = _mm_aesenclast_si128(t, ks[14]);
|
||||
t = _mm_xor_si128(t, d);
|
||||
_mm_storeu_si128(bo + i, t);
|
||||
|
||||
c = _mm_xor_si128(t, c);
|
||||
c = _mm_xor_si128(c, k0);
|
||||
c = _mm_xor_si128(c, ks[0]);
|
||||
|
||||
c = _mm_aesenc_si128(c, k1);
|
||||
c = _mm_aesenc_si128(c, k2);
|
||||
c = _mm_aesenc_si128(c, k3);
|
||||
c = _mm_aesenc_si128(c, k4);
|
||||
c = _mm_aesenc_si128(c, k5);
|
||||
c = _mm_aesenc_si128(c, k6);
|
||||
c = _mm_aesenc_si128(c, k7);
|
||||
c = _mm_aesenc_si128(c, k8);
|
||||
c = _mm_aesenc_si128(c, k9);
|
||||
c = _mm_aesenc_si128(c, k10);
|
||||
c = _mm_aesenc_si128(c, k11);
|
||||
c = _mm_aesenc_si128(c, k12);
|
||||
c = _mm_aesenc_si128(c, k13);
|
||||
c = _mm_aesenc_si128(c, ks[1]);
|
||||
c = _mm_aesenc_si128(c, ks[2]);
|
||||
c = _mm_aesenc_si128(c, ks[3]);
|
||||
c = _mm_aesenc_si128(c, ks[4]);
|
||||
c = _mm_aesenc_si128(c, ks[5]);
|
||||
c = _mm_aesenc_si128(c, ks[6]);
|
||||
c = _mm_aesenc_si128(c, ks[7]);
|
||||
c = _mm_aesenc_si128(c, ks[8]);
|
||||
c = _mm_aesenc_si128(c, ks[9]);
|
||||
c = _mm_aesenc_si128(c, ks[10]);
|
||||
c = _mm_aesenc_si128(c, ks[11]);
|
||||
c = _mm_aesenc_si128(c, ks[12]);
|
||||
c = _mm_aesenc_si128(c, ks[13]);
|
||||
|
||||
c = _mm_aesenclast_si128(c, k14);
|
||||
c = _mm_aesenclast_si128(c, ks[14]);
|
||||
|
||||
state = increment_be(state);
|
||||
}
|
||||
|
|
|
@ -67,8 +67,7 @@ struct private_mac_t {
|
|||
METHOD(mac_t, get_mac, bool,
|
||||
private_mac_t *this, chunk_t data, u_int8_t *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i t, l, *bi;
|
||||
__m128i *ks, t, l, *bi;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
if (!this->k)
|
||||
|
@ -76,18 +75,7 @@ METHOD(mac_t, get_mac, bool,
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
k0 = this->k->schedule[0];
|
||||
k1 = this->k->schedule[1];
|
||||
k2 = this->k->schedule[2];
|
||||
k3 = this->k->schedule[3];
|
||||
k4 = this->k->schedule[4];
|
||||
k5 = this->k->schedule[5];
|
||||
k6 = this->k->schedule[6];
|
||||
k7 = this->k->schedule[7];
|
||||
k8 = this->k->schedule[8];
|
||||
k9 = this->k->schedule[9];
|
||||
k10 = this->k->schedule[10];
|
||||
|
||||
ks = this->k->schedule;
|
||||
t = this->t;
|
||||
|
||||
if (this->rem_size + data.len > AES_BLOCK_SIZE)
|
||||
|
@ -105,17 +93,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
|
||||
t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
|
||||
|
||||
t = _mm_xor_si128(t, k0);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenclast_si128(t, k10);
|
||||
t = _mm_xor_si128(t, ks[0]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
t = _mm_aesenclast_si128(t, ks[10]);
|
||||
|
||||
/* process blocks M_2 ... M_n-1 */
|
||||
bi = (__m128i*)data.ptr;
|
||||
|
@ -132,17 +120,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
{
|
||||
t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
|
||||
|
||||
t = _mm_xor_si128(t, k0);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenclast_si128(t, k10);
|
||||
t = _mm_xor_si128(t, ks[0]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
t = _mm_aesenclast_si128(t, ks[10]);
|
||||
}
|
||||
|
||||
/* store remaining bytes of block M_n */
|
||||
|
@ -188,17 +176,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
*/
|
||||
t = _mm_xor_si128(l, t);
|
||||
|
||||
t = _mm_xor_si128(t, k0);
|
||||
t = _mm_aesenc_si128(t, k1);
|
||||
t = _mm_aesenc_si128(t, k2);
|
||||
t = _mm_aesenc_si128(t, k3);
|
||||
t = _mm_aesenc_si128(t, k4);
|
||||
t = _mm_aesenc_si128(t, k5);
|
||||
t = _mm_aesenc_si128(t, k6);
|
||||
t = _mm_aesenc_si128(t, k7);
|
||||
t = _mm_aesenc_si128(t, k8);
|
||||
t = _mm_aesenc_si128(t, k9);
|
||||
t = _mm_aesenclast_si128(t, k10);
|
||||
t = _mm_xor_si128(t, ks[0]);
|
||||
t = _mm_aesenc_si128(t, ks[1]);
|
||||
t = _mm_aesenc_si128(t, ks[2]);
|
||||
t = _mm_aesenc_si128(t, ks[3]);
|
||||
t = _mm_aesenc_si128(t, ks[4]);
|
||||
t = _mm_aesenc_si128(t, ks[5]);
|
||||
t = _mm_aesenc_si128(t, ks[6]);
|
||||
t = _mm_aesenc_si128(t, ks[7]);
|
||||
t = _mm_aesenc_si128(t, ks[8]);
|
||||
t = _mm_aesenc_si128(t, ks[9]);
|
||||
t = _mm_aesenclast_si128(t, ks[10]);
|
||||
|
||||
_mm_storeu_si128((__m128i*)out, t);
|
||||
|
||||
|
|
|
@ -87,10 +87,9 @@ static inline __m128i increment_be(__m128i x)
|
|||
static void encrypt_ctr128(private_aesni_ctr_t *this,
|
||||
size_t len, u_char *in, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i d1, d2, d3, d4;
|
||||
__m128i state, b, *bi, *bo;
|
||||
__m128i *ks, state, b, *bi, *bo;
|
||||
u_int i, blocks, pblocks, rem;
|
||||
|
||||
state = _mm_load_si128((__m128i*)&this->state);
|
||||
|
@ -100,17 +99,7 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
|
||||
{
|
||||
|
@ -119,56 +108,56 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
|
|||
d3 = _mm_loadu_si128(bi + i + 2);
|
||||
d4 = _mm_loadu_si128(bi + i + 3);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t2 = _mm_xor_si128(state, k0);
|
||||
t2 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t3 = _mm_xor_si128(state, k0);
|
||||
t3 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t4 = _mm_xor_si128(state, k0);
|
||||
t4 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t2 = _mm_aesenc_si128(t2, k1);
|
||||
t3 = _mm_aesenc_si128(t3, k1);
|
||||
t4 = _mm_aesenc_si128(t4, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t2 = _mm_aesenc_si128(t2, k2);
|
||||
t3 = _mm_aesenc_si128(t3, k2);
|
||||
t4 = _mm_aesenc_si128(t4, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t2 = _mm_aesenc_si128(t2, k3);
|
||||
t3 = _mm_aesenc_si128(t3, k3);
|
||||
t4 = _mm_aesenc_si128(t4, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t2 = _mm_aesenc_si128(t2, k4);
|
||||
t3 = _mm_aesenc_si128(t3, k4);
|
||||
t4 = _mm_aesenc_si128(t4, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t2 = _mm_aesenc_si128(t2, k5);
|
||||
t3 = _mm_aesenc_si128(t3, k5);
|
||||
t4 = _mm_aesenc_si128(t4, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t2 = _mm_aesenc_si128(t2, k6);
|
||||
t3 = _mm_aesenc_si128(t3, k6);
|
||||
t4 = _mm_aesenc_si128(t4, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t2 = _mm_aesenc_si128(t2, k7);
|
||||
t3 = _mm_aesenc_si128(t3, k7);
|
||||
t4 = _mm_aesenc_si128(t4, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t2 = _mm_aesenc_si128(t2, k8);
|
||||
t3 = _mm_aesenc_si128(t3, k8);
|
||||
t4 = _mm_aesenc_si128(t4, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t2 = _mm_aesenc_si128(t2, k9);
|
||||
t3 = _mm_aesenc_si128(t3, k9);
|
||||
t4 = _mm_aesenc_si128(t4, k9);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[1]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[1]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[2]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[2]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[3]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[3]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[4]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[4]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[5]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[5]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[6]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[6]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[7]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[7]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[8]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[8]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[9]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[9]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[9]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k10);
|
||||
t2 = _mm_aesenclast_si128(t2, k10);
|
||||
t3 = _mm_aesenclast_si128(t3, k10);
|
||||
t4 = _mm_aesenclast_si128(t4, k10);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[10]);
|
||||
t2 = _mm_aesenclast_si128(t2, ks[10]);
|
||||
t3 = _mm_aesenclast_si128(t3, ks[10]);
|
||||
t4 = _mm_aesenclast_si128(t4, ks[10]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
t2 = _mm_xor_si128(t2, d2);
|
||||
t3 = _mm_xor_si128(t3, d3);
|
||||
|
@ -183,20 +172,20 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
|
|||
{
|
||||
d1 = _mm_loadu_si128(bi + i);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k10);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[10]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
}
|
||||
|
@ -207,19 +196,19 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
|
|||
memcpy(&b, bi + blocks, rem);
|
||||
|
||||
d1 = _mm_loadu_si128(&b);
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k10);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[10]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(&b, t1);
|
||||
|
||||
|
@ -233,10 +222,9 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
|
|||
static void encrypt_ctr192(private_aesni_ctr_t *this,
|
||||
size_t len, u_char *in, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i d1, d2, d3, d4;
|
||||
__m128i state, b, *bi, *bo;
|
||||
__m128i *ks, state, b, *bi, *bo;
|
||||
u_int i, blocks, pblocks, rem;
|
||||
|
||||
state = _mm_load_si128((__m128i*)&this->state);
|
||||
|
@ -246,19 +234,7 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
|
||||
{
|
||||
|
@ -267,64 +243,64 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
|
|||
d3 = _mm_loadu_si128(bi + i + 2);
|
||||
d4 = _mm_loadu_si128(bi + i + 3);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t2 = _mm_xor_si128(state, k0);
|
||||
t2 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t3 = _mm_xor_si128(state, k0);
|
||||
t3 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t4 = _mm_xor_si128(state, k0);
|
||||
t4 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t2 = _mm_aesenc_si128(t2, k1);
|
||||
t3 = _mm_aesenc_si128(t3, k1);
|
||||
t4 = _mm_aesenc_si128(t4, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t2 = _mm_aesenc_si128(t2, k2);
|
||||
t3 = _mm_aesenc_si128(t3, k2);
|
||||
t4 = _mm_aesenc_si128(t4, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t2 = _mm_aesenc_si128(t2, k3);
|
||||
t3 = _mm_aesenc_si128(t3, k3);
|
||||
t4 = _mm_aesenc_si128(t4, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t2 = _mm_aesenc_si128(t2, k4);
|
||||
t3 = _mm_aesenc_si128(t3, k4);
|
||||
t4 = _mm_aesenc_si128(t4, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t2 = _mm_aesenc_si128(t2, k5);
|
||||
t3 = _mm_aesenc_si128(t3, k5);
|
||||
t4 = _mm_aesenc_si128(t4, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t2 = _mm_aesenc_si128(t2, k6);
|
||||
t3 = _mm_aesenc_si128(t3, k6);
|
||||
t4 = _mm_aesenc_si128(t4, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t2 = _mm_aesenc_si128(t2, k7);
|
||||
t3 = _mm_aesenc_si128(t3, k7);
|
||||
t4 = _mm_aesenc_si128(t4, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t2 = _mm_aesenc_si128(t2, k8);
|
||||
t3 = _mm_aesenc_si128(t3, k8);
|
||||
t4 = _mm_aesenc_si128(t4, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t2 = _mm_aesenc_si128(t2, k9);
|
||||
t3 = _mm_aesenc_si128(t3, k9);
|
||||
t4 = _mm_aesenc_si128(t4, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t2 = _mm_aesenc_si128(t2, k10);
|
||||
t3 = _mm_aesenc_si128(t3, k10);
|
||||
t4 = _mm_aesenc_si128(t4, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t2 = _mm_aesenc_si128(t2, k11);
|
||||
t3 = _mm_aesenc_si128(t3, k11);
|
||||
t4 = _mm_aesenc_si128(t4, k11);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[1]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[1]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[2]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[2]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[3]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[3]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[4]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[4]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[5]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[5]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[6]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[6]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[7]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[7]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[8]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[8]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[9]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[9]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[10]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[10]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[11]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[11]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[11]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k12);
|
||||
t2 = _mm_aesenclast_si128(t2, k12);
|
||||
t3 = _mm_aesenclast_si128(t3, k12);
|
||||
t4 = _mm_aesenclast_si128(t4, k12);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[12]);
|
||||
t2 = _mm_aesenclast_si128(t2, ks[12]);
|
||||
t3 = _mm_aesenclast_si128(t3, ks[12]);
|
||||
t4 = _mm_aesenclast_si128(t4, ks[12]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
t2 = _mm_xor_si128(t2, d2);
|
||||
t3 = _mm_xor_si128(t3, d3);
|
||||
|
@ -339,22 +315,22 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
|
|||
{
|
||||
d1 = _mm_loadu_si128(bi + i);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k12);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[12]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
}
|
||||
|
@ -365,21 +341,21 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
|
|||
memcpy(&b, bi + blocks, rem);
|
||||
|
||||
d1 = _mm_loadu_si128(&b);
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k12);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[12]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(&b, t1);
|
||||
|
||||
|
@ -393,10 +369,9 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
|
|||
static void encrypt_ctr256(private_aesni_ctr_t *this,
|
||||
size_t len, u_char *in, u_char *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
|
||||
__m128i t1, t2, t3, t4;
|
||||
__m128i d1, d2, d3, d4;
|
||||
__m128i state, b, *bi, *bo;
|
||||
__m128i *ks, state, b, *bi, *bo;
|
||||
u_int i, blocks, pblocks, rem;
|
||||
|
||||
state = _mm_load_si128((__m128i*)&this->state);
|
||||
|
@ -406,21 +381,7 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
|
|||
bi = (__m128i*)in;
|
||||
bo = (__m128i*)out;
|
||||
|
||||
k0 = this->key->schedule[0];
|
||||
k1 = this->key->schedule[1];
|
||||
k2 = this->key->schedule[2];
|
||||
k3 = this->key->schedule[3];
|
||||
k4 = this->key->schedule[4];
|
||||
k5 = this->key->schedule[5];
|
||||
k6 = this->key->schedule[6];
|
||||
k7 = this->key->schedule[7];
|
||||
k8 = this->key->schedule[8];
|
||||
k9 = this->key->schedule[9];
|
||||
k10 = this->key->schedule[10];
|
||||
k11 = this->key->schedule[11];
|
||||
k12 = this->key->schedule[12];
|
||||
k13 = this->key->schedule[13];
|
||||
k14 = this->key->schedule[14];
|
||||
ks = this->key->schedule;
|
||||
|
||||
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
|
||||
{
|
||||
|
@ -429,72 +390,72 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
|
|||
d3 = _mm_loadu_si128(bi + i + 2);
|
||||
d4 = _mm_loadu_si128(bi + i + 3);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t2 = _mm_xor_si128(state, k0);
|
||||
t2 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t3 = _mm_xor_si128(state, k0);
|
||||
t3 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
t4 = _mm_xor_si128(state, k0);
|
||||
t4 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t2 = _mm_aesenc_si128(t2, k1);
|
||||
t3 = _mm_aesenc_si128(t3, k1);
|
||||
t4 = _mm_aesenc_si128(t4, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t2 = _mm_aesenc_si128(t2, k2);
|
||||
t3 = _mm_aesenc_si128(t3, k2);
|
||||
t4 = _mm_aesenc_si128(t4, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t2 = _mm_aesenc_si128(t2, k3);
|
||||
t3 = _mm_aesenc_si128(t3, k3);
|
||||
t4 = _mm_aesenc_si128(t4, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t2 = _mm_aesenc_si128(t2, k4);
|
||||
t3 = _mm_aesenc_si128(t3, k4);
|
||||
t4 = _mm_aesenc_si128(t4, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t2 = _mm_aesenc_si128(t2, k5);
|
||||
t3 = _mm_aesenc_si128(t3, k5);
|
||||
t4 = _mm_aesenc_si128(t4, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t2 = _mm_aesenc_si128(t2, k6);
|
||||
t3 = _mm_aesenc_si128(t3, k6);
|
||||
t4 = _mm_aesenc_si128(t4, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t2 = _mm_aesenc_si128(t2, k7);
|
||||
t3 = _mm_aesenc_si128(t3, k7);
|
||||
t4 = _mm_aesenc_si128(t4, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t2 = _mm_aesenc_si128(t2, k8);
|
||||
t3 = _mm_aesenc_si128(t3, k8);
|
||||
t4 = _mm_aesenc_si128(t4, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t2 = _mm_aesenc_si128(t2, k9);
|
||||
t3 = _mm_aesenc_si128(t3, k9);
|
||||
t4 = _mm_aesenc_si128(t4, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t2 = _mm_aesenc_si128(t2, k10);
|
||||
t3 = _mm_aesenc_si128(t3, k10);
|
||||
t4 = _mm_aesenc_si128(t4, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t2 = _mm_aesenc_si128(t2, k11);
|
||||
t3 = _mm_aesenc_si128(t3, k11);
|
||||
t4 = _mm_aesenc_si128(t4, k11);
|
||||
t1 = _mm_aesenc_si128(t1, k12);
|
||||
t2 = _mm_aesenc_si128(t2, k12);
|
||||
t3 = _mm_aesenc_si128(t3, k12);
|
||||
t4 = _mm_aesenc_si128(t4, k12);
|
||||
t1 = _mm_aesenc_si128(t1, k13);
|
||||
t2 = _mm_aesenc_si128(t2, k13);
|
||||
t3 = _mm_aesenc_si128(t3, k13);
|
||||
t4 = _mm_aesenc_si128(t4, k13);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[1]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[1]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[2]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[2]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[3]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[3]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[4]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[4]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[5]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[5]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[6]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[6]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[7]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[7]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[8]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[8]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[9]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[9]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[10]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[10]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[11]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[11]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[11]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[12]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[12]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[12]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[12]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[13]);
|
||||
t2 = _mm_aesenc_si128(t2, ks[13]);
|
||||
t3 = _mm_aesenc_si128(t3, ks[13]);
|
||||
t4 = _mm_aesenc_si128(t4, ks[13]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k14);
|
||||
t2 = _mm_aesenclast_si128(t2, k14);
|
||||
t3 = _mm_aesenclast_si128(t3, k14);
|
||||
t4 = _mm_aesenclast_si128(t4, k14);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[14]);
|
||||
t2 = _mm_aesenclast_si128(t2, ks[14]);
|
||||
t3 = _mm_aesenclast_si128(t3, ks[14]);
|
||||
t4 = _mm_aesenclast_si128(t4, ks[14]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
t2 = _mm_xor_si128(t2, d2);
|
||||
t3 = _mm_xor_si128(t3, d3);
|
||||
|
@ -509,24 +470,24 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
|
|||
{
|
||||
d1 = _mm_loadu_si128(bi + i);
|
||||
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
state = increment_be(state);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t1 = _mm_aesenc_si128(t1, k12);
|
||||
t1 = _mm_aesenc_si128(t1, k13);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[12]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[13]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k14);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[14]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(bo + i, t1);
|
||||
}
|
||||
|
@ -537,23 +498,23 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
|
|||
memcpy(&b, bi + blocks, rem);
|
||||
|
||||
d1 = _mm_loadu_si128(&b);
|
||||
t1 = _mm_xor_si128(state, k0);
|
||||
t1 = _mm_xor_si128(state, ks[0]);
|
||||
|
||||
t1 = _mm_aesenc_si128(t1, k1);
|
||||
t1 = _mm_aesenc_si128(t1, k2);
|
||||
t1 = _mm_aesenc_si128(t1, k3);
|
||||
t1 = _mm_aesenc_si128(t1, k4);
|
||||
t1 = _mm_aesenc_si128(t1, k5);
|
||||
t1 = _mm_aesenc_si128(t1, k6);
|
||||
t1 = _mm_aesenc_si128(t1, k7);
|
||||
t1 = _mm_aesenc_si128(t1, k8);
|
||||
t1 = _mm_aesenc_si128(t1, k9);
|
||||
t1 = _mm_aesenc_si128(t1, k10);
|
||||
t1 = _mm_aesenc_si128(t1, k11);
|
||||
t1 = _mm_aesenc_si128(t1, k12);
|
||||
t1 = _mm_aesenc_si128(t1, k13);
|
||||
t1 = _mm_aesenc_si128(t1, ks[1]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[2]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[3]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[4]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[5]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[6]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[7]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[8]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[9]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[10]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[11]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[12]);
|
||||
t1 = _mm_aesenc_si128(t1, ks[13]);
|
||||
|
||||
t1 = _mm_aesenclast_si128(t1, k14);
|
||||
t1 = _mm_aesenclast_si128(t1, ks[14]);
|
||||
t1 = _mm_xor_si128(t1, d1);
|
||||
_mm_storeu_si128(&b, t1);
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -72,8 +72,7 @@ struct private_aesni_mac_t {
|
|||
METHOD(mac_t, get_mac, bool,
|
||||
private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
|
||||
{
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
|
||||
__m128i e, *bi;
|
||||
__m128i *ks, e, *bi;
|
||||
u_int blocks, rem, i;
|
||||
|
||||
if (!this->k1)
|
||||
|
@ -81,17 +80,7 @@ METHOD(mac_t, get_mac, bool,
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
k0 = this->k1->schedule[0];
|
||||
k1 = this->k1->schedule[1];
|
||||
k2 = this->k1->schedule[2];
|
||||
k3 = this->k1->schedule[3];
|
||||
k4 = this->k1->schedule[4];
|
||||
k5 = this->k1->schedule[5];
|
||||
k6 = this->k1->schedule[6];
|
||||
k7 = this->k1->schedule[7];
|
||||
k8 = this->k1->schedule[8];
|
||||
k9 = this->k1->schedule[9];
|
||||
k10 = this->k1->schedule[10];
|
||||
ks = this->k1->schedule;
|
||||
|
||||
e = this->e;
|
||||
|
||||
|
@ -114,17 +103,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
|
||||
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
|
||||
|
||||
e = _mm_xor_si128(e, k0);
|
||||
e = _mm_aesenc_si128(e, k1);
|
||||
e = _mm_aesenc_si128(e, k2);
|
||||
e = _mm_aesenc_si128(e, k3);
|
||||
e = _mm_aesenc_si128(e, k4);
|
||||
e = _mm_aesenc_si128(e, k5);
|
||||
e = _mm_aesenc_si128(e, k6);
|
||||
e = _mm_aesenc_si128(e, k7);
|
||||
e = _mm_aesenc_si128(e, k8);
|
||||
e = _mm_aesenc_si128(e, k9);
|
||||
e = _mm_aesenclast_si128(e, k10);
|
||||
e = _mm_xor_si128(e, ks[0]);
|
||||
e = _mm_aesenc_si128(e, ks[1]);
|
||||
e = _mm_aesenc_si128(e, ks[2]);
|
||||
e = _mm_aesenc_si128(e, ks[3]);
|
||||
e = _mm_aesenc_si128(e, ks[4]);
|
||||
e = _mm_aesenc_si128(e, ks[5]);
|
||||
e = _mm_aesenc_si128(e, ks[6]);
|
||||
e = _mm_aesenc_si128(e, ks[7]);
|
||||
e = _mm_aesenc_si128(e, ks[8]);
|
||||
e = _mm_aesenc_si128(e, ks[9]);
|
||||
e = _mm_aesenclast_si128(e, ks[10]);
|
||||
|
||||
bi = (__m128i*)data.ptr;
|
||||
rem = data.len % AES_BLOCK_SIZE;
|
||||
|
@ -140,17 +129,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
{
|
||||
e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
|
||||
|
||||
e = _mm_xor_si128(e, k0);
|
||||
e = _mm_aesenc_si128(e, k1);
|
||||
e = _mm_aesenc_si128(e, k2);
|
||||
e = _mm_aesenc_si128(e, k3);
|
||||
e = _mm_aesenc_si128(e, k4);
|
||||
e = _mm_aesenc_si128(e, k5);
|
||||
e = _mm_aesenc_si128(e, k6);
|
||||
e = _mm_aesenc_si128(e, k7);
|
||||
e = _mm_aesenc_si128(e, k8);
|
||||
e = _mm_aesenc_si128(e, k9);
|
||||
e = _mm_aesenclast_si128(e, k10);
|
||||
e = _mm_xor_si128(e, ks[0]);
|
||||
e = _mm_aesenc_si128(e, ks[1]);
|
||||
e = _mm_aesenc_si128(e, ks[2]);
|
||||
e = _mm_aesenc_si128(e, ks[3]);
|
||||
e = _mm_aesenc_si128(e, ks[4]);
|
||||
e = _mm_aesenc_si128(e, ks[5]);
|
||||
e = _mm_aesenc_si128(e, ks[6]);
|
||||
e = _mm_aesenc_si128(e, ks[7]);
|
||||
e = _mm_aesenc_si128(e, ks[8]);
|
||||
e = _mm_aesenc_si128(e, ks[9]);
|
||||
e = _mm_aesenclast_si128(e, ks[10]);
|
||||
}
|
||||
|
||||
/* store remaining bytes of block M[n] */
|
||||
|
@ -196,17 +185,17 @@ METHOD(mac_t, get_mac, bool,
|
|||
}
|
||||
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
|
||||
|
||||
e = _mm_xor_si128(e, k0);
|
||||
e = _mm_aesenc_si128(e, k1);
|
||||
e = _mm_aesenc_si128(e, k2);
|
||||
e = _mm_aesenc_si128(e, k3);
|
||||
e = _mm_aesenc_si128(e, k4);
|
||||
e = _mm_aesenc_si128(e, k5);
|
||||
e = _mm_aesenc_si128(e, k6);
|
||||
e = _mm_aesenc_si128(e, k7);
|
||||
e = _mm_aesenc_si128(e, k8);
|
||||
e = _mm_aesenc_si128(e, k9);
|
||||
e = _mm_aesenclast_si128(e, k10);
|
||||
e = _mm_xor_si128(e, ks[0]);
|
||||
e = _mm_aesenc_si128(e, ks[1]);
|
||||
e = _mm_aesenc_si128(e, ks[2]);
|
||||
e = _mm_aesenc_si128(e, ks[3]);
|
||||
e = _mm_aesenc_si128(e, ks[4]);
|
||||
e = _mm_aesenc_si128(e, ks[5]);
|
||||
e = _mm_aesenc_si128(e, ks[6]);
|
||||
e = _mm_aesenc_si128(e, ks[7]);
|
||||
e = _mm_aesenc_si128(e, ks[8]);
|
||||
e = _mm_aesenc_si128(e, ks[9]);
|
||||
e = _mm_aesenclast_si128(e, ks[10]);
|
||||
_mm_storeu_si128((__m128i*)out, e);
|
||||
|
||||
/* (2) Define E[0] = 0x00000000000000000000000000000000 */
|
||||
|
|
Loading…
Reference in New Issue