From bd0aa66e454246a1781c27f6d82273e62e121110 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@revosec.ch>
Date: Tue, 7 Apr 2015 18:12:52 +0200
Subject: [PATCH] curve22519: Add a portable backend implemented in plain C

---
 .../plugins/curve25519/Makefile.am            |   1 +
 .../plugins/curve25519/curve25519_drv.c       |   2 +
 .../curve25519/curve25519_drv_portable.c      | 613 ++++++++++++++++++
 .../curve25519/curve25519_drv_portable.h      |  31 +
 4 files changed, 647 insertions(+)
 create mode 100644 src/libstrongswan/plugins/curve25519/curve25519_drv_portable.c
 create mode 100644 src/libstrongswan/plugins/curve25519/curve25519_drv_portable.h

diff --git a/src/libstrongswan/plugins/curve25519/Makefile.am b/src/libstrongswan/plugins/curve25519/Makefile.am
index a8003128a..e97a61c2c 100644
--- a/src/libstrongswan/plugins/curve25519/Makefile.am
+++ b/src/libstrongswan/plugins/curve25519/Makefile.am
@@ -13,6 +13,7 @@ endif
 libstrongswan_curve25519_la_SOURCES = \
 	curve25519_dh.h curve25519_dh.c \
 	curve25519_drv.h curve25519_drv.c \
+	curve25519_drv_portable.h curve25519_drv_portable.c \
 	curve25519_plugin.h curve25519_plugin.c
 
 libstrongswan_curve25519_la_LDFLAGS = -module -avoid-version
diff --git a/src/libstrongswan/plugins/curve25519/curve25519_drv.c b/src/libstrongswan/plugins/curve25519/curve25519_drv.c
index 438a24b40..df39e711f 100644
--- a/src/libstrongswan/plugins/curve25519/curve25519_drv.c
+++ b/src/libstrongswan/plugins/curve25519/curve25519_drv.c
@@ -14,6 +14,7 @@
  */
 
 #include "curve25519_drv.h"
+#include "curve25519_drv_portable.h"
 
 typedef curve25519_drv_t*(*curve25519_drv_create)();
 
@@ -23,6 +24,7 @@ typedef curve25519_drv_t*(*curve25519_drv_create)();
 curve25519_drv_t *curve25519_drv_probe()
 {
 	curve25519_drv_create drivers[] = {
+		curve25519_drv_portable_create,
 	};
 	curve25519_drv_t *driver;
 	int i;
diff --git a/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.c b/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.c
new file mode 100644
index 000000000..9182de5a7
--- /dev/null
+++ b/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * Based on public domain code by Andrew Moon (curve22519-donna).
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "curve25519_drv_portable.h"
+
+typedef struct private_curve25519_drv_t private_curve25519_drv_t;
+
+/**
+ * Private data of an curve25519_drv_portable_t object.
+ */
+struct private_curve25519_drv_t {
+
+	/**
+	 * Public curve25519_drv_t interface.
+	 */
+	curve25519_drv_t public;
+
+	/**
+	 * Private key
+	 */
+	u_char key[CURVE25519_KEY_SIZE];
+};
+
+METHOD(curve25519_drv_t, set_key, bool,
+	private_curve25519_drv_t *this, u_char *key)
+{
+	memcpy(this->key, key, sizeof(this->key));
+
+	this->key[0] &= 0xf8;
+	this->key[31] &= 0x7f;
+	this->key[31] |= 0x40;
+	return TRUE;
+}
+
+/**
+ * OR a 32-bit integer to an unaligned little-endian
+ */
+static inline void horule32(void *p, uint32_t x)
+{
+	uint32_t r;
+
+	memcpy(&r, p, sizeof(r));
+	r |= htole32(x);
+	memcpy(p, &r, sizeof(r));
+}
+
+/**
+ * Reduce a 32-bit integer to 26 bits
+ */
+static inline uint32_t rdc26(uint32_t v)
+{
+	return v & ((1 << 26) - 1);
+}
+
+/**
+ * Reduce a 32-bit integer to 25 bits
+ */
+static inline uint32_t rdc25(uint32_t v)
+{
+	return v & ((1 << 25) - 1);
+}
+
+/**
+ * Shift right a 64-bit integer by 26 bits
+ */
+static inline uint32_t sr26(uint64_t v)
+{
+	return v >> 26;
+}
+
+/**
+ * Shift right a 64-bit integer by 25 bits
+ */
+static inline uint32_t sr25(uint64_t v)
+{
+	return v >> 25;
+}
+
+/**
+ * Multiply a 64-bit integer with a 32-bit integer
+ */
+static inline uint64_t mul64(uint64_t a, uint32_t b)
+{
+	return a * b;
+}
+
+/**
+ * out = a + b
+ */
+static inline void add(uint32_t out[10], uint32_t a[10], uint32_t b[10])
+{
+	out[0] = a[0] + b[0];
+	out[1] = a[1] + b[1];
+	out[2] = a[2] + b[2];
+	out[3] = a[3] + b[3];
+	out[4] = a[4] + b[4];
+	out[5] = a[5] + b[5];
+	out[6] = a[6] + b[6];
+	out[7] = a[7] + b[7];
+	out[8] = a[8] + b[8];
+	out[9] = a[9] + b[9];
+}
+
+/**
+ * out = a - b
+ */
+static inline void sub(uint32_t out[10], uint32_t a[10], uint32_t b[10])
+{
+	uint32_t x;
+
+	x = 0x7ffffda + a[0] - b[0];           out[0] = rdc26(x);
+	x = 0x3fffffe + a[1] - b[1] + sr26(x); out[1] = rdc25(x);
+	x = 0x7fffffe + a[2] - b[2] + sr25(x); out[2] = rdc26(x);
+	x = 0x3fffffe + a[3] - b[3] + sr26(x); out[3] = rdc25(x);
+	x = 0x7fffffe + a[4] - b[4] + sr25(x); out[4] = rdc26(x);
+	x = 0x3fffffe + a[5] - b[5] + sr26(x); out[5] = rdc25(x);
+	x = 0x7fffffe + a[6] - b[6] + sr25(x); out[6] = rdc26(x);
+	x = 0x3fffffe + a[7] - b[7] + sr26(x); out[7] = rdc25(x);
+	x = 0x7fffffe + a[8] - b[8] + sr25(x); out[8] = rdc26(x);
+	x = 0x3fffffe + a[9] - b[9] + sr26(x); out[9] = rdc25(x);
+	                    out[0] += sr25(x) * 19;
+}
+
+/**
+ * out = in * scalar
+ */
+static void scalar_product(uint32_t out[10], uint32_t in[10], uint32_t scalar)
+{
+	uint64_t x;
+
+	x = mul64(in[0], scalar);           out[0] = rdc26(x);
+	x = mul64(in[1], scalar) + sr26(x); out[1] = rdc25(x);
+	x = mul64(in[2], scalar) + sr25(x); out[2] = rdc26(x);
+	x = mul64(in[3], scalar) + sr26(x); out[3] = rdc25(x);
+	x = mul64(in[4], scalar) + sr25(x); out[4] = rdc26(x);
+	x = mul64(in[5], scalar) + sr26(x); out[5] = rdc25(x);
+	x = mul64(in[6], scalar) + sr25(x); out[6] = rdc26(x);
+	x = mul64(in[7], scalar) + sr26(x); out[7] = rdc25(x);
+	x = mul64(in[8], scalar) + sr25(x); out[8] = rdc26(x);
+	x = mul64(in[9], scalar) + sr26(x); out[9] = rdc25(x);
+	                 out[0] += sr25(x) * 19;
+}
+
+/**
+ * out = a * b
+ */
+static inline void mul(uint32_t out[10], uint32_t a[10], uint32_t b[10])
+{
+	uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+	uint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
+	uint64_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+
+	r0 = b[0];
+	r1 = b[1];
+	r2 = b[2];
+	r3 = b[3];
+	r4 = b[4];
+	r5 = b[5];
+	r6 = b[6];
+	r7 = b[7];
+	r8 = b[8];
+	r9 = b[9];
+
+	s0 = a[0];
+	s1 = a[1];
+	s2 = a[2];
+	s3 = a[3];
+	s4 = a[4];
+	s5 = a[5];
+	s6 = a[6];
+	s7 = a[7];
+	s8 = a[8];
+	s9 = a[9];
+
+	m1 = mul64(r0, s1) + mul64(r1, s0);
+	m3 = mul64(r0, s3) + mul64(r1, s2) + mul64(r2, s1) + mul64(r3, s0);
+	m5 = mul64(r0, s5) + mul64(r1, s4) + mul64(r2, s3) + mul64(r3, s2)
+	   + mul64(r4, s1) + mul64(r5, s0);
+	m7 = mul64(r0, s7) + mul64(r1, s6) + mul64(r2, s5) + mul64(r3, s4)
+	   + mul64(r4, s3) + mul64(r5, s2) + mul64(r6, s1) + mul64(r7, s0);
+	m9 = mul64(r0, s9) + mul64(r1, s8) + mul64(r2, s7) + mul64(r3, s6)
+	   + mul64(r4, s5) + mul64(r5, s4) + mul64(r6, s3) + mul64(r7, s2)
+	   + mul64(r8, s1) + mul64(r9, s0);
+
+	r1 *= 2;
+	r3 *= 2;
+	r5 *= 2;
+	r7 *= 2;
+
+	m0 = mul64(r0, s0);
+	m2 = mul64(r0, s2) + mul64(r1, s1) + mul64(r2, s0);
+	m4 = mul64(r0, s4) + mul64(r1, s3) + mul64(r2, s2) + mul64(r3, s1)
+	   + mul64(r4, s0);
+	m6 = mul64(r0, s6) + mul64(r1, s5) + mul64(r2, s4) + mul64(r3, s3)
+	   + mul64(r4, s2) + mul64(r5, s1) + mul64(r6, s0);
+	m8 = mul64(r0, s8) + mul64(r1, s7) + mul64(r2, s6) + mul64(r3, s5)
+	   + mul64(r4, s4) + mul64(r5, s3) + mul64(r6, s2) + mul64(r7, s1)
+	   + mul64(r8, s0);
+
+	r1 *= 19;
+	r2 *= 19;
+	r3 = (r3 / 2) * 19;
+	r4 *= 19;
+	r5 = (r5 / 2) * 19;
+	r6 *= 19;
+	r7 = (r7 / 2) * 19;
+	r8 *= 19;
+	r9 *= 19;
+
+	m1 += mul64(r9, s2) + mul64(r8, s3) + mul64(r7, s4) + mul64(r6, s5)
+	   +  mul64(r5, s6) + mul64(r4, s7) + mul64(r3, s8) + mul64(r2, s9);
+	m3 += mul64(r9, s4) + mul64(r8, s5) + mul64(r7, s6) + mul64(r6, s7)
+	   +  mul64(r5, s8) + mul64(r4, s9);
+	m5 += mul64(r9, s6) + mul64(r8, s7) + mul64(r7, s8) + mul64(r6, s9);
+	m7 += mul64(r9, s8) + mul64(r8, s9);
+
+	r3 *= 2;
+	r5 *= 2;
+	r7 *= 2;
+	r9 *= 2;
+
+	m0 += mul64(r9, s1) + mul64(r8, s2) + mul64(r7, s3) + mul64(r6, s4)
+	   +  mul64(r5, s5) + mul64(r4, s6) + mul64(r3, s7) + mul64(r2, s8)
+	   +  mul64(r1, s9);
+	m2 += mul64(r9, s3) + mul64(r8, s4) + mul64(r7, s5) + mul64(r6, s6)
+	   +  mul64(r5, s7) + mul64(r4, s8) + mul64(r3, s9);
+	m4 += mul64(r9, s5) + mul64(r8, s6) + mul64(r7, s7) + mul64(r6, s8)
+	   +  mul64(r5, s9);
+	m6 += mul64(r9, s7) + mul64(r8, s8) + mul64(r7, s9);
+	m8 += mul64(r9, s9);
+
+	m1 += m0 >> 26; r1 = rdc25(m1);
+	m2 += m1 >> 25; r2 = rdc26(m2);
+	m3 += m2 >> 26; r3 = rdc25(m3);
+	m4 += m3 >> 25; r4 = rdc26(m4);
+	m5 += m4 >> 26; r5 = rdc25(m5);
+	m6 += m5 >> 25; r6 = rdc26(m6);
+	m7 += m6 >> 26; r7 = rdc25(m7);
+	m8 += m7 >> 25; r8 = rdc26(m8);
+	m9 += m8 >> 26; r9 = rdc25(m9);
+	m0 = rdc26(m0) + mul64(m9 >> 25, 19);
+	r0 = rdc26(m0); r1 += m0 >> 26;
+
+	out[0] = r0;
+	out[1] = r1;
+	out[2] = r2;
+	out[3] = r3;
+	out[4] = r4;
+	out[5] = r5;
+	out[6] = r6;
+	out[7] = r7;
+	out[8] = r8;
+	out[9] = r9;
+}
+
+/**
+ * out = in^(2 * count), inlining
+ */
+static inline void square_times(uint32_t out[10], uint32_t in[10], int count)
+{
+	uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+	uint32_t d6, d7, d8, d9;
+	uint64_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+
+	r0 = in[0];
+	r1 = in[1];
+	r2 = in[2];
+	r3 = in[3];
+	r4 = in[4];
+	r5 = in[5];
+	r6 = in[6];
+	r7 = in[7];
+	r8 = in[8];
+	r9 = in[9];
+
+	while (count--)
+	{
+		m0 = mul64(r0, r0    );
+		r0 *= 2;
+		m1 = mul64(r0, r1    );
+		m2 = mul64(r0, r2    ) + mul64(r1, r1 * 2);
+		r1 *= 2;
+		m3 = mul64(r0, r3    ) + mul64(r1, r2    );
+		m4 = mul64(r0, r4    ) + mul64(r1, r3 * 2) + mul64(r2, r2);
+		r2 *= 2;
+		m5 = mul64(r0, r5    ) + mul64(r1, r4    ) + mul64(r2, r3);
+		m6 = mul64(r0, r6    ) + mul64(r1, r5 * 2) + mul64(r2, r4)
+		   + mul64(r3, r3 * 2);
+		r3 *= 2;
+		m7 = mul64(r0, r7    ) + mul64(r1, r6    ) + mul64(r2, r5)
+		   + mul64(r3, r4    );
+		m8 = mul64(r0, r8    ) + mul64(r1, r7 * 2) + mul64(r2, r6)
+		   + mul64(r3, r5 * 2) + mul64(r4, r4    );
+		m9 = mul64(r0, r9    ) + mul64(r1, r8    ) + mul64(r2, r7)
+		   + mul64(r3, r6    ) + mul64(r4, r5 * 2);
+
+		d6 = r6 * 19;
+		d7 = r7 * 2 * 19;
+		d8 = r8 * 19;
+		d9 = r9 * 2 * 19;
+
+		m0 += mul64(d9, r1    ) + mul64(d8, r2    ) + mul64(d7, r3    )
+		   +  mul64(d6, r4 * 2) + mul64(r5, r5 * 2 * 19);
+		m1 += mul64(d9, r2 / 2) + mul64(d8, r3    ) + mul64(d7, r4    )
+		   +  mul64(d6, r5 * 2);
+		m2 += mul64(d9, r3    ) + mul64(d8, r4 * 2) + mul64(d7, r5 * 2)
+		   +  mul64(d6, r6    );
+		m3 += mul64(d9, r4    ) + mul64(d8, r5 * 2) + mul64(d7, r6    );
+		m4 += mul64(d9, r5 * 2) + mul64(d8, r6 * 2) + mul64(d7, r7    );
+		m5 += mul64(d9, r6    ) + mul64(d8, r7 * 2);
+		m6 += mul64(d9, r7 * 2) + mul64(d8, r8    );
+		m7 += mul64(d9, r8    );
+		m8 += mul64(d9, r9    );
+
+		m1 += m0 >> 26; r1 = rdc25(m1);
+		m2 += m1 >> 25; r2 = rdc26(m2);
+		m3 += m2 >> 26; r3 = rdc25(m3);
+		m4 += m3 >> 25; r4 = rdc26(m4);
+		m5 += m4 >> 26; r5 = rdc25(m5);
+		m6 += m5 >> 25; r6 = rdc26(m6);
+		m7 += m6 >> 26; r7 = rdc25(m7);
+		m8 += m7 >> 25; r8 = rdc26(m8);
+		m9 += m8 >> 26; r9 = rdc25(m9);
+		m0 = rdc26(m0) + mul64(sr25(m9), 19);
+		r0 = rdc26(m0); r1 += sr26(m0);
+	}
+
+	out[0] = r0;
+	out[1] = r1;
+	out[2] = r2;
+	out[3] = r3;
+	out[4] = r4;
+	out[5] = r5;
+	out[6] = r6;
+	out[7] = r7;
+	out[8] = r8;
+	out[9] = r9;
+}
+
+/**
+ * out = in * in
+ */
+static void square(uint32_t out[10], uint32_t in[10])
+{
+	return square_times(out, in, 1);
+}
+
+/**
+ * Take a little-endian, 32-byte number and expand it into polynomial form
+ */
+static void expand(uint32_t out[10], u_char *in)
+{
+	uint32_t x0, x1, x2, x3, x4, x5, x6, x7;
+
+	x0 = uletoh32(in + 0);
+	x1 = uletoh32(in + 4);
+	x2 = uletoh32(in + 8);
+	x3 = uletoh32(in + 12);
+	x4 = uletoh32(in + 16);
+	x5 = uletoh32(in + 20);
+	x6 = uletoh32(in + 24);
+	x7 = uletoh32(in + 28);
+
+	out[0] = rdc26(                         x0       );
+	out[1] = rdc25((((uint64_t)x1 << 32) | x0) >> 26);
+	out[2] = rdc26((((uint64_t)x2 << 32) | x1) >> 19);
+	out[3] = rdc25((((uint64_t)x3 << 32) | x2) >> 13);
+	out[4] = rdc26((                        x3) >>  6);
+	out[5] = rdc25(                         x4       );
+	out[6] = rdc26((((uint64_t)x5 << 32) | x4) >> 25);
+	out[7] = rdc25((((uint64_t)x6 << 32) | x5) >> 19);
+	out[8] = rdc26((((uint64_t)x7 << 32) | x6) >> 12);
+	out[9] = rdc25((                        x7) >>  6);
+}
+
+/**
+ * Propagate carries in f
+ */
+static inline void carry(uint32_t f[10])
+{
+	f[1] += f[0] >> 26; f[0] = rdc26(f[0]);
+	f[2] += f[1] >> 25; f[1] = rdc25(f[1]);
+	f[3] += f[2] >> 26; f[2] = rdc26(f[2]);
+	f[4] += f[3] >> 25; f[3] = rdc25(f[3]);
+	f[5] += f[4] >> 26; f[4] = rdc26(f[4]);
+	f[6] += f[5] >> 25; f[5] = rdc25(f[5]);
+	f[7] += f[6] >> 26; f[6] = rdc26(f[6]);
+	f[8] += f[7] >> 25; f[7] = rdc25(f[7]);
+	f[9] += f[8] >> 26; f[8] = rdc26(f[8]);
+}
+
+/**
+ * Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void contract(u_char *out, uint32_t f[10])
+{
+	carry(f);
+	f[0] += 19 * (f[9] >> 25); f[9] = rdc25(f[9]);
+	carry(f);
+	f[0] += 19 * (f[9] >> 25); f[9] = rdc25(f[9]);
+
+	/* now t is between 0 and 2^255-1, properly carried.
+	 * case 1: between 0 and 2^255-20.
+	 * case 2: between 2^255-19 and 2^255-1.
+	 */
+	f[0] += 19;
+	carry(f);
+	f[0] += 19 * (f[9] >> 25); f[9] = rdc25(f[9]);
+
+	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
+	f[0] += (1 << 26) - 19;
+	f[1] += (1 << 25) - 1;
+	f[2] += (1 << 26) - 1;
+	f[3] += (1 << 25) - 1;
+	f[4] += (1 << 26) - 1;
+	f[5] += (1 << 25) - 1;
+	f[6] += (1 << 26) - 1;
+	f[7] += (1 << 25) - 1;
+	f[8] += (1 << 26) - 1;
+	f[9] += (1 << 25) - 1;
+
+	/* now between 2^255 and 2^256-20, and offset by 2^255. */
+	carry(f);
+	f[9] = rdc25(f[9]);
+
+	f[1] <<= 2;
+	f[2] <<= 3;
+	f[3] <<= 5;
+	f[4] <<= 6;
+	f[6] <<= 1;
+	f[7] <<= 3;
+	f[8] <<= 4;
+	f[9] <<= 6;
+
+	memset(out, 0, 32);
+	horule32(out +  0, f[0]);
+	horule32(out +  3, f[1]);
+	horule32(out +  6, f[2]);
+	horule32(out +  9, f[3]);
+	horule32(out + 12, f[4]);
+	horule32(out + 16, f[5]);
+	horule32(out + 19, f[6]);
+	horule32(out + 22, f[7]);
+	horule32(out + 25, f[8]);
+	horule32(out + 28, f[9]);
+}
+
+/**
+ * Swap the contents of x and q if swap is non-zero
+ */
+static void swap_conditional(uint32_t a[10], uint32_t b[10], uint32_t swap)
+{
+	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
+
+	swap = -swap;
+
+	x0 = swap & (a[0] ^ b[0]); a[0] ^= x0; b[0] ^= x0;
+	x1 = swap & (a[1] ^ b[1]); a[1] ^= x1; b[1] ^= x1;
+	x2 = swap & (a[2] ^ b[2]); a[2] ^= x2; b[2] ^= x2;
+	x3 = swap & (a[3] ^ b[3]); a[3] ^= x3; b[3] ^= x3;
+	x4 = swap & (a[4] ^ b[4]); a[4] ^= x4; b[4] ^= x4;
+	x5 = swap & (a[5] ^ b[5]); a[5] ^= x5; b[5] ^= x5;
+	x6 = swap & (a[6] ^ b[6]); a[6] ^= x6; b[6] ^= x6;
+	x7 = swap & (a[7] ^ b[7]); a[7] ^= x7; b[7] ^= x7;
+	x8 = swap & (a[8] ^ b[8]); a[8] ^= x8; b[8] ^= x8;
+	x9 = swap & (a[9] ^ b[9]); a[9] ^= x9; b[9] ^= x9;
+}
+
+/*
+ * In:  b =   2^5 - 2^0
+ * Out: b = 2^250 - 2^0
+ */
+static void pow_two5mtwo0_two250mtwo0(uint32_t b[10])
+{
+	uint32_t t0[10], c[10];
+
+	/* 2^5  - 2^0 */ /* b */
+	/* 2^10 - 2^5 */ square_times(t0, b, 5);
+	/* 2^10 - 2^0 */ mul(b, t0, b);
+	/* 2^20 - 2^10 */ square_times(t0, b, 10);
+	/* 2^20 - 2^0 */ mul(c, t0, b);
+	/* 2^40 - 2^20 */ square_times(t0, c, 20);
+	/* 2^40 - 2^0 */ mul(t0, t0, c);
+	/* 2^50 - 2^10 */ square_times(t0, t0, 10);
+	/* 2^50 - 2^0 */ mul(b, t0, b);
+	/* 2^100 - 2^50 */ square_times(t0, b, 50);
+	/* 2^100 - 2^0 */ mul(c, t0, b);
+	/* 2^200 - 2^100 */ square_times(t0, c, 100);
+	/* 2^200 - 2^0 */ mul(t0, t0, c);
+	/* 2^250 - 2^50 */ square_times(t0, t0, 50);
+	/* 2^250 - 2^0 */ mul(b, t0, b);
+}
+
+/*
+ * z^(p - 2) = z(2^255 - 21)
+ */
+static void recip(uint32_t out[10], uint32_t z[10])
+{
+	uint32_t a[10], t0[10], b[10];
+
+	/* 2 */ square(a, z); /* a = 2 */
+	/* 8 */ square_times(t0, a, 2);
+	/* 9 */ mul(b, t0, z); /* b = 9 */
+	/* 11 */ mul(a, b, a); /* a = 11 */
+	/* 22 */ square(t0, a);
+	/* 2^5 - 2^0 = 31 */ mul(b, t0, b);
+	/* 2^250 - 2^0 */ pow_two5mtwo0_two250mtwo0(b);
+	/* 2^255 - 2^5 */ square_times(b, b, 5);
+	/* 2^255 - 21 */ mul(out, b, a);
+}
+
+METHOD(curve25519_drv_t, curve25519, bool,
+	private_curve25519_drv_t *this, u_char *in, u_char *out)
+{
+	uint32_t nqpqx[10] = {1}, nqpqz[10] = {0}, nqz[10] = {1}, nqx[10];
+	uint32_t q[10], qx[10], qpqx[10], qqx[10], zzz[10], zmone[10];
+	uint32_t bit, lastbit, i;
+
+	expand(q, in);
+	memcpy(nqx, q, sizeof(nqx));
+
+	/* bit 255 is always 0, and bit 254 is always 1, so skip bit 255 and
+	 * start pre-swapped on bit 254 */
+	lastbit = 1;
+
+	/* we are doing bits 254..3 in the loop, but are swapping in bits 253..2 */
+	for (i = 253; i >= 2; i--)
+	{
+		add(qx, nqx, nqz);
+		sub(nqz, nqx, nqz);
+		add(qpqx, nqpqx, nqpqz);
+		sub(nqpqz, nqpqx, nqpqz);
+		mul(nqpqx, qpqx, nqz);
+		mul(nqpqz, qx, nqpqz);
+		add(qqx, nqpqx, nqpqz);
+		sub(nqpqz, nqpqx, nqpqz);
+		square(nqpqz, nqpqz);
+		square(nqpqx, qqx);
+		mul(nqpqz, nqpqz, q);
+		square(qx, qx);
+		square(nqz, nqz);
+		mul(nqx, qx, nqz);
+		sub(nqz, qx, nqz);
+		scalar_product(zzz, nqz, 121665);
+		add(zzz, zzz, qx);
+		mul(nqz, nqz, zzz);
+
+		bit = (this->key[i/8] >> (i & 7)) & 1;
+		swap_conditional(nqx, nqpqx, bit ^ lastbit);
+		swap_conditional(nqz, nqpqz, bit ^ lastbit);
+		lastbit = bit;
+	}
+
+	/* the final 3 bits are always zero, so we only need to double */
+	for (i = 0; i < 3; i++)
+	{
+		add(qx, nqx, nqz);
+		sub(nqz, nqx, nqz);
+		square(qx, qx);
+		square(nqz, nqz);
+		mul(nqx, qx, nqz);
+		sub(nqz, qx, nqz);
+		scalar_product(zzz, nqz, 121665);
+		add(zzz, zzz, qx);
+		mul(nqz, nqz, zzz);
+	}
+
+	recip(zmone, nqz);
+	mul(nqz, nqx, zmone);
+	contract(out, nqz);
+
+	return TRUE;
+}
+
+METHOD(curve25519_drv_t, destroy, void,
+	private_curve25519_drv_t *this)
+{
+	memwipe(this->key, sizeof(this->key));
+	free(this);
+}
+
+/**
+ * See header
+ */
+curve25519_drv_t *curve25519_drv_portable_create()
+{
+	private_curve25519_drv_t *this;
+
+	INIT(this,
+		.public = {
+			.set_key = _set_key,
+			.curve25519 = _curve25519,
+			.destroy = _destroy,
+		},
+	);
+
+	return &this->public;
+}
diff --git a/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.h b/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.h
new file mode 100644
index 000000000..45ad1d904
--- /dev/null
+++ b/src/libstrongswan/plugins/curve25519/curve25519_drv_portable.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015 revosec AG
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+/**
+ * @defgroup curve25519_drv_portable curve25519_drv_portable
+ * @{ @ingroup curve25519
+ */
+
+#include "curve25519_drv.h"
+
+#ifndef CURVE25519_DRV_PORTABLE_H_
+#define CURVE25519_DRV_PORTABLE_H_
+
+/**
+ * Create a curve25519_drv_portable instance.
+ */
+curve25519_drv_t *curve25519_drv_portable_create();
+
+#endif /** CURVE25519_DRV_PORTABLE_H_ @}*/