initial version of SQIsign

Co-authored-by: Jorge Chavez-Saab <jorgechavezsaab@gmail.com> Co-authored-by: Maria Corte-Real Santos <36373796+mariascrs@users.noreply.github.com> Co-authored-by: Luca De Feo <github@defeo.lu> Co-authored-by: Jonathan Komada Eriksen <jonathan.eriksen97@gmail.com> Co-authored-by: Basil Hess <bhe@zurich.ibm.com> Co-authored-by: Antonin Leroux <18654258+tonioecto@users.noreply.github.com> Co-authored-by: Patrick Longa <plonga@microsoft.com> Co-authored-by: Lorenz Panny <lorenz@yx7.cc> Co-authored-by: Francisco Rodríguez-Henríquez <francisco.rodriguez@tii.ae> Co-authored-by: Sina Schaeffler <108983332+syndrakon@users.noreply.github.com> Co-authored-by: Benjamin Wesolowski <19474926+Calodeon@users.noreply.github.com>
2023-06-01 00:00:00 +00:00
commit 28ff420dd0
285 changed files with 70301 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,92 @@
+# There are the following dependencies
+#     ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
+#     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
+#     │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
+#     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
+#     └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
+#           │                    │                       │
+#           │                    │                       │
+#           ├────────────────────┼─────────────────┐     │
+#           │                    │                 │     │
+#           │                    │                 │     │
+#       ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
+#       │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
+#       └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
+#           │       │            │            └───▲──────┬───────────┘
+#           │       │            │                │      │
+#       ┌───┴──┐    │            │                │      │
+#       │ KLPT ◄────┘            │     ┌──────────┘      │
+#       └───┬──┘                 │     │                 │
+#           │                    │     │                 │
+# ┌─────────▼─────────┐          │     │                 │
+# │ Quaternion orders │          │     │            ┌────▼───┐
+# │     and ideals    │          │     │            │ GF(p²) │
+# └─────────┬─────────┘          │     │            └────┬───┘
+#           │           ┌─┬──────▼─────┴──┬─┐            │
+#     ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
+#     │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
+#     └───────────┘     │ ├───────────────┤ │      └───────────┘
+#                       └─┴───────────────┴─┘                    
+
+add_subdirectory(common)
+add_subdirectory(intbig)
+add_subdirectory(quaternion)
+add_subdirectory(precomp)
+add_subdirectory(klpt)
+add_subdirectory(gf)
+add_subdirectory(ec)
+
+add_subdirectory(id2iso)
+add_subdirectory(protocols)
+
+FOREACH(SVARIANT ${SVARIANT_S})
+    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
+    string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
+    set(SOURCE_FILES_VARIANT sqisign.c)
+    # Library for SQIsign variant
+    add_library(sqisign_${SVARIANT_LOWER} ${SOURCE_FILES_VARIANT})
+    target_link_libraries(sqisign_${SVARIANT_LOWER} PUBLIC 
+        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
+        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
+        ${LIB_KLPT_${SVARIANT_UPPER}} 
+        ${LIB_QUATERNION} 
+        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
+        ${LIB_INTBIG} 
+        ${LIB_GF_${SVARIANT_UPPER}} 
+        ${LIB_EC_${SVARIANT_UPPER}} 
+        ${GMP} 
+        sqisign_common_sys
+    )
+
+    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_compile_definitions(sqisign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT})
+
+    # Library for SQIsign variant (test)
+    add_library(sqisign_${SVARIANT_LOWER}_test ${SOURCE_FILES_VARIANT})
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_test PUBLIC 
+        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
+        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
+        ${LIB_KLPT_${SVARIANT_UPPER}} 
+        ${LIB_QUATERNION} 
+        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
+        ${LIB_INTBIG} 
+        ${LIB_GF_${SVARIANT_UPPER}} 
+        ${LIB_EC_${SVARIANT_UPPER}} 
+        ${GMP} 
+        sqisign_common_test
+    )
+
+    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test PUBLIC SQISIGN_VARIANT=${SVARIANT})
+
+    # Library with NIST API
+    set(SOURCE_FILE_NISTAPI nistapi/${SVARIANT_LOWER}/api.c)
+    add_library(sqisign_${SVARIANT_LOWER}_nistapi ${SOURCE_FILE_NISTAPI})
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PRIVATE sqisign_${SVARIANT_LOWER})
+    target_include_directories(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC nistapi/${SVARIANT_LOWER} PUBLIC ../include)
+
+    # Library with NIST API (test)
+    add_library(sqisign_${SVARIANT_LOWER}_test_nistapi ${SOURCE_FILE_NISTAPI})
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PRIVATE sqisign_${SVARIANT_LOWER}_test)
+    target_include_directories(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC nistapi/${SVARIANT_LOWER})
+ENDFOREACH()
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,3 @@
+get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
+include(${SELECT_SQISIGN_VARIANT})
--- a/src/common/generic/CMakeLists.txt
+++ b/src/common/generic/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(SOURCE_FILES_COMMON_SYS 
+    randombytes_system.c 
+    aes_c.c 
+    fips202.c 
+    mem.c
+)
+
+add_library(sqisign_common_sys ${SOURCE_FILES_COMMON_SYS})
+target_include_directories(sqisign_common_sys PRIVATE include ../../include)
+target_compile_options(sqisign_common_sys PUBLIC ${C_OPT_FLAGS})
+
+set(SOURCE_FILES_COMMON_TEST 
+    randombytes_ctrdrbg.c 
+    aes_c.c 
+    fips202.c 
+    mem.c
+)
+
+add_library(sqisign_common_test ${SOURCE_FILES_COMMON_TEST})
+target_include_directories(sqisign_common_test PRIVATE include ../include)
+target_compile_options(sqisign_common_test PUBLIC ${C_OPT_FLAGS})
+
+if (ENABLE_CT_TESTING)
+    target_compile_definitions(sqisign_common_sys PUBLIC ENABLE_CT_TESTING)
+    target_compile_definitions(sqisign_common_test PUBLIC ENABLE_CT_TESTING)
+endif()
--- a/src/common/generic/aes_c.c
+++ b/src/common/generic/aes_c.c
@@ -0,0 +1,740 @@
+// SPDX-License-Identifier: MIT and Apache-2.0
+
+/*
+ * AES implementation based on code from PQClean,
+ * which is in turn based on BearSSL (https://bearssl.org/)
+ * by Thomas Pornin.
+ *
+ *
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define AES128_KEYBYTES 16
+#define AES192_KEYBYTES 24
+#define AES256_KEYBYTES 32
+#define AESCTR_NONCEBYTES 12
+#define AES_BLOCKBYTES 16
+
+// We've put these states on the heap to make sure ctx_release is used.
+#define PQC_AES128_STATESIZE 88
+typedef struct {
+    uint64_t *sk_exp;
+} aes128ctx;
+
+#define PQC_AES192_STATESIZE 104
+typedef struct {
+    uint64_t  *sk_exp;
+} aes192ctx;
+
+#define PQC_AES256_STATESIZE 120
+typedef struct {
+    uint64_t *sk_exp;
+} aes256ctx;
+
+
+/** Initializes the context **/
+void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);
+
+void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key);
+
+void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx);
+
+void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx);
+
+/** Frees the context **/
+void aes128_ctx_release(aes128ctx *r);
+
+
+/** Initializes the context **/
+void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key);
+
+void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key);
+
+void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx);
+
+void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx);
+
+void aes192_ctx_release(aes192ctx *r);
+
+
+/** Initializes the context **/
+void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);
+
+void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key);
+
+void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx);
+
+void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx);
+
+/** Frees the context **/
+void aes256_ctx_release(aes256ctx *r);
+
+static inline uint32_t br_dec32le(const unsigned char *src) {
+    return (uint32_t)src[0]
+           | ((uint32_t)src[1] << 8)
+           | ((uint32_t)src[2] << 16)
+           | ((uint32_t)src[3] << 24);
+}
+
+
+static void br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src) {
+    while (num-- > 0) {
+        *v ++ = br_dec32le(src);
+        src += 4;
+    }
+}
+
+
+static inline uint32_t br_swap32(uint32_t x) {
+    x = ((x & (uint32_t)0x00FF00FF) << 8)
+        | ((x >> 8) & (uint32_t)0x00FF00FF);
+    return (x << 16) | (x >> 16);
+}
+
+
+static inline void br_enc32le(unsigned char *dst, uint32_t x) {
+    dst[0] = (unsigned char)x;
+    dst[1] = (unsigned char)(x >> 8);
+    dst[2] = (unsigned char)(x >> 16);
+    dst[3] = (unsigned char)(x >> 24);
+}
+
+
+static void br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num) {
+    while (num-- > 0) {
+        br_enc32le(dst, *v ++);
+        dst += 4;
+    }
+}
+
+
+static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
+    /*
+     * This S-box implementation is a straightforward translation of
+     * the circuit described by Boyar and Peralta in "A new
+     * combinational logic minimization technique with applications
+     * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
+     *
+     * Note that variables x* (input) and s* (output) are numbered
+     * in "reverse" order (x0 is the high bit, x7 is the low bit).
+     */
+
+    uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
+    uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
+    uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+    uint64_t y20, y21;
+    uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+    uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
+    uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+    uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+    uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+    uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+    uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+    uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+    uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
+    uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+    x0 = q[7];
+    x1 = q[6];
+    x2 = q[5];
+    x3 = q[4];
+    x4 = q[3];
+    x5 = q[2];
+    x6 = q[1];
+    x7 = q[0];
+
+    /*
+     * Top linear transformation.
+     */
+    y14 = x3 ^ x5;
+    y13 = x0 ^ x6;
+    y9 = x0 ^ x3;
+    y8 = x0 ^ x5;
+    t0 = x1 ^ x2;
+    y1 = t0 ^ x7;
+    y4 = y1 ^ x3;
+    y12 = y13 ^ y14;
+    y2 = y1 ^ x0;
+    y5 = y1 ^ x6;
+    y3 = y5 ^ y8;
+    t1 = x4 ^ y12;
+    y15 = t1 ^ x5;
+    y20 = t1 ^ x1;
+    y6 = y15 ^ x7;
+    y10 = y15 ^ t0;
+    y11 = y20 ^ y9;
+    y7 = x7 ^ y11;
+    y17 = y10 ^ y11;
+    y19 = y10 ^ y8;
+    y16 = t0 ^ y11;
+    y21 = y13 ^ y16;
+    y18 = x0 ^ y16;
+
+    /*
+     * Non-linear section.
+     */
+    t2 = y12 & y15;
+    t3 = y3 & y6;
+    t4 = t3 ^ t2;
+    t5 = y4 & x7;
+    t6 = t5 ^ t2;
+    t7 = y13 & y16;
+    t8 = y5 & y1;
+    t9 = t8 ^ t7;
+    t10 = y2 & y7;
+    t11 = t10 ^ t7;
+    t12 = y9 & y11;
+    t13 = y14 & y17;
+    t14 = t13 ^ t12;
+    t15 = y8 & y10;
+    t16 = t15 ^ t12;
+    t17 = t4 ^ t14;
+    t18 = t6 ^ t16;
+    t19 = t9 ^ t14;
+    t20 = t11 ^ t16;
+    t21 = t17 ^ y20;
+    t22 = t18 ^ y19;
+    t23 = t19 ^ y21;
+    t24 = t20 ^ y18;
+
+    t25 = t21 ^ t22;
+    t26 = t21 & t23;
+    t27 = t24 ^ t26;
+    t28 = t25 & t27;
+    t29 = t28 ^ t22;
+    t30 = t23 ^ t24;
+    t31 = t22 ^ t26;
+    t32 = t31 & t30;
+    t33 = t32 ^ t24;
+    t34 = t23 ^ t33;
+    t35 = t27 ^ t33;
+    t36 = t24 & t35;
+    t37 = t36 ^ t34;
+    t38 = t27 ^ t36;
+    t39 = t29 & t38;
+    t40 = t25 ^ t39;
+
+    t41 = t40 ^ t37;
+    t42 = t29 ^ t33;
+    t43 = t29 ^ t40;
+    t44 = t33 ^ t37;
+    t45 = t42 ^ t41;
+    z0 = t44 & y15;
+    z1 = t37 & y6;
+    z2 = t33 & x7;
+    z3 = t43 & y16;
+    z4 = t40 & y1;
+    z5 = t29 & y7;
+    z6 = t42 & y11;
+    z7 = t45 & y17;
+    z8 = t41 & y10;
+    z9 = t44 & y12;
+    z10 = t37 & y3;
+    z11 = t33 & y4;
+    z12 = t43 & y13;
+    z13 = t40 & y5;
+    z14 = t29 & y2;
+    z15 = t42 & y9;
+    z16 = t45 & y14;
+    z17 = t41 & y8;
+
+    /*
+     * Bottom linear transformation.
+     */
+    t46 = z15 ^ z16;
+    t47 = z10 ^ z11;
+    t48 = z5 ^ z13;
+    t49 = z9 ^ z10;
+    t50 = z2 ^ z12;
+    t51 = z2 ^ z5;
+    t52 = z7 ^ z8;
+    t53 = z0 ^ z3;
+    t54 = z6 ^ z7;
+    t55 = z16 ^ z17;
+    t56 = z12 ^ t48;
+    t57 = t50 ^ t53;
+    t58 = z4 ^ t46;
+    t59 = z3 ^ t54;
+    t60 = t46 ^ t57;
+    t61 = z14 ^ t57;
+    t62 = t52 ^ t58;
+    t63 = t49 ^ t58;
+    t64 = z4 ^ t59;
+    t65 = t61 ^ t62;
+    t66 = z1 ^ t63;
+    s0 = t59 ^ t63;
+    s6 = t56 ^ ~t62;
+    s7 = t48 ^ ~t60;
+    t67 = t64 ^ t65;
+    s3 = t53 ^ t66;
+    s4 = t51 ^ t66;
+    s5 = t47 ^ t65;
+    s1 = t64 ^ ~s3;
+    s2 = t55 ^ ~t67;
+
+    q[7] = s0;
+    q[6] = s1;
+    q[5] = s2;
+    q[4] = s3;
+    q[3] = s4;
+    q[2] = s5;
+    q[1] = s6;
+    q[0] = s7;
+}
+
+static void br_aes_ct64_ortho(uint64_t *q) {
+#define SWAPN(cl, ch, s, x, y)   do { \
+        uint64_t a, b; \
+        a = (x); \
+        b = (y); \
+        (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \
+        (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \
+    } while (0)
+
+#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
+#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
+#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
+
+    SWAP2(q[0], q[1]);
+    SWAP2(q[2], q[3]);
+    SWAP2(q[4], q[5]);
+    SWAP2(q[6], q[7]);
+
+    SWAP4(q[0], q[2]);
+    SWAP4(q[1], q[3]);
+    SWAP4(q[4], q[6]);
+    SWAP4(q[5], q[7]);
+
+    SWAP8(q[0], q[4]);
+    SWAP8(q[1], q[5]);
+    SWAP8(q[2], q[6]);
+    SWAP8(q[3], q[7]);
+}
+
+
+static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
+    uint64_t x0, x1, x2, x3;
+
+    x0 = w[0];
+    x1 = w[1];
+    x2 = w[2];
+    x3 = w[3];
+    x0 |= (x0 << 16);
+    x1 |= (x1 << 16);
+    x2 |= (x2 << 16);
+    x3 |= (x3 << 16);
+    x0 &= (uint64_t)0x0000FFFF0000FFFF;
+    x1 &= (uint64_t)0x0000FFFF0000FFFF;
+    x2 &= (uint64_t)0x0000FFFF0000FFFF;
+    x3 &= (uint64_t)0x0000FFFF0000FFFF;
+    x0 |= (x0 << 8);
+    x1 |= (x1 << 8);
+    x2 |= (x2 << 8);
+    x3 |= (x3 << 8);
+    x0 &= (uint64_t)0x00FF00FF00FF00FF;
+    x1 &= (uint64_t)0x00FF00FF00FF00FF;
+    x2 &= (uint64_t)0x00FF00FF00FF00FF;
+    x3 &= (uint64_t)0x00FF00FF00FF00FF;
+    *q0 = x0 | (x2 << 8);
+    *q1 = x1 | (x3 << 8);
+}
+
+
+static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
+    uint64_t x0, x1, x2, x3;
+
+    x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
+    x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
+    x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+    x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+    x0 |= (x0 >> 8);
+    x1 |= (x1 >> 8);
+    x2 |= (x2 >> 8);
+    x3 |= (x3 >> 8);
+    x0 &= (uint64_t)0x0000FFFF0000FFFF;
+    x1 &= (uint64_t)0x0000FFFF0000FFFF;
+    x2 &= (uint64_t)0x0000FFFF0000FFFF;
+    x3 &= (uint64_t)0x0000FFFF0000FFFF;
+    w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
+    w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
+    w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
+    w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
+}
+
+static const unsigned char Rcon[] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t sub_word(uint32_t x) {
+    uint64_t q[8];
+
+    memset(q, 0, sizeof q);
+    q[0] = x;
+    br_aes_ct64_ortho(q);
+    br_aes_ct64_bitslice_Sbox(q);
+    br_aes_ct64_ortho(q);
+    return (uint32_t)q[0];
+}
+
+static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len) {
+    unsigned int i, j, k, nk, nkf;
+    uint32_t tmp;
+    uint32_t skey[60];
+    unsigned nrounds = 10 + ((key_len - 16) >> 2);
+
+    nk = (key_len >> 2);
+    nkf = ((nrounds + 1) << 2);
+    br_range_dec32le(skey, (key_len >> 2), key);
+    tmp = skey[(key_len >> 2) - 1];
+    for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+        if (j == 0) {
+            tmp = (tmp << 24) | (tmp >> 8);
+            tmp = sub_word(tmp) ^ Rcon[k];
+        } else if (nk > 6 && j == 4) {
+            tmp = sub_word(tmp);
+        }
+        tmp ^= skey[i - nk];
+        skey[i] = tmp;
+        if (++ j == nk) {
+            j = 0;
+            k ++;
+        }
+    }
+
+    for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
+        uint64_t q[8];
+
+        br_aes_ct64_interleave_in(&q[0], &q[4], skey + i);
+        q[1] = q[0];
+        q[2] = q[0];
+        q[3] = q[0];
+        q[5] = q[4];
+        q[6] = q[4];
+        q[7] = q[4];
+        br_aes_ct64_ortho(q);
+        comp_skey[j + 0] =
+            (q[0] & (uint64_t)0x1111111111111111)
+            | (q[1] & (uint64_t)0x2222222222222222)
+            | (q[2] & (uint64_t)0x4444444444444444)
+            | (q[3] & (uint64_t)0x8888888888888888);
+        comp_skey[j + 1] =
+            (q[4] & (uint64_t)0x1111111111111111)
+            | (q[5] & (uint64_t)0x2222222222222222)
+            | (q[6] & (uint64_t)0x4444444444444444)
+            | (q[7] & (uint64_t)0x8888888888888888);
+    }
+}
+
+static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds) {
+    unsigned u, v, n;
+
+    n = (nrounds + 1) << 1;
+    for (u = 0, v = 0; u < n; u ++, v += 4) {
+        uint64_t x0, x1, x2, x3;
+
+        x0 = x1 = x2 = x3 = comp_skey[u];
+        x0 &= (uint64_t)0x1111111111111111;
+        x1 &= (uint64_t)0x2222222222222222;
+        x2 &= (uint64_t)0x4444444444444444;
+        x3 &= (uint64_t)0x8888888888888888;
+        x1 >>= 1;
+        x2 >>= 2;
+        x3 >>= 3;
+        skey[v + 0] = (x0 << 4) - x0;
+        skey[v + 1] = (x1 << 4) - x1;
+        skey[v + 2] = (x2 << 4) - x2;
+        skey[v + 3] = (x3 << 4) - x3;
+    }
+}
+
+
+static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
+    q[0] ^= sk[0];
+    q[1] ^= sk[1];
+    q[2] ^= sk[2];
+    q[3] ^= sk[3];
+    q[4] ^= sk[4];
+    q[5] ^= sk[5];
+    q[6] ^= sk[6];
+    q[7] ^= sk[7];
+}
+
+static inline void shift_rows(uint64_t *q) {
+    int i;
+
+    for (i = 0; i < 8; i ++) {
+        uint64_t x;
+
+        x = q[i];
+        q[i] = (x & (uint64_t)0x000000000000FFFF)
+               | ((x & (uint64_t)0x00000000FFF00000) >> 4)
+               | ((x & (uint64_t)0x00000000000F0000) << 12)
+               | ((x & (uint64_t)0x0000FF0000000000) >> 8)
+               | ((x & (uint64_t)0x000000FF00000000) << 8)
+               | ((x & (uint64_t)0xF000000000000000) >> 12)
+               | ((x & (uint64_t)0x0FFF000000000000) << 4);
+    }
+}
+
+static inline uint64_t rotr32(uint64_t x) {
+    return (x << 32) | (x >> 32);
+}
+
+static inline void mix_columns(uint64_t *q) {
+    uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+    uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+    q0 = q[0];
+    q1 = q[1];
+    q2 = q[2];
+    q3 = q[3];
+    q4 = q[4];
+    q5 = q[5];
+    q6 = q[6];
+    q7 = q[7];
+    r0 = (q0 >> 16) | (q0 << 48);
+    r1 = (q1 >> 16) | (q1 << 48);
+    r2 = (q2 >> 16) | (q2 << 48);
+    r3 = (q3 >> 16) | (q3 << 48);
+    r4 = (q4 >> 16) | (q4 << 48);
+    r5 = (q5 >> 16) | (q5 << 48);
+    r6 = (q6 >> 16) | (q6 << 48);
+    r7 = (q7 >> 16) | (q7 << 48);
+
+    q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
+    q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
+    q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
+    q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
+    q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
+    q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
+    q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
+    q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
+}
+
+
+static void inc4_be(uint32_t *x) {
+    uint32_t t = br_swap32(*x) + 4;
+    *x = br_swap32(t);
+}
+
+
+static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+    uint32_t w[16];
+    uint64_t q[8];
+    unsigned int i;
+
+    memcpy(w, ivw, sizeof(w));
+    for (i = 0; i < 4; i++) {
+        br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2));
+    }
+    br_aes_ct64_ortho(q);
+
+
+    add_round_key(q, sk_exp);
+    for (i = 1; i < nrounds; i++) {
+        br_aes_ct64_bitslice_Sbox(q);
+        shift_rows(q);
+        mix_columns(q);
+        add_round_key(q, sk_exp + (i << 3));
+    }
+    br_aes_ct64_bitslice_Sbox(q);
+    shift_rows(q);
+    add_round_key(q, sk_exp + 8 * nrounds);
+
+    br_aes_ct64_ortho(q);
+    for (i = 0; i < 4; i ++) {
+        br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
+    }
+    br_range_enc32le(out, w, 16);
+}
+
+
+static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+    aes_ecb4x(out, ivw, sk_exp, nrounds);
+
+    /* Increase counter for next 4 blocks */
+    inc4_be(ivw + 3);
+    inc4_be(ivw + 7);
+    inc4_be(ivw + 11);
+    inc4_be(ivw + 15);
+}
+
+
+static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, unsigned int nrounds) {
+    uint32_t blocks[16];
+    unsigned char t[64];
+
+    while (nblocks >= 4) {
+        br_range_dec32le(blocks, 16, in);
+        aes_ecb4x(out, blocks, rkeys, nrounds);
+        nblocks -= 4;
+        in += 64;
+        out += 64;
+    }
+
+    if (nblocks) {
+        br_range_dec32le(blocks, nblocks * 4, in);
+        aes_ecb4x(t, blocks, rkeys, nrounds);
+        memcpy(out, t, nblocks * 16);
+    }
+}
+
+
+static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, unsigned int nrounds) {
+    uint32_t ivw[16];
+    size_t i;
+    uint32_t cc = 0;
+
+    br_range_dec32le(ivw, 3, iv);
+    memcpy(ivw +  4, ivw, 3 * sizeof(uint32_t));
+    memcpy(ivw +  8, ivw, 3 * sizeof(uint32_t));
+    memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
+    ivw[ 3] = br_swap32(cc);
+    ivw[ 7] = br_swap32(cc + 1);
+    ivw[11] = br_swap32(cc + 2);
+    ivw[15] = br_swap32(cc + 3);
+
+    while (outlen > 64) {
+        aes_ctr4x(out, ivw, rkeys, nrounds);
+        out += 64;
+        outlen -= 64;
+    }
+    if (outlen > 0) {
+        unsigned char tmp[64];
+        aes_ctr4x(tmp, ivw, rkeys, nrounds);
+        for (i = 0; i < outlen; i++) {
+            out[i] = tmp[i];
+        }
+    }
+}
+
+void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key) {
+    uint64_t skey[22];
+
+    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES128_STATESIZE);
+    if (r->sk_exp == NULL) {
+        exit(111);
+    }
+
+    br_aes_ct64_keysched(skey, key, 16);
+    br_aes_ct64_skey_expand(r->sk_exp, skey, 10);
+}
+
+void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key) {
+    aes128_ecb_keyexp(r, key);
+}
+
+
+void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key) {
+    uint64_t skey[26];
+    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES192_STATESIZE);
+    if (r->sk_exp == NULL) {
+        exit(111);
+    }
+
+    br_aes_ct64_keysched(skey, key, 24);
+    br_aes_ct64_skey_expand(r->sk_exp, skey, 12);
+}
+
+
+void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key) {
+    aes192_ecb_keyexp(r, key);
+}
+
+
+void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key) {
+    uint64_t skey[30];
+    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES256_STATESIZE);
+    if (r->sk_exp == NULL) {
+        exit(111);
+    }
+
+    br_aes_ct64_keysched(skey, key, 32);
+    br_aes_ct64_skey_expand(r->sk_exp, skey, 14);
+}
+
+
+void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key) {
+    aes256_ecb_keyexp(r, key);
+}
+
+
+void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx) {
+    aes_ecb(out, in, nblocks, ctx->sk_exp, 10);
+}
+
+void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx) {
+    aes_ctr(out, outlen, iv, ctx->sk_exp, 10);
+}
+
+void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx) {
+    aes_ecb(out, in, nblocks, ctx->sk_exp, 12);
+}
+
+void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx) {
+    aes_ctr(out, outlen, iv, ctx->sk_exp, 12);
+}
+
+void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx) {
+    aes_ecb(out, in, nblocks, ctx->sk_exp, 14);
+}
+
+void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx) {
+    aes_ctr(out, outlen, iv, ctx->sk_exp, 14);
+}
+
+void aes128_ctx_release(aes128ctx *r) {
+    free(r->sk_exp);
+}
+
+void aes192_ctx_release(aes192ctx *r) {
+    free(r->sk_exp);
+}
+
+void aes256_ctx_release(aes256ctx *r) {
+    free(r->sk_exp);
+}
+
+int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    aes128ctx ctx;
+    unsigned char iv[16] = { 0 };
+
+    aes128_ctr_keyexp(&ctx, input);
+    aes128_ctr(output, outputByteLen, iv, &ctx);
+    aes128_ctx_release(&ctx);
+
+    return (int)outputByteLen;
+}
+
+void AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output) {
+    aes256ctx ctx;
+
+    aes256_ecb_keyexp(&ctx, key);
+    aes256_ecb(output, input, 1, &ctx);
+    aes256_ctx_release(&ctx);
+}
--- a/src/common/generic/fips202.c
+++ b/src/common/generic/fips202.c
--- a/src/common/generic/include/aes.h
+++ b/src/common/generic/include/aes.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AES_H
+#define AES_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen);
+#endif
+
+#endif
--- a/src/common/generic/include/bench.h
+++ b/src/common/generic/include/bench.h
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#if defined(TARGET_OS_UNIX) && (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_OTHER))
+#include <time.h>
+#endif
+#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X) || defined(TARGET_OTHER))
+#define print_bench_unit printf("nsec\n");
+#else
+#define print_bench_unit printf("cycles\n");
+#endif
+
+#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X))
+#define BENCH_UNITS "nsec"
+#else
+#define BENCH_UNITS "cycles"
+#endif
+
+static inline int64_t cpucycles(void) {
+#if (defined(TARGET_AMD64) || defined(TARGET_X86))
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
+    return ((int64_t) lo) | (((int64_t) hi) << 32);
+#elif (defined(TARGET_S390X))
+    uint64_t tod;
+    asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
+    return (tod * 1000 / 4096);
+#else
+    struct timespec time;
+    clock_gettime(CLOCK_REALTIME, &time);
+    return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
+#endif
+}
+
+static inline int cmpfunc (const void *a, const void *b) {
+    return ( *(uint64_t *)a - * (uint64_t *)b );
+}
+
+#define BENCH_CODE_1(r) \
+    cycles = 0; \
+    for (i = 0; i < (r); ++i) { \
+        cycles1 = cpucycles();
+
+#define BENCH_CODE_2(name, csv) \
+        cycles2 = cpucycles(); \
+        if(i < LIST_SIZE) \
+          cycles_list[i] = (cycles2 - cycles1);\
+        cycles = cycles + (cycles2 - cycles1); \
+    } \
+    qsort(cycles_list, (runs < LIST_SIZE)? runs : LIST_SIZE, sizeof(uint64_t), cmpfunc);\
+    if (csv) \
+      printf("%2" PRId64 ",", cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2]); \
+    else { \
+      printf("  %-20s-> median: %2" PRId64 ", average: %2" PRId64 " ", name, \
+      cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2], (cycles / runs)); \
+      printf("%s\n", BENCH_UNITS); \
+    }
--- a/src/common/generic/include/fips202.h
+++ b/src/common/generic/include/fips202.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef FIPS202_H
+#define FIPS202_H
+
+#include <stddef.h>
+
+int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
+int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
+
+#endif
--- a/src/common/generic/include/tutil.h
+++ b/src/common/generic/include/tutil.h
@@ -0,0 +1,33 @@
+#ifndef TUTIL_H
+#define TUTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
+#endif
+
+#if defined(RADIX_64)
+#define digit_t uint64_t
+#define sdigit_t int64_t
+#define DIGIT_LEN 8
+#define RADIX 64
+#define LOG2RADIX 6
+#define BSWAP_DIGIT(i) BSWAP64(i)
+#elif defined(RADIX_32)
+#define digit_t uint32_t
+#define sdigit_t int32_t
+#define DIGIT_LEN 4
+#define RADIX 32
+#define LOG2RADIX 5
+#define BSWAP_DIGIT(i) BSWAP32(i)
+#else
+#error "Radix must be 32bit or 64 bit"
+#endif
+
+#endif
--- a/src/common/generic/mem.c
+++ b/src/common/generic/mem.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <string.h>
+#include <stdlib.h>
+
+void sqisign_secure_free(void *mem, size_t size) {
+    if (mem) {
+        typedef void *(*memset_t)(void *, int, size_t);
+        static volatile memset_t memset_func = memset;
+        memset_func(mem, 0, size);
+        free(mem);
+    }
+}
+void sqisign_secure_clear(void *mem, size_t size) {
+    typedef void *(*memset_t)(void *, int, size_t);
+    static volatile memset_t memset_func = memset;
+    memset_func(mem, 0, size);
+}
--- a/src/common/generic/randombytes_ctrdrbg.c
+++ b/src/common/generic/randombytes_ctrdrbg.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: Apache-2.0 and Unknown
+//
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#include <string.h>
+
+#include <aes.h>
+
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define RNG_SUCCESS      0
+#define RNG_BAD_MAXLEN  -1
+#define RNG_BAD_OUTBUF  -2
+#define RNG_BAD_REQ_LEN -3
+
+static __inline void AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer) {
+    AES_ECB_encrypt(ctr, key, buffer);
+}
+
+typedef struct {
+    unsigned char   buffer[16];
+    int             buffer_pos;
+    unsigned long   length_remaining;
+    unsigned char   key[32];
+    unsigned char   ctr[16];
+} AES_XOF_struct;
+
+typedef struct {
+    unsigned char   Key[32];
+    unsigned char   V[16];
+    int             reseed_counter;
+} AES256_CTR_DRBG_struct;
+
+
+void
+AES256_CTR_DRBG_Update(unsigned char *provided_data,
+                       unsigned char *Key,
+                       unsigned char *V);
+
+AES256_CTR_DRBG_struct  DRBG_ctx;
+
+static void
+randombytes_init_nist(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+    unsigned char   seed_material[48];
+
+    (void)security_strength;  // Unused parameter
+    memcpy(seed_material, entropy_input, 48);
+    if (personalization_string)
+        for (int i = 0; i < 48; i++) {
+            seed_material[i] ^= personalization_string[i];
+        }
+    memset(DRBG_ctx.Key, 0x00, 32);
+    memset(DRBG_ctx.V, 0x00, 16);
+    AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
+    DRBG_ctx.reseed_counter = 1;
+}
+
+static int
+randombytes_nist(unsigned char *x, size_t xlen) {
+    unsigned char   block[16];
+    size_t          i = 0;
+
+    while ( xlen > 0 ) {
+        //increment V
+        for (int j = 15; j >= 0; j--) {
+            if ( DRBG_ctx.V[j] == 0xff ) {
+                DRBG_ctx.V[j] = 0x00;
+            } else {
+                DRBG_ctx.V[j]++;
+                break;
+            }
+        }
+        AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
+        if ( xlen > 15 ) {
+            memcpy(x + i, block, 16);
+            i += 16;
+            xlen -= 16;
+        } else {
+            memcpy(x + i, block, xlen);
+            i += xlen;
+            xlen = 0;
+        }
+    }
+    AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
+    DRBG_ctx.reseed_counter++;
+
+    return 0;
+}
+
+void
+AES256_CTR_DRBG_Update(unsigned char *provided_data,
+                       unsigned char *Key,
+                       unsigned char *V) {
+    unsigned char   temp[48];
+
+    for (int i = 0; i < 3; i++) {
+        //increment V
+        for (int j = 15; j >= 0; j--) {
+            if ( V[j] == 0xff ) {
+                V[j] = 0x00;
+            } else {
+                V[j]++;
+                break;
+            }
+        }
+
+        AES256_ECB(Key, V, temp + 16 * i);
+    }
+    if ( provided_data != NULL )
+        for (int i = 0; i < 48; i++) {
+            temp[i] ^= provided_data[i];
+        }
+    memcpy(Key, temp, 32);
+    memcpy(V, temp + 32, 16);
+}
+
+int randombytes(unsigned char *random_array, unsigned long long nbytes) {
+    int ret = randombytes_nist(random_array, nbytes);
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
+#endif
+    return ret;
+}
+
+void
+randombytes_init(unsigned char *entropy_input,
+                 unsigned char *personalization_string,
+                 int security_strength) {
+    return randombytes_init_nist(entropy_input, personalization_string, security_strength);
+}
--- a/src/common/generic/randombytes_system.c
+++ b/src/common/generic/randombytes_system.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: MIT
+
+/*
+The MIT License
+Copyright (c) 2017 Daan Sprenkels <hello@dsprenkels.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+// In the case that are compiling on linux, we need to define _GNU_SOURCE
+// *before* randombytes.h is included. Otherwise SYS_getrandom will not be
+// declared.
+#if defined(__linux__) || defined(__GNU__)
+# define _GNU_SOURCE
+#endif /* defined(__linux__) || defined(__GNU__) */
+
+#if defined(_WIN32)
+/* Windows */
+# include <windows.h>
+# include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
+#endif /* defined(_WIN32) */
+
+/* wasi */
+#if defined(__wasi__)
+#include <stdlib.h>
+#endif
+
+/* kFreeBSD */
+#if defined(__FreeBSD_kernel__) && defined(__GLIBC__)
+# define GNU_KFREEBSD
+#endif
+
+#if defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
+/* Linux */
+// We would need to include <linux/random.h>, but not every target has access
+// to the linux headers. We only need RNDGETENTCNT, so we instead inline it.
+// RNDGETENTCNT is originally defined in `include/uapi/linux/random.h` in the
+// linux repo.
+# define RNDGETENTCNT 0x80045200
+
+# include <assert.h>
+# include <errno.h>
+# include <fcntl.h>
+# include <poll.h>
+# include <stdint.h>
+# include <stdio.h>
+# include <sys/ioctl.h>
+# if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
+#  define USE_GLIBC
+#  include <sys/random.h>
+# endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24)) */
+# include <sys/stat.h>
+# include <sys/syscall.h>
+# include <sys/types.h>
+# include <unistd.h>
+
+// We need SSIZE_MAX as the maximum read len from /dev/urandom
+# if !defined(SSIZE_MAX)
+#  define SSIZE_MAX (SIZE_MAX / 2 - 1)
+# endif /* defined(SSIZE_MAX) */
+
+#endif /* defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD) */
+
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+/* Dragonfly, FreeBSD, NetBSD, OpenBSD (has arc4random) */
+# include <sys/param.h>
+# if defined(BSD)
+#  include <stdlib.h>
+# endif
+/* GNU/Hurd defines BSD in sys/param.h which causes problems later */
+# if defined(__GNU__)
+#  undef BSD
+# endif
+#endif
+
+#if defined(__EMSCRIPTEN__)
+# include <assert.h>
+# include <emscripten.h>
+# include <errno.h>
+# include <stdbool.h>
+#endif /* defined(__EMSCRIPTEN__) */
+
+
+#if defined(_WIN32)
+static int randombytes_win32_randombytes(void* buf, size_t n)
+{
+	HCRYPTPROV ctx;
+	BOOL tmp;
+	DWORD to_read = 0;
+	const size_t MAX_DWORD = 0xFFFFFFFF;
+
+	tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL,
+	                          CRYPT_VERIFYCONTEXT);
+	if (tmp == FALSE) return -1;
+
+	while (n > 0) {
+		to_read = (DWORD)(n < MAX_DWORD ? n : MAX_DWORD);
+		tmp = CryptGenRandom(ctx, to_read, (BYTE*) buf);
+		if (tmp == FALSE) return -1;
+		buf = ((char*)buf) + to_read;
+		n -= to_read;
+	}
+
+	tmp = CryptReleaseContext(ctx, 0);
+	if (tmp == FALSE) return -1;
+
+	return 0;
+}
+#endif /* defined(_WIN32) */
+
+#if defined(__wasi__)
+static int randombytes_wasi_randombytes(void *buf, size_t n) {
+	arc4random_buf(buf, n);
+	return 0;
+}
+#endif /* defined(__wasi__) */
+
+#if (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom))
+# if defined(USE_GLIBC)
+// getrandom is declared in glibc.
+# elif defined(SYS_getrandom)
+static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) {
+	return syscall(SYS_getrandom, buf, buflen, flags);
+}
+# endif
+
+static int randombytes_linux_randombytes_getrandom(void *buf, size_t n)
+{
+	/* I have thought about using a separate PRF, seeded by getrandom, but
+	 * it turns out that the performance of getrandom is good enough
+	 * (250 MB/s on my laptop).
+	 */
+	size_t offset = 0, chunk;
+	int ret;
+	while (n > 0) {
+		/* getrandom does not allow chunks larger than 33554431 */
+		chunk = n <= 33554431 ? n : 33554431;
+		do {
+			ret = getrandom((char *)buf + offset, chunk, 0);
+		} while (ret == -1 && errno == EINTR);
+		if (ret < 0) return ret;
+		offset += ret;
+		n -= ret;
+	}
+	assert(n == 0);
+	return 0;
+}
+#endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom)) */
+
+#if (defined(__linux__) || defined(GNU_KFREEBSD)) && !defined(SYS_getrandom)
+
+# if defined(__linux__)
+static int randombytes_linux_read_entropy_ioctl(int device, int *entropy)
+{
+	return ioctl(device, RNDGETENTCNT, entropy);
+}
+
+static int randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
+{
+	int retcode;
+	do {
+		rewind(stream);
+		retcode = fscanf(stream, "%d", entropy);
+	} while (retcode != 1 && errno == EINTR);
+	if (retcode != 1) {
+		return -1;
+	}
+	return 0;
+}
+
+static int randombytes_linux_wait_for_entropy(int device)
+{
+	/* We will block on /dev/random, because any increase in the OS' entropy
+	 * level will unblock the request. I use poll here (as does libsodium),
+	 * because we don't *actually* want to read from the device. */
+	enum { IOCTL, PROC } strategy = IOCTL;
+	const int bits = 128;
+	struct pollfd pfd;
+	int fd;
+	FILE *proc_file;
+	int retcode, retcode_error = 0; // Used as return codes throughout this function
+	int entropy = 0;
+
+	/* If the device has enough entropy already, we will want to return early */
+	retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
+	// printf("errno: %d (%s)\n", errno, strerror(errno));
+	if (retcode != 0 && (errno == ENOTTY || errno == ENOSYS)) {
+		// The ioctl call on /dev/urandom has failed due to a
+		//   - ENOTTY (unsupported action), or
+		//   - ENOSYS (invalid ioctl; this happens on MIPS, see #22).
+		//
+		// We will fall back to reading from
+		// `/proc/sys/kernel/random/entropy_avail`.  This less ideal,
+		// because it allocates a file descriptor, and it may not work
+		// in a chroot.  But at this point it seems we have no better
+		// options left.
+		strategy = PROC;
+		// Open the entropy count file
+		proc_file = fopen("/proc/sys/kernel/random/entropy_avail", "r");
+		if (proc_file == NULL) {
+			return -1;
+		}
+	} else if (retcode != 0) {
+		// Unrecoverable ioctl error
+		return -1;
+	}
+	if (entropy >= bits) {
+		return 0;
+	}
+
+	do {
+		fd = open("/dev/random", O_RDONLY);
+	} while (fd == -1 && errno == EINTR); /* EAGAIN will not occur */
+	if (fd == -1) {
+		/* Unrecoverable IO error */
+		return -1;
+	}
+
+	pfd.fd = fd;
+	pfd.events = POLLIN;
+	for (;;) {
+		retcode = poll(&pfd, 1, -1);
+		if (retcode == -1 && (errno == EINTR || errno == EAGAIN)) {
+			continue;
+		} else if (retcode == 1) {
+			if (strategy == IOCTL) {
+				retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
+			} else if (strategy == PROC) {
+				retcode = randombytes_linux_read_entropy_proc(proc_file, &entropy);
+			} else {
+				return -1; // Unreachable
+			}
+
+			if (retcode != 0) {
+				// Unrecoverable I/O error
+				retcode_error = retcode;
+				break;
+			}
+			if (entropy >= bits) {
+				break;
+			}
+		} else {
+			// Unreachable: poll() should only return -1 or 1
+			retcode_error = -1;
+			break;
+		}
+	}
+	do {
+		retcode = close(fd);
+	} while (retcode == -1 && errno == EINTR);
+	if (strategy == PROC) {
+		do {
+			retcode = fclose(proc_file);
+		} while (retcode == -1 && errno == EINTR);
+	}
+	if (retcode_error != 0) {
+		return retcode_error;
+	}
+	return retcode;
+}
+# endif /* defined(__linux__) */
+
+
+static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
+{
+	int fd;
+	size_t offset = 0, count;
+	ssize_t tmp;
+	do {
+		fd = open("/dev/urandom", O_RDONLY);
+	} while (fd == -1 && errno == EINTR);
+	if (fd == -1) return -1;
+# if defined(__linux__)
+	if (randombytes_linux_wait_for_entropy(fd) == -1) return -1;
+# endif
+
+	while (n > 0) {
+		count = n <= SSIZE_MAX ? n : SSIZE_MAX;
+		tmp = read(fd, (char *)buf + offset, count);
+		if (tmp == -1 && (errno == EAGAIN || errno == EINTR)) {
+			continue;
+		}
+		if (tmp == -1) return -1; /* Unrecoverable IO error */
+		offset += tmp;
+		n -= tmp;
+	}
+	close(fd);
+	assert(n == 0);
+	return 0;
+}
+#endif /* defined(__linux__) && !defined(SYS_getrandom) */
+
+
+#if defined(BSD)
+static int randombytes_bsd_randombytes(void *buf, size_t n)
+{
+	arc4random_buf(buf, n);
+	return 0;
+}
+#endif /* defined(BSD) */
+
+
+#if defined(__EMSCRIPTEN__)
+static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
+	const int ret = EM_ASM_INT({
+		var crypto;
+		try {
+			crypto = require('crypto');
+		} catch (error) {
+			return -2;
+		}
+		try {
+			writeArrayToMemory(crypto.randomBytes($1), $0);
+			return 0;
+		} catch (error) {
+			return -1;
+		}
+	}, buf, n);
+	switch (ret) {
+	case 0:
+		return 0;
+	case -1:
+		errno = EINVAL;
+		return -1;
+	case -2:
+		errno = ENOSYS;
+		return -1;
+	}
+	assert(false); // Unreachable
+}
+#endif /* defined(__EMSCRIPTEN__) */
+
+
+static int randombytes_select(void *buf, size_t n)
+{
+#if defined(__EMSCRIPTEN__)
+	return randombytes_js_randombytes_nodejs(buf, n);   
+#elif defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
+# if defined(USE_GLIBC)
+	/* Use getrandom system call */
+	return randombytes_linux_randombytes_getrandom(buf, n);
+# elif defined(SYS_getrandom)
+	/* Use getrandom system call */
+	return randombytes_linux_randombytes_getrandom(buf, n);
+# else
+	/* When we have enough entropy, we can read from /dev/urandom */
+	return randombytes_linux_randombytes_urandom(buf, n);
+# endif
+#elif defined(BSD)
+	/* Use arc4random system call */
+	return randombytes_bsd_randombytes(buf, n);
+#elif defined(_WIN32)
+	/* Use windows API */
+	return randombytes_win32_randombytes(buf, n);
+#elif defined(__wasi__)
+	/* Use WASI */
+	return randombytes_wasi_randombytes(buf, n);
+#else
+# error "randombytes(...) is not supported on this platform"
+#endif
+}
+
+int randombytes(unsigned char *x, unsigned long long xlen) {
+
+    int ret = randombytes_select(x, (size_t) xlen);
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_UNDEFINED(x, xlen);
+#endif
+    return ret;
+}
+
+void randombytes_init(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+    (void) entropy_input;
+    (void) personalization_string;
+    (void) security_strength;
+}
--- a/src/ec/CMakeLists.txt
+++ b/src/ec/CMakeLists.txt
@@ -0,0 +1 @@
+include(${SELECT_IMPL_TYPE})
--- a/src/ec/ref/CMakeLists.txt
+++ b/src/ec/ref/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)
+
+include(${SELECT_SQISIGN_VARIANT})
--- a/src/ec/ref/ecx/basis.c
+++ b/src/ec/ref/ecx/basis.c
@@ -0,0 +1,508 @@
+#include "isog.h"
+
+
+static void xTPL(ec_point_t* Q, const ec_point_t* P, const ec_point_t* A3)
+{
+    /* ----------------------------------------------------------------------------- *
+     * Differential point tripling given the montgomery coefficient A3 = (A+2C:A-2C)
+     * ----------------------------------------------------------------------------- */
+     
+    fp2_t t0, t1, t2, t3, t4;
+    fp2_sub(&t0, &P->x, &P->z);
+    fp2_sqr(&t2, &t0);
+    fp2_add(&t1, &P->x, &P->z);
+    fp2_sqr(&t3, &t1);
+    fp2_add(&t4, &t1, &t0);
+    fp2_sub(&t0, &t1, &t0);
+    fp2_sqr(&t1, &t4);
+    fp2_sub(&t1, &t1, &t3);
+    fp2_sub(&t1, &t1, &t2);
+    fp2_mul(&Q->x, &t3, &A3->x);
+    fp2_mul(&t3, &Q->x, &t3);
+    fp2_mul(&Q->z, &t2, &A3->z);
+    fp2_mul(&t2, &t2, &Q->z);
+    fp2_sub(&t3, &t2, &t3);
+    fp2_sub(&t2, &Q->x, &Q->z);
+    fp2_mul(&t1, &t2, &t1);
+    fp2_add(&t2, &t3, &t1);
+    fp2_sqr(&t2, &t2);
+    fp2_mul(&Q->x, &t2, &t4);
+    fp2_sub(&t1, &t3, &t1);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&Q->z, &t1, &t0);
+}
+
+int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P){
+
+    fp2_t t0, t1, t2;
+
+    // Check if xz*(C^2x^2+zACx+z^2C^2) is a square
+    fp2_mul(&t0, &curve->C, &P->x); 
+    fp2_mul(&t1, &t0, &P->z);       
+    fp2_mul(&t1, &t1, &curve->A);   
+    fp2_mul(&t2, &curve->C, &P->z); 
+    fp2_sqr(&t0, &t0);              
+    fp2_sqr(&t2, &t2);              
+    fp2_add(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t2);
+    fp2_mul(&t0, &t0, &P->x);
+    fp2_mul(&t0, &t0, &P->z);
+    return fp2_is_square(&t0);
+}
+
+static void difference_point(ec_point_t* PQ, const ec_point_t* P, const ec_point_t* Q, const ec_curve_t* curve){
+    // Given P,Q in affine x-only, computes a deterministic choice for (P-Q)
+    // The points must be normalized to z=1 and the curve to C=1
+
+    fp2_t t0, t1, t2, t3;
+    
+    fp2_sub(&PQ->z, &P->x, &Q->x);  // P - Q
+    fp2_mul(&t2, &P->x, &Q->x);     // P*Q
+    fp_mont_setone(t1.re);
+    fp_set(t1.im, 0);
+    fp2_sub(&t3, &t2, &t1);         // P*Q-1
+    fp2_mul(&t0, &PQ->z, &t3);      // (P-Q)*(P*Q-1)
+    fp2_sqr(&PQ->z, &PQ->z);        // (P-Q)^2
+    fp2_sqr(&t0, &t0);              // (P-Q)^2*(P*Q-1)^2
+    fp2_add(&t1, &t2, &t1);         // P*Q+1
+    fp2_add(&t3, &P->x, &Q->x);     // P+Q
+    fp2_mul(&t1, &t1, &t3);         // (P+Q)*(P*Q+1)
+    fp2_mul(&t2, &t2, &curve->A);   // A*P*Q
+    fp2_add(&t2, &t2, &t2);         // 2*A*P*Q
+    fp2_add(&t1, &t1, &t2);         // (P+Q)*(P*Q+1) + 2*A*P*Q
+    fp2_sqr(&t2, &t1);              // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2
+    fp2_sub(&t0, &t2, &t0);         // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2 - (P-Q)^2*(P*Q-1)^2
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &t0, &t1);
+}
+
+void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve){
+    fp2_t x, t0, t1, t2;
+    ec_point_t P, Q, Q2, P2, A24;
+
+    // Curve coefficient in the form A24 = (A+2C:4C)
+    fp2_add(&A24.z, &curve->C, &curve->C);
+    fp2_add(&A24.x, &curve->A, &A24.z);
+    fp2_add(&A24.z, &A24.z, &A24.z);
+
+    fp_mont_setone(x.re);
+    fp_set(x.im, 0);
+
+    // Find P
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&P.x, &x);
+            fp_mont_setone(P.z.re);
+            fp_set(P.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear odd factors from the order
+        xMULv2(&P, &P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
+
+        // Check if point has order 2^f
+        copy_point(&P2, &P);
+        for(int i = 0; i < POWER_OF_2 - 1; i++)
+            xDBLv2(&P2, &P2, &A24);
+        if(ec_is_zero(&P2))
+            continue;
+        else
+            break;
+    }
+    
+    // Find Q
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&Q.x, &x);
+            fp_mont_setone(Q.z.re);
+            fp_set(Q.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear odd factors from the order
+        xMULv2(&Q, &Q, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
+
+        // Check if point has order 2^f
+        copy_point(&Q2, &Q);
+        for(int i = 0; i < POWER_OF_2 - 1; i++)
+            xDBLv2(&Q2, &Q2, &A24);
+        if(ec_is_zero(&Q2))
+            continue;
+
+        // Check if point is orthogonal to P
+        if(is_point_equal(&P2, &Q2))
+            continue;
+        else
+            break;
+    }
+
+    // Normalize points
+    ec_curve_t E;
+    fp2_mul(&t0, &P.z, &Q.z);
+    fp2_mul(&t1, &t0, &curve->C);
+    fp2_inv(&t1);
+    fp2_mul(&P.x, &P.x, &t1);
+    fp2_mul(&Q.x, &Q.x, &t1);
+    fp2_mul(&E.A, &curve->A, &t1);
+    fp2_mul(&P.x, &P.x, &Q.z);
+    fp2_mul(&P.x, &P.x, &curve->C);
+    fp2_mul(&Q.x, &Q.x, &P.z);
+    fp2_mul(&Q.x, &Q.x, &curve->C);
+    fp2_mul(&E.A, &E.A, &t0);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    fp2_copy(&Q.z, &P.z);
+    fp2_copy(&E.C, &P.z);
+
+    // Compute P-Q
+    difference_point(&PQ2->PmQ, &P, &Q, &E);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->Q, &Q);
+}
+
+
+void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P){
+
+    fp2_t x, t0, t1, t2;
+    ec_point_t Q, Q2, P2, A24;
+
+    // Curve coefficient in the form A24 = (A+2C:4C)
+    fp2_add(&A24.z, &curve->C, &curve->C);
+    fp2_add(&A24.x, &curve->A, &A24.z);
+    fp2_add(&A24.z, &A24.z, &A24.z);
+
+    // Point of order 2 generated by P
+    copy_point(&P2, P);
+    for(int i = 0; i < POWER_OF_2 - 1; i++)
+        xDBLv2(&P2, &P2, &A24);
+
+    // Find Q
+    fp_mont_setone(x.re);
+    fp_set(x.im, 0);
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&Q.x, &x);
+            fp_mont_setone(Q.z.re);
+            fp_set(Q.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear odd factors from the order
+        xMULv2(&Q, &Q, p_cofactor_for_2f, (int)P_COFACTOR_FOR_2F_BITLENGTH, &A24);
+
+        // Check if point has order 2^f
+        copy_point(&Q2, &Q);
+        for(int i = 0; i < POWER_OF_2 - 1; i++)
+            xDBLv2(&Q2, &Q2, &A24);
+        if(ec_is_zero(&Q2))
+            continue;
+
+        // Check if point is orthogonal to P
+        if(is_point_equal(&P2, &Q2))
+            continue;
+        else
+            break;
+    }
+
+    // Normalize points
+    ec_curve_t E;
+    ec_point_t PP;
+    fp2_mul(&t0, &P->z, &Q.z);
+    fp2_mul(&t1, &t0, &curve->C);
+    fp2_inv(&t1);
+    fp2_mul(&PP.x, &P->x, &t1);
+    fp2_mul(&Q.x, &Q.x, &t1);
+    fp2_mul(&E.A, &curve->A, &t1);
+    fp2_mul(&PP.x, &PP.x, &Q.z);
+    fp2_mul(&PP.x, &PP.x, &curve->C);
+    fp2_mul(&Q.x, &Q.x, &P->z);
+    fp2_mul(&Q.x, &Q.x, &curve->C);
+    fp2_mul(&E.A, &E.A, &t0);
+    fp_mont_setone(PP.z.re);
+    fp_set(PP.z.im, 0);
+    fp2_copy(&Q.z, &PP.z);
+    fp2_copy(&E.C, &PP.z);
+
+    // Compute P-Q
+    difference_point(&PQ2->PmQ, &PP, &Q, &E);
+    copy_point(&PQ2->P, &PP);
+    copy_point(&PQ2->Q, &Q);
+}
+
+void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve){
+
+    fp2_t x, t0, t1, t2;
+    ec_point_t P, Q, Q3, P3, A24, A3;
+
+    // Curve coefficient in the form A24 = (A+2C:4C)
+    fp2_add(&A24.z, &curve->C, &curve->C);
+    fp2_add(&A24.x, &curve->A, &A24.z);
+    fp2_add(&A24.z, &A24.z, &A24.z);
+
+    // Curve coefficient in the form A3 = (A+2C:A-2C)
+    fp2_sub(&A3.z, &A24.x, &A24.z);
+    fp2_copy(&A3.x, &A24.x);
+
+    fp_mont_setone(x.re);
+    fp_set(x.im, 0);
+
+    // Find P
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&P.x, &x);
+            fp_mont_setone(P.z.re);
+            fp_set(P.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear non-3 factors from the order
+        xMULv2(&P, &P, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
+
+        // Check if point has order 3^g
+        copy_point(&P3, &P);
+        for(int i = 0; i < POWER_OF_3 - 1; i++)
+            xTPL(&P3, &P3, &A3);
+        if(ec_is_zero(&P3))
+            continue;
+        else
+            break;
+    }
+    
+    // Find Q
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&Q.x, &x);
+            fp_mont_setone(Q.z.re);
+            fp_set(Q.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear non-3 factors from the order
+        xMULv2(&Q, &Q, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
+
+        // Check if point has order 3^g
+        copy_point(&Q3, &Q);
+        for(int i = 0; i < POWER_OF_3 - 1; i++)
+            xTPL(&Q3, &Q3, &A3);
+        if(ec_is_zero(&Q3))
+            continue;
+
+        // Check if point is orthogonal to P
+        if(is_point_equal(&P3, &Q3))
+            continue;
+        xDBLv2(&P3, &P3, &A24);
+        if(is_point_equal(&P3, &Q3))
+            continue;
+        else
+            break;
+    }
+
+    // Normalize points
+    ec_curve_t E;
+    fp2_mul(&t0, &P.z, &Q.z);
+    fp2_mul(&t1, &t0, &curve->C);
+    fp2_inv(&t1);
+    fp2_mul(&P.x, &P.x, &t1);
+    fp2_mul(&Q.x, &Q.x, &t1);
+    fp2_mul(&E.A, &curve->A, &t1);
+    fp2_mul(&P.x, &P.x, &Q.z);
+    fp2_mul(&P.x, &P.x, &curve->C);
+    fp2_mul(&Q.x, &Q.x, &P.z);
+    fp2_mul(&Q.x, &Q.x, &curve->C);
+    fp2_mul(&E.A, &E.A, &t0);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    fp2_copy(&Q.z, &P.z);
+    fp2_copy(&E.C, &P.z);
+
+    // Compute P-Q
+    difference_point(&PQ3->PmQ, &P, &Q, &E);
+    copy_point(&PQ3->P, &P);
+    copy_point(&PQ3->Q, &Q);
+}
+
+void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve){
+
+    fp2_t x, t0, t1, t2;
+    ec_point_t P, Q, Q6, P6, R, T, A24, A3;
+
+    // Curve coefficient in the form A24 = (A+2C:4C)
+    fp2_add(&A24.z, &curve->C, &curve->C);
+    fp2_add(&A24.x, &curve->A, &A24.z);
+    fp2_add(&A24.z, &A24.z, &A24.z);
+
+    // Curve coefficient in the form A3 = (A+2C:A-2C)
+    fp2_sub(&A3.z, &A24.x, &A24.z);
+    fp2_copy(&A3.x, &A24.x);
+
+    fp_mont_setone(x.re);
+    fp_set(x.im, 0);
+
+    // Find P
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&P.x, &x);
+            fp_mont_setone(P.z.re);
+            fp_set(P.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear non-2 factors and non-3 factors from the order
+        xMULv2(&P, &P, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
+
+        // Check if point has order 2^f*3^g
+        copy_point(&P6, &P);
+        for(int i = 0; i < POWER_OF_2 - 1; i++)
+            xDBLv2(&P6, &P6, &A24);
+        for(int i = 0; i < POWER_OF_3 - 1; i++)
+            xTPL(&P6, &P6, &A3);
+        if(ec_is_zero(&P6))
+            continue;
+        xDBLv2(&T, &P6, &A24);
+        if (ec_is_zero(&T))
+            continue;
+        xTPL(&T, &P6, &A3);
+        if (ec_is_zero(&T))
+            continue;
+        break;
+    }
+
+    // Find Q
+    while(1){
+        fp_add(x.im, x.re, x.im);
+
+        // Check if point is rational
+        fp2_sqr(&t0, &curve->C);
+        fp2_mul(&t1, &t0, &x);
+        fp2_mul(&t2, &curve->A, &curve->C);
+        fp2_add(&t1, &t1, &t2);
+        fp2_mul(&t1, &t1, &x);
+        fp2_add(&t1, &t1, &t0);
+        fp2_mul(&t1, &t1, &x);
+        if(fp2_is_square(&t1)){
+            fp2_copy(&Q.x, &x);
+            fp_mont_setone(Q.z.re);
+            fp_set(Q.z.im, 0);
+        }
+        else
+            continue;
+
+        // Clear non-6 factors from the order
+        xMULv2(&Q, &Q, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
+
+        // Check first if point has order 2^f*3^g
+        copy_point(&Q6, &Q);
+        for(int i = 0; i < POWER_OF_2 - 1; i++)
+            xDBLv2(&Q6, &Q6, &A24);
+        for(int i = 0; i < POWER_OF_3 - 1; i++)
+            xTPL(&Q6, &Q6, &A3);
+        if(ec_is_zero(&Q6))
+            continue;
+        xDBLv2(&T, &Q6, &A24);
+        if (ec_is_zero(&T))
+            continue;
+        xTPL(&T, &Q6, &A3);
+        if (ec_is_zero(&T))
+            continue;
+
+        // Check if point P is independent from point Q
+        xTPL(&R, &P6, &A3);
+        xTPL(&T, &Q6, &A3);
+        if(is_point_equal(&R, &T))
+            continue;
+        xDBLv2(&R, &P6, &A24);
+        xDBLv2(&T, &Q6, &A24);
+        if(is_point_equal(&R, &T))
+            continue;
+        break;
+    }
+
+    // Normalize points
+    ec_curve_t E;
+    fp2_mul(&t0, &P.z, &Q.z);
+    fp2_mul(&t1, &t0, &curve->C);
+    fp2_inv(&t1);
+    fp2_mul(&P.x, &P.x, &t1);
+    fp2_mul(&Q.x, &Q.x, &t1);
+    fp2_mul(&E.A, &curve->A, &t1);
+    fp2_mul(&P.x, &P.x, &Q.z);
+    fp2_mul(&P.x, &P.x, &curve->C);
+    fp2_mul(&Q.x, &Q.x, &P.z);
+    fp2_mul(&Q.x, &Q.x, &curve->C);
+    fp2_mul(&E.A, &E.A, &t0);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    fp2_copy(&Q.z, &P.z);
+    fp2_copy(&E.C, &P.z);
+
+    // Compute P-Q
+    difference_point(&PQ6->PmQ, &P, &Q, &E);
+    copy_point(&PQ6->P, &P);
+    copy_point(&PQ6->Q, &Q);
+}
--- a/src/ec/ref/ecx/ec.c
+++ b/src/ec/ref/ecx/ec.c
--- a/src/ec/ref/ecx/fp2-test.c
+++ b/src/ec/ref/ecx/fp2-test.c
@@ -0,0 +1,90 @@
+#include <assert.h>
+#include <time.h>
+#include <stdio.h>
+#include "../generic/include/fp2_tmp.h"
+
+int main()
+{
+	fp2_t fp2_0, fp2_1;
+	// ------------
+	fp2_set0(fp2_0);
+	fp2_set1(fp2_1);
+	// ------------
+
+	int i;
+	fp2_t a, b, c, d;
+	fp_t e;
+
+	for (i = 0; i < 1024; i++)
+	{
+		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)1024);
+		fflush(stdout);
+		printf("\r\x1b[K");
+                
+		// Random elements of fp
+		fp2_random(a);
+		fp2_random(b);
+		fp2_copy(c, a);
+		c.re[0] += 1;
+		fp2_copy(d, b);
+		d.re[0] -= 1;
+
+		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
+		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
+
+		// Testing neg
+		fp2_set0(b);
+		fp2_copy(c, a);
+		fp2_neg(a, a);
+		fp2_sub(c, b, c);
+		assert(fp2_isequal(a,c) == 1);
+
+		fp2_set1(a);	// Now a == 1
+		fp2_set0(b);	// Now b == 0
+
+		assert(fp2_is_zero(a) == 0);
+		assert(fp2_is_zero(b) == 1);
+
+		// testing c - c
+		fp2_sub(d, c, c);
+		assert(fp2_is_zero(d) == 1);
+
+		// tetsing c * 0
+		fp2_mul(d, c, b);
+		assert(fp2_is_zero(d) == 1);
+
+		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
+		fp2_set1(a);
+		fp2_mul(d, c, a);
+		assert(fp2_isequal(d, c) == 1);
+
+		// fp_set(e, 1);	// Now e == 1
+		// fp2_pow(d, e, c);
+		// assert(fp2_isequal(d, c) == 1);
+		
+		// fp_set(e, 0);	// Now e == 0
+		// fp2_pow(d, e, c);
+		// assert(fp2_isone(d) == 1);
+
+		// fp2_set(a, 1);	// Now e == R mod p
+		// fp_random(e);
+		// fp2_pow(d, e, a);
+		// assert(fp2_isone(d) == 1);
+
+		// Testing 1/a by computing (1/a) x a
+		fp2_random(a);
+		fp2_copy(b, a);
+		fp2_inv(a);
+		fp2_mul(c, a, b);
+		assert(fp2_isone(c) == 1);
+
+		fp2_random(a);
+		fp2_sqr(b, a);
+		assert( fp2_issquare(b) );
+
+	};
+
+	printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i / (int)1024);
+	printf("-- All tests passed.\n");
+	return 0;
+}
--- a/src/ec/ref/ecx/isog_chains.c
+++ b/src/ec/ref/ecx/isog_chains.c
@@ -0,0 +1,298 @@
+#include "isog.h"
+#include <assert.h>
+
+static inline void AC_to_A24(ec_point_t *A24, ec_curve_t const *E)
+{
+    // A24 = (A+2C : 4C)
+    fp2_add(&A24->z, &E->C, &E->C);
+    fp2_add(&A24->x, &E->A, &A24->z);
+    fp2_add(&A24->z, &A24->z, &A24->z);
+}
+
+static inline void A24_to_AC(ec_curve_t *E, ec_point_t const *A24)
+{
+    // (A:C) = ((A+2C)*2-4C : 4C)
+    fp2_add(&E->A, &A24->x, &A24->x);
+    fp2_sub(&E->A, &E->A, &A24->z);
+    fp2_add(&E->A, &E->A, &E->A);
+    fp2_copy(&E->C, &A24->z);
+}
+
+void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_point_t* points, unsigned short length){
+        ec_point_t Q4, Q, A24;
+        copy_point(&Q4, &phi->kernel);
+        AC_to_A24(&A24, &phi->curve);
+        for(int i = 0; i < phi->length - 2; i++)
+            xDBLv2(&Q4, &Q4, &A24);
+        xDBLv2(&Q, &Q4, &A24);
+        if(fp2_is_zero(&Q.x)){
+            xisog_4_singular(&A24, Q4, A24);
+            xeval_4_singular(points, points, length, Q4);
+            xeval_4_singular(&Q, &phi->kernel, 1, Q4);
+        }
+        else{
+            xisog_4(&A24, Q4);
+            xeval_4(points, points, length);
+            xeval_4(&Q, &phi->kernel, 1);
+        }
+        ec_eval_even_strategy(image, points, length, &A24, &Q, phi->length-2);
+    }
+
+void ec_eval_even_nonzero(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_point_t* points, unsigned short length){
+        ec_point_t Q4, A24;
+        copy_point(&Q4, &phi->kernel);
+        AC_to_A24(&A24, &phi->curve);
+        for(int i = 0; i < phi->length - 2; i++)
+            xDBLv2(&Q4, &Q4, &A24);
+        xisog_4(&A24, Q4);
+        xeval_4(points, points, length);
+        xeval_4(&Q4, &phi->kernel, 1);
+        ec_eval_even_strategy(image, points, length, &A24, &Q4, phi->length-2);
+    }
+
+static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
+    ec_point_t* A24, const ec_point_t *kernel, const int isog_len){
+    
+    assert(isog_len == POWER_OF_2-2);
+        
+    uint8_t log2_of_e, tmp;
+    fp2_t t0;
+    digit_t e_half = (isog_len)>>1;
+    for(tmp = e_half, log2_of_e = 0; tmp > 0; tmp>>=1, ++log2_of_e);
+    log2_of_e *= 2; // In order to ensure each splits is at most size log2_of_e
+
+    ec_point_t SPLITTING_POINTS[log2_of_e], K2;
+    copy_point(&SPLITTING_POINTS[0], kernel);
+
+    int strategy = 0,    // Current element of the strategy to be used
+    i, j;
+
+    int BLOCK = 0,       // Keeps track of point order
+    current = 0;         // Number of points being carried
+    int XDBLs[log2_of_e]; // Number of doubles performed
+
+    // If walk length is odd, we start with a 2-isogeny
+    if(isog_len & 1){
+        copy_point(&SPLITTING_POINTS[1], &SPLITTING_POINTS[0]);
+        for(i = 0; i < isog_len-1; i++)
+            xDBLv2(&SPLITTING_POINTS[1], &SPLITTING_POINTS[1], A24);
+        xisog_2(A24, SPLITTING_POINTS[1]);
+        xeval_2(SPLITTING_POINTS, SPLITTING_POINTS, 1);
+        xeval_2(points, points, points_len);
+    }
+    
+    // Chain of 4-isogenies
+    for(j = 0; j < (e_half - 1); j++)
+    {   
+        // Get the next point of order 4
+        while (BLOCK != (e_half -  1 - j) )
+        {
+            // A new split will be added
+            current += 1;
+            // We set the seed of the new split to be computed and saved
+            copy_point(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current - 1]);
+            for(i = 0; i < 2*STRATEGY4[strategy]; i++)
+                xDBLv2(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current], A24);
+            XDBLs[current] = STRATEGY4[strategy];  // The number of doublings performed is saved
+            BLOCK += STRATEGY4[strategy];          // BLOCK is increased by the number of doublings performed
+            strategy += 1;                  // Next, we move to the next element of the strategy
+        }
+
+        // Evaluate 4-isogeny
+        xisog_4(A24, SPLITTING_POINTS[current]);
+        xeval_4(SPLITTING_POINTS, SPLITTING_POINTS, current);
+        xeval_4(points, points, points_len);
+
+        BLOCK -= XDBLs[current];  
+        XDBLs[current] = 0;      
+        current -= 1;            
+    }
+
+    // Final 4-isogeny
+    xisog_4(A24, SPLITTING_POINTS[current]);
+    xeval_4(points, points, points_len);
+
+    // Output curve in the form (A:C)
+    A24_to_AC(image, A24);
+}
+
+void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
+        ec_point_t* points, unsigned short length){
+        
+    ec_point_t ker_plus, ker_minus, P, K, A24, B24;
+    int i,j,k;
+
+    AC_to_A24(&A24, &phi->curve);
+
+    // Isogenies with kernel in E[p+1]
+    copy_point(&ker_plus, &phi->ker_plus);
+    copy_point(&ker_minus, &phi->ker_minus);
+    for(i = 0; i < P_LEN; i++){
+        copy_point(&P, &ker_plus);
+        for(j = i+1; j < P_LEN; j++){
+            for(k = 0; k < phi->degree[j]; k++)
+                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
+        }
+        for(k = 0; k < phi->degree[i]; k++){
+            copy_point(&K, &P);
+            for(j = 0; j < phi->degree[i]-k-1; j++)
+                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
+            kps(i, K, A24);
+            xisog(&B24, i, A24);
+            xeval(&P, i, P, A24);
+            xeval(&ker_plus, i, ker_plus, A24);
+            xeval(&ker_minus, i, ker_minus, A24);
+            for(j = 0; j < length; j++)
+                xeval(&points[j], i, points[j], A24);
+            copy_point(&A24, &B24);
+            kps_clear(i);
+        }
+    }
+
+    // Isogenies with kernel in E[p-1]
+    for(i = P_LEN; i < P_LEN+M_LEN; i++){
+        copy_point(&P, &ker_minus);
+        for(j = i+1; j < P_LEN+M_LEN; j++){
+            for(k = 0; k < phi->degree[j]; k++)
+                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
+        }
+        for(k = 0; k < phi->degree[i]; k++){
+            copy_point(&K, &P);
+            for(j = 0; j < phi->degree[i]-k-1; j++)
+                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
+            kps(i, K, A24);
+            xisog(&B24, i, A24);
+            xeval(&P, i, P, A24);
+            xeval(&ker_minus, i, ker_minus, A24);
+            for(j = 0; j < length; j++)
+                xeval(&points[j], i, points[j], A24);
+            copy_point(&A24, &B24);
+            kps_clear(i);
+        }
+    }
+
+    A24_to_AC(image, &A24);
+}
+
+void ec_curve_normalize(ec_curve_t *new, ec_isom_t *isom, const ec_curve_t *old){
+    fp2_t t0, t1, t2, t3, t4, t5;
+    // Compute the other solutions:
+    // A'^2 = [ sqrt(A^2-4C^2)*(9C^2-A^2) +- (A^3-3AC^2) ] / [ 2C^2*sqrt(A^2-4C^2) ]
+    fp2_sqr(&t0, &old->C);      //C^2
+    fp2_add(&t1, &t0, &t0);     //2C^2
+    fp2_add(&t2, &t1, &t1);     //4C^2
+    fp2_sqr(&t3, &old->A);      //A^2
+    fp2_sub(&t2, &t3, &t2);     //A^2-4C^2
+    fp2_sqrt(&t2);              //sqrt(A^2-4C^2)
+    fp2_add(&t0, &t0, &t1);     //3C^2
+    fp2_mul(&t1, &t2, &t1);     //2C^2*sqrt(A^2-4C^2)
+    fp2_sub(&t5, &t3, &t0);     //A^2-3C^2
+    fp2_mul(&t5, &t5, &old->A);     //A^3-3AC^2
+    fp2_add(&t4, &t0, &t0);     //6C^2
+    fp2_add(&t0, &t4, &t0);     //9C^2
+    fp2_sub(&t0, &t0, &t3);     //9C^2-A^2
+    fp2_add(&t3, &t3, &t3);     //2A^2
+    fp2_mul(&t3, &t3, &t2);     //2A^2*sqrt(A^2-4C^2)
+    fp2_mul(&t2, &t2, &t0);     //sqrt(A^2-4C^2)*(9C^2-A^2)
+    fp2_add(&t0, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) + (A^3-3AC^2)
+    fp2_sub(&t2, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) - (A^3-3AC^2)
+    fp2_inv(&t1);               //1/2C^2*sqrt(A^2-4C^2)
+    fp2_mul(&t0, &t0, &t1);     // First solution
+    fp2_mul(&t2, &t2, &t1);     // Second solution
+    fp2_mul(&t1, &t3, &t1);     // Original solution
+
+    // Chose the lexicographically first solution
+    if(fp2_cmp(&t0, &t1)==1)
+        fp2_copy(&t0, &t1);
+    if(fp2_cmp(&t0, &t2)==1)
+        fp2_copy(&t0, &t2);
+
+    // Copy the solution
+    fp2_sqrt(&t0);
+    ec_curve_t E;
+    fp2_copy(&E.A, &t0);
+    fp_mont_setone(E.C.re);
+    fp_set(E.C.im, 0);
+    ec_isomorphism(isom, old, &E);
+    fp2_copy(&new->A, &E.A);
+    fp2_copy(&new->C, &E.C);
+}
+
+void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to){
+    fp2_t t0, t1, t2, t3, t4;
+    fp2_mul(&t0, &from->A, &to->C);
+    fp2_sqr(&t0, &t0);                  //fromA^2toC^2
+    fp2_mul(&t1, &to->A, &from->C);
+    fp2_sqr(&t1, &t1);                  //toA^2fromC^2
+    fp2_mul(&t2, &to->C, &from->C);
+    fp2_sqr(&t2, &t2);                  //toC^2fromC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t2, &t3, &t2);             //3toC^2fromC^2
+    fp2_sub(&t3, &t2, &t0);             //3toC^2fromC^2-fromA^2toC^2
+    fp2_sub(&t4, &t2, &t1);             //3toC^2fromC^2-toA^2fromC^2
+    fp2_inv(&t3);
+    fp2_mul(&t4, &t4, &t3);
+    fp2_sqrt(&t4);                      //lambda^2 constant for SW isomorphism
+    fp2_sqr(&t3, &t4);
+    fp2_mul(&t3, &t3, &t4);             //lambda^6
+
+    // Check sign of lambda^2, such that lambda^6 has the right sign
+    fp2_sqr(&t0, &from->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t0, &t1); // 9fromC^2
+    fp2_sqr(&t2, &from->A);
+    fp2_add(&t2, &t2, &t2); // 2fromA^2
+    fp2_sub(&t2, &t2, &t0);
+    fp2_mul(&t2, &t2, &from->A); // -9fromC^2fromA+2fromA^3
+    fp2_sqr(&t0, &to->C);
+    fp2_mul(&t0, &t0, &to->C);
+    fp2_mul(&t2, &t2, &t0);     //toC^3* [-9fromC^2fromA+2fromA^3]
+    fp2_mul(&t3, &t3, &t2);             //lambda^6*(-9fromA+2fromA^3)*toC^3
+    fp2_sqr(&t0, &to->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t0, &t1); // 9toC^2
+    fp2_sqr(&t2, &to->A);
+    fp2_add(&t2, &t2, &t2); // 2toA^2
+    fp2_sub(&t2, &t2, &t0);
+    fp2_mul(&t2, &t2, &to->A); // -9toC^2toA+2toA^3
+    fp2_sqr(&t0, &from->C);
+    fp2_mul(&t0, &t0, &from->C);
+    fp2_mul(&t2, &t2, &t0);     //fromC^3* [-9toC^2toA+2toA^3]
+    if(!fp2_is_equal(&t2, &t3))
+        fp2_neg(&t4, &t4);
+
+    // Mont -> SW -> SW -> Mont
+    fp_mont_setone(t0.re);
+    fp_set(t0.im, 0);
+    fp2_add(&isom->D, &t0, &t0);
+    fp2_add(&isom->D, &isom->D, &t0);
+    fp2_mul(&isom->D, &isom->D, &from->C);
+    fp2_mul(&isom->D, &isom->D, &to->C);
+    fp2_mul(&isom->Nx, &isom->D, &t4);
+    fp2_mul(&t4, &t4, &from->A);
+    fp2_mul(&t4, &t4, &to->C);
+    fp2_mul(&t0, &to->A, &from->C);
+    fp2_sub(&isom->Nz, &t0, &t4);
+}
+
+void ec_iso_inv(ec_isom_t* isom){
+    fp2_t tmp;
+    fp2_copy(&tmp, &isom->D);
+    fp2_copy(&isom->D, &isom->Nx);
+    fp2_copy(&isom->Nx, &tmp);
+    fp2_neg(&isom->Nz, &isom->Nz);
+}
+
+void ec_iso_eval(ec_point_t *P, ec_isom_t* isom){
+    fp2_t tmp;
+    fp2_mul(&P->x, &P->x, &isom->Nx);
+    fp2_mul(&tmp, &P->z, &isom->Nz);
+    fp2_sub(&P->x, &P->x, &tmp);
+    fp2_mul(&P->z, &P->z, &isom->D);
+}
--- a/src/ec/ref/ecx/kps.c
+++ b/src/ec/ref/ecx/kps.c
@@ -0,0 +1,228 @@
+#include "isog.h"
+#include "curve_extras.h"
+#include <assert.h>
+
+int sI, sJ, sK;	// Sizes of each current I, J, and K	
+
+fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
+			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
+
+ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
+fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
+    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
+    A0;			// constant multiple of the reciprocal R0
+
+poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
+     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
+     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
+     
+fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
+
+int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
+    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
+
+fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
+
+// -----------------------------------------------------------
+// -----------------------------------------------------------
+// Traditional Kernel Point computation (KPs)
+
+// Kernel computation required in tye degree-4 isogeny evaluation
+void kps_4(ec_point_t const P)
+{
+	fp2_sub(&K[1].x, &P.x, &P.z);
+	fp2_add(&K[2].x, &P.x, &P.z);
+	fp2_sqr(&K[0].x, &P.z);
+	fp2_add(&K[0].z, &K[0].x, &K[0].x);
+	fp2_add(&K[0].x, &K[0].z, &K[0].z);
+}
+
+void eds2mont(ec_point_t* P)
+{
+	fp2_t t;
+	fp2_add(&t, &(P->z), &(P->x));
+	fp2_sub(&(P->z), &(P->z), &(P->x));
+	fp2_copy(&(P->x), &t);
+}
+
+
+// Differential doubling in Twisted Edwards model
+void ydbl(ec_point_t* Q, ec_point_t* const P, ec_point_t const* A)
+{
+	fp2_t t_0, t_1, X, Z;
+
+	fp2_sqr(&t_0, &(P->x));
+	fp2_sqr(&t_1, &(P->z));
+	fp2_mul(&Z, &(A->z), &t_0);
+	fp2_mul(&X, &Z, &t_1);
+	fp2_sub(&t_1, &t_1, &t_0);
+	fp2_mul(&t_0, &(A->x), &t_1);
+	fp2_add(&Z, &Z, &t_0);
+	fp2_mul(&Z, &Z, &t_1);
+
+	fp2_sub(&(Q->x), &X, &Z);
+	fp2_add(&(Q->z), &X, &Z);
+}
+
+// Differential addition in Twisted Edwards model
+void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ)
+{
+	fp2_t a, b, c, d, X, Z;
+
+	fp2_mul(&a, &(P->z), &(Q->x));
+	fp2_mul(&b, &(P->x), &(Q->z));
+	fp2_add(&c, &a, &b);
+	fp2_sub(&d, &a, &b);
+	fp2_sqr(&c, &c);
+	fp2_sqr(&d, &d);
+
+	fp2_add(&a, &(PQ->z), &(PQ->x));
+	fp2_sub(&b, &(PQ->z), &(PQ->x));
+	fp2_mul(&X, &b, &c);
+	fp2_mul(&Z, &a, &d);
+
+	fp2_sub(&(R->x), &X, &Z);
+	fp2_add(&(R->z), &X, &Z);
+}
+
+// tvelu formulae
+void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A)
+{
+	int j;
+	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;
+
+	// Mapping the input point x(P), which belongs to a 
+	// Montogmery curve model, into its Twisted Edwards 
+	// representation y(P)
+	fp2_sub(&K[0].x, &P.x, &P.z);
+	fp2_add(&K[0].z, &P.x, &P.z);
+	ydbl(&K[1], &K[0], &A);				// y([2]P)
+
+	for (j = 2; j < d; j++)
+		yadd(&K[j], &K[j - 1], &K[0], &K[j - 2]);	// y([j+1]P)
+}
+
+// -----------------------------------------------------------
+// -----------------------------------------------------------
+// Kernel Point computation (KPs) used in velu SQRT
+void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A)
+{
+	// =================================================================================
+	assert(TORSION_ODD_PRIMES[i] > gap);	// Ensuring velusqrt is used for l_i > gap
+	// The optimal bounds must corresponds to sI, sJ, and sK
+
+	sI = sizeI[i];	// Size of I
+	sJ = sizeJ[i];	// Size of J
+	sK = sizeK[i];	// Size of K
+	assert(sI >= sJ);	// Ensuring #I >= #J
+	assert(sK >= 0);	// Recall, it must be that #K >= 0
+	assert(sJ > 1);		// ensuring sI >= sJ > 1
+	// =================================================================================
+	
+	// Now, we can proceed by the general case
+
+	int j;
+
+	// --------------------------------------------------
+	// Computing [j]P for each j in {1, 3, ..., 2*sJ - 1}
+	ec_point_t P2, P4;
+	copy_point(&J[0], &P);				//    x(P)
+	// Next computations are required for allowing the use of the function get_A()
+	fp2_mul(&XZJ4[0], &J[0].x, &J[0].z);					//   Xj*Zj
+	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  2Xj*Zj
+	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  4Xj*Zj
+	fp2_neg(&XZJ4[0], &XZJ4[0]);					// -4Xj*Zj
+	xDBLv2(&P2, &P, &A);					// x([2]P)
+	xADD(&J[1], &P2, &J[0], &J[0]);			// x([3]P)
+	// Next computations are required for allowing the use of the function get_A()
+	fp2_mul(&XZJ4[1], &J[1].x, &J[1].z);					//   Xj*Zj
+	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  2Xj*Zj
+	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  4Xj*Zj
+	fp2_neg(&XZJ4[1], &XZJ4[1]);					// -4Xj*Zj
+	for (j = 2; j < sJ; j++)
+	{
+		xADD(&J[j], &J[j - 1], &P2, &J[j - 2]);	// x([2*j + 1]P)
+		// Next computations are required for allowing the use of the function get_A()
+		fp2_mul(&XZJ4[j], &J[j].x, &J[j].z);					//   Xj*Zj
+		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  2Xj*Zj
+		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  4Xj*Zj
+		fp2_neg(&XZJ4[j], &XZJ4[j]);					// -4Xj*Zj
+	};
+
+	// ----------------------------------------------------------
+	// Computing [i]P for i in { (2*sJ) * (2i + 1) : 0 <= i < sI}
+	// and the linear factors of h_I(W)
+	ec_point_t Q, Q2, tmp1, tmp2;
+	int bhalf_floor= sJ >> 1;
+	int bhalf_ceil = sJ - bhalf_floor;
+	xDBLv2(&P4, &P2, &A);								// x([4]P)
+	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
+	xADD(&Q, &J[bhalf_ceil], &J[bhalf_floor - 1], &P2);	// Q := [2b]P
+	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
+
+	// .............................................
+	xDBLv2(&Q2, &Q, &A);					// x([2]Q)
+	xADD(&tmp1, &Q2, &Q, &Q);	// x([3]Q)
+	fp2_neg(&I[0][0], &Q.x);
+	fp2_copy(&I[0][1], &Q.z);
+	fp2_neg(&I[1][0], &tmp1.x);
+	fp2_copy(&I[1][1], &tmp1.z);
+	copy_point(&tmp2, &Q);
+	
+	for (j = 2; j < sI; j++){
+		xADD(&tmp2, &tmp1, &Q2, &tmp2);	// x([2*j + 1]Q)
+		fp2_neg(&I[j][0], &tmp2.x);
+		fp2_copy(&I[j][1], &tmp2.z);
+		swap_points(&tmp1, &tmp2, -(uint64_t)1);
+	}
+
+
+	// ----------------------------------------------------------------
+	// Computing [k]P for k in { 4*sJ*sI + 1, ..., l - 6, l - 4, l - 2}
+	// In order to avoid BRANCHES we make allways copy in K[0] and K[1]
+	// by assuming that these entries are only used when sK >= 1 and 
+	// sK >= 2, respectively.
+
+	//if (sK >= 1)
+	copy_point(&K[0], &P2);				//       x([l - 2]P) = x([2]P)
+	//if (sK >= 2)
+	copy_point(&K[1], &P4);				//       x([l - 4]P) = x([4]P)
+	
+	for (j = 2; j < sK; j++)
+		xADD(&K[j], &K[j - 1], &P2, &K[j - 2]);	// x([l - 2*(j+1)]P) = x([2 * (j+1)]P)
+
+	// ----------------------------------------------------------------
+	//                   ~~~~~~~~               ~~~~~~~~
+	//                    |    |                 |    |
+	// Computing h_I(W) = |    | (W - x([i]P)) = |    | (Zi * W - Xi) / Zi where x([i]P) = Xi/Zi
+	//                    i in I                 i in I
+	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
+
+	product_tree_LENFeq2(ptree_hI, deg_ptree_hI, 0, I, sI);				// Product tree of hI
+	if (!scaled)
+	{
+		// (unscaled) remainder tree approach
+		reciprocal_tree(rtree_hI, rtree_A, 2*sJ + 1, ptree_hI, deg_ptree_hI, 0, sI);	// Reciprocal tree of hI
+	}
+	else
+	{
+		// scaled remainder tree approach
+		fp2_t f_rev[sI_max + 1];
+		for (j = 0; j < (sI + 1); j++)
+			fp2_copy(&f_rev[j], &ptree_hI[0][sI - j]);
+
+		if (sI > (2*sJ - sI + 1))
+			reciprocal(R0, &A0, f_rev, sI + 1, sI);
+		else
+			reciprocal(R0, &A0, f_rev, sI + 1, 2*sJ - sI + 1);
+	};
+}
+
+void kps_clear(int i){
+		if (TORSION_ODD_PRIMES[i] > gap)
+		{
+			if (!scaled)
+				clear_tree(rtree_hI, 0, sizeI[i]);
+			clear_tree(ptree_hI, 0, sizeI[i]);
+		}
+}
--- a/src/ec/ref/ecx/poly-mul.c
+++ b/src/ec/ref/ecx/poly-mul.c
--- a/src/ec/ref/ecx/poly-redc.c
+++ b/src/ec/ref/ecx/poly-redc.c
@@ -0,0 +1,349 @@
+#define _POLY_MUL_REDC_H_
+#include "poly.h"
+#include <assert.h>
+
+void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n){
+  
+  // Writes a polynomial to h and a field element to c such that f*h = c mod x^n
+  // REQUIRES h to have space for n terms
+  // NOT responsible for terms in h beyond h[n-1]
+
+  int i;
+
+  // Case when f needs to be padded with zeroes
+  if(n > lenf)
+  {
+    fp2_t fpad[n];
+    for(i = 0; i < lenf; i++)
+      fp2_copy(&fpad[i], &f[i]);
+    for(i = lenf; i < n; i++)
+      fp2_set(&fpad[i], 0);
+    reciprocal(h, c, fpad, n, n);
+    return;
+  }
+
+  // Trivial case
+  if(n == 0)
+  {
+    fp2_set(&*c, 0);
+    return;
+  }
+
+  // Case n = 1
+  if(n == 1)
+  {
+    fp2_copy(&*c, &f[0]);
+    fp_mont_setone(h[0].re);fp_set(h[0].im,0);
+    return;
+  }
+
+  // Case n = 2
+  if(n == 2)
+  {
+    fp2_sqr(&*c, &f[0]);
+    fp2_copy(&h[0], &f[0]);
+    fp2_neg(&h[1], &f[1]);
+    return;
+  }
+
+  // Case n = 3
+  if(n == 3)
+  {
+    fp2_t t0, t1;
+
+    fp2_sqr(&t0, &f[1]);
+    fp2_mul(&t1, &f[0], &f[2]);
+    fp2_sub(&t1, &t1, &t0);
+    fp2_mul(&t1, &t1, &f[0]);
+
+    reciprocal(h, c, f, 2, 2);
+    fp2_mul(&h[0], &h[0], &*c);
+    fp2_mul(&h[1], &h[1], &*c);
+    fp2_neg(&h[2], &t1);
+    fp2_sqr(&*c, &*c);
+    return;
+  }
+
+  // Case n = 4
+  if(n == 4)
+  {
+    fp2_t t0, t1, t2, t3, g[2];
+
+    reciprocal(g, &t3, f, 2, 2);
+    fp2_sqr(&t0, &f[1]);
+    fp2_mul(&t1, &g[0], &f[2]);
+    fp2_mul(&t2, &g[0], &f[3]);
+    fp2_mul(&h[1], &g[1], &f[2]);
+    fp2_sub(&t0, &t1, &t0);
+    fp2_add(&t1, &t2, &h[1]);
+    fp2_mul(&t2, &t0, &g[0]);
+    fp2_mul(&h[1], &t0, &g[1]);
+    fp2_mul(&h[3], &t1, &g[0]);
+    fp2_add(&h[3], &h[1], &h[3]);
+    
+    fp2_mul(&h[0], &g[0], &t3);
+    fp2_mul(&h[1], &g[1], &t3);
+    fp2_neg(&h[2], &t2);
+    fp2_neg(&h[3], &h[3]);
+    fp2_sqr(&*c, &t3);
+    return;
+  }
+
+
+  // General case
+  // Compute the reciprocal g mod x^m for m = ceil(n/2)
+  // Then f*g-c is multiple of x^m so we only care about terms from m to n-1
+  const int m = n - (n>>1);
+  fp2_t g[m], t[m], t0;
+
+  reciprocal(g, &t0, f, lenf, m);
+  poly_mul_middle(t, g, m, f, n);
+  poly_mul_low(t, n-m, g, m, &(t[2*m-n]), n-m);
+  for(i = 0; i < m; i++)
+    fp2_mul(&h[i], &g[i], &t0);
+  for(i = m; i < n; i++)
+    fp2_neg(&h[i], &t[i-m]);
+  fp2_sqr(&*c, &t0);
+  return;
+}
+
+
+void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,//
+	       const poly f_rev_inv, const fp2_t c)
+{
+  // Computes h(x) =  a * g(x) mod f(x) for some scalar a, writting lenf-1 terms to h.
+  // REQUIRES an inverse f_rev_inv such that f_rev*f_rev_inv = c mod x^(leng-lenf+1),
+  // where f_rev is the polynomial with the coefficients of f listed in reverse order.
+  // The scalar a is equal to c, except for special cases:
+  //    - If leng<lenf (no reduction needed) then a = 1
+  //    - If lenf = leng = 2, then a = f[1] 
+  //    - If lenf = leng = 3, then a = f[2] 
+  //    - If lenf=2, leng=3 then a = 2*f[1]^2
+  //
+  // REQUIRES h to have space for lenf-1 terms
+  // NOT responsible for terms in h beyond h[lenf-2]
+
+  int i;
+  
+  // Case without reduction
+  if(leng < lenf)
+  {
+    for(i = 0; i < leng; i++)
+      fp2_copy(&h[i], &g[i]);
+    for(i = leng; i < lenf-1; i++)
+      fp2_set(&h[i], 0);
+    return;
+  }
+
+  // Small cases for f linear
+  if(lenf == 2)
+  {
+    if(leng == 2)
+    {
+      fp2_t t0;
+      fp2_mul(&t0, &g[0], &f[1]);
+      fp2_mul(&h[0], &g[1], &f[0]);
+      fp2_sub(&h[0], &t0, &h[0]);
+      return;
+    }
+    
+    if(leng == 3)
+    {
+      fp2_t f0f1, f02, f12;
+      fp2_sqr(&f02, &f[0]);
+      fp2_sqr(&f12, &f[1]);
+      fp2_sub(&f0f1, &f[0], &f[1]);
+      fp2_sqr(&f0f1, &f0f1);
+      fp2_sub(&f0f1, &f0f1, &f02);
+      fp2_sub(&f0f1, &f0f1, &f12);
+      fp2_add(&f02, &f02, &f02);
+      fp2_add(&f12, &f12, &f12);
+      fp2_mul(&f02, &f02, &g[2]);
+      fp2_mul(&f12, &f12, &g[0]);
+      fp2_mul(&f0f1, &f0f1, &g[1]);
+      fp2_add(&h[0], &f02, &f12);
+      fp2_add(&h[0], &h[0], &f0f1);
+      return;
+    }
+  }
+
+  // Small case for f cuadratic
+  if(lenf == 3 && leng == 3)
+  {
+    fp2_t f2g1, f2g0, f1g2;
+    fp2_mul(&f2g1, &g[1], &f[2]);
+    fp2_mul(&f2g0, &g[0], &f[2]);
+    fp2_mul(&f1g2, &g[2], &f[1]);
+    fp2_mul(&h[0], &g[2], &f[0]);
+    fp2_sub(&h[0], &f2g0, &h[0]);
+    fp2_sub(&h[1], &f2g1, &f1g2);
+    return;
+  }
+
+  // General case
+  fp2_t g_reversed[leng], Q[leng - lenf + 1], Q_reversed[leng - lenf + 1];
+  
+  for(i = 0; i < leng; i++)
+    fp2_copy(&g_reversed[i], &g[leng-1-i]);
+
+  poly_mul_low(Q, leng-lenf+1, f_rev_inv, leng-lenf+1, g_reversed, leng-lenf+1);
+
+  for(i = 0; i < leng - lenf + 1; i++)
+    fp2_copy(&Q_reversed[i], &Q[leng - lenf - i]);
+
+  poly_mul_low(g_reversed, lenf-1, Q_reversed, leng-lenf+1, f, lenf);
+
+  for(i = 0; i < lenf-1; i++)
+  {
+    fp2_mul(&h[i], &g[i], &c);
+    fp2_sub(&h[i], &h[i], &g_reversed[i]);
+  }
+  return;
+}
+
+
+void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[],//
+		     const int root, const int n)
+{
+  // Given a product tree H with degrees tree DEG rooted at root and generated 
+  // by n polynomials, writes the reverse-reciprocal polynomials to R and field elements 
+  // to A such that Rev(H[i])*R[i] = A[i] mod x^(N) for all nodes but the leaves.
+  // The mod is N = deg(parent)-deg(self) for inner nodes, or N = leng - deg(root) for the root.
+  //
+  // REQUIRES that leng >= DEG[0] and that R,A have enough space for the tree (see product_tree)
+
+  if(n == 0)
+    return;
+
+  const int parent = (root-1) >> 1;
+  const int brother = root - 1 + 2*(root & 1);
+  int lenr;
+
+  if(root > 0)
+    lenr = DEG[parent] - DEG[root];
+  else
+    lenr = leng - DEG[root];
+  
+  R[root] = malloc(sizeof(fp2_t)*lenr);
+  
+  // ----------------------------------
+  // base cases determined by poly_redc
+  if(n == 1)
+    return;
+
+
+  // case for computing  g mod f when len(f), len(g) = 3
+  if (DEG[root] == 2 && lenr == 1)
+  {
+    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
+    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
+    return;
+  }
+  
+  // ----------------------------------
+
+  int i;
+  
+  // When the parent's inverse was calculated to a smaller modulus, need to invert from scratch
+  if(root == 0 || leng < lenr)
+  {
+    for(i = 0; i < lenr && i < DEG[root]+1; i++)
+      fp2_copy(&R[root][i], &H[root][DEG[root]-i]);
+    for(i = DEG[root]+1; i < lenr; i++){
+      fp2_set(&R[root][i], 0);
+    }
+    reciprocal(R[root], &(A[root]), R[root], lenr, lenr);
+  }
+  else
+  {
+  // When parent's inverse was to a greater/equal modulus, this inverse can be obtained from it
+    for(i = 0; i < lenr; i++)
+      fp2_copy(&R[root][i], &H[brother][DEG[brother]-i]);
+    poly_mul_low(R[root], lenr, R[parent], leng, R[root], lenr);
+    fp2_copy(&A[root], &A[parent]);
+  }
+
+  // Now move on to the children
+  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
+  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
+  return;
+}
+
+
+void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[],//
+		const poly H[], const int DEG[], const int root, const int n)
+{
+  // Given the product tree H and reciprocal tree R,A generated by f_0, ... , f_{n-1},
+  // with corresponding degrees tree DEG[] and rooted at root,  writes the constant term 
+  // of c_i*g mod f_i to REM[i]. The constants c_i are unspecified, but are a function
+  // only of leng and f_0,...,f_{n-1} so they cancel out when taking the ratios of
+  // remainders of different g's of the same length.
+  //
+  // REQUIRES REM to have space for n terms
+
+  if(n == 0)
+    return;
+  
+  fp2_t g_mod[DEG[root]];
+  poly_redc(g_mod, g, leng, H[root], DEG[root]+1, R[root], A[root]);
+
+  if(n == 1)
+  {
+    fp2_copy(&REM[0], &g_mod[0]);
+    return;
+  }
+  
+  multieval_unscaled(REM, g_mod, DEG[root], R, A, H, DEG, 2*root+1, n-(n>>1));
+  multieval_unscaled(&(REM[n-(n>>1)]), g_mod, DEG[root], R, A, H, DEG, 2*root+2, n>>1);
+  return;
+}
+
+
+void multieval_scaled(fp2_t REM[], const poly G, const poly H[], //
+			   const int DEG[], const int root, const int n)
+{
+  // Given the product tree H generated by LINEAR f_0,...,f_{n-1} rooted at root and with
+  // corresponding degrees tree DEG, writes the constant term of c_i * g mod f_i(x) to REM[i]
+  // The constants c_i are unspecified but are only a function of leng and f_0,...,f_{n-1},
+  // so they cancel out when taking the ratios of remainders of different g's of the same length.
+  //
+  // REQUIRES REM to have space for n terms and n > 1
+  // Also REQUIRES G = rev((rev(g mod F)) * F_rev_inv mod x^deg(F)-1) where F = H[root]
+  // and F_rev_inv is its reverse's reciprocal mod x^deg(F)
+
+  if(root == 0)
+  {
+    if(n == 1)
+    {
+      fp2_copy(&REM[0], &G[DEG[root]-1]);
+      return;
+    }
+    else
+    {
+      multieval_scaled(REM, G, H, DEG, 2*root+1, n-(n>>1));
+      multieval_scaled(&(REM[n-(n>>1)]), G, H, DEG, 2*root+2, n>>1);
+      return;
+    }
+  }
+    
+  const int parent = (root-1) >> 1;
+  const int brother = root - 1 + 2*(root & 1);
+  const int uncle = parent - 1 + 2*(parent & 1);
+  fp2_t fg[DEG[brother]+1];
+
+  if(root > 2)
+    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[uncle]+1);
+  else
+    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[0]);
+    
+  
+  if(n == 1)
+  {
+    fp2_copy(&REM[0], &fg[DEG[brother]]);
+    return;
+  }
+
+  multieval_scaled(REM, fg, H, DEG, 2*root+1, n-(n>>1));
+  multieval_scaled(&(REM[n-(n>>1)]), fg, H, DEG, 2*root+2, n>>1);
+  return;
+}
--- a/src/ec/ref/ecx/tedwards.c
+++ b/src/ec/ref/ecx/tedwards.c
@@ -0,0 +1,231 @@
+#include <tedwards.h>
+#include <assert.h>
+
+// a*x^2+y^2=1+d*x^2*y^2
+// a = A.x/A.z + 2, d = A.x/A.z - 2
+
+void ted_init(ted_point_t* P)
+{ // Initialize point as identity element (X:Y:Z:T) <- (0:1:1:0)
+    fp_t one = {0};
+
+    memset((digit_t*)P, 0, NWORDS_FIELD*RADIX*8/8);
+    one[0] = 1;
+    fp_tomont(P->x.re, one);
+}
+
+void copy_ted_point(ted_point_t* P, ted_point_t const* Q)
+{
+    fp2_copy(&(P->x), &(Q->x));
+    fp2_copy(&(P->y), &(Q->y));
+    fp2_copy(&(P->z), &(Q->z));
+    fp2_copy(&(P->t), &(Q->t));
+}
+
+void ted_dbl(ted_point_t *Q, ted_point_t const *P, ec_curve_t const* E) 
+{
+    // A = X1^2
+    // B = Y1^2
+    // C = 2*Z1^2
+    // D = a*A
+    // K = (X1+Y1)^2-A-B
+    // G = D+B
+    // F = G-C
+    // H = D-B
+    // X3 = K*F
+    // Y3 = G*H
+    // T3 = K*H
+    // Z3 = F*G
+
+    // TODO: neutral element
+    fp2_t A, B, C, D, K, G, F, H;
+
+    fp2_sqr(&A, &P->x);
+    fp2_sqr(&B, &P->y);
+    fp2_sqr(&C, &P->z);
+    fp2_add(&C, &C, &C);
+    fp2_mul(&D, &A, &E->A);
+    fp2_add(&K, &P->x, &P->y);
+    fp2_sqr(&K, &K);
+    fp2_sub(&K, &K, &A);
+    fp2_sub(&K, &K, &B);
+    fp2_add(&G, &D, &B);
+    fp2_sub(&F, &G, &C);
+    fp2_sub(&H, &D, &B);
+    fp2_mul(&Q->x, &K, &F);
+    fp2_mul(&Q->y, &G, &H);
+    fp2_mul(&Q->t, &K, &H);
+    fp2_mul(&Q->z, &F, &G);
+}
+
+void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E)
+{
+    // A = X1*X2
+    // B = Y1*Y2
+    // C = Z1*T2
+    // D = T1*Z2
+    // K = D+C
+    // F = (X1-Y1)*(X2+Y2)+B-A
+    // G = B+a*A
+    // H = D-C
+    // X3 = K*F
+    // Y3 = G*H
+    // T3 = K*H
+    // Z3 = F*G
+
+    // TODO: neutral element
+
+    ted_point_t res;
+
+    if (is_ted_equal(P, Q)) {
+      ted_dbl(S, P, E);
+      return;
+    }
+    //assert(!is_ted_equal(P, Q));
+    
+    ted_neg(&res, P);
+    if (is_ted_equal(&res, Q)) {
+       ted_init(S);
+       return;
+    }
+    // assert(!ted_equal(&res,Q));
+    fp2_t A, B, C, D, K, F, G, H, tmp;
+
+    fp2_mul(&A, &P->x, &Q->x);
+    fp2_mul(&B, &P->y, &Q->y);
+    fp2_mul(&C, &P->z, &Q->t);
+    fp2_mul(&D, &P->t, &Q->z);
+    fp2_add(&K, &D, &C);
+    fp2_add(&F, &Q->x, &Q->y);
+    fp2_sub(&tmp, &P->x, &P->y);
+    fp2_mul(&F, &F, &tmp);
+    fp2_add(&F, &F, &B);
+    fp2_sub(&F, &F, &A);
+    fp2_mul(&G, &A, &E->A);
+    fp2_add(&G, &G, &B);
+    fp2_sub(&H, &D, &C);
+    fp2_mul(&res.x, &K, &F);
+    fp2_mul(&res.y, &G, &H);
+    fp2_mul(&res.t, &K, &H);
+    fp2_mul(&res.z, &F, &G);
+
+    if (fp2_is_zero(&res.x) && fp2_is_zero(&res.y) && fp2_is_zero(&res.z)) {
+        ted_dbl(S, P, E);
+    } else {
+        copy_ted_point(S, &res);
+    }
+}
+
+void ted_neg(ted_point_t* Q, ted_point_t const* P)
+{
+    fp2_neg(&Q->x, &P->x);
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+    fp2_neg(&Q->t, &P->t);
+}
+
+static bool xLIFT(fp2_t* y, const ec_point_t* P, const ec_curve_t* curve)
+{ // Returns false if it is on the curve, true if it is on the twist
+    fp2_t z2, tmp1, tmp2, y2;
+
+    if (fp2_is_zero(&P->z)) return false;
+
+    // (X^2 + Z^2) C
+    fp2_sqr(&tmp1, &P->x);
+    fp2_sqr(&z2, &P->z);
+    fp2_add(&tmp1, &tmp1, &z2);
+    fp2_mul(&tmp1, &tmp1, &curve->C);
+
+    // X^2C + AXZ + Z^2C
+    fp2_mul(&tmp2, &P->x, &P->z);
+    fp2_mul(&tmp2, &tmp2, &curve->A);
+    fp2_add(&tmp1, &tmp1, &tmp2);
+
+    // X^3C + AX^2Z + XZ^2C = Z^3(Cx^3 + Ax^2 + Cx) = Z^3 C (B*y^2) = Z C (B*Y^2) // x = X/Z
+    fp2_mul(&tmp1, &tmp1, &P->x);
+    // (ZC)^(-1)
+    fp2_mul(&tmp2, &curve->C, &P->z);
+
+    assert(!fp2_is_zero(&tmp2));
+    
+    fp2_inv(&tmp2);    
+    fp2_mul(&y2, &tmp1, &tmp2);    // (B*Y^2)
+    fp2_copy(y, &y2);
+
+    if (fp2_is_square(&y2)) {  // on the curve
+        fp2_sqrt(y);
+        return false;
+    } else { // on the twist
+        fp2_t tmp = fp2_non_residue();
+        fp2_mul(y, y, &tmp);
+        fp2_sqrt(y);
+        return true;
+    }
+}
+
+//void mont_to_ted(ec_point_t* E, ec_point_t const* A, bool twist)
+void mont_to_ted(ec_curve_t* ted_curve, ec_curve_t const* curve)
+{
+    fp2_t tmp, two;
+
+    // A : y^2 = x^3 + (a/c)x^2 + x
+    fp2_copy(&tmp, &curve->C);         
+    fp2_inv(&tmp);                    // 1/c
+    fp2_mul(&tmp, &tmp, &curve->A);   // a/c
+    fp2_set(&two, 2);
+    fp2_tomont(&two, &two);
+    fp2_add(&ted_curve->A, &tmp, &two);       // a/c + 2
+    fp2_sub(&ted_curve->C, &tmp, &two);       // a/c - 2
+    //if (twist) {
+        // B = Fp2_inv(fp2_non_residue)
+    //    tmp = fp2_non_residue();
+    //    fp2_mul2(&E->x,&tmp);
+    //    fp2_mul2(&E->z,&tmp);
+    //}
+}
+
+void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* curve)
+{
+    if (fp2_is_zero(&P->z)) {
+        fp2_set(&Q->x, 0);
+        fp2_set(&Q->y, 1);
+        fp2_set(&Q->z, 1);
+        fp2_set(&Q->t, 0);
+        fp_tomont(Q->y.re, Q->y.re);
+        fp_tomont(Q->z.re, Q->z.re);
+    } else {
+        fp2_t tmp, y;
+
+        xLIFT(&y, P, curve);
+        fp2_add(&tmp, &P->x, &P->z);
+        fp2_mul(&Q->x, &P->x, &tmp);
+        fp2_sub(&Q->y, &P->x, &P->z);
+        fp2_mul(&Q->y, &Q->y, &y);
+        fp2_mul(&Q->z, &tmp, &y);
+        fp2_copy(&Q->t, &Q->z);
+        fp2_inv(&Q->t);
+        fp2_mul(&Q->t, &Q->t, &Q->x);
+        fp2_mul(&Q->t, &Q->t, &Q->y);
+    }
+}
+
+void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P)
+{
+    fp2_add(&Q->x, &P->z, &P->y);
+    fp2_sub(&Q->z, &P->z, &P->y);
+}
+
+bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2)
+{
+    fp2_t x1z2, y1z2;
+    fp2_t y2z1, x2z1;
+    fp2_t t1y2, t2y1;
+
+    fp2_mul(&x1z2, &P1->x, &P2->z);
+    fp2_mul(&y1z2, &P1->y, &P2->z);
+    fp2_mul(&y2z1, &P2->y, &P1->z);
+    fp2_mul(&x2z1, &P2->x, &P1->z);
+    fp2_mul(&t1y2, &P1->t, &P2->y);
+    fp2_mul(&t2y1, &P2->t, &P1->y);
+
+    return fp2_is_equal(&x1z2, &x2z1) && fp2_is_equal(&y1z2, &y2z1) && fp2_is_equal(&t1y2, &t2y1);
+}
--- a/src/ec/ref/ecx/test/ec-test.c
+++ b/src/ec/ref/ecx/test/ec-test.c
@@ -0,0 +1,18 @@
+#include "ec-tests.h"
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !(ec_test() & dlog_test());
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !(ec_run() & dlog_run());
+    } else {
+        exit(1);
+    }
+}
--- a/src/ec/ref/ecx/test/fp2-test.c
+++ b/src/ec/ref/ecx/test/fp2-test.c
@@ -0,0 +1,142 @@
+#include <assert.h>
+#include <time.h>
+#include <stdio.h>
+#include <fp2.h>
+#include <inttypes.h>
+
+static int BENCH_LOOPS = 1000;       // Number of iterations per bench
+static int TEST_LOOPS  = 512;       // Number of iterations per test
+
+bool fp2_isequal(fp2_t a, fp2_t b){
+    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
+}
+
+bool fp2_isone(fp2_t a){
+    fp_t one;
+    bool res = 1;
+    fp_mont_setone(one);
+    for(int i = 0; i < NWORDS_FIELD; i++){
+        res = res && (a.re[i] == one[i]);
+        res = res && (a.im[i] == 0);
+    }
+    return res;
+}
+
+void fp2_print(char *name, fp2_t const a){
+    fp2_t b;
+    fp2_set(&b, 1);
+    fp2_mul(&b, &b, &a);
+    printf("%s = 0x", name);
+    for(int i = NWORDS_FIELD - 1; i >=0; i--)
+        printf("%016" PRIx64, b.re[i]);
+    printf(" + i*0x");
+    for(int i = NWORDS_FIELD - 1; i >=0; i--)
+        printf("%016" PRIx64, b.im[i]);
+    printf("\n");
+}
+
+// VERY NOT SECURE (testing only)
+void fp2_random(fp2_t *a){
+    for(int i = 0; i < NWORDS_FIELD; i++){
+        a->re[i] = rand();
+        a->im[i] = rand();
+    }
+    // Normalize
+    fp2_t one;
+    fp_mont_setone(one.re);fp_set(one.im,0);
+    fp2_mul(&*a, &*a, &one);
+    // Update seed
+    srand((unsigned) a->re[0]);
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc > 1) {
+		TEST_LOOPS = atoi(argv[1]);
+	}
+
+	fp2_t fp2_0, fp2_1;
+	// ------------
+	fp2_set(&fp2_0, 0);
+	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
+	// ------------
+
+	int i;
+	fp2_t a, b, c, d;
+	fp_t e;
+
+	for (i = 0; i < TEST_LOOPS; i++)
+	{
+		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)TEST_LOOPS);
+		fflush(stdout);
+		printf("\r\x1b[K");
+                
+		// Random elements of fp
+		fp2_random(&a);
+		fp2_random(&b);
+		fp2_copy(&c, &a);
+		c.re[0] += 1;
+		fp2_copy(&d, &b);
+		d.re[0] -= 1;
+
+		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
+		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
+
+		// Testing neg
+		fp2_set(&b, 0);
+		fp2_copy(&c, &a);
+		fp2_neg(&a, &a);
+		fp2_sub(&c, &b, &c);
+		assert(fp2_isequal(a,c) == 1);
+
+		fp_mont_setone(a.re);fp_set(a.im,0);	// Now a == 1
+		fp2_set(&b, 0);	// Now b == 0
+
+		assert(fp2_is_zero(&a) == 0);
+		assert(fp2_is_zero(&b) == 1);
+
+		// testing c - c
+		fp2_sub(&d, &c, &c);
+		assert(fp2_is_zero(&d) == 1);
+
+		// tetsing c * 0
+		fp2_mul(&d, &c, &b);
+		assert(fp2_is_zero(&d) == 1);
+
+		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
+		fp_mont_setone(a.re);fp_set(a.im,0);
+		fp2_mul(&d, &c, &a);
+		assert(fp2_isequal(d, c) == 1);
+
+		// fp_set(e, 1);	// Now e == 1
+		// fp2_pow(d, e, c);
+		// assert(fp2_isequal(d, c) == 1);
+		
+		// fp_set(e, 0);	// Now e == 0
+		// fp2_pow(d, e, c);
+		// assert(fp2_isone(d) == 1);
+
+		// fp2_set(a, 1);	// Now e == R mod p
+		// fp_random(e);
+		// fp2_pow(d, e, a);
+		// assert(fp2_isone(d) == 1);
+
+		// Testing 1/a by computing (1/a) x a
+		fp2_random(&a);
+		fp2_copy(&b, &a);
+		fp2_inv(&a);
+		fp2_mul(&c, &a, &b);
+		assert(fp2_isone(c) == 1);
+
+		fp2_random(&a);
+		fp2_sqr(&b, &a);
+		assert( fp2_is_square(&b) );
+
+	};
+
+	if(TEST_LOOPS){
+		printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i /TEST_LOOPS);
+	}
+	printf("-- All tests passed.\n");
+	return 0;
+}
--- a/src/ec/ref/ecx/test/isog-test.c
+++ b/src/ec/ref/ecx/test/isog-test.c
--- a/src/ec/ref/ecx/test/mont-test.c
+++ b/src/ec/ref/ecx/test/mont-test.c
@@ -0,0 +1,386 @@
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "ec.h"
+#include "isog.h"
+#include "test-basis.h"
+#include <bench.h> 
+
+static int BENCH_LOOPS = 1000;       // Number of iterations per bench
+static int TEST_LOOPS  = 128;       // Number of iterations per test
+
+// void random_scalar(fp_t k, const uint8_t j)
+// {
+//         // To implement a better random function (We must use some of the SHAKE family functions)
+//         do
+//         {
+//                 randombytes((void *)k, keyspace_bytes[j]);
+//         } while (fp_issmaller((uint64_t *)k, keyspace_size[j]));
+// }
+
+// VERY NOT SECURE (testing only)
+void fp2_random(fp2_t *a){
+    for(int i = 0; i < NWORDS_FIELD; i++){
+        a->re[i] = rand();
+        a->im[i] = rand();
+    }
+    // Normalize
+    fp2_t one;
+    fp_mont_setone(one.re);fp_set(one.im,0);
+    fp2_mul(&*a, &*a, &one);
+    // Update seed
+    srand((unsigned) a->re[0]);
+}
+
+// Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
+void coeff(fp2_t *B, ec_point_t const A)
+{
+	fp2_t t;
+	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
+	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
+
+	fp2_copy(&*B, &A.z);
+	fp2_inv(&*B);		// 1 / (C24)
+	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
+	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
+}
+
+// Determines if point is fp2-rational (if not, then it must be a zero trace point)
+uint8_t isrational(ec_point_t const T, fp2_t const a)
+{
+	fp2_t XT, tmp, aux, YT_squared;
+
+	fp2_copy(&XT, &T.z);
+	fp2_inv(&XT);
+
+	fp2_mul(&XT, &XT, &T.x);
+
+	fp2_sqr(&tmp, &XT);
+	fp2_mul(&aux, &tmp, &XT);
+	fp2_mul(&tmp, &tmp, &a);
+	fp2_add(&YT_squared, &tmp, &aux);
+	fp2_add(&YT_squared, &YT_squared, &XT);
+
+	return fp2_is_square(&YT_squared);
+}
+
+// ladder3pt computes x(P + [m]Q)
+void ladder3pt(ec_point_t* R, fp_t const m, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A)
+{
+	ec_point_t X0, X1, X2;
+	copy_point(&X0, Q);
+	copy_point(&X1, P);
+	copy_point(&X2, PQ);
+
+	int i,j;
+	uint64_t t;
+	for (i = 0; i < NWORDS_FIELD; i++)
+	{
+		t = 1;
+		for (j = 0 ; j < 64; j++)
+		{
+			swap_points(&X1, &X2, -((t & m[i]) == 0));
+			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
+			swap_points(&X1, &X2, -((t & m[i]) == 0));
+			t <<= 1;
+		};
+	};
+	copy_point(R, &X1);
+}
+
+// For computing [(p + 1) / l_i]P, i:=0, ..., (N - 1)
+void cofactor_multiples(ec_point_t P[], ec_point_t const* A, size_t lower, size_t upper)
+{
+	assert(lower < upper);
+	if (upper - lower == 1)
+		return ;
+
+	int i;
+	size_t mid = lower + (upper - lower + 1) / 2;
+	copy_point(&(P[mid]), &(P[lower]));
+	for (i = lower; i < (int)mid; i++)
+		xMULv2(&(P[mid]), &(P[mid]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
+	for (i = (int)mid; i < (int)upper; i++)
+		xMULv2(&(P[lower]), &(P[lower]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
+
+	cofactor_multiples(P, A, lower, mid);
+	cofactor_multiples(P, A, mid, upper);
+}
+
+// The projective x-coordinate point (X : Z) at infinity is such that Z == 0
+static inline int isinfinity(ec_point_t const P)
+{
+	return fp2_is_zero(&P.z);
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc > 1) {
+		TEST_LOOPS = atoi(argv[1]);
+	}
+
+	fp2_t fp2_0, fp2_1;
+	fp2_set(&fp2_0, 0);
+	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
+
+	int i, j;
+
+	ec_point_t A;
+	fp2_set(&A.x, 0);
+	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
+
+	fp2_add(&A.z, &A.z, &A.z);	// 2C
+	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
+	fp2_add(&A.z, &A.z, &A.z);	// 4C
+
+	// Just to ensure the projective curve coeffientes are different from zero
+	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
+
+	fp2_t a;
+	coeff(&a, A);
+
+	ec_point_t PA, QA, PQA, PB, QB, PQB;
+
+	// Writing the public projective x-coordinate points into Montogmery domain
+	fp2_tomont(&(PA.x), &(xPA));
+	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
+	fp2_tomont(&(QA.x), &(xQA));
+	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
+	fp2_tomont(&(PQA.x), &(xPQA));
+	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
+
+	assert( isrational(PA, a) );
+	assert( isrational(QA, a) );
+	assert( isrational(PQA, a) );
+
+	// ======================================================================================================
+	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
+	for (j = 0; j < P_LEN; j++)
+	{
+		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
+		{
+			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+	
+			assert( isrational(PA, a) );
+			assert( isrational(QA, a) );
+			assert( isrational(PQA, a) );
+		};
+	};
+	assert( !isinfinity(PA) );
+	assert( !isinfinity(QA) );
+	assert( !isinfinity(PQA) );
+	
+	ec_point_t P[P_LEN + M_LEN], Q[P_LEN + M_LEN], PQ[P_LEN + M_LEN];
+	copy_point(&(P[0]), &PA);
+	cofactor_multiples(P, &A, 0, P_LEN);
+	copy_point(&(Q[0]), &QA);
+	cofactor_multiples(Q, &A, 0, P_LEN);
+	copy_point(&(PQ[0]), &PQA);
+	cofactor_multiples(PQ, &A, 0, P_LEN);
+	for (j = 0; j < P_LEN; j++)
+	{
+		// x(PA)
+		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
+		assert( isrational(P[j], a) );
+		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(P[j]) );		// It must be now the point at infinity
+		// x(QA)
+		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
+		assert( isrational(Q[j], a) );
+		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
+		// x(PQA)
+		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
+		assert( isrational(PQ[j], a) );
+		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
+	};
+	// Writing the public projective x-coordinate points into Montogmery domain
+	fp2_tomont(&(PB.x), &(xPB));
+	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
+	fp2_tomont(&(QB.x), &(xQB));
+	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
+	fp2_tomont(&(PQB.x), &(xPQB));
+	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
+
+	assert( !isrational(PB, a) );
+	assert( !isrational(QB, a) );
+	assert( !isrational(PQB, a) );
+	// ======================================================================================================
+	// Recall, PB, QB, and PQB are expeted to be M-order points, but we require to ensure they are of order M
+	for (j = P_LEN; j < (P_LEN + M_LEN); j++)
+	{
+		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
+		{
+			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+	
+			assert( !isrational(PB, a) );
+			assert( !isrational(QB, a) );
+			assert( !isrational(PQB, a) );
+		};
+	};
+	assert( !isinfinity(PB) );
+	assert( !isinfinity(QB) );
+	assert( !isinfinity(PQB) );
+
+	copy_point(&(P[P_LEN]), &PB);
+	cofactor_multiples(P, &A, P_LEN, P_LEN + M_LEN);
+	copy_point(&(Q[P_LEN]), &QB);
+	cofactor_multiples(Q, &A, P_LEN, P_LEN + M_LEN);
+	copy_point(&(PQ[P_LEN]), &PQB);
+	cofactor_multiples(PQ, &A, P_LEN, P_LEN + M_LEN);
+	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
+	{
+		// x(PB)
+		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
+		assert( !isrational(P[j], a) );
+		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(P[j]) );		// It must be now the point at infinity
+		// x(QB)
+		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
+		assert( !isrational(Q[j], a) );
+		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
+		// x(PQB)
+		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
+		assert( !isrational(PQ[j], a) );
+		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
+	};
+
+	fp2_t m;
+
+	// Writing the public projective x-coordinate points into Montogmery domain
+	fp2_tomont(&(PA.x), &(xPA));
+	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
+	fp2_tomont(&(QA.x), &(xQA));
+	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
+	fp2_tomont(&(PQA.x), &(xPQA));
+	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
+
+	assert( isrational(PA, a) );
+	assert( isrational(QA, a) );
+	assert( isrational(PQA, a) );
+	
+	fp2_tomont(&(PB.x), &(xPB));
+	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
+	fp2_tomont(&(QB.x), &(xQB));
+	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
+	fp2_tomont(&(PQB.x), &(xPQB));
+	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
+
+	assert( !isrational(PB, a) );
+	assert( !isrational(QB, a) );
+	assert( !isrational(PQB, a) );
+
+	ec_point_t R[P_LEN + M_LEN];
+	int k;
+	for (j = 0; j < TEST_LOOPS; j++)
+	{
+		printf("[%3d%%] Testing EC differential arithmetic", 100 * j / TEST_LOOPS);
+		fflush(stdout);
+		printf("\r\x1b[K");
+		fp2_random(&m);
+		ladder3pt(&(R[0]), m.re, &PA, &QA, &PQA, &A);
+		assert( isrational(R[0], a) );
+		for (k = 0; k < P_LEN; k++)
+		{
+			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
+			{
+				xMULv2(&R[0], &R[0], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
+				assert( isrational(R[0], a) );
+			};
+		};
+		cofactor_multiples(R, &A, 0, P_LEN);
+		for (i = 0; i < P_LEN; i++)
+		{
+			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
+			assert( isrational(R[i], a) );
+			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
+			assert( isinfinity(R[i]) );		// It must be now the point at infinity
+		};
+
+		fp2_random(&m);
+		ladder3pt(&(R[P_LEN]), m.re, &PB, &QB, &PQB, &A);
+		assert( !isrational(R[P_LEN], a) );
+		for (k = P_LEN; k < (P_LEN+M_LEN); k++)
+		{
+			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
+			{
+				xMULv2(&R[P_LEN], &R[P_LEN], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
+				assert( !isrational(R[P_LEN], a) );
+			};
+		};
+		cofactor_multiples(R, &A, P_LEN, P_LEN + M_LEN);
+		for (i = P_LEN; i < (P_LEN+M_LEN); i++)
+		{
+			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
+			assert( !isrational(R[i], a) );
+			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
+			assert( isinfinity(R[i]) );		// It must be now the point at infinity
+		};
+	};
+
+	if(TEST_LOOPS)
+		printf("[%3d%%] Tested EC differential arithmetic:\tNo errors!\n", 100 * j / TEST_LOOPS);
+	printf("-- All tests passed.\n");
+
+	// BENCHMARK xDBLv2
+    unsigned long long cycles, cycles1, cycles2;
+    cycles = 0;
+	ec_point_t PP[TEST_LOOPS], EE[TEST_LOOPS];
+	for(int i = 0; i < TEST_LOOPS; i++){
+		fp2_random(&PP[i].x);
+		fp2_random(&PP[i].z);
+		fp2_random(&EE[i].x);
+		fp2_random(&EE[i].z);
+	}
+    cycles1 = cpucycles(); 
+	for(int i = 0; i < TEST_LOOPS; i++){
+		xDBLv2(&PP[i], &PP[i], &EE[i]);
+	}
+    cycles2 = cpucycles();
+    cycles = cycles+(cycles2-cycles1);
+	
+	printf("xDBLv2 bench: %7lld cycles\n", cycles/TEST_LOOPS);
+
+	// BENCHMARK xIsog4
+    cycles = 0;
+	ec_point_t KK0[TEST_LOOPS], KK1[TEST_LOOPS], KK2[TEST_LOOPS];
+	for(int i = 0; i < TEST_LOOPS; i++){
+		fp2_random(&KK0[i].x);
+		fp2_random(&KK0[i].z);
+		fp2_random(&KK1[i].x);
+		fp2_random(&KK1[i].z);
+		fp2_random(&KK2[i].x);
+		fp2_random(&KK2[i].z);
+	}
+    cycles1 = cpucycles(); 
+	for(int i = 0; i < TEST_LOOPS; i++){
+	fp2_t t0, t1;
+	fp2_add(&t0, &PP[i].x, &PP[i].z);
+	fp2_sub(&t1, &PP[i].x, &PP[i].z);
+	fp2_mul(&(EE[i].x), &t0, &KK1[i].x);
+	fp2_mul(&(EE[i].z), &t1, &KK2[i].x);
+	fp2_mul(&t0, &t0, &t1);
+	fp2_mul(&t0, &t0, &KK0[i].x); 
+	fp2_add(&t1, &(EE[i].x), &(EE[i].z));
+	fp2_sub(&(EE[i].z), &(EE[i].x), &(EE[i].z));
+	fp2_sqr(&t1, &t1);
+	fp2_sqr(&(EE[i].z), &(EE[i].z));
+	fp2_add(&(EE[i].x), &t0, &t1);
+	fp2_sub(&t0, &(EE[i].z), &t0);
+	fp2_mul(&(EE[i].x), &(EE[i].x), &t1);
+	fp2_mul(&(EE[i].z), &(EE[i].z), &t0);
+	}
+    cycles2 = cpucycles();
+    cycles = cycles+(cycles2-cycles1);
+	printf("xeval_4 bench: %7lld cycles\n", cycles/TEST_LOOPS);
+
+	return 0;
+}
--- a/src/ec/ref/ecx/test/poly-mul-test.c
+++ b/src/ec/ref/ecx/test/poly-mul-test.c
@@ -0,0 +1,445 @@
+#include <poly.h>
+#include <assert.h>
+#include <stdio.h>
+
+bool fp2_isequal(fp2_t a, fp2_t b){
+    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
+}
+
+// VERY NOT SECURE (testing only)
+void fp2_random(fp2_t *a){
+    for(int i = 0; i < NWORDS_FIELD; i++){
+        a->re[i] = rand();
+        a->im[i] = rand();
+    }
+    // Normalize
+    fp2_t one;
+    fp_mont_setone(one.re);fp_set(one.im,0);
+    fp2_mul(&*a, &*a, &one);
+    // Update seed
+    srand((unsigned) a->re[0]);
+}
+
+void slow_mul(poly h, poly f, int lenf, poly g, int leng){
+  // Computes h = f*g by school method
+
+  fp2_t a, b;
+  int nf, ng, e;
+  int lenh = lenf + leng - 1;
+  
+  if(lenh <= 0){
+    return;
+  }
+  
+  fp2_t fg[lenh];
+  
+  if (leng > lenf){
+    slow_mul(h, g, leng, f, lenf);
+    return;
+  }
+  
+  for(e = 0; e < lenh; e++){
+
+    if (lenf - 1 < e){
+      nf = lenf - 1;
+    }
+    else{
+      nf = e;
+    }
+
+    ng = e - nf;
+    fp2_set(&a, 0);
+    while( (ng < leng) & (nf >= 0) ){
+      fp2_mul(&b, &f[nf], &g[ng]);
+      fp2_add(&a, &a, &b);
+      nf--;
+      ng++;
+    }
+    fp2_copy(&fg[e], &a);
+  }
+  for(e = 0; e < lenh; e++){
+    fp2_copy(&h[e], &fg[e]);
+  }
+  return;
+}
+
+
+
+int main(){
+  fp2_t fp2_0, fp2_1;
+  #define nmax 16
+  int nf, ng, n, e;
+        fp2_set(&fp2_0, 0);
+        fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0); 
+  
+  //TEST MULTIPLICATION BY 0
+  
+  for(nf = 2; nf < nmax; nf++){
+    fp2_t f[nf], h[nf-1];
+
+    printf("[%3d%%] Testing multiplication by 0", 100 * nf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    for(e = 0; e < nf; e++){
+      fp2_random(&f[e]);
+    }
+    poly_mul(h, f, nf, f, 0);
+    for(e = 0; e < nf-1; e++){
+      assert(fp2_is_zero(&h[e])==1);
+    }
+    poly_mul(h, f, 0, f, nf);
+    for(e = 0; e < nf-1; e++){
+      assert(fp2_is_zero(&h[e])==1);
+    }
+  }
+  printf("[%3d%%] Tested multiplication by 0:\t\tNo errors!\n", 100 * nf / nmax);
+
+  
+  
+  //TEST FOR f, g, h DISJOINT MEMORY SPACES
+  
+  for(nf = 1; nf < nmax; nf++){
+    
+    printf("[%3d%%] Testing multiplication", 100 * nf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    for(ng = 1; ng < nmax; ng++){
+      
+      fp2_t f[nf];   //Random length nf poly
+      for(e = 0; e < nf; e++){
+	fp2_random(&f[e]);
+      }
+      
+      fp2_t g[ng];  // Random length ng poly
+      for(e = 0; e < ng; e++){
+	fp2_random(&g[e]);
+      }
+      
+      fp2_t h[nf+ng-1];// Compute product
+      poly_mul(h, f, nf, g, ng);
+
+      fp2_t fg[nf+ng-1]; // Compute the product by school method
+      slow_mul(fg, f, nf, g, ng);
+      
+      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
+	assert(fp2_isequal(h[e], fg[e])==1);
+      }
+    }
+  }
+  printf("[%3d%%] Tested multiplication:\t\t\tNo errors!\n", 100 * nf / nmax);
+
+  
+
+  // TEST FOR f, g CONTIGIOUS AND RESULT SAVED OVER THEM
+    
+  for(nf = 1; nf < nmax; nf++){
+          
+    printf("[%3d%%] Testing multiplication in place", 100 * nf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    for(ng = 1; ng < nmax; ng++){
+      
+      fp2_t h[nf+ng];
+      
+      //Random length nf poly
+      for(e = 0; e < nf; e++){
+	fp2_random(&h[e]);
+      }
+      
+      // Random length ng poly
+      for(e = 0; e < ng; e++){
+	fp2_random(&h[e+nf]);
+      }
+
+      // Compute the product
+      fp2_t fg[nf+ng-1];
+      slow_mul(fg, h, nf, &(h[nf]), ng); // School method
+      poly_mul(h, h, nf, &(h[nf]), ng); // Karatsuba method
+
+
+      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
+	assert(fp2_isequal(h[e], fg[e])==1);
+      }
+    }
+  }
+    printf("[%3d%%] Tested multiplication in place:\t\tNo errors!\n", 100 * nf / nmax);
+
+    
+    
+  //TEST FOR MULTIPLICATION MOD X^N BY 0
+    
+  for(nf = 2; nf < nmax; nf++){
+    fp2_t f[nf];
+    
+    printf("[%3d%%] Testing mul mod x^n by 0", 100 * nf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    for(e = 0; e < nf; e++){
+      fp2_random(&f[e]);
+    }
+    
+    for(n = 1; n < nmax; n++){
+      fp2_t h[n];
+      poly_mul_low(h, n, f, nf, f, 0);
+      for(e = 0; e < n; e++){
+	assert(fp2_is_zero(&h[e])==1);
+      }
+      poly_mul_low(h, n, f, 0, f, nf);
+      for(e = 0; e < n; e++){
+	assert(fp2_is_zero(&h[e])==1);
+      }
+    }
+  }
+  printf("[%3d%%] Tested mul mod x^n by 0:\t\t\tNo errors!\n", 100 * nf / nmax);
+
+  
+  
+  //TEST FOR MULTIPLICATION MOD X^N
+    
+    for(nf = 1; nf < nmax; nf++){
+    
+      printf("[%3d%%] Testing mul mod x^n", 100 * nf / nmax);
+      fflush(stdout);
+      printf("\r\x1b[K");
+      
+      for(ng = 1; ng < nmax; ng++){
+
+	fp2_t f[nf], g[ng], fg[nf+ng-1];
+	poly h;
+
+	//Get random polynomials
+	for(e = 0; e < nf; e++){
+	  fp2_random(&f[e]);
+	}
+	for(e = 0; e < ng; e++){
+	  fp2_random(&g[e]);
+	}
+	
+	//Save regular result to fg
+	slow_mul(fg, f, nf, g, ng);
+
+	//Compute result mod x^n
+	for(n = 1; n < 2*nmax; n++){
+	  h = malloc(sizeof(fp2_t)*n);
+	  poly_mul_low(h, n, f, nf, g, ng);
+
+	  //Compare with expected
+	  e = 0;
+	  while(e < nf+ng-1 && e < n){
+	    assert(fp2_isequal(h[e], fg[e]) == 1);
+	    e++;
+	  }
+	  while(e < n){
+	    assert(fp2_is_zero(&h[e]) == 1);
+	    e++;
+	  }
+	  free(h);
+	}
+      }
+    }
+    printf("[%3d%%] Tested mul mod x^n:\t\t\tNo errors!\n", 100 * nf / nmax);
+
+  
+     
+  //TEST FOR POLY_MUL_MIDDLE
+    
+    for(nf = 1; nf < 2*nmax; nf+=1){
+      fp2_t f[nf];
+      
+      printf("[%3d%%] Testing poly_mul_middle", 100 * nf / (2*nmax));
+      fflush(stdout);
+      printf("\r\x1b[K");
+      
+      for(ng = (nf+1)>>1; ng < (nf+1)-((nf+1)>>1); ng++){
+	// This runs from floor((nf+1)/2) to ceil((nf+1)/2)
+	fp2_t g[ng];
+	for(e = 0; e < nf; e++){
+	  fp2_random(&f[e]);
+	}
+	for(e = 0; e < ng; e++){
+	  fp2_random(&g[e]);
+	}
+	
+	fp2_t h[nf+ng-1];
+	slow_mul(h, g, ng, f, nf);
+	poly_mul_middle(g, g, ng, f, nf);
+      
+	for(e = 0; e < ng; e++){
+	  assert(fp2_isequal(h[e+nf-ng], g[e])==1);
+	}
+      }
+    }
+    printf("[%3d%%] Tested poly_mul_middle:\t\t\tNo errors!\n", 100 * nf / (2*nmax));
+
+  
+  // TEST FOR SELF RECIPROCAL MULTIPLICATION
+    for(nf = 1; nf < nmax; nf++){
+
+      printf("[%3d%%] Testing self reciprocal mul", 100 * nf / nmax);
+      fflush(stdout);
+      printf("\r\x1b[K");
+
+      for(ng = 1; ng < nmax; ng++){
+      
+	fp2_t f[nf], g[ng], h[nf+ng-1], fg[nf+ng-1];
+
+	// Get random palyndromes
+	for(e = 0; e < (nf>>1); e++){
+	  fp2_random(&f[e]);
+	  fp2_copy(&f[nf-1-e], &f[e]);
+	}
+	if(nf & 1){
+	  fp2_random(&f[nf>>1]);
+	}
+
+	for(e = 0; e < (ng>>1); e++){
+	  fp2_random(&g[e]);
+	  fp2_copy(&g[ng-1-e], &g[e]);
+	}
+	if(ng & 1){
+	  fp2_random(&g[ng>>1]);
+	} 
+
+	// Compute products
+	poly_mul_selfreciprocal(h, g, ng, f, nf);
+	slow_mul(fg, g, ng, f, nf);
+
+	// Compare
+	for(e = 0; e < nf+ng-1; e++){
+	  assert(fp2_isequal(fg[e], h[e])==1);
+	}
+      }
+    }		 
+    printf("[%3d%%] Tested self reciprocal mul:\t\tNo errors!\n", 100 * nf / nmax);
+
+  // TEST FOR PRODUCT TREES
+    int tree_size, iteration, i;
+    int  len, *DEG, LENF;
+    poly *H, *F, h;
+    
+    for(tree_size = 1; tree_size < nmax; tree_size++){
+
+      printf("[%3d%%] Testing product tree:\t\t\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
+      fflush(stdout);
+      printf("\r\x1b[K");
+
+      i = 0;
+      while((1<<i) < tree_size){
+	i++;
+      }
+      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
+      H = malloc(sizeof(poly)*((1<<(i+2))-1));
+      F = malloc(sizeof(poly)*tree_size);
+      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
+
+      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
+
+	// Generate random list of polynomials
+	LENF = (rand() % nmax)+1;
+	for(i = 0; i < tree_size; i++){
+	  F[i] = malloc(sizeof(fp2_t)*LENF);
+	  for(e = 0; e < LENF; e++){
+	    fp2_random(&F[i][e]);
+	  }
+	}
+	product_tree(H, DEG, 0, F, LENF, tree_size);
+	
+	// Build product of all polynomials manually
+	len = LENF;
+	
+	//for(e = 0; e < LENF[0]; e++){
+	for(e = 0; e < LENF; e++){
+	  fp2_copy(&h[e], &F[0][e]);
+	}
+	for(i = 1; i < tree_size; i++){
+	  poly_mul(h, h, len, F[i], LENF);
+	  len += LENF-1;
+	}
+
+	// Compare to root
+	assert (len == DEG[0]+1);
+	for(e = 0; e < len; e++){
+	  assert(fp2_isequal(H[0][e], h[e])==1);
+	}
+      clear_tree(H, 0, tree_size);
+      for(i = 0; i < tree_size; i++){
+	free(F[i]);
+      }
+
+      }
+      free(DEG);
+      free(H);
+      free(F); 
+      free(h);
+    }
+    printf("[%3d%%] Tested product tree:\t\t\tNo errors!\n", 100 * tree_size / nmax);
+    
+  // TEST FOR SELF RECIPROCAL PRODUCT TREES
+    
+    for(tree_size = 1; tree_size < nmax; tree_size++){
+
+      printf("[%3d%%] Testing selfreciprocal product tree:\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
+      fflush(stdout);
+      printf("\r\x1b[K");
+
+      i = 0;
+      while((1<<i) < tree_size){
+	i++;
+      }
+      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
+      H = malloc(sizeof(poly)*((1<<(i+2))-1));
+      F = malloc(sizeof(poly)*tree_size);
+      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
+
+      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
+
+	// Generate random list of polynomials
+	LENF = (rand() % nmax)+1;;
+	for(i = 0; i < tree_size; i++){
+	  F[i] = malloc(sizeof(fp2_t)*LENF);
+	  for(e = 0; e < (LENF>>1); e++){
+	    fp2_random(&F[i][e]);
+	    fp2_copy(&F[i][LENF-1-e], &F[i][e]);
+	  }
+	  if(LENF & 1){
+	  	fp2_random(&F[i][(LENF>>1)]);
+	  }
+	}
+	product_tree_selfreciprocal(H, DEG, 0, F, LENF, tree_size);
+	
+	// Build product of all polynomials manually
+	len = LENF;
+	for(e = 0; e < LENF; e++){
+	  fp2_copy(&h[e], &F[0][e]);
+	}
+	for(i = 1; i < tree_size; i++){
+	  poly_mul(h, h, len, F[i], LENF);
+	  len += LENF-1;
+	}
+
+	// Compare to root
+	assert (len == DEG[0]+1);
+	for(e = 0; e < len; e++){
+	  assert(fp2_isequal(H[0][e], h[e])==1);
+	}
+      clear_tree(H, 0, tree_size);
+      for(i = 0; i < tree_size; i++){
+	free(F[i]);
+      }
+
+      }
+      free(DEG);
+      free(H);
+      free(F); 
+      free(h);
+    }
+    printf("[%3d%%] Tested selfreciprocal product tree:\tNo errors!\n", 100 * tree_size / nmax);
+    
+    printf("-- All tests passed.\n");
+    return 0;
+}
+  
--- a/src/ec/ref/ecx/test/poly-redc-test.c
+++ b/src/ec/ref/ecx/test/poly-redc-test.c
@@ -0,0 +1,461 @@
+#include "poly.h"
+#include <assert.h>
+#include <stdio.h>
+#define nmax 32
+
+bool fp2_isequal(fp2_t a, fp2_t b){
+    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
+}
+
+// VERY NOT SECURE (testing only)
+void fp2_random(fp2_t *a){
+    for(int i = 0; i < NWORDS_FIELD; i++){
+        a->re[i] = rand();
+        a->im[i] = rand();
+    }
+    // Normalize
+    fp2_t one;
+    fp_mont_setone(one.re);fp_set(one.im,0);
+    fp2_mul(&*a, &*a, &one);
+    // Update seed
+    srand((unsigned) a->re[0]);
+}
+
+int main(){
+  fp2_t fp2_0, fp2_1;
+  fp2_set(&fp2_0, 0);
+  fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
+
+  int lenf, leng, n, e, iteration, array_size, tree_size, i, root, brother, *DEG, LENF;
+  poly f, g, h, f_rev, f_rev_inv, *F, *H, *R, g1, g2, REM1, REM2, G1, G2, G1_rev, G2_rev, R0;
+  fp2_t c, *A, *C, ratio, A0;
+  
+  f_rev_inv = 0;
+  
+// TEST FOR RECIPROCAL
+  for(lenf = 1; lenf < nmax; lenf++)
+  {  
+    printf("[%3d%%] Testing reciprocals", 100 * lenf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+
+    // Get random poly
+    f = malloc(sizeof(fp2_t)*lenf);
+    for(e = 0; e < lenf; e++)
+      fp2_random(&f[e]);
+
+    for(n = 1; n < nmax; n++)
+    {
+      // Get the reciprocal and multiply them
+      h = malloc(sizeof(fp2_t)*n);
+      memset(h, 0, sizeof(fp2_t)*n);
+      reciprocal(h, &c, f, lenf, n);
+      poly_mul_low(h, n, f, lenf, h, n);
+
+      // Compare with expected
+      assert(fp2_isequal(h[0],c));
+      for(e = 1;  e < n; e++)
+	assert(fp2_is_zero(&h[e]));
+      free(h);
+    }
+    free(f); 
+  }
+  printf("[%3d%%] Tested reciprocals:\t\tNo errors!\n", 100 * lenf / nmax);
+  
+  
+
+  // TEST FOR REDUCTION
+  for(lenf = 2; lenf < nmax; lenf++)
+  {
+    printf("[%3d%%] Testing polynomial reduction", 100 * lenf / nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+
+    // Get random poly for the mod
+    f = malloc(sizeof(fp2_t)*lenf);
+    f_rev = malloc(sizeof(fp2_t)*lenf);
+    for(e = 0; e < lenf; e++)
+    {
+      fp2_random(&f[e]);
+      fp2_copy(&f_rev[lenf-1-e], &f[e]);
+    }
+
+    for(leng = 1; leng < nmax; leng++)
+    {
+      // Get random poly to reduce
+      g = malloc(sizeof(fp2_t)*leng);
+      for(e = 0; e < leng; e++){
+	fp2_random(&g[e]);
+      }
+
+      // Get reverse-inverse mod x^(leng-lenf+1)
+      if(leng >= lenf)
+      {
+	f_rev_inv = malloc(sizeof(fp2_t)*(leng-lenf+1));
+	reciprocal(f_rev_inv, &c, f_rev, lenf, leng-lenf+1);
+      }
+      else{
+	fp_mont_setone(c.re);fp_set(c.im,0);
+      }
+	
+      // Compute the reduction
+      h = malloc(sizeof(fp2_t)*(lenf-1));
+      poly_redc(h, g, leng, f, lenf, f_rev_inv, c);
+
+      // Reduce manually
+      int leng_red = leng;
+      fp2_t scale, f_e;
+      while(leng_red >= lenf)
+      {
+	fp2_copy(&scale, &f[lenf-1]);
+	fp2_inv(&scale);
+	fp2_mul(&scale, &scale, &g[leng_red-1]);
+	for(e = 0; e < lenf; e++)
+	  {
+	    fp2_mul(&f_e, &f[e], &scale);
+	    fp2_sub(&g[e+leng_red-lenf], &g[e+leng_red-lenf], &f_e);
+	  }
+	leng_red--;
+      }
+
+      // Rescale manual result
+      if( leng < lenf){
+	      fp_mont_setone(scale.re);fp_set(scale.im,0);
+      }
+      else
+	if(lenf == 2 && leng == 3)
+	{
+	  fp2_sqr(&scale, &f[1]);
+	  fp2_add(&scale, &scale, &scale);
+	}
+	else
+	  fp2_copy(&scale, &c);
+      for(e = 0; e < leng_red; e++)
+	fp2_mul(&g[e], &g[e], &scale);
+     
+
+      // Comapre results
+      for(e = leng_red-1; e >= 0; e--)
+	      assert(fp2_isequal(h[e], g[e]));
+      for(e = leng_red; e < lenf-1; e++)
+	      assert(fp2_is_zero(&h[e]));
+      
+      free(g);
+      free(h);
+      if(leng >= lenf)
+	free(f_rev_inv);
+    }
+    free(f);
+    free(f_rev);
+  }
+  printf("[%3d%%] Tested polynomial reduction:\tNo errors!\n", 100 * lenf / nmax);
+
+  
+
+// TEST FOR RECIPROCAL TREES
+  
+  for(tree_size = 3; tree_size < nmax; tree_size++)
+  {
+    printf("[%3d%%] Testing reciprocal tree:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    // Compute size of arrays
+    i = 0;
+    while((1<<i) < tree_size){
+      i++;
+    }
+    array_size = (1<<(i+2))-1;
+    
+    DEG = malloc(sizeof(int)*array_size);
+    H = malloc(sizeof(poly)*array_size);
+    R = malloc(sizeof(poly)*array_size);
+    F = malloc(sizeof(poly)*tree_size);
+    A = malloc(sizeof(fp2_t)*array_size);
+    
+    // Get random polys
+    LENF = 2;
+    for(i = 0; i < tree_size; i++)
+    {
+      F[i] = malloc(sizeof(fp2_t)*LENF);
+      for(e = 0; e < LENF; e++){
+	      fp2_random(&F[i][e]);
+      }
+    }
+    
+    // Get product tree then reciprocal tree
+    product_tree(H, DEG, 0, F, LENF, tree_size);
+    leng = DEG[0]+1+(rand() % nmax);
+    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
+    
+    // Check the root
+    root = 0;
+    lenf = leng-DEG[root];
+    f = malloc(sizeof(fp2_t)*lenf);
+    for(e = 0; e < DEG[root]+1 && e < lenf; e++){
+      fp2_copy(&f[e], &H[root][DEG[root]-e]);
+    }
+    for(e = DEG[root]+1; e < lenf; e++){
+      fp2_set(&f[e], 0);
+    }
+    poly_mul_low(f, lenf, f, lenf, R[root], lenf);
+    assert(fp2_isequal(f[0], A[root]));
+    for(e = 1; e < lenf; e++){
+      assert(fp2_is_zero(&f[e]));
+    }
+    free(f);
+    
+    // Perform random walks
+    for(iteration = 0; iteration < nmax - tree_size; iteration++)
+    {
+      root = 0;
+      n = tree_size;
+      while(n > 1)
+      {
+	if(rand() & 1)
+	{
+	  root = 2*root+1;
+	  n = n - (n>>1);
+	}
+	else
+	{
+	  root = 2*root+2;
+	  n = n>>1;
+	}
+	brother = root - 1 + 2*(root & 1);
+	
+	// Check current node
+	if(DEG[root] > 2)
+	{
+	  lenf = DEG[brother];
+	  f = malloc(sizeof(fp2_t)*lenf);
+	  for(e = 0; e < DEG[root]+1 && e < lenf; e++){
+	    fp2_copy(&f[e], &H[root][DEG[root]-e]);
+    }
+	  for(e = DEG[root]+1; e < lenf; e++){
+	    fp2_set(&f[e], 0);
+    }
+	  poly_mul_low(f, lenf, f, lenf, R[root], lenf);
+	  assert(fp2_isequal(f[0], A[root]));
+	  for(e = 1; e < lenf; e++){
+	    assert(fp2_is_zero(&f[e]));
+    }
+	  free(f);
+	}
+      }
+    }
+    // Clean up
+    for(i = 0; i < tree_size; i++)
+      free(F[i]);
+    clear_tree(H, 0, tree_size);
+    clear_tree(R, 0, tree_size);
+    free(F);
+    free(H);
+    free(R);
+    free(A);
+    free(DEG);
+  }
+  printf("[%3d%%] Tested reciprocal tree:\t\tNo errors!\n", 100 * tree_size / nmax);
+  
+  
+
+  // TEST FOR REMAINDERS
+  for(tree_size = 2; tree_size < nmax; tree_size++)
+  {
+    printf("[%3d%%] Testing batched remainders:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    // Compute size of arrays
+    i = 0;
+    while((1<<i) < tree_size)
+      i++;
+    array_size = (1<<(i+2))-1;
+    
+    DEG = malloc(sizeof(int)*array_size);
+    H = malloc(sizeof(poly)*array_size);
+    R = malloc(sizeof(poly)*array_size);
+    F = malloc(sizeof(poly)*tree_size);
+    A = malloc(sizeof(fp2_t)*array_size);
+    REM1 = malloc(sizeof(fp2_t)*array_size);
+    REM2 = malloc(sizeof(fp2_t)*array_size);
+    C = malloc(sizeof(fp2_t)*tree_size);
+    
+    // Get random polys
+    LENF = 2;
+    for(i = 0; i < tree_size; i++)
+    {
+      F[i] = malloc(sizeof(fp2_t)*LENF);
+      for(e = 0; e < LENF; e++)
+	fp2_random(&F[i][e]);
+    }
+    
+    // Get product tree, reciprocal tree, and remainders
+    product_tree(H, DEG, 0, F, LENF, tree_size);
+    leng = DEG[0]+1+(rand() % nmax);
+    g1 = malloc(sizeof(fp2_t)*leng);
+    g2 = malloc(sizeof(fp2_t)*leng);
+    for(e = 0; e < leng; e++)
+    {
+      fp2_random(&g1[e]);
+      fp2_random(&g2[e]);
+    }
+    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
+    multieval_unscaled(REM1, g1, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
+    multieval_unscaled(REM2, g2, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
+    
+    for(i = 0; i < tree_size; i++)
+    {
+      // Get ratio of the remainder
+      fp2_inv(&REM1[i]);
+      fp2_mul(&ratio, &REM1[i], &REM2[i]);
+      
+      // Compute remainders manually
+      f_rev = malloc(sizeof(fp2_t)*LENF);
+      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
+      h = malloc(sizeof(fp2_t)*(LENF-1));
+      for(e = 0; e < LENF; e++)
+	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
+      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
+      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
+      fp2_copy(&REM1[i], &h[0]);
+      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
+      fp2_copy(&REM2[i], &h[0]);
+      free(f_rev);
+      free(f_rev_inv);
+      free(h);
+
+      // Compare results
+      fp2_inv(&REM1[i]);
+      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
+      assert(fp2_isequal(REM1[i], ratio));
+    }
+		 
+    // Clean up
+    for(i = 0; i < tree_size; i++)
+      free(F[i]);
+    free(g1);
+    free(g2);
+    clear_tree(H, 0, tree_size);
+    clear_tree(R, 0, tree_size);
+    free(F);
+    free(H);
+    free(R);
+    free(A);
+    free(DEG);
+    free(REM1);
+    free(REM2);
+    free(C);
+  } 
+  printf("[%3d%%] Tested batched remainders:\tNo errors!\n", 100 * tree_size / nmax);
+  
+
+
+// TEST FOR SCALED REMAINDER TREE
+  for(tree_size = 1; tree_size < nmax; tree_size++)
+  {
+    printf("[%3d%%] Testing scaled remainder tree:\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
+    fflush(stdout);
+    printf("\r\x1b[K");
+    
+    // Compute size of arrays
+    i = 0;
+    while((1<<i) < tree_size)
+      i++;
+    array_size = (1<<(i+2))-1;
+    
+    DEG = malloc(sizeof(int)*array_size);
+    H = malloc(sizeof(poly)*array_size);
+    F = malloc(sizeof(poly)*tree_size);
+    REM1 = malloc(sizeof(fp2_t)*array_size);
+    REM2 = malloc(sizeof(fp2_t)*array_size);
+    
+    // Get random polys
+    LENF = 2;
+    for(i = 0; i < tree_size; i++)
+    {
+      F[i] = malloc(sizeof(fp2_t)*LENF);
+      for(e = 0; e < LENF; e++)
+	fp2_random(&F[i][e]);
+    }
+    
+    // Get random polys to reduce
+    product_tree(H, DEG, 0, F, LENF, tree_size);
+    leng = DEG[0]+1+(rand() % nmax);
+    g1 = malloc(sizeof(fp2_t)*leng);
+    g2 = malloc(sizeof(fp2_t)*leng);
+    for(e = 0; e < leng; e++)
+    {
+      fp2_random(&g1[e]);
+      fp2_random(&g2[e]);
+    }
+
+    // Get the required initial nodes
+    G1 = malloc(sizeof(fp2_t)*DEG[0]);
+    G2 = malloc(sizeof(fp2_t)*DEG[0]);
+    G1_rev = malloc(sizeof(fp2_t)*DEG[0]);
+    G2_rev = malloc(sizeof(fp2_t)*DEG[0]);
+    R0 = malloc(sizeof(fp2_t)*(leng));
+    f_rev = malloc(sizeof(fp2_t)*(DEG[0]+1));
+    for(e = 0; e < DEG[0]+1; e++)
+      fp2_copy(&f_rev[e], &H[0][DEG[0]-e]);
+    if( DEG[0] > leng-DEG[0])
+      reciprocal(R0, &A0, f_rev, DEG[0]+1, DEG[0]);
+    else
+      reciprocal(R0, &A0, f_rev, DEG[0]+1, leng-DEG[0]);
+    poly_redc(G1, g1, leng, H[0], DEG[0]+1, R0, A0);
+    poly_redc(G2, g2, leng, H[0], DEG[0]+1, R0, A0);
+    for(e = 0; e < DEG[0]; e++)
+    {
+      fp2_copy(&G1_rev[e], &G1[DEG[0]-1-e]);
+      fp2_copy(&G2_rev[e], &G2[DEG[0]-1-e]);
+    }
+    poly_mul_middle(G1_rev, G1_rev, DEG[0], R0, DEG[0]);
+    poly_mul_middle(G2_rev, G2_rev, DEG[0], R0, DEG[0]);
+    for(e = 0; e < DEG[0]; e++)
+    {
+      fp2_copy(&G1[e], &G1_rev[DEG[0]-1-e]);
+      fp2_copy(&G2[e], &G2_rev[DEG[0]-1-e]);
+    }
+    free(G1_rev);free(G2_rev);free(R0);free(f_rev);
+
+    // Compute the scaled remainder trees
+    multieval_scaled(REM1, G1, H, DEG, 0, tree_size);
+    multieval_scaled(REM2, G2, H, DEG, 0, tree_size);
+    
+    for(i = 0; i < tree_size; i++)
+    {
+      // Get ratio of the remainder
+      fp2_inv(&REM1[i]);
+      fp2_mul(&ratio, &REM1[i], &REM2[i]);
+
+      // Compute remainders manually
+      f_rev = malloc(sizeof(fp2_t)*LENF);
+      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
+      h = malloc(sizeof(fp2_t)*(LENF-1));
+      for(e = 0; e < LENF; e++)
+	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
+      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
+      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
+      fp2_copy(&REM1[i], &h[0]);
+      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
+      fp2_copy(&REM2[i], &h[0]);
+      free(f_rev);free(f_rev_inv);free(h);
+
+      // Compare results
+      fp2_inv(&REM1[i]);
+      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
+      assert(fp2_isequal(REM1[i], ratio));
+    }
+		 
+    // Clean up
+    for(i = 0; i < tree_size; i++)
+      free(F[i]);
+    free(F);free(g1);free(g2);free(G1);free(G2);
+    clear_tree(H, 0, tree_size);free(H);free(DEG);
+    free(REM1);free(REM2);
+  } 
+  printf("[%3d%%] Tested scaled remainder tree:\tNo errors!\n", 100 * tree_size / nmax);
+  
+  printf("-- All tests passed.\n");
+}
--- a/src/ec/ref/ecx/test/test_extras.c
+++ b/src/ec/ref/ecx/test/test_extras.c
@@ -0,0 +1,75 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/src/ec/ref/ecx/test/test_extras.h
+++ b/src/ec/ref/ecx/test/test_extras.h
@@ -0,0 +1,29 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <time.h>
+#include <stdlib.h>
+#include <fp.h>
+#include <fp2.h>
+#include <curve_extras.h>
+
+#define PASSED    0
+#define FAILED    1
+    
+// Access system counter for benchmarking
+//int64_t cpucycles(void);
+
+// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
+
+// Multiprecision subtraction for testing, assumes a > b
+void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords);
+
+// Generating a pseudo-random field element in [0, p-1] 
+void fprandom_test(digit_t* a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2random_test(fp2_t* a);
+
+#endif
--- a/src/ec/ref/ecx/test/velu-test.c
+++ b/src/ec/ref/ecx/test/velu-test.c
@@ -0,0 +1,298 @@
+#include<time.h>
+#include <stdio.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "isog.h"
+#include "sdacs.h"
+#include "ec.h"
+#include "test-basis.h"
+
+void random_scalar(fp_t k, const uint8_t j)
+{
+    for(int i = 0; i < NWORDS_FIELD; i++)
+        k[i] = rand();
+}
+
+// Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
+void coeff(fp2_t *B, ec_point_t const A)
+{
+	fp2_t t;
+	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
+	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
+
+	fp2_copy(&*B, &A.z);
+	fp2_inv(&*B);		// 1 / (C24)
+	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
+	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
+}
+
+// Determines if point is fp2-rational (if not, then it must be a zero trace point)
+uint8_t isrational(ec_point_t const T, fp2_t const a)
+{
+	fp2_t XT, tmp, aux, YT_squared;
+
+	fp2_copy(&XT, &T.z);
+	fp2_inv(&XT);
+
+	fp2_mul(&XT, &XT, &T.x);
+
+	fp2_sqr(&tmp, &XT);
+	fp2_mul(&aux, &tmp, &XT);
+	fp2_mul(&tmp, &tmp, &a);
+	fp2_add(&YT_squared, &tmp, &aux);
+	fp2_add(&YT_squared, &YT_squared, &XT);
+
+	return fp2_is_square(&YT_squared);
+}
+
+// ladder3pt computes x(P + [m]Q)
+void ladder3pt(ec_point_t *R, fp_t const m, ec_point_t const *P, ec_point_t const *Q, ec_point_t const *PQ, ec_point_t const *A)
+{
+	ec_point_t X0, X1, X2;
+	copy_point(&X0, Q);
+	copy_point(&X1, P);
+	copy_point(&X2, PQ);
+
+	int i,j;
+	uint64_t t;
+	for (i = 0; i < NWORDS_FIELD; i++)
+	{
+		t = 1;
+		for (j = 0 ; j < 64; j++)
+		{
+			swap_points(&X1, &X2, -((t & m[i]) == 0));
+			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
+			swap_points(&X1, &X2, -((t & m[i]) == 0));
+			t <<= 1;
+		};
+	};
+	copy_point(R, &X1);
+}
+
+// The projective x-coordinate point (X : Z) at infinity is such that Z == 0
+static inline int isinfinity(ec_point_t const P)
+{
+	return fp2_is_zero(&P.z);
+}
+
+int main()
+{
+	
+	fp2_t fp2_0, fp2_1;
+	fp2_set(&fp2_0, 0);
+	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
+
+	int i, j;
+
+	ec_point_t A, B, T;
+	fp2_set(&A.x, 0);
+	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
+	
+	// fp2_add(&A.x, &A.z, &A.x);	// 1
+	// fp2_add(&A.x, &A.x, &A.x);	// 2
+	// fp2_add(&A.x, &A.z, &A.x);	// 3
+	// fp2_add(&A.x, &A.x, &A.x);	// 6
+
+	fp2_add(&A.z, &A.z, &A.z);	// 2C
+	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
+	fp2_add(&A.z, &A.z, &A.z);	// 4C
+
+	// Just to ensure the projective curve coeffientes are different from zero
+	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
+
+	fp2_t a;
+	coeff(&a, A);
+
+	ec_point_t PA, QA, PQA, PB, QB, PQB, RA, RB;
+
+	// Writing the public projective x-coordinate points into Montogmery domain
+	fp2_tomont(&(PA.x), &(xPA));
+	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
+	fp2_tomont(&(QA.x), &(xQA));
+	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
+	fp2_tomont(&(PQA.x), &(xPQA));
+	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
+
+	assert( isrational(PA, a) );
+	assert( isrational(QA, a) );
+	assert( isrational(PQA, a) );
+
+	fp2_tomont(&(PB.x), &(xPB));
+	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
+	fp2_tomont(&(QB.x), &(xQB));
+	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
+	fp2_tomont(&(PQB.x), &(xPQB));
+	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
+
+	assert( !isrational(PB, a) );
+	assert( !isrational(QB, a) );
+	assert( !isrational(PQB, a) );
+	// ======================================================================================================
+	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
+	for (j = 0; j < P_LEN; j++)
+	{
+		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
+		{
+			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+			assert( isrational(PA, a) );
+			assert( isrational(QA, a) );
+			assert( isrational(PQA, a) );
+		};
+	};
+
+	assert( !isinfinity(PA) );
+	assert( !isinfinity(QA) );
+	assert( !isinfinity(PQA) );
+
+	// --------------------------------------------------------------
+	fp_t m;
+	random_scalar(m, 0);
+	ladder3pt(&RA, m, &PA, &QA, &PQA, &A);
+	for (i = 0; i < P_LEN; i++)
+	{
+		printf("// Processing the %d-th prime:\t", i + 1);
+		printf("%2d%%", 100 * i / (int)P_LEN);
+		fflush(stdout);
+		printf("\r\x1b[K");
+
+		copy_point(&T, &RA);
+		for (j = (i+1); j < P_LEN; j++)
+			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+		assert( !isinfinity(T) );
+
+		kps(i, T, A);
+		if (TORSION_ODD_PRIMES[i] > gap)
+			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
+		else
+			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
+
+		xisog(&B, i, A);
+
+		xeval(&PB, i, PB, A);
+		coeff(&a, B);
+		assert( !isinfinity(PB) );
+		assert( !isrational(PB, a) );
+
+		xeval(&RA, i, RA, A);
+		assert( (!isinfinity(RA) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
+		assert( (isrational(RA, a) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
+		
+		copy_point(&A, &B);
+		// Verifying the order of the image point of  PA has been reduced 
+		copy_point(&T, &RA);
+		for (j = (i+1); j < P_LEN; j++)
+			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+		assert( isinfinity(T) );
+		kps_clear(i);
+	};
+
+	fp2_set(&A.x, 0);
+	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
+	
+	// fp2_add(&A.x, &A.z, &A.x);	// 1
+	// fp2_add(&A.x, &A.x, &A.x);	// 2
+	// fp2_add(&A.x, &A.z, &A.x);	// 3
+	// fp2_add(&A.x, &A.x, &A.x);	// 6
+
+	fp2_add(&A.z, &A.z, &A.z);	// 2C
+	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
+	fp2_add(&A.z, &A.z, &A.z);	// 4C
+
+	// Just to ensure the projective curve coeffientes are different from zero
+	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
+
+	coeff(&a, A);
+	// Writing the public projective x-coordinate points into Montogmery domain
+	fp2_tomont(&(PA.x), &(xPA));
+	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
+	fp2_tomont(&(QA.x), &(xQA));
+	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
+	fp2_tomont(&(PQA.x), &(xPQA));
+	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
+
+	assert( isrational(PA, a) );
+	assert( isrational(QA, a) );
+	assert( isrational(PQA, a) );
+
+	fp2_tomont(&(PB.x), &(xPB));
+	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
+	fp2_tomont(&(QB.x), &(xQB));
+	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
+	fp2_tomont(&(PQB.x), &(xPQB));
+	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
+
+	assert( !isrational(PB, a) );
+	assert( !isrational(QB, a) );
+	assert( !isrational(PQB, a) );
+
+	// ======================================================================================================
+	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
+	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
+	{
+		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
+		{
+			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+			assert( !isrational(PB, a) );
+			assert( !isrational(QB, a) );
+			assert( !isrational(PQB, a) );
+		};
+	};
+
+	assert( !isinfinity(PB) );
+	assert( !isinfinity(QB) );
+	assert( !isinfinity(PQB) );
+
+	random_scalar(m, 1);
+	ladder3pt(&RB, m, &PB, &QB, &PQB, &A);
+	for (i = P_LEN; i < (P_LEN+M_LEN); i++)
+	{
+		printf("// Processing the %d-th prime:\t", i + 1);
+		printf("%2d%%", 100 * i / (int)(P_LEN+M_LEN));
+		fflush(stdout);
+		printf("\r\x1b[K");
+
+		copy_point(&T, &RB);
+		for (j = (i+1); j < (P_LEN+M_LEN); j++)
+			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+		assert( !isinfinity(T) );
+
+		kps(i, T, A);
+		if (TORSION_ODD_PRIMES[i] > gap)
+			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
+		else
+			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
+
+		xisog(&B, i, A);
+
+		xeval(&PA, i, PA, A);
+		coeff(&a, B);
+		assert( !isinfinity(PA) );
+		assert( isrational(PA, a) );
+
+		xeval(&RB, i, RB, A);
+		assert( (!isinfinity(RB) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
+		assert( (!isrational(RB, a) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
+	
+		copy_point(&A, &B);
+		// Verifying the order of the image point of  PB has been reduced 
+		copy_point(&T, &RB);
+		for (j = (i+1); j < (P_LEN+M_LEN); j++)
+			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
+
+		assert( isinfinity(T) );
+		kps_clear(i);
+	};
+
+	printf("-- All tests passed!\n");
+	return 0;
+}
--- a/src/ec/ref/ecx/xeval.c
+++ b/src/ec/ref/ecx/xeval.c
@@ -0,0 +1,299 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+// Traditional isogeny evaluation (xEVAL)
+
+// CrissCross procedure as described in Hisil and Costello paper
+void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta)
+{
+	fp2_t t_1, t_2;
+
+	fp2_mul(&t_1, &alpha, &delta);
+    	fp2_mul(&t_2, &beta, &gamma);
+	fp2_add(&*r0, &t_1, &t_2);
+	fp2_sub(&*r1, &t_1, &t_2);
+}
+
+// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
+void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ)
+{
+	fp2_t t0, t1, t2;
+	for(int j = 0; j < lenQ; j++){
+		fp2_add(&t0, &Q[j].x, &Q[j].z);
+		fp2_sub(&t1, &Q[j].x, &Q[j].z);
+		fp2_mul(&t2, &K[0].x, &t1);
+		fp2_mul(&t1, &K[0].z, &t0);
+		fp2_add(&t0, &t2, &t1);
+		fp2_sub(&t1, &t2, &t1);
+		fp2_mul(&R[j].x, &Q[j].x, &t0);
+		fp2_mul(&R[j].z, &Q[j].z, &t1);
+	}
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
+void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ)
+{
+	fp2_t t0, t1;
+
+	for(int i = 0; i < lenQ; i++){
+		fp2_add(&t0, &Q[i].x, &Q[i].z);
+		fp2_sub(&t1, &Q[i].x, &Q[i].z);
+		fp2_mul(&(R[i].x), &t0, &K[1].x);
+		fp2_mul(&(R[i].z), &t1, &K[2].x);
+		fp2_mul(&t0, &t0, &t1);
+		fp2_mul(&t0, &t0, &K[0].x); 
+		fp2_add(&t1, &(R[i].x), &(R[i].z));
+		fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
+		fp2_sqr(&t1, &t1);
+		fp2_sqr(&(R[i].z), &(R[i].z));
+		fp2_add(&(R[i].x), &t0, &t1);
+		fp2_sub(&t0, &t0, &(R[i].z));
+		fp2_mul(&(R[i].x), &(R[i].x), &t1);
+		fp2_mul(&(R[i].z), &(R[i].z), &t0);
+	}
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P = (0, 0)
+// Must call after xisog_4_singular
+void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P)
+{
+	fp2_t t0, t1, t2;
+	for(int i = 0; i < lenQ; i++){
+		fp2_add(&t0, &Q[i].x, &Q[i].z);
+		fp2_sub(&t2, &Q[i].x, &Q[i].z);
+		fp2_sqr(&t0, &t0);
+		fp2_sqr(&t2, &t2);
+		fp2_sub(&R[i].z, &t0, &t2);
+		if(fp2_is_equal(&P.x, &P.z)){
+			// Branch for P = (+1,_)
+			fp2_copy(&t1, &t2);
+		}
+		else{
+			// Branch for P = (-1,_)
+			fp2_copy(&t1, &t0);
+			fp2_copy(&t0, &t2);
+		}
+		fp2_mul(&R[i].x, &R[i].z, &K[0].x);
+		fp2_mul(&R[i].z, &R[i].z, &K[1].x);
+		fp2_mul(&R[i].z, &R[i].z, &t1);
+		fp2_mul(&t1, &t1, &K[0].z);
+		fp2_add(&R[i].x, &R[i].x, &t1);
+		fp2_mul(&R[i].x, &R[i].x, &t0);
+	}
+}
+
+// Isogeny evaluation on Montgomery curves
+// Recall: K has been computed in Twisted Edwards model and none extra additions are required.
+void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P)
+{
+	int j;
+	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
+
+	fp2_t R0, R1, S0, S1, T0, T1;
+	fp2_add(&S0, &P.x, &P.z);
+	fp2_sub(&S1, &P.x, &P.z);
+
+	CrissCross(&R0, &R1, K[0].z, K[0].x, S0, S1);
+	for (j = 1; j < d; j++)
+	{
+		CrissCross(&T0, &T1, K[j].z, K[j].x, S0, S1);
+		fp2_mul(&R0, &T0, &R0);
+		fp2_mul(&R1, &T1, &R1);
+	};
+
+	fp2_sqr(&R0, &R0);
+	fp2_sqr(&R1, &R1);
+
+	fp2_mul(&(Q->x), &P.x, &R0);
+	fp2_mul(&(Q->z), &P.z, &R1);
+}
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+// Isogeny evaluation (xEVAL) used in velu SQRT
+
+void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
+{
+	// =================================================================================
+	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
+	sI = sizeI[i];          // size of I
+	sJ = sizeJ[i];          // size of J
+	sK = sizeK[i];          // size of K
+
+	assert(sI >= sJ);       // Ensuring #I >= #J
+	assert(sK >= 0);        // Recall, it must be that #K >= 0
+	assert(sJ > 1);         // ensuring sI >= sJ > 1
+	// =================================================================================
+
+	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
+	fp2_t Ap;
+	fp2_add(&Ap, &A.x, &A.x); // 2A' + 4C
+	fp2_sub(&Ap, &Ap, &A.z);   // 2A'
+	fp2_add(&Ap, &Ap, &Ap);     // 4A'
+
+	//  --------------------------------------------------------------------------------------------------
+	//                   ~~~~~~~~
+	//                    |    | 
+	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
+	//                    j in J 
+	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
+	// In particular, for a degree-l isogeny construction, we need alpha = X/Z and alpha = Z/X (i.e., 1/alpha)
+
+	//fp2_t EJ_0[sJ][3]; // EJ_0[j][2] factors of one polynomial to be used in a resultant 
+
+	fp2_t XZ_add, XZj_add,
+	   XZ_sub, XZj_sub,
+	   AXZ2,
+	   CXZ2,
+	   CX2Z2,
+	   t1, t2;
+
+	fp2_add(&XZ_add, &P.x, &P.z);	// X + Z
+	fp2_sub(&XZ_sub, &P.x, &P.z);	// X - Z
+
+	fp2_mul(&AXZ2, &P.x, &P.z);	// X * Z
+	fp2_sqr(&t1, &P.x);		// X ^ 2
+	fp2_sqr(&t2, &P.z);		// Z ^ 2
+
+	fp2_add(&CX2Z2, &t1, &t2);		//      X^2 + Z^2
+	fp2_mul(&CX2Z2, &CX2Z2, &A.z);	// C * (X^2 + Z^2)
+
+	fp2_add(&AXZ2, &AXZ2, &AXZ2);	//       2 * (X * Z)
+	fp2_mul(&CXZ2, &AXZ2, &A.z);	// C  * [2 * (X * Z)]
+	fp2_mul(&AXZ2, &AXZ2, &Ap);		// A' * [2 * (X * Z)]
+
+	int j;
+	for (j = 0; j < sJ; j++)
+	{
+		fp2_add(&XZj_add, &J[j].x, &J[j].z);		// Xj + Zj
+		fp2_sub(&XZj_sub, &J[j].x, &J[j].z);		// Xj - Zj
+
+		fp2_mul(&t1, &XZ_sub, &XZj_add);			// (X - Z) * (Xj + Zj)
+		fp2_mul(&t2, &XZ_add, &XZj_sub);			// (X + Z) * (Xj - Zj)
+
+		// ...................................
+		// Computing the quadratic coefficient
+		fp2_sub(&EJ_0[j][2], &t1, &t2);			//       2 * [(X*Zj) - (Z*Xj)]
+		fp2_sqr(&EJ_0[j][2], &EJ_0[j][2]);			//     ( 2 * [(X*Zj) - (Z*Xj)] )^2
+		fp2_mul(&EJ_0[j][2], &A.z, &EJ_0[j][2]);		// C * ( 2 * [(X*Zj) - (Z*Xj)] )^2
+
+		// ..................................
+		// Computing the constant coefficient
+		fp2_add(&EJ_0[j][0], &t1, &t2);			//       2 * [(X*Xj) - (Z*Zj)]
+		fp2_sqr(&EJ_0[j][0], &EJ_0[j][0]);			//     ( 2 * [(X*Xj) - (Z*Zj)] )^2
+		fp2_mul(&EJ_0[j][0], &A.z, &EJ_0[j][0]);		// C * ( 2 * [(X*Xj) - (Z*Zj)] )^2
+
+		// ................................
+		// Computing the linear coefficient
+	
+		// C * [ (-2*Xj*Zj)*(alpha^2 + 1) + (-2*alpha)*(Xj^2 + Zj^2)] + [A' * (-2*Xj*Zj) * (2*X*Z)] where alpha = X/Z
+		fp2_add(&t1, &J[j].x, &J[j].z);			//      (Xj + Zj)
+		fp2_sqr(&t1, &t1);					//      (Xj + Zj)^2
+		fp2_add(&t1, &t1, &t1);				//  2 * (Xj + Zj)^2
+		fp2_add(&t1, &t1, &XZJ4[j]);			//  2 * (Xj + Zj)^2 - (4*Xj*Zj) := 2 * (Xj^2 + Zj^2)
+		fp2_mul(&t1, &t1, &CXZ2);				// [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
+
+		fp2_mul(&t2, &CX2Z2, &XZJ4[j]);			// [C * (X^2 + Z^2)] * (-4 * Xj * Zj)
+		fp2_sub(&t1, &t2, &t1);				// [C * (X^2 + Z^2)] * (-4 * Xj * Zj) - [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
+
+		fp2_mul(&t2, &AXZ2, &XZJ4[j]);			// (2 * [A' * (X * Z)]) * (-4 * Xj * Zj)
+		fp2_add(&EJ_0[j][1], &t1, &t2);			// This is our desired equation but multiplied by 2
+		fp2_add(&EJ_0[j][1], &EJ_0[j][1], &EJ_0[j][1]);	// This is our desired equation but multiplied by 4
+	};
+
+        // ---------------------------------------------------------------------
+        // The faster way for multiplying is using a divide-and-conquer approach
+
+	// product tree of EJ_0 (we only require the root)
+	product_tree_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
+	assert( deg_ptree_EJ[0] == (2*sJ) );
+	if (!scaled)
+	{
+		// unscaled remainder tree approach
+		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
+	}
+	else
+	{
+		// scaled remainder tree approach
+		fp2_t G[sI_max], G_rev[sI_max];
+		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
+
+		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
+
+		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
+        };
+
+	// Finally, we must multiply the leaves of the outpur of remainders
+	fp2_t r0;
+	product(&r0, (const fp2_t*)leaves, sI);
+	// EJ_1 is just reverting the ordering in the coefficients of EJ_0
+	for (j = 0; j < sJ; j++){
+		fp2_copy(&t1, &ptree_EJ[0][j]);
+		fp2_copy(&ptree_EJ[0][j], &ptree_EJ[0][2*sJ - j]);
+		fp2_copy(&ptree_EJ[0][2*sJ - j], &t1);
+	}
+
+	if (!scaled)
+	{
+		// unscaled remainder tree approach
+		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
+	}
+	else
+	{
+		// scaled remainder tree approach
+		fp2_t G[sI_max], G_rev[sI_max];
+		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
+
+		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
+
+		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
+        };
+	clear_tree(ptree_EJ, 0, sJ);
+	// Finally, we must multiply the leaves of the outpur of remainders
+	fp2_t r1;
+	product(&r1, (const fp2_t*)leaves, sI);
+
+	// -------------------------------
+	// Sometimes the public value sK is equal to zero,
+	// Thus for avoing runtime error we add one when sK =0
+	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
+	for (j = 0; j < sK; j++)
+	{
+		fp2_add(&XZj_add, &K[j].x, &K[j].z);	// Xk + Zk
+		fp2_sub(&XZj_sub, &K[j].x, &K[j].z);	// Xk - Zk
+		fp2_mul(&t1, &XZ_sub, &XZj_add);		// (X - Z) * (Xk + Zk)
+		fp2_mul(&t2, &XZ_add, &XZj_sub);		// (X + Z) * (Xk - Zk)
+
+		// Case alpha = X/Z
+		fp2_sub(&hK_0[j], &t1, &t2);		// 2 * [(X*Zk) - (Z*Xk)]
+
+		// Case 1/alpha = Z/X
+		fp2_add(&hK_1[j], &t1, &t2);		// 2 * [(X*Xk) - (Z*Zk)]
+	};
+
+	// hk_0 <- use product to mulitiply all the elements in hK_0
+	product(&hk_0, (const fp2_t*)hK_0, sK);
+	// hk_1 <- use product to mulitiply all the elements in hK_1
+	product(&hk_1, (const fp2_t*)hK_1, sK);
+
+	// ---------------------------------------------------------------------------------
+	// Now, unifying all the computations
+	fp2_mul(&t1, &hk_1, &r1);				// output of algorithm 2 with 1/alpha = Z/X and without the demoninator
+	fp2_sqr(&t1, &t1);
+	fp2_mul(&(Q->x), &t1, &P.x);
+
+	fp2_mul(&t2, &hk_0, &r0);				// output of algorithm 2 with alpha = X/Z and without the demoninator
+	fp2_sqr(&t2, &t2);
+	fp2_mul(&(Q->z), &t2, &P.z);
+}
--- a/src/ec/ref/ecx/xisog.c
+++ b/src/ec/ref/ecx/xisog.c
@@ -0,0 +1,295 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+// Degree-2 isogeny with kernel generated by P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void xisog_2(ec_point_t* B, ec_point_t const P)
+{
+        fp2_sqr(&B->x, &P.x);
+        fp2_sqr(&B->z, &P.z);
+        fp2_sub(&B->x, &B->z, &B->x);
+        fp2_add(&K[0].x, &P.x, &P.z);
+        fp2_sub(&K[0].z, &P.x, &P.z);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void xisog_4(ec_point_t* B, ec_point_t const P)
+{
+	fp2_sqr(&K[0].x, &P.x);
+	fp2_sqr(&K[0].z, &P.z);
+	fp2_add(&K[1].x, &K[0].z, &K[0].x);
+	fp2_sub(&K[1].z, &K[0].z, &K[0].x);
+	fp2_mul(&B->x, &K[1].x, &K[1].z);
+	fp2_sqr(&B->z, &K[0].z);
+
+	// Constants for xeval_4
+	fp2_add(&K[2].x, &P.x, &P.z);
+	fp2_sub(&K[1].x, &P.x, &P.z);
+	fp2_add(&K[0].x, &K[0].z, &K[0].z);
+	fp2_add(&K[0].x, &K[0].x, &K[0].x);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P = (0 ,0)
+void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24)
+{
+	fp2_copy(&K[0].z, &A24.z);
+	if(fp2_is_equal(&P.x, &P.z)){
+		// Case for P=(1,_)
+		fp2_copy(&K[0].x, &A24.x);
+		fp2_sub(&K[1].x, &A24.x, &A24.z);
+		fp2_neg(&B24->z, &K[1].x);
+	}
+	else{
+		// Case for P=(-1,_)
+		fp2_copy(&K[1].x, &A24.x);
+		fp2_sub(&K[0].x, &A24.x, &A24.z);
+		fp2_neg(&B24->z, &K[0].x);
+		fp2_copy(&B24->z, &K[1].x);
+	}
+	fp2_copy(&B24->x, &K[0].z);
+}
+
+// xISOG procedure, which is a hybrid between Montgomery and Twisted Edwards
+// This tradition fomulae corresponds with the Twisted Edwards formulae but 
+// mapping the output into Montgomery form
+void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A)
+{
+	int j;
+	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
+
+	fp2_t By, Bz, constant_d_edwards, tmp_a, tmp_d;
+
+	fp2_copy(&By, &K[0].x);
+	fp2_copy(&Bz, &K[0].z);
+
+	for (j = 1; j < d; j++)
+	{
+		fp2_mul(&By, &By, &K[j].x);
+		fp2_mul(&Bz, &Bz, &K[j].z);
+	};
+
+	// Mapping Montgomery curve coefficients into Twisted Edwards form
+	fp2_sub(&constant_d_edwards, &A.x, &A.z);
+	fp2_copy(&tmp_a, &A.x);
+	fp2_copy(&tmp_d, &constant_d_edwards);
+
+	// left-to-right method for computing a^l and d^l
+	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
+	{
+		fp2_sqr(&tmp_a, &tmp_a);
+		fp2_sqr(&tmp_d, &tmp_d);
+		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
+		{
+			fp2_mul(&tmp_a, &tmp_a, &A.x);
+			fp2_mul(&tmp_d, &tmp_d, &constant_d_edwards);
+		};
+	};
+
+	// raising to 8-th power
+	for (j = 0; j < 3; j++)
+	{
+		fp2_sqr(&By, &By);
+		fp2_sqr(&Bz, &Bz);
+	};
+
+	// Mapping Twisted Edwards curve coefficients into Montgomery form
+	fp2_mul(&(B->x), &tmp_a, &Bz);
+	fp2_mul(&(B->z), &tmp_d, &By);
+	fp2_sub(&(B->z), &(B->x), &(B->z));
+}
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+//  Isogeny construction (xISOG) used in velu SQRT
+
+void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A)
+{
+	// =================================================================================
+	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
+	sI = sizeI[i];          // size of I
+	sJ = sizeJ[i];          // size of J
+	sK = sizeK[i];          // size of K
+
+	assert(sI >= sJ);       // Ensuring #I >= #J
+	assert(sK >= 0);         // Recall, L is a prime and therefore it must be that #K > 0
+	assert(sJ > 1);         // ensuring sI >= sJ > 1
+	// =================================================================================
+	
+	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
+	fp2_t Ap;
+	fp2_add(&Ap, &A.x, &A.x);	// 2A' + 4C
+	fp2_sub(&Ap, &Ap, &A.z);	// 2A'
+	fp2_add(&Ap, &Ap, &Ap);	// 4A'
+
+	fp2_t ADD_SQUARED[sJ_max],	// (Xj + Zj)^2
+	   SUB_SQUARED[sJ_max];	// (Xj - Zj)^2
+
+	int j;
+	// Next loop precompute some variables to be used in the reaminder of xisog
+	for (j = 0; j < sJ; j++)
+	{
+		fp2_sub(&SUB_SQUARED[j], &J[j].x, &J[j].z);		// (Xj - Zj)
+		fp2_sqr(&SUB_SQUARED[j], &SUB_SQUARED[j]);		// (Xj - Zj)^2
+		fp2_sub(&ADD_SQUARED[j], &SUB_SQUARED[j], &XZJ4[j]);	// (Xj + Zj)^2
+	};
+
+	//  --------------------------------------------------------------------------------------------------
+	//                   ~~~~~~~~
+	//                    |    | 
+	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
+	//                    j in J 
+	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
+	// In particular, for a degree-l isogeny construction, we need alpha = 1 and alpha = -1
+
+	//fp2_t EJ_0[sJ][3],	// quadratic factors of one polynomial to be used in a resultant 
+	//   EJ_1[sJ][3];	// quadratic factors of one polynomial to be used in a resultant
+
+	// Next loop computes all the quadratic factors of EJ_0 and EJ_1
+	fp2_t t1;
+	for (j = 0; j < sJ; j++)
+	{
+		// Each SUB_SQUARED[j] and ADD_SQUARED[j] should be multiplied by C
+		fp2_mul(&EJ_1[j][0], &ADD_SQUARED[j], &A.z);
+		fp2_mul(&EJ_0[j][0], &SUB_SQUARED[j], &A.z);
+		// We require the double of tadd and tsub
+		fp2_add(&EJ_0[j][1], &EJ_1[j][0], &EJ_1[j][0]);
+		fp2_add(&EJ_1[j][1], &EJ_0[j][0], &EJ_0[j][0]);
+
+		fp2_mul(&t1, &XZJ4[j], &Ap);			// A' *(-4*Xj*Zj)
+
+		// Case alpha = 1
+		fp2_sub(&EJ_0[j][1], &t1, &EJ_0[j][1]);
+		fp2_copy(&EJ_0[j][2], &EJ_0[j][0]);		// E_[0,j} is a palindrome
+		
+		// Case alpha = -1
+		fp2_sub(&EJ_1[j][1], &EJ_1[j][1], &t1);
+		fp2_copy(&EJ_1[j][2], &EJ_1[j][0]);		// E_{1,j} is a palindrome
+	};
+
+	// ---------------------------------------------------------------------
+	// The faster way for multiplying is using a divide-and-conquer approach
+	
+	// selfreciprocal product tree of EJ_0 (we only require the root)
+	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
+	assert( deg_ptree_EJ[0] == (2*sJ) );
+	if (!scaled)
+	{
+		// (unscaled) remainder tree approach
+		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
+	}
+	else
+	{
+		// scaled remainder tree approach
+		fp2_t G[sI_max], G_rev[sI_max];
+		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
+
+		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
+
+		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
+	};
+	clear_tree(ptree_EJ, 0, sJ);
+	// Finally, we must multiply the leaves of the outpur of remainders
+	fp2_t r0;
+	product(&r0, (const fp2_t*)leaves, sI);
+
+	// selfreciprocal product tree of EJ_1 (we only require the root)
+	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_1, sJ);
+	assert( deg_ptree_EJ[0] == (2*sJ) );
+	if (!scaled)
+	{
+		// (unscaled) remainder tree approach
+		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
+	}
+	else
+	{
+		// scaled remainder tree approach
+		fp2_t G[sI_max], G_rev[sI_max];
+		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
+
+		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
+		for (j = 0; j < sI; j++)
+			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
+
+		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
+	};
+	clear_tree(ptree_EJ, 0, sJ);
+	// Finally, we must multiply the leaves of the outpur of remainders
+	fp2_t r1;
+	product(&r1, (const fp2_t*)leaves, sI);
+
+	// -------------------------------
+	// Sometimes the public value sK is equal to zero,
+	// Thus for avoing runtime error we add one when sK =0
+	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
+	for (j = 0; j < sK; j++)
+	{
+		fp2_sub(&hK_0[j], &K[j].z, &K[j].x);
+		fp2_add(&hK_1[j], &K[j].z, &K[j].x);
+	};
+
+	// hk_0 <- use product to mulitiply all the elements in hK_0
+	product(&hk_0, (const fp2_t*)hK_0, sK);
+	// hk_1 <- use product to mulitiply all the elements in hK_1
+	product(&hk_1, (const fp2_t*)hK_1, sK);
+	
+	// --------------------------------------------------------------
+	// Now, we have all the ingredients for computing the image curve
+	fp2_t A24, A24m,
+	   t24, t24m;	// <---- JORGE creo que podemos omitir estas variables, se usan cuando ya no se requiren los valores de la entrada A (podemos cambiar estos t's por B[0] y B[1]
+
+	fp2_copy(&A24, &A.x);			// A' + 2C
+	fp2_sub(&A24m, &A.x, &A.z);		// A' - 2C
+	fp2_copy(&Ap, &A24m);
+
+	// left-to-right method for computing (A' + 2C)^l and (A' - 2C)^l
+	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
+	{
+		fp2_sqr(&A24, &A24);
+		fp2_sqr(&A24m, &A24m);
+		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
+		{
+			fp2_mul(&A24, &A24, &A.x);
+			fp2_mul(&A24m, &A24m, &Ap);
+		};
+	};
+
+	fp2_mul(&t24m, &hk_1, &r1);			// output of algorithm 2 with alpha =-1 and without the demoninator
+	fp2_sqr(&t24m, &t24m);			// raised at 2
+	fp2_sqr(&t24m, &t24m);			// raised at 4
+	fp2_sqr(&t24m, &t24m);			// raised at 8
+
+	fp2_mul(&t24, &hk_0, &r0);			// output of algorithm 2 with alpha = 1 and without the demoninator 
+	fp2_sqr(&t24, &t24);			// raised at 2
+	fp2_sqr(&t24, &t24);			// raised at 4
+	fp2_sqr(&t24, &t24);			// raised at 8
+
+	fp2_mul(&A24, &A24, &t24m);
+	fp2_mul(&A24m, &A24m, &t24);
+
+	// Now, we have d = (A24m / A24) where the image Montgomery cuve coefficient is
+	//      B'   2*(1 + d)   2*(A24 + A24m)
+	// B = ---- = --------- = --------------
+	//      C      (1 - d)     (A24 - A24m)
+	// However, we required B' + 2C = 4*A24 and 4C = 4 * (A24 - A24m)
+
+	fp2_sub(&t24m, &A24, &A24m);		//   (A24 - A24m)
+	fp2_add(&t24m, &t24m, &t24m);		// 2*(A24 - A24m)
+	fp2_add(&t24m, &t24m, &t24m);		// 4*(A24 - A24m)
+
+	fp2_add(&t24, &A24, &A24);			// 2 * A24
+	fp2_add(&t24, &t24, &t24);			// 4 * A24
+
+	fp2_copy(&(B->x), &t24);
+	fp2_copy(&(B->z), &t24m);
+}
--- a/src/ec/ref/include/curve_extras.h
+++ b/src/ec/ref/include/curve_extras.h
@@ -0,0 +1,28 @@
+#ifndef CURVE_EXTRAS_H
+#define CURVE_EXTRAS_H
+
+#include "ec.h"
+#include "torsion_constants.h"
+
+typedef struct jac_point_t {
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+} jac_point_t;
+
+bool ec_is_zero(ec_point_t const* P);
+void copy_point(ec_point_t* P, ec_point_t const* Q);
+void swap_points(ec_point_t* P, ec_point_t* Q, const digit_t option);
+void ec_init(ec_point_t* P);
+void xDBLv2(ec_point_t* Q, ec_point_t const* P, ec_point_t const* A24);
+void xDBLADD(ec_point_t* R, ec_point_t* S, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A24);
+void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
+void xDBL(ec_point_t* Q, ec_point_t const* P, ec_point_t const* AC);
+void xMUL(ec_point_t* Q, ec_point_t const* P, digit_t const* k, ec_curve_t const* curve);
+void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
+
+#define is_point_equal ec_is_equal
+#define xADD ec_add
+
+#endif
+
--- a/src/ec/ref/include/ec.h
+++ b/src/ec/ref/include/ec.h
@@ -0,0 +1,776 @@
+/** @file
+ *
+ * @authors Luca De Feo, Francisco RH
+ *
+ * @brief Elliptic curve stuff
+*/
+
+#ifndef EC_H
+#define EC_H
+
+#include <fp2.h>
+#include <ec_params.h>
+
+
+/** @defgroup ec Elliptic curves
+ * @{
+*/
+
+/** @defgroup ec_t Data structures
+ * @{
+*/
+
+/** @brief Projective point
+ *
+ * @typedef ec_point_t
+ *
+ * @struct ec_point_t
+ *
+ * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
+*/
+typedef struct ec_point_t {
+    fp2_t x;
+    fp2_t z;
+} ec_point_t;
+
+/** @brief A basis of a torsion subgroup
+ *
+ * @typedef ec_basis_t
+ *
+ * @struct ec_basis_t
+ *
+ * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
+*/
+typedef struct ec_basis_t {
+    ec_point_t P;
+    ec_point_t Q;
+    ec_point_t PmQ;
+} ec_basis_t;
+
+/** @brief An elliptic curve
+ *
+ * @typedef ec_curve_t
+ *
+ * @struct ec_curve_t
+ *
+ * An elliptic curve in projective Montgomery form
+*/
+typedef struct ec_curve_t {
+    fp2_t A;
+    fp2_t C; ///< cannot be 0
+} ec_curve_t;
+
+/** @brief An isogeny of degree a power of 2
+ *
+ * @typedef ec_isog_even_t
+ *
+ * @struct ec_isog_even_t
+*/
+typedef struct ec_isog_even_t {
+    ec_curve_t curve;      ///< The domain curve
+    ec_point_t kernel;     ///< A kernel generator
+    unsigned short length; ///< The length as a 2-isogeny walk
+} ec_isog_even_t;
+
+
+/** @brief An odd divisor of p² - 1
+ *
+ * @typedef ec_isog_odd_t
+ *
+ * Given that the list of divisors of p² - 1 is known, this is
+ * represented as a fixed-length vector of integer exponents.
+*/
+
+typedef uint8_t ec_degree_odd_t[P_LEN + M_LEN];
+
+/** @brief An isogeny of odd degree dividing p² - 1
+ *
+ * @typedef ec_isog_odd_t
+ *
+ * @struct ec_isog_odd_t
+*/
+typedef struct ec_isog_odd_t {
+    ec_curve_t curve;
+    ec_point_t ker_plus;    ///< A generator of E[p+1] ∩ ker(φ)
+    ec_point_t ker_minus;   ///< A generator of E[p-1] ∩ ker(φ)
+    ec_degree_odd_t degree; ///< The degree of the isogeny
+} ec_isog_odd_t;
+
+/** @brief Isomorphism of Montgomery curves
+ *
+ * @typedef ec_isom_t
+ *
+ * @struct ec_isom_t
+ *
+ * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X - Nz Z) : (D Z) )
+*/
+typedef struct ec_isom_t {
+    fp2_t Nx;
+    fp2_t Nz;
+    fp2_t D;
+} ec_isom_t;
+
+// end ec_t
+/** @}
+*/
+
+
+/** @defgroup ec_curve_t Curves and isomorphisms
+ * @{
+*/
+
+/**
+ * @brief j-invariant.
+ *
+ * @param j_inv computed j_invariant
+ * @param curve input curve
+ */
+void ec_j_inv(fp2_t* j_inv, const ec_curve_t* curve);
+
+/**
+ * @brief Isomorphism of elliptic curve
+ *
+ * @param isom computed isomorphism
+ * @param from domain curve
+ * @param to image curve
+ */
+void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to);
+
+/**
+ * @brief In-place inversion of an isomorphism
+ *
+ * @param isom an isomorphism
+ */
+void ec_iso_inv(ec_isom_t* isom);
+
+/**
+ * @brief In-place evaluation of an isomorphism
+ *
+ * @param P a point
+ * @param isom an isomorphism
+ */
+void ec_iso_eval(ec_point_t* P, ec_isom_t* isom);
+
+/**
+ * @brief Given a Montgomery curve, computes a standard model for it and the isomorphism to it.
+ *
+ * @param new computed new curve
+ * @param isom computed isomorphism from `old` to `new`
+ * @param old A Montgomery curve
+ */
+void ec_curve_normalize(ec_curve_t *new, ec_isom_t *isom, const ec_curve_t *old);
+
+/** @}
+*/
+/** @defgroup ec_point_t Point operations
+ * @{
+*/
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @param Q a point
+ * @return 1 if equal
+ */
+bool ec_is_equal(const ec_point_t* P, const ec_point_t* Q);
+
+/**
+ * @brief Reduce Z-coordinate of point in place
+ *
+ * @param P a point
+ */
+void ec_normalize(ec_point_t* P);
+
+/**
+ * @brief Test whether a point is on a curve
+ *
+ * @param curve a curve
+ * @param P a point
+ * @return 1 if P is on the curve
+ */
+int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Point negation
+ *
+ * @param res computed opposite of P
+ * @param P a point
+ */
+void ec_neg(ec_point_t* res, const ec_point_t* P);
+
+/**
+ * @brief Point addition
+ *
+ * @param res computed sum of P and Q
+ * @param P a point
+ * @param Q a point
+ * @param PQ the difference P-Q
+ */
+void ec_add(ec_point_t* res, const ec_point_t* P, const ec_point_t* Q, const ec_point_t* PQ);
+
+/**
+ * @brief Point doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ */
+void ec_dbl(ec_point_t* res, const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Point multiplication
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param P a point
+ */
+void ec_mul(ec_point_t* res, const ec_curve_t* curve, const digit_t* scalar, const ec_point_t* P);
+
+/**
+ * @brief Point multiplication by a scalar of limited length
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param kbits the bit size of scalar
+ * @param P a point
+ */
+void xMULv2(ec_point_t* Q, ec_point_t const* P, digit_t const* k, const int kbits, ec_point_t const* A24);
+
+/**
+ * @brief Combination P+m*Q
+ *
+ * @param R computed P + m * Q
+ * @param curve the curve
+ * @param m an unsigned multi-precision integer
+ * @param P a point
+ * @param Q a point
+ * @param PQ the difference P-Q
+ */
+void ec_ladder3pt(ec_point_t *R, fp_t const m, ec_point_t const *P, ec_point_t const *Q, ec_point_t const *PQ, ec_curve_t const *A);
+
+/**
+ * @brief Linear combination of points of a basis
+ *
+ * @param res computed scalarP * P + scalarQ * Q
+ * @param curve the curve
+ * @param scalarP an unsigned multi-precision integer
+ * @param scalarQ an unsigned multi-precision integer
+ * @param PQ a torsion basis consisting of points P and Q
+ */
+void ec_biscalar_mul(ec_point_t* res, const ec_curve_t* curve,
+    const digit_t* scalarP, const digit_t* scalarQ,
+    const ec_basis_t* PQ);
+
+/** @}
+*/
+
+/** @defgroup ec_dlog_t Discrete logs and bases
+ * @{
+*/
+
+/**
+ * @brief Generate a Montgomery curve and a 2^f-torsion basis
+ *
+ * The algorithm is deterministc
+ *
+ * @param PQ2 computed basis of the 2^f-torsion
+ * @param curve the computed curve
+ */
+void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve);
+
+/**
+ * @brief Complete a basis of the 2^f-torsion
+ *
+ * The algorithm is deterministic
+ *
+ * @param PQ2 a basis of the 2^f-torsion containing P as first generator
+ * @param curve the curve
+ * @param P a point of order 2^f
+ */
+void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Generate a 3^e-torsion basis
+ *
+ * The algorithm is deterministic
+ *
+ * @param PQ3 the computed 3^e-torsion basis
+ * @param curve a curve
+ */
+void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve);
+
+/**
+ * @brief Generate a 6^e-torsion basis
+ *
+ * The algorithm is deterministic
+ *
+ * @param PQ6 the computed 2^f*3^g-torsion basis
+ * @param curve a curve
+ */
+void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve);
+
+
+/**
+ * @brief Compute the generalized dlog of R wrt the 2^f-basis PQ2
+ *
+ * Ensure that R = scalarP * P + scalarQ * Q
+ *
+ * @param scalarP the computed dlog
+ * @param scalarQ the computed dlog
+ * @param PQ2 a 2^f-torsion basis
+ * @param R a point of order dividing 2^f
+ */
+void ec_dlog_2(digit_t* scalarP, digit_t* scalarQ,
+    const ec_basis_t* PQ2, const ec_point_t* R, const ec_curve_t* curve);
+
+/**
+ * @brief Compute the generalized dlog of R wrt the 3^e-basis PQ3
+ *
+ * Ensure that R = scalarP * P + scalarQ * Q
+ *
+ * @param scalarP the computed dlog
+ * @param scalarQ the computed dlog
+ * @param PQ3 a 3^e-torsion basis
+ * @param R a point of order dividing 3^e
+ */
+void ec_dlog_3(digit_t* scalarP, digit_t* scalarQ,
+    const ec_basis_t* PQ3, const ec_point_t* R, const ec_curve_t* curve);
+/** @}
+*/
+
+/** @defgroup ec_isog_t Isogenies
+ * @{
+*/
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param length of the list points
+ */
+void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_point_t* points, unsigned short length);
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points, assuming the point (0,0) is not in the kernel
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param length of the list points
+ */
+void ec_eval_even_nonzero(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_point_t* points, unsigned short length);
+
+/**
+ * @brief Evaluate isogeny of even degree on list of torsion bases
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of bases to evaluate the isogeny on, modified in place
+ * @param length of the list bases
+ */
+static inline void ec_eval_even_basis(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_basis_t* points, unsigned short length) {
+    ec_eval_even(image, phi, (ec_point_t*)points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
+}
+
+/**
+ * @brief Evaluate isogeny of odd degree on list of points
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param length of the list points
+ */
+void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
+    ec_point_t* points, unsigned short length);
+
+/**
+ * @brief Evaluate isogeny of odd degree on list of torsion bases
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of bases to evaluate the isogeny on, modified in place
+ * @param length of the list bases
+ */
+static inline void ec_eval_odd_basis(ec_curve_t* image, const ec_isog_odd_t* phi,
+    ec_basis_t* points, unsigned short length) {
+    ec_eval_odd(image, phi, (ec_point_t*)points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
+}
+
+/** @}
+*/
+
+// end ec
+/** @}
+*/
+
+
+
+#endif
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////// ORIGINAL VERSION
+
+#if 0
+
+/** @file
+ *
+ * @authors Luca De Feo, Francisco RH
+ *
+ * @brief Elliptic curve stuff
+*/
+
+#ifndef EC_H
+#define EC_H
+
+#include <gf.h>
+
+/** @defgroup ec Elliptic curves
+ * @{
+*/
+
+/** @defgroup ec_t Data structures
+ * @{
+*/
+
+/** @brief Projective point
+ *
+ * @typedef ec_point_t
+ *
+ * @struct ec_point_t
+ *
+ * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
+*/
+typedef struct ec_point_t {
+    fp2_t X;
+    //fp2_t Y;
+    fp2_t Z;
+} ec_point_t;
+
+/** @brief A basis of a torsion subgroup
+ *
+ * @typedef ec_basis_t
+ *
+ * @struct ec_basis_t
+ *
+ * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
+*/
+typedef struct ec_basis_t {
+    ec_point_t P;
+    ec_point_t Q;
+    ec_point_t PmQ;  // or maybe not
+} ec_basis_t;
+
+/** @brief An elliptic curve
+ *
+ * @typedef ec_curve_t
+ *
+ * @struct ec_curve_t
+ *
+ * An elliptic curve in projective Montgomery form
+*/
+typedef struct ec_curve_t {
+    fp2_t A;
+    fp2_t C; ///< cannot be 0
+} ec_curve_t;
+
+/** @brief An isogeny of degree a power of 2
+ *
+ * @typedef ec_isog_even_t
+ *
+ * @struct ec_isog_even_t
+*/
+typedef struct ec_isog_even_t {
+    ec_curve_t curve;      ///< The domain curve
+    ec_point_t kernel;     ///< A kernel generator
+    unsigned short length; ///< The length as a 2-isogeny walk
+} ec_isog_even_t;
+
+
+/** @brief An odd divisor of p² - 1
+ *
+ * @typedef ec_isog_odd_t
+ *
+ * @struct ec_isog_odd_t
+ *
+ * Given that the list of divisors of p² - 1 is known, this could be
+ * represented as a fixed-length vector of integer exponents, possibly
+ * distinguishing the divisors of p + 1 from those of p - 1.
+*/
+typedef struct ec_degree_odd_t {
+    // todo (basically a ushort vector)
+} ec_degree_odd_t;
+
+/** @brief An isogeny of odd degree dividing p² - 1
+ *
+ * @typedef ec_isog_odd_t
+ *
+ * @struct ec_isog_odd_t
+*/
+typedef struct ec_isog_odd_t {
+    ec_point_t ker_plus;    ///< A generator of E[p+1] ∩ ker(φ)
+    ec_point_t ker_minus;   ///< A generator of E[p-1] ∩ ker(φ)
+    ec_degree_odd_t degree; ///< The degree of the isogeny
+} ec_isog_odd_t;
+
+/** @brief Isomorphism of Montgomery curves
+ *
+ * @typedef ec_isom_t
+ *
+ * @struct ec_isom_t
+ *
+ * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X - Nz Z) : (D Z) )
+ * TODO: fix if (X:Y:Z) coordinates.
+*/
+typedef struct ec_isom_t {
+    fp2_t Nx;
+    fp2_t Nz;
+    fp2_t D;
+} ec_isom_t;
+
+// end ec_t
+/** @}
+*/
+
+
+/** @defgroup ec_curve_t Curves and isomorphisms
+ * @{
+*/
+
+/**
+ * @brief j-invariant.
+ *
+ * @param j_inv computed j_invariant
+ * @param curve input curve
+ */
+void ec_j_inv(fp2_t* j_inv, const ec_curve_t* curve);
+
+/**
+ * @brief Isomorphism of elliptic curve
+ *
+ * @param isom computed isomorphism
+ * @param from domain curve
+ * @param to image curve
+ */
+void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to);
+
+/** @}
+*/
+/** @defgroup ec_point_t Point operations
+ * @{
+*/
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @param Q a point
+ * @return 1 if equal
+ */
+int ec_is_equal(const ec_point_t* P, const ec_point_t* Q);
+
+/**
+ * @brief Reduce Z-coordinate of point in place
+ *
+ * @param P a point
+ */
+void ec_normalize(ec_point_t* P);
+
+/**
+ * @brief Test whether a point is on a curve
+ *
+ * @param curve a curve
+ * @param P a point
+ * @return 1 if P is on the curve
+ */
+int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Point negation
+ *
+ * @param res computed opposite of P
+ * @param P a point
+ */
+void ec_neg(ec_point_t* res, const ec_point_t* P);
+
+/**
+ * @brief Point addition
+ *
+ * Needs to be adjusted if (X:Z) arithmetic.
+ *
+ * @param res computed sum of P and Q
+ * @param P a point
+ * @param Q a point
+ */
+void ec_add(ec_point_t* res, const ec_point_t* P, const ec_point_t* Q);
+
+/**
+ * @brief Point doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ */
+void ec_dbl(ec_point_t* res, const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Point multiplication
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param P a point
+ */
+void ec_mul(ec_point_t* res, const ec_curve_t* curve, const digit_t* scalar, const ec_point_t* P);
+
+/**
+ * @brief Linear combination of points of a basis
+ *
+ * @param res computed scalarP * P + scalarQ * Q
+ * @param curve the curve
+ * @param scalarP an unsigned multi-precision integer
+ * @param scalarQ an unsigned multi-precision integer
+ * @param PQ a torsion basis consisting of points P and Q
+ */
+void ec_biscalar_mul(ec_point_t* res, const ec_curve_t* curve,
+    const digit_t* scalarP, const digit_t* scalarQ,
+    const ec_basis_t* PQ);
+
+/** @}
+*/
+
+/** @defgroup ec_dlog_t Discrete logs and bases
+ * @{
+*/
+
+/**
+ * @brief Generate a Montgomery curve and a 2^f-torsion basis
+ *
+ * The algorithm is deterministc
+ *
+ * @param curve the computed curve
+ * @param PQ2 a basis of the 2^f-torsion
+ * @param j_inv a j-invariant
+ */
+void ec_j_to_basis_2(ec_curve_t* curve, ec_basis_t* PQ2, const fp2_t* j_inv);
+
+/**
+ * @brief Complete a basis of the 2^f-torsion
+ *
+ * The algorithm is deterministc
+ *
+ * @param PQ2 a basis of the 2^f-torsion containing P as first generator
+ * @param curve the curve
+ * @param P a point of order 2^f
+ */
+void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P);
+
+/**
+ * @brief Generate a 3^e-torsion basis
+ *
+ * The algorithm is deterministc
+ *
+ * @param PQ3 the computed 3^e-torsion basis
+ * @param curve a curve
+ */
+void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve);
+
+/**
+ * @brief Compute the generalized dlog of R wrt the 2^f-basis PQ2
+ *
+ * Ensure that R = scalarP * P + scalarQ * Q
+ *
+ * @param scalarP the computed dlog
+ * @param scalarQ the computed dlog
+ * @param PQ2 a 2^f-torsion basis
+ * @param R a point of order dividing 2^f
+ */
+void ec_dlog_2(digit_t* scalarP, digit_t* scalarQ,
+    const ec_basis_t* PQ2, const ec_point_t* R);
+
+/**
+ * @brief Compute the generalized dlog of R wrt the 3^e-basis PQ3
+ *
+ * Ensure that R = scalarP * P + scalarQ * Q
+ *
+ * @param scalarP the computed dlog
+ * @param scalarQ the computed dlog
+ * @param PQ3 a 3^e-torsion basis
+ * @param R a point of order dividing 3^e
+ */
+void ec_dlog_3(digit_t* scalarP, digit_t* scalarQ,
+    const ec_basis_t* PQ3, const ec_point_t* R);
+/** @}
+*/
+
+/** @defgroup ec_isog_t Isogenies
+ * @{
+*/
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param length of the list points
+ */
+void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_point_t* points, unsigned short length);
+
+/**
+ * @brief Evaluate isogeny of even degree on list of torsion bases
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of bases to evaluate the isogeny on, modified in place
+ * @param length of the list bases
+ */
+static inline void ec_eval_even_basis(ec_curve_t* image, const ec_isog_even_t* phi,
+    ec_basis_t* points, unsigned short length) {
+    ec_eval_even(image, phi, points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
+}
+
+/**
+ * @brief Evaluate isogeny of odd degree on list of points
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param length of the list points
+ */
+void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
+    ec_point_t* points, unsigned short length);
+
+/**
+ * @brief Evaluate isogeny of odd degree on list of torsion bases
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of bases to evaluate the isogeny on, modified in place
+ * @param length of the list bases
+ */
+static inline void ec_eval_odd_basis(ec_curve_t* image, const ec_isog_odd_t* phi,
+    ec_basis_t* points, unsigned short length) {
+    ec_eval_odd(image, phi, points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
+}
+
+/** @}
+*/
+
+// end ec
+/** @}
+*/
+
+#endif
+
+
+#endif
--- a/src/ec/ref/include/isog.h
+++ b/src/ec/ref/include/isog.h
@@ -0,0 +1,84 @@
+#ifndef _ISOG_H_
+#define _ISOG_H_
+
+#include "curve_extras.h"
+#include "poly.h"
+
+extern int sI, sJ, sK;	// Sizes of each current I, J, and K	
+
+extern fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
+			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
+
+extern ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
+extern fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
+    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
+    A0;			// constant multiple of the reciprocal R0
+
+extern poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
+     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
+     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
+     
+extern fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
+
+extern int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
+    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
+
+extern fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
+
+
+void eds2mont(ec_point_t* P);						// mapping from Twisted edwards into Montogmery
+void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ);	// differential addition on Twisted edwards model
+void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta);
+
+void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A);	// tvelu formulae
+void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
+
+void xisog_4(ec_point_t* B, ec_point_t const P);			// degree-4 isogeny construction
+void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24);
+void xisog_2(ec_point_t* B, ec_point_t const P);			// degree-2 isogeny construction
+void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A);	// tvelu formulae
+void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A);	// svelu formulae
+
+void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ);					// degree-4 isogeny evaluation
+void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P);
+void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ);	// degree-2 isogeny evaluation
+void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P);			// tvelu formulae
+void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
+
+// Strategy-based 4-isogeny chain
+static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
+    ec_point_t* A24, const ec_point_t *kernel, const int isog_len);
+
+void kps_clear(int i);	// Clear memory assigned by KPS
+
+
+// hybrid velu formulae
+static inline void kps(uint64_t const i, ec_point_t const P, ec_point_t const A)	
+{
+	// Next branch only depends on a fixed public bound (named gap)
+	if (TORSION_ODD_PRIMES[i] <= gap)
+		kps_t(i, P, A);
+	else
+		kps_s(i, P, A);
+}
+
+static inline void xisog(ec_point_t* B, uint64_t const i, ec_point_t const A)
+{
+	// Next branch only depends on a fixed public bound (named gap)
+	if (TORSION_ODD_PRIMES[i] <= gap)
+		xisog_t(B, i, A);
+	else
+		xisog_s(B, i, A);
+}
+
+static inline void xeval(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
+{
+	// Next branch only depends on a fixed public bound (named gap)
+	if (TORSION_ODD_PRIMES[i] <= gap)
+		xeval_t(Q, i, P);
+	else
+		xeval_s(Q, i, P, A);
+}
+
+
+#endif
--- a/src/ec/ref/include/poly.h
+++ b/src/ec/ref/include/poly.h
@@ -0,0 +1,28 @@
+#ifndef _POLY_H_
+#define _POLY_H_
+
+#include <fp2.h>
+
+typedef fp2_t *poly; // Polynomials are arrays of coeffs over Fq, lowest degree first
+
+void poly_mul(poly h, const poly f, const int lenf, const poly g, const int leng);
+void poly_mul_low(poly h, const int n, const poly f, const int lenf, const poly g, const int leng);
+void poly_mul_middle(poly h, const poly g, const int leng, const poly f, const int lenf);
+void poly_mul_selfreciprocal(poly h, const poly g, const int leng, const poly f, const int lenf);
+
+void product_tree(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
+void product_tree_LENFeq2(poly H[], int DEG[], const int root, const fp2_t F[][2], const int n);
+void product_tree_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
+void product_tree_selfreciprocal(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
+void product_tree_selfreciprocal_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
+void clear_tree(poly H[], const int root, const int n);
+
+void product(fp2_t *c, const fp2_t F[], const int n);
+
+void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n);
+void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,const poly f_inv, const fp2_t c);
+void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[], const int root, const int n);
+void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[], const poly H[], const int DEG[], const int root, const int n);
+void multieval_scaled(fp2_t REM[], const poly G, const poly H[], const int DEG[], const int root, const int n);
+
+#endif /* _POLY_H */
--- a/src/ec/ref/include/sdacs.h
+++ b/src/ec/ref/include/sdacs.h
@@ -0,0 +1,50 @@
+#ifndef _SDACS_H_
+#define _SDACS_H_
+
+static char SDAC_P_0[] = "0";
+static char SDAC_P_1[] = "10";
+static char SDAC_P_2[] = "100";
+static char SDAC_P_3[] = "0100";
+static char SDAC_P_4[] = "10000";
+static char SDAC_P_5[] = "110000";
+static char SDAC_P_6[] = "100000";
+static char SDAC_P_7[] = "1100010001";
+static char SDAC_P_8[] = "1001010000";
+static char SDAC_P_9[] = "0101001000";
+static char SDAC_P_10[] = "110110010000";
+static char SDAC_P_11[] = "10000000000";
+static char SDAC_P_12[] = "1010100001001000";
+
+static char SDAC_M_0[] = "";
+static char SDAC_M_1[] = "000";
+static char SDAC_M_2[] = "1010";
+static char SDAC_M_3[] = "100010";
+static char SDAC_M_4[] = "0010000";
+static char SDAC_M_5[] = "110000000";
+static char SDAC_M_6[] = "1010101010";
+static char SDAC_M_7[] = "1010001000";
+static char SDAC_M_8[] = "1001000000";
+static char SDAC_M_9[] = "0100001000";
+static char SDAC_M_10[] ="101101010000"; 
+static char SDAC_M_11[] = "100100010010";
+static char SDAC_M_12[] = "010100011000";
+static char SDAC_M_13[] = "101010000001";
+static char SDAC_M_14[] = "010100001000";
+static char SDAC_M_15[] = "1101010010000";
+static char SDAC_M_16[] = "1001010001010";
+static char SDAC_M_17[] = "101001000000101";
+
+static char *SDACs[31] = {
+	SDAC_P_0, SDAC_P_1, SDAC_P_2, SDAC_P_3, SDAC_P_4, 
+	SDAC_P_5, SDAC_P_6, SDAC_P_7, SDAC_P_8, SDAC_P_9, 
+	SDAC_P_10, SDAC_P_11, SDAC_P_12, 
+	SDAC_M_0, SDAC_M_1, SDAC_M_2, SDAC_M_3, SDAC_M_4, 
+	SDAC_M_5, SDAC_M_6, SDAC_M_7, SDAC_M_8, SDAC_M_9, 
+	SDAC_M_10, SDAC_M_11, SDAC_M_12, SDAC_M_13, SDAC_M_14, 
+	SDAC_M_15, SDAC_M_16, SDAC_M_17
+	};
+
+static int LENGTHS[] =	{
+1, 2, 3, 4, 5, 6, 6, 10, 10, 10, 12, 11, 16, 0, 3, 4, 6, 7, 9, 10, 10, 10, 10, 12, 12, 12, 12, 12, 13, 13, 15
+	};
+#endif
--- a/src/ec/ref/include/tedwards.h
+++ b/src/ec/ref/include/tedwards.h
@@ -0,0 +1,28 @@
+#ifndef TEDWARDS_H
+#define TEDWARDS_H
+
+#include <fp2.h>
+#include "ec.h"
+
+// a*x^2+y^2=1+d*x^2*y^2
+
+typedef struct ted_point_t {
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+    fp2_t t; // t = x*y/z
+} ted_point_t;
+
+void ted_init(ted_point_t* P);
+bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2);
+void copy_ted_point(ted_point_t* P, ted_point_t const* Q);
+
+void ted_neg(ted_point_t* Q, ted_point_t const* P);
+void ted_dbl(ted_point_t* Q, ted_point_t const* P, ec_curve_t const* E);
+void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E);
+
+void mont_to_ted(ec_curve_t* E, ec_curve_t const* A);
+void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* A);
+void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P);
+
+#endif
--- a/src/ec/ref/lvl1/CMakeLists.txt
+++ b/src/ec/ref/lvl1/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
+    ${ECX_DIR}/poly-mul.c 
+    ${ECX_DIR}/poly-redc.c 
+    ${ECX_DIR}/ec.c 
+    ${ECX_DIR}/tedwards.c 
+    ${ECX_DIR}/kps.c 
+    ${ECX_DIR}/xisog.c 
+    ${ECX_DIR}/xeval.c 
+    ${ECX_DIR}/isog_chains.c 
+    ${ECX_DIR}/basis.c
+)
+
+add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
+target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/ec/ref/lvl1/test/CMakeLists.txt
+++ b/src/ec/ref/lvl1/test/CMakeLists.txt
@@ -0,0 +1,36 @@
+add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
+	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
+	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
+		
+add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
+	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+		
+add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
+	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+	
+add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
+	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
+	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
+	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
+	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+
+add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
+add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl1/test/ec-tests.h
+++ b/src/ec/ref/lvl1/test/ec-tests.h
@@ -0,0 +1,400 @@
+#ifndef EC_TESTS_H
+#define EC_TESTS_H
+
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>       //////// NOTE: enable later
+#include "test-basis.h"
+#include "ec_params.h"
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 1000;       // Number of iterations per bench
+static int TEST_LOOPS  = 512;       // Number of iterations per test
+
+
+bool ec_test()
+{ // Tests for ecc arithmetic
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_point_t AC = {0};
+    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing ecc functions: \n\n"); 
+
+    // Point doubling
+    passed = 1;
+    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
+    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
+    P.z.re[0] = 0x01;
+
+    AC.z.re[0] = 0x01;
+    fp2_tomont(&AC.z, &AC.z);
+        
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    xDBL(&S, &R, &AC);
+    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
+    fp2_copy(&SS.z, &S.z);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
+    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
+    
+    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
+    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
+    Q.z.re[0] = 0x01;
+
+    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
+    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
+    PQ.z.re[0] = 0x01;
+
+    fp2_tomont(&S.x, &Q.x);
+    fp2_tomont(&S.z, &Q.z);
+    fp2_tomont(&PQ.x, &PQ.x);
+    fp2_tomont(&PQ.z, &PQ.z);
+    xADD(&S, &SS, &S, &PQ);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
+    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 126;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
+    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
+    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &Q.x);
+    fp2_tomont(&R.z, &Q.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    l[0] = 0x34AB78B6C6B2D8C0;
+    l[1] = 0xDE6B2D8CD00F38D1;
+    l[2] = 0xA35F4A7897E17CE2;
+    l[3] = 0x20ACF4A789614D13;
+    fp2_inv(&SS.z);
+    fp2_mul(&SS.x, &SS.x, &SS.z);
+    fp2_copy(&SS.z, &R.z);
+    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
+    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+out0:
+    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
+    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ 
+    return OK;
+}
+
+bool dlog_test()
+{ // Tests for dlog
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Testing dlog functions: \n\n");
+
+    // dlog2 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(f1, TWOpFm1);
+    fp_copy(f2, TWOpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[0] = 0xFFFFFFFFFFFFFFFF;
+    k[1] = 0x00000000000007FF;
+    l[0] = 0xFFFFFFFFFFFFFFFE;
+    l[1] = 0x00000000000007FF;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
+           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, f2, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, f2, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
+    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // dlog3 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(tpFdiv2, THREEpFdiv2);
+    fp_copy(tpF, THREEpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[1] = 0;
+    l[1] = 0;
+    k[0] = 0x02153E468B91C6D1;
+    l[0] = 0x02153E468B91C6D0;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
+           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, tpF, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, tpF, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
+    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool ec_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P, Q, R, PQ, AC;
+    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking ecc arithmetic: \n\n"); 
+
+    // Point doubling
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        xDBL(&Q, &P, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point addition
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xADD(&R, &Q, &P, &PQ);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+bool dlog_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Benchmarking dlog2: \n\n");
+
+    // dlog2 computation
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // dlog3 computation
+
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+#endif
--- a/src/ec/ref/lvl1/test/test-basis.h
+++ b/src/ec/ref/lvl1/test/test-basis.h
@@ -0,0 +1,24 @@
+#ifndef TEST_BASIS_H
+#define TEST_BASIS_H
+
+#include "fp2.h"
+// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
+const fp2_t xPA = {{0x7505815fb30f099e,0x89e78dbb4294c8df,0x7db9b4b1f7716d7b,0x13fcd4c87af65308},{0x93533c1017088fd4,0x6df9e398a1bb4cb1,0xc928f082be2e2b4c,0x17aa7e2906bef0af}};
+const fp2_t xQA = {{0xe96336b75eb5a505,0x5640cecad0ad7b5a,0x1394f0771bc58ac1,0x18d92124656d68d9},{0xa54e8e24605754f0,0xe52de9790bbe4bb9,0x3bf9b7833f62e255,0x277a07644ec4f0e2}};
+const fp2_t xPQA = {{0xc8fcceb408e3444c,0x9f8ca4d2c05c3287,0x259e496f17c0f529,0x0eb18a51c2a3dd1a},{0x1014dbe2534b8310,0x6b035ee3c371ea12,0x8354ecb4c111db6d,0x178259b78fe08093}};
+
+const fp2_t xPB = {{0xbd0a2f0c9a5378ca,0x74af17405042203d,0x0ccdcb4b7f0b8c15,0x314c70951a92d8bf},{0xe889e6bc5f9842af,0xefb0edbb5e266ab3,0x7bfb9d05f1ba6962,0x0a5f3f4fe6f16514}};
+const fp2_t xQB = {{0x137e215438caaf3b,0xc4403ee1b69f1382,0x2b5783edcefa7246,0x3015572698262f66},{0x8e88e4293f84536e,0x8d6dbc277f85ff77,0xb3f17b53b01da916,0x08dd3f4976c5dad1}};
+const fp2_t xPQB = {{0xf0c2701a7050d9b9,0xc8fdb069c0234d3a,0x9ec25780f2b101a8,0x221a0565053e8ff4},{0xd8513bf6a05910ae,0x47ff2422258dfb3a,0xb98ccceae31ac407,0x21bcc8e659aaa1b3}};
+
+// 2^f-torsion basis for A=0
+const fp2_t xP2 = {{0xfc93bac7df77fd30,0xa8d37e10783215bd,0x4bd2ece4f148039b,0x2bd5b83f5f8c09fb},{0x444112970b59f12f,0x557b8b9beb55c276,0x633f97cd9464df6c,0x00a1b21b593a2dfd}};
+const fp2_t xQ2 = {{0x6b4289960273222c,0xa290d8eb8e343a04,0x0c0a333f80a0ed68,0x31a58910e276aff0},{0xb7ca615ad7473865,0xeb6f72f20794f050,0x2941c3fe3203b94f,0x32ad5cbe915e467b}};
+const fp2_t xPQ2 = {{0xac9f90005e47b095,0x47eafdafd5168836,0xb88aac8334acdad0,0x1a5cf52a20f665b4},{0x4baa70fb1f5fa99c,0xffb7ddb12c87f1a3,0xdd3a229d370a8484,0x1e992ad0a14baf03}};
+
+// 3^g-torsion basis for A=0
+const fp2_t xP3 = {{0x8cf496c2722f340d,0x3e329c5a507ad39c,0xa0c7caa3e4537e25,0x1371d43cf97de48e},{0xa4b94c97b8149e7d,0xd290853fa14704c7,0x158b854173c1b289,0x04c6dcda7872c23f}};
+const fp2_t xQ3 = {{0x0f6380fd4c963950,0x101a22a245c4f563,0x601d3e30b21a5f43,0x0becd5f73b067949},{0xd364123c6806057e,0x8ff24fca9e060260,0x3b52df5bfb817901,0x30950462489b838f}};
+const fp2_t xPQ3 = {{0xe04cab7169e64a82,0x56df573ea9295c19,0x06cbb6af8e341990,0x0f1046ca03017ca1},{0x2dac3457c35be728,0x2f59af21113f25f9,0xa0dc4f54eec2715d,0x102ecf9a7ff2f2ff}};
+
+#endif
--- a/src/ec/ref/lvl3/CMakeLists.txt
+++ b/src/ec/ref/lvl3/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
+    ${ECX_DIR}/poly-mul.c 
+    ${ECX_DIR}/poly-redc.c 
+    ${ECX_DIR}/ec.c 
+    ${ECX_DIR}/tedwards.c 
+    ${ECX_DIR}/kps.c 
+    ${ECX_DIR}/xisog.c 
+    ${ECX_DIR}/xeval.c 
+    ${ECX_DIR}/isog_chains.c 
+    ${ECX_DIR}/basis.c
+)
+
+add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
+target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/ec/ref/lvl3/test/CMakeLists.txt
+++ b/src/ec/ref/lvl3/test/CMakeLists.txt
@@ -0,0 +1,36 @@
+add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
+	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
+	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
+		
+add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
+	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+		
+add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
+	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+	
+add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
+	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
+	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
+	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
+	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+
+add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
+add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl3/test/ec-tests.h
+++ b/src/ec/ref/lvl3/test/ec-tests.h
@@ -0,0 +1,400 @@
+#ifndef EC_TESTS_H
+#define EC_TESTS_H
+
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>       //////// NOTE: enable later
+#include "test-basis.h"
+#include "ec_params.h"
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 1000;       // Number of iterations per bench
+static int TEST_LOOPS  = 512;       // Number of iterations per test
+
+
+bool ec_test()
+{ // Tests for ecc arithmetic
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_point_t AC = {0};
+    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
+/*
+    // Point doubling
+    passed = 1;
+    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
+    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
+    P.z.re[0] = 0x01;
+
+    AC.z.re[0] = 0x01;
+    fp2_tomont(&AC.z, &AC.z);
+        
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    xDBL(&S, &R, &AC);
+    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
+    fp2_copy(&SS.z, &S.z);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
+    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
+    
+    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
+    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
+    Q.z.re[0] = 0x01;
+
+    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
+    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
+    PQ.z.re[0] = 0x01;
+
+    fp2_tomont(&S.x, &Q.x);
+    fp2_tomont(&S.z, &Q.z);
+    fp2_tomont(&PQ.x, &PQ.x);
+    fp2_tomont(&PQ.z, &PQ.z);
+    xADD(&S, &SS, &S, &PQ);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
+    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 126;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
+    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
+    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &Q.x);
+    fp2_tomont(&R.z, &Q.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    l[0] = 0x34AB78B6C6B2D8C0;
+    l[1] = 0xDE6B2D8CD00F38D1;
+    l[2] = 0xA35F4A7897E17CE2;
+    l[3] = 0x20ACF4A789614D13;
+    fp2_inv(&SS.z);
+    fp2_mul(&SS.x, &SS.x, &SS.z);
+    fp2_copy(&SS.z, &R.z);
+    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
+    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+out0:
+    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
+    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ */
+    return OK;
+}
+
+bool dlog_test()
+{ // Tests for dlog
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Testing dlog functions: \n\n");
+
+    // dlog2 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(f1, TWOpFm1);
+    fp_copy(f2, TWOpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[0] = 0xFFFFFFFFFFFFFFFF;
+    k[1] = 0x00000000000007FF;
+    l[0] = 0xFFFFFFFFFFFFFFFE;
+    l[1] = 0x00000000000007FF;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
+           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, f2, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, f2, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
+    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // dlog3 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(tpFdiv2, THREEpFdiv2);
+    fp_copy(tpF, THREEpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[1] = 0;
+    l[1] = 0;
+    k[0] = 0x02153E468B91C6D1;
+    l[0] = 0x02153E468B91C6D0;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
+           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, tpF, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, tpF, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
+    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool ec_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P, Q, R, PQ, AC;
+    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking ecc arithmetic: \n\n"); 
+
+    // Point doubling
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        xDBL(&Q, &P, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point addition
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xADD(&R, &Q, &P, &PQ);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+bool dlog_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Benchmarking dlog2: \n\n");
+
+    // dlog2 computation
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // dlog3 computation
+
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+#endif
--- a/src/ec/ref/lvl3/test/test-basis.h
+++ b/src/ec/ref/lvl3/test/test-basis.h
@@ -0,0 +1,24 @@
+#ifndef TEST_BASIS_H
+#define TEST_BASIS_H
+
+#include "fp2.h"
+// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
+const fp2_t xPA = {{0x35b53c72e7494775,0x5791b499bc29710d,0x2060f3aca68fa4ff,0x81150c19a14f523a,0x08af6c81a906d44a,0x00cca2a93efb536e},{0x14eaac356375af76,0x5655011e771be3b4,0x6273ccee274d7754,0x440d6b5b4496c183,0xa3d7f80e9f9111ba,0x0302e153bee01a18}};
+const fp2_t xQA = {{0x80c0767d1b7b5fd8,0x24e9039d430ca3b5,0x26485254625dc85a,0x612eaebc345b64d1,0x59669fbd946a4409,0x004c3a8564e16101},{0x0e1eac4e38449c54,0x752c042b4c6675cb,0x88ec0e75c8e9ea0e,0xbf7c4cdbfc4483f0,0xd594cb5474bbc264,0x02f5e2345a9b4654}};
+const fp2_t xPQA = {{0x1f5accaff9a7da90,0x91884964774d4cb2,0x0e938e13dd088e63,0x453c9af09879a724,0xb2bd09ec3740312b,0x0007a5837e23aaa1},{0x8e1ac4b319787bd4,0x7cb9fba402f67bfe,0x370b2951f9ec29cf,0x7a020172566f9d17,0x063e31753d703130,0x01551136265bade6}};
+
+const fp2_t xPB = {{0xb702a70a8ae132ad,0x56d8804c83a8e696,0x5ac3e12f4df1792e,0x0a89da435664746e,0xd8758765206844bd,0x01a92f6e9e0e9296},{0x8aaab711b76b0959,0x210e6695ca5e5fdd,0x593be0d75909ca12,0xfbc074d8ebdeb927,0xb61fcc328d3756bc,0x0198a5942855c8bf}};
+const fp2_t xQB = {{0x2b6b82b950b61fda,0x0ef2dd717daed334,0x99dee4db0b268ac9,0x3534eb384e1fcaf0,0xbaf112845a4f2d81,0x037f1492d8d815a1},{0x97e80590f9a0556b,0x7d9b4b87a22a7792,0xda4534fe75595b4b,0xbe1092a2733c03e1,0xbf5b1bd147b0d630,0x0125721476e5267f}};
+const fp2_t xPQB = {{0xb7d459a56d4aebec,0x6ac7f10ba20e1e71,0x9a95a8928507f7ef,0xc4c5aff6b97f3dfe,0x644beb3e86806b77,0x022319eb6eaf072a},{0x8ad0f6b18934790e,0xdad82b7b38e166bf,0xcb08f5a3ab53d9a9,0xd2ff39b401ba8aba,0xbff9b5e40ed9e5ce,0x03c1773791f554c0}};
+
+// 2^f-torsion basis for A=0
+const fp2_t xP2 = {{0x7a26fdb0e5844206,0x0752b2ba140f7dfd,0x1728013f8f5fe257,0xd05f129975ed6bba,0xe736dbce707ad5a8,0x01f861715896d0be},{0xdac046927a0c5352,0x5a42474ac156ff18,0xe887982ff4c5a9ea,0x3875be6432251f1c,0xdfae47315af877ee,0x005627f085582ecc}};
+const fp2_t xQ2 = {{0xc4f03ab3db57331b,0xf04261fc3b713778,0xa99b82430c7e40d1,0x5fe52b1324c2a091,0xfcaa2a7049d0f657,0x021f2caa09302141},{0x4a92a1d5ff9f6730,0x6dcd5f600f33783e,0xdb8b4e2e5149b45e,0x993458635c01d0c0,0x5f9bc7d3bb307f91,0x01fcc7eae4712b6a}};
+const fp2_t xPQ2 = {{0x7f4ee9c86c4341a2,0x0c867f482063bdfc,0xe46fb7b0fbd479c7,0xddaa716e091be9ad,0x29239eadddf5dc59,0x0231c09c660f0a89},{0xde64fa344dd64237,0xa89aaaed3dd84555,0xbb70924d8fb73f27,0x0869ec018b3366dc,0x47a0356ce742bcbc,0x00547dbda6dc094d}};
+
+// 3^g-torsion basis for A==0
+const fp2_t xP3 = {{0x7c878d0ceaa821f0,0xf94db4cab7186625,0x7cff6d5fb0ca7867,0x4e3f5bd19cbca9d6,0x05ec8273d0042548,0x0233a79cf87040b3},{0x060e9f3dcab8192c,0xa94e86d063a46398,0x0e5cc403bfb60867,0x3ea1277f98087283,0xaff1fd95bb094917,0x025041b12719d3b8}};
+const fp2_t xQ3 = {{0xb25aaa192bd351b7,0xc5db1962aed7e543,0x1f722ab174319947,0xd1c9bb4a0a5d8aa3,0x351415ec64f88921,0x0288ae044d62c930},{0xb41ede1724f8e06a,0xfb10ce5a83c66629,0x9846173e31a9d448,0x35c94966192f08db,0x72f7252946af3f9c,0x02ea05c971e7b34c}};
+const fp2_t xPQ3 = {{0x674703cc3134d90b,0x507e338e496b8f75,0x0c8cb1f138346e4c,0x54cb7ad5ba580da7,0x65750f0bcd0a9857,0x038b435f51669e87},{0xdcdc0116c67589a0,0x45ce94f4d345c827,0x0f2cbfb3c53b73ea,0x03e7951bc98efbb8,0x3335ad0991864858,0x01e151a64210f74f}};
+
+#endif
--- a/src/ec/ref/lvl5/CMakeLists.txt
+++ b/src/ec/ref/lvl5/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
+    ${ECX_DIR}/poly-mul.c 
+    ${ECX_DIR}/poly-redc.c 
+    ${ECX_DIR}/ec.c 
+    ${ECX_DIR}/tedwards.c 
+    ${ECX_DIR}/kps.c 
+    ${ECX_DIR}/xisog.c 
+    ${ECX_DIR}/xeval.c 
+    ${ECX_DIR}/isog_chains.c 
+    ${ECX_DIR}/basis.c
+)
+
+add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
+target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/ec/ref/lvl5/test/CMakeLists.txt
+++ b/src/ec/ref/lvl5/test/CMakeLists.txt
@@ -0,0 +1,36 @@
+add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
+	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
+	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
+		
+add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
+	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+		
+add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
+	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
+	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+	
+add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
+	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
+	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
+	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
+	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
+	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
+
+
+add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
+add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl5/test/ec-tests.h
+++ b/src/ec/ref/lvl5/test/ec-tests.h
@@ -0,0 +1,400 @@
+#ifndef EC_TESTS_H
+#define EC_TESTS_H
+
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>       //////// NOTE: enable later
+#include "test-basis.h"
+#include "ec_params.h"
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 1000;       // Number of iterations per bench
+static int TEST_LOOPS  = 512;       // Number of iterations per test
+
+
+bool ec_test()
+{ // Tests for ecc arithmetic
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_point_t AC = {0};
+    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
+/*
+    // Point doubling
+    passed = 1;
+    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
+    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
+    P.z.re[0] = 0x01;
+
+    AC.z.re[0] = 0x01;
+    fp2_tomont(&AC.z, &AC.z);
+        
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    xDBL(&S, &R, &AC);
+    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
+    fp2_copy(&SS.z, &S.z);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
+    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
+    
+    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
+    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
+    Q.z.re[0] = 0x01;
+
+    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
+    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
+    PQ.z.re[0] = 0x01;
+
+    fp2_tomont(&S.x, &Q.x);
+    fp2_tomont(&S.z, &Q.z);
+    fp2_tomont(&PQ.x, &PQ.x);
+    fp2_tomont(&PQ.z, &PQ.z);
+    xADD(&S, &SS, &S, &PQ);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
+    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 126;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
+    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &P.x);
+    fp2_tomont(&R.z, &P.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    xMUL(&S, &R, k, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
+    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+    fp2_tomont(&R.x, &Q.x);
+    fp2_tomont(&R.z, &Q.z);
+    k[0] = 0xE77AD6B6C6B2D8CD;
+    k[1] = 0xDE43A0B600F38D12;
+    k[2] = 0xA35F4A7897E17CE2;
+    k[3] = 0x10ACB62E614D1237;
+    l[0] = 0x34AB78B6C6B2D8C0;
+    l[1] = 0xDE6B2D8CD00F38D1;
+    l[2] = 0xA35F4A7897E17CE2;
+    l[3] = 0x20ACF4A789614D13;
+    fp2_inv(&SS.z);
+    fp2_mul(&SS.x, &SS.x, &SS.z);
+    fp2_copy(&SS.z, &R.z);
+    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
+    fp2_inv(&S.z);
+    fp2_mul(&S.x, &S.x, &S.z);
+    fp2_frommont(&S.x, &S.x);
+
+    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
+    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
+
+    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
+    
+out0:
+    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
+    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ */
+    return OK;
+}
+
+bool dlog_test()
+{ // Tests for dlog
+    bool OK = true;
+    int passed;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Testing dlog functions: \n\n");
+
+    // dlog2 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(f1, TWOpFm1);
+    fp_copy(f2, TWOpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[0] = 0xFFFFFFFFFFFFFFFF;
+    k[1] = 0x00000000000007FF;
+    l[0] = 0xFFFFFFFFFFFFFFFE;
+    l[1] = 0x00000000000007FF;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
+           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, f2, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, f2, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
+    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // dlog3 testing
+    passed = 1;
+    
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp_copy(tpFdiv2, THREEpFdiv2);
+    fp_copy(tpF, THREEpF);
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+    k[1] = 0;
+    l[1] = 0;
+    k[0] = 0x02153E468B91C6D1;
+    l[0] = 0x02153E468B91C6D0;
+
+    for (int n = 0; n < TEST_LOOPS; n++)
+    {
+        k[0] -= 1;
+        l[0] -= 2;
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+
+        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
+        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
+        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
+           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
+            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
+                sub_test(kt, tpF, kt, NWORDS_ORDER);
+            }
+            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
+                sub_test(lt, tpF, lt, NWORDS_ORDER);
+            }
+        }
+        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
+    }
+
+    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
+    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool ec_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P, Q, R, PQ, AC;
+    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking ecc arithmetic: \n\n"); 
+
+    // Point doubling
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        xDBL(&Q, &P, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point addition
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xADD(&R, &Q, &P, &PQ);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Point multiplication
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+bool dlog_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
+    ec_curve_t AC = {0};
+    ec_basis_t PQ2;
+    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
+    printf("Benchmarking dlog2: \n\n");
+
+    // dlog2 computation
+    
+    fp2_tomont(&P.x, &xP2);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ2);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ2);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    AC.C.re[0] = 0x01;
+    fp2_tomont(&AC.C, &AC.C);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // dlog3 computation
+
+    fp2_tomont(&P.x, &xP3);
+    fp_mont_setone(P.z.re);
+    fp_set(P.z.im, 0);
+    
+    fp2_tomont(&Q.x, &xQ3);
+    fp_mont_setone(Q.z.re);
+    fp_set(Q.z.im, 0);
+    
+    fp2_tomont(&PQ.x, &xPQ3);
+    fp_mont_setone(PQ.z.re);
+    fp_set(PQ.z.im, 0);
+
+    copy_point(&PQ2.P, &P);
+    copy_point(&PQ2.Q, &Q);
+    copy_point(&PQ2.PmQ, &PQ);
+
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        fprandom_test(k); fprandom_test(l);
+        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
+        cycles1 = cpucycles();
+        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+#endif
--- a/src/ec/ref/lvl5/test/test-basis.h
+++ b/src/ec/ref/lvl5/test/test-basis.h
@@ -0,0 +1,24 @@
+#ifndef TEST_BASIS_H
+#define TEST_BASIS_H
+
+#include "fp2.h"
+// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
+const fp2_t xPA = {{0x3c780e636a5869dc,0xb8a1d106332efe8e,0x7dd946e490e6578e,0x71d1fadbea881f88,0xb94912baba3999f0,0x85343be0a74ca9e1,0x22ae01775a9f7fa4,0x001032ffab70a66e},{0x15908a4b85221a67,0x342f82e6a1db4e1d,0x3d7c806a0d47b041,0x693830fad798c598,0xcfa244134a61827a,0x7f723d6f5d9628cf,0x10da657833d4d027,0x000c48499df01216}};
+const fp2_t xQA = {{0x79a766df9c10c642,0x7677cb85097be8be,0x2a21c7f9b84b9deb,0xb263e837f57210ce,0x551d6636b7c7e061,0x78d332581bee10b2,0xce30a9926772e06c,0x00150b5009b1d6ed},{0xbb2f097dae470eb9,0x53940c6df1eb93a9,0x7786a4bab87320c1,0x89d32acc1c91db18,0x733ef7f139fb7f9b,0x7bc336ee25a3901b,0xf7dfe8f5559eeeb1,0x00210555ab63e7f3}};
+const fp2_t xPQA = {{0x315ead6fadc8b0d6,0x7da37e8b7e94de95,0xcc6a9e206f513651,0x84fa9fab584acf3d,0x293b25689ac50519,0xe3222bd1c8154964,0x8ad7f39d04a8274f,0x000898edca69c223},{0x3e6c3e1864851e7e,0x01807c724f75ad5e,0xe9cd50eff4e66fb7,0x6c7c19a88fed9707,0x3ab57d0499386a40,0x6b5fd53c6efdc0b5,0x092fe030da27bc43,0x00076f2f409c5f8e}};
+
+const fp2_t xPB = {{0x229e388475511856,0x2f6b17e9ec9258c0,0x0cb28c568697f9f4,0xca039e28512c9f9b,0xd52d823761b0daa2,0xa09c3800e22c5e3b,0x2971022668c3b76a,0x0006e91c4415afd1},{0xbd5059b7406e1dcd,0x9da456ed8c11f1a3,0x1fb30e9cf66f928e,0x867c348b2f488d26,0x9d4b03d8aa4229bc,0x1c01ca1088d145a8,0xc9d6a201d77644a1,0x000a0d45131bf5b0}};
+const fp2_t xQB = {{0x712f0e5d0e3b4dfa,0x52260082dda1a07e,0x5a7513dcfd273829,0xc686f0976cbb5dcf,0xf5fc3df004cc7efc,0x615d0c2da4f2fb9f,0x796efbb3f65aede8,0x00028176c42e1d9f},{0xb8779b5a7bd2436b,0x4067b7e09d0ca56c,0xfdbaee6ff27ebe38,0x69310e98174025de,0x71960a10fa15706e,0x08ffb4b3f6efafbf,0xb7116ca162211ea3,0x00253c0f60765f1f}};
+const fp2_t xPQB = {{0x0e90506c89b46e0c,0x24ec65d5deb4e5b9,0x8477f7e141db8725,0xf76957ec1940dbd3,0xc2857af32534e715,0x06820654c6bae5f4,0x5ac928ef3c90c1f8,0x0024f724366faeed},{0xf6d7d2fdb06b91c4,0xe603cf05ce3f7555,0x8a0876277637415c,0xa1ef891f00155f8f,0x159db3ac93d39d57,0x5a05683aeaa453ff,0x180c38da2402f6fc,0x000b69d01dcb9107}};
+
+// 2^f-torsion basis for A=0
+const fp2_t xP2 = {{0x5d453ee3e6de9bf6,0xb5e51a5e88d8bbf3,0xc91ce6ef41eda957,0x4e0ba74e86fd3385,0xeff87c1def35e01f,0xedcd6c20496988a5,0x91a2c14abdb955fe,0x000be92a3f4de175},{0xa8a13d8e0022a825,0xb26bb70885d42bef,0x2533c31e799596b4,0xc41d58b247fb5ac9,0x8d45fa188fd5cb65,0x1b0593f6e4af948d,0x0ede22e4fcbe17ca,0x0014f54c5d5e1308}};
+const fp2_t xQ2 = {{0x90414b2365f868cd,0x68af18688f73fe25,0x46ca4c4b4ca19114,0xadae5e2564f79c98,0xfe3e09af9d00eb08,0x6856810a298a57bf,0x170d41ba9327205d,0x001d588b6744b4ea},{0xfb94e978bcf29be5,0x136700c07b264bd6,0x62a3c89d8466b8f9,0x9f990ca7d3084bd8,0xaab6fb1040e242d0,0x9e9325c5a5c20740,0xa9a6ee97f376e198,0x0003c8eee3581511}};
+const fp2_t xPQ2 = {{0x873d426c501eafe6,0xdeb1e87769484669,0x57c38f42bd1fef4d,0x53ca12d14b2ded18,0xb72ef4a808fc9d70,0x59d9a54b1844cca1,0x6ca7ccb15b6a9e49,0x00132a12929654f7},{0xffc6b824b6603270,0xb4152cbd3b607298,0xbe97764acdcb16ce,0x5205b1ec222c3be9,0x0cf5ac18d1eb4984,0xf5233664fd72c328,0x492e775887a3367c,0x001ce6bdfc847b45}};
+
+// 3^g-torsion basis for A=0
+const fp2_t xP3 = {{0x807a6abcb56d1915,0x3ab8ff7df809ea8f,0x2bd4f1eba48b23ac,0xeb32542370dde5ff,0xe6c50551eaaf2329,0x545dceaf98f07f09,0x90bfb0e10f3e5b48,0x000cc0084da1b367},{0xbd6f9c82cd4acc13,0x9b39d0711267d8a2,0x0ff31ab9fd38bb36,0xccc169cd75c1a58b,0xd943ad3571e304b4,0xfc3cda0859595d00,0xabda66362732b019,0x00070c5abcf1f329}};
+const fp2_t xQ3 = {{0x2b46bbfa6e57a9db,0xa7a5881479d3aaff,0x5c8106d57698b7cb,0xde0ccd3c436cd1ad,0xed351e8fbc28fd8f,0xe18a9a18e4f5bf03,0x9a98961a81073911,0x001ed93f47abe8f2},{0x5dc96ddee6e9a9eb,0x5e8905d15b918006,0xe89cecdc3f9b48f1,0x9d1a98543001e35e,0x0795c7b134dadeba,0x8050c48376f36d87,0xe9f364f7c6fbee1f,0x00061cb05b384f81}};
+const fp2_t xPQ3 = {{0xd44970f662987227,0x4c8eda7256920e8d,0x857f42e972e25a0e,0xc66a5b62daa3644d,0x6ab4ded74a464c38,0x4157cc1048b85a3a,0x9916ab1ee4e2305a,0x000c6943137ffba1},{0x0c5118f818e5279d,0xacb0c4a011613c7a,0xb87b4a9cb16a7565,0xc997ccbe0159f318,0x6fc50720bce6f45f,0xbd1916a5ca7789d7,0x3f48f437fdeccc64,0x000674d925340bc4}};
+
+#endif
--- a/src/gf/CMakeLists.txt
+++ b/src/gf/CMakeLists.txt
@@ -0,0 +1 @@
+include(${SELECT_IMPL_TYPE})
--- a/src/gf/broadwell/CMakeLists.txt
+++ b/src/gf/broadwell/CMakeLists.txt
@@ -0,0 +1 @@
+include(${SELECT_SQISIGN_VARIANT})
--- a/src/gf/broadwell/lvl1/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set(SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL
+    fp_asm.S fp.c fp2.c
+)
+
+add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL})
+target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
+target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/gf/broadwell/lvl1/Makefile
+++ b/src/gf/broadwell/lvl1/Makefile
@@ -0,0 +1,46 @@
+
+CC=gcc
+CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
+LDFLAGS=-lm
+AR=ar rcs
+RANLIB=ranlib
+
+OBJECTS=objs/fp_p1913.o objs/fp.o objs/fp2.o objs/fp_asm.o objs/random.o
+
+all: lib tests
+	
+objs/fp_p1913.o: fp_p1913.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp_p1913.c -o objs/fp_p1913.o
+	
+objs/fp.o: fp.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
+	
+objs/fp2.o: fp2.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
+
+objs/fp_asm.o: fp_asm.S
+	$(CC) -c $(CFLAGS) fp_asm.S -o objs/fp_asm.o
+
+objs/random.o: ../../../common/generic/randombytes_system.c
+	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
+
+lib: $(OBJECTS)
+	rm -rf lib
+	mkdir lib
+	$(AR) lib/libtest.a $^
+	$(RANLIB) lib/libtest.a
+
+tests: lib
+	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
+	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
+
+check: tests
+
+.PHONY: clean
+
+clean:
+	rm -rf *.req objs lib test_fp*
+
--- a/src/gf/broadwell/lvl1/fp.c
+++ b/src/gf/broadwell/lvl1/fp.c
@@ -0,0 +1,192 @@
+#include "include/fp.h"
+
+const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0x252C9E49355147FF, 0x33A6A86587407437, 0x34E29E286B95D98C };
+const uint64_t R2[NWORDS_FIELD] = { 0x233625AE400674D4, 0x20AFD6C1025A1C2E, 0x30A841AB0920655D, 0x0D72E7D67C30CD3D };
+const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+
+
+void fp_set(digit_t* x, const digit_t val)
+{ // Set field element x = val, where val has wordsize
+
+    x[0] = val;
+    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
+        x[i] = 0;
+    }
+}
+
+void fp_mont_setone(digit_t* out1) {
+    out1[0] = 0x4;
+    out1[1] = UINT64_C(0x6b4d86db2abae000);
+    out1[2] = UINT64_C(0x31655e69e2fe2f23);
+    out1[3] = UINT64_C(0x2c75875e51a899cf);
+}
+
+bool fp_is_equal(const digit_t* a, const digit_t* b)
+{ // Compare two field elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ b[i];
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+bool fp_is_zero(const digit_t* a)
+{ // Is a field element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void fp_copy(digit_t* out, const digit_t* a)
+{
+    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+}
+
+void fp_neg(digit_t* out, const digit_t* a)
+{ // Modular negation, out = -a mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < NWORDS_FIELD; i++) {
+        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
+    }
+    fp_sub(out, out, (digit_t*)p);
+}
+
+void fp_tomont(digit_t* out, const digit_t* a)
+{ // Conversion to Montgomery representation
+  // out = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
+
+    fp_mul(out, a, (digit_t*)&R2);
+}
+
+void fp_frommont(digit_t* out, const digit_t* a)
+{ // Conversion from Montgomery representation to standard representation
+  // out = a*R^(-1) mod p, where a in [0, p-1].
+    digit_t one[NWORDS_FIELD] = {0};
+
+    one[0] = 1;
+    fp_mul(out, a, one);
+}
+
+void MUL(digit_t* out, const digit_t a, const digit_t b)
+{ // Digit multiplication, digit*digit -> 2-digit result 
+  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
+  // Output: 0 < out < 2^(2w)-1    
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        // Low part
+    ah = a >> (sizeof(digit_t)*4);            // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t)*4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low;                 // out00
+
+    res1 = albl >> (sizeof(digit_t)*4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t)*4);
+    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
+
+    res1 = ahbl >> (sizeof(digit_t)*4);
+    res2 = albh >> (sizeof(digit_t)*4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low;                 // out10 
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry;     // out11
+}
+
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords-1; i++) {
+        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords-1] >>= shift;
+    return bit_out;
+}
+
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift
+
+    for (int i = nwords-1; i > 0; i--) {
+        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+static void fp_exp3div4(digit_t* out, const digit_t* a)
+{ // Fixed exponentiation out = a^((p-3)/4) mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+  // Requirement: p = 3(mod 4)
+    fp_t p_t, acc;
+    digit_t bit;
+
+    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
+    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    fp_set(out, 1);
+    fp_tomont(out, out);
+
+    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
+        bit = p_t[0] & 1;
+        mp_shiftr(p_t, 1, NWORDS_FIELD);
+        if (bit == 1) {
+            fp_mul(out, out, acc);
+        }
+        fp_sqr(acc, acc);
+    }
+}
+
+void fp_inv(digit_t* a)
+{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
+  // Input: a=xR in [0, p-1] 
+  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
+  // Requirement: Ceiling(Log(p)) < w*nwords
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_sqr(t, t);
+    fp_mul(a, t, a);    // a^(p-2)
+}
+
+bool fp_is_square(const digit_t* a)
+{ // Is field element a square?
+  // Output: out = 0 (false), 1 (true)
+    fp_t t, one;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_mul(t, t, a);    // a^((p-1)/2)
+    fp_frommont(t, t);
+    fp_set(one, 1);
+
+    return fp_is_equal(t, one);
+}
+
+void fp_sqrt(digit_t* a)
+{ // Square root computation, out = a^((p+1)/4) mod p
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_mul(a, t, a);    // a^((p+1)/4)
+}
--- a/src/gf/broadwell/lvl1/fp2.c
+++ b/src/gf/broadwell/lvl1/fp2.c
@@ -0,0 +1,190 @@
+#include <fp2.h>
+
+extern const digit_t R[NWORDS_FIELD];
+
+extern void fp2_sq_c0(fp2_t *out, const fp2_t *in);
+extern void fp2_sq_c1(fp_t *out, const fp2_t *in);
+
+extern void fp2_mul_c0(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+extern void fp2_mul_c1(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+
+/* Arithmetic modulo X^2 + 1 */
+
+void fp2_set(fp2_t* x, const digit_t val)
+{
+    fp_set(x->re, val);
+    fp_set(x->im, 0);
+}
+
+bool fp2_is_zero(const fp2_t* a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(a->re) & fp_is_zero(a->im);
+}
+
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
+}
+
+void fp2_copy(fp2_t* x, const fp2_t* y)
+{
+    fp_copy(x->re, y->re);
+    fp_copy(x->im, y->im);
+}
+
+fp2_t fp2_non_residue()
+{ // 2 + i is a quadratic non-residue for p1913
+    fp_t one = {0};
+    fp2_t res;
+
+    one[0] = 1;
+    fp_tomont(one, one);
+    fp_add(res.re, one, one);
+    fp_copy(res.im, one);
+    return res;
+}
+
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_add(x->re, y->re, z->re);
+    fp_add(x->im, y->im, z->im);
+}
+
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_sub(x->re, y->re, z->re);
+    fp_sub(x->im, y->im, z->im);
+}
+
+void fp2_neg(fp2_t* x, const fp2_t* y)
+{
+    fp_neg(x->re, y->re);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_t t;
+
+    fp2_mul_c0(&t, y, z);              // c0 = a0*b0 - a1*b1
+    fp2_mul_c1(&x->im, y, z);          // c1 = a0*b1 + a1*b0 
+    x->re[0] = t[0]; x->re[1] = t[1]; x->re[2] = t[2]; x->re[3] = t[3];
+}
+
+void fp2_sqr(fp2_t* x, const fp2_t* y) {
+    fp2_t t;
+
+    fp2_sq_c0(&t, y);               // c0 = (a0+a1)(a0-a1)
+    fp2_sq_c1(&x->im, y);           // c1 = 2a0*a1
+    x->re[0] = t.re[0]; x->re[1] = t.re[1]; x->re[2] = t.re[2]; x->re[3] = t.re[3];
+}
+
+void fp2_inv(fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+    fp_inv(t0);
+    fp_mul(x->re, x->re, t0);
+    fp_mul(x->im, x->im, t0);
+    fp_neg(x->im, x->im);
+}
+
+bool fp2_is_square(const fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+
+    return fp_is_square(t0);
+}
+
+void fp2_frob(fp2_t* x, const fp2_t* y)
+{
+    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_tomont(fp2_t* x, const fp2_t* y)
+{ 
+    fp_tomont(x->re, y->re);
+    fp_tomont(x->im, y->im);
+}
+
+void fp2_frommont(fp2_t* x, const fp2_t* y)
+{
+    fp_frommont(x->re, y->re);
+    fp_frommont(x->im, y->im);
+}
+
+// NOTE: old, non-constant-time implementation. Could be optimized
+void fp2_sqrt(fp2_t* x)
+{
+    fp_t sdelta, re, tmp1, tmp2, inv2, im;
+
+    if (fp_is_zero(x->im)) {
+        if (fp_is_square(x->re)) {
+            fp_sqrt(x->re);
+            return;
+        } else {
+            fp_neg(x->im, x->re);
+            fp_sqrt(x->im);
+            fp_set(x->re, 0);
+            return;
+        }
+    }
+
+    // sdelta = sqrt(re^2 + im^2)
+    fp_sqr(sdelta, x->re);
+    fp_sqr(tmp1, x->im);
+    fp_add(sdelta, sdelta, tmp1);
+    fp_sqrt(sdelta);
+
+    fp_set(inv2, 2);
+    fp_tomont(inv2, inv2);     // inv2 <- 2
+    fp_inv(inv2);
+    fp_add(re, x->re, sdelta);
+    fp_mul(re, re, inv2);
+    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    if (!fp_is_square(tmp2)) {
+        fp_sub(re, x->re, sdelta);
+        fp_mul(re, re, inv2);
+    }
+
+    fp_sqrt(re);
+    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    fp_inv(im);
+    fp_mul(im, im, inv2);
+    fp_mul(x->im, im, x->im);    
+    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+}
+
+// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
+int fp2_cmp(fp2_t* x, fp2_t* y){
+    fp2_t a, b;
+    fp2_frommont(&a, x);
+    fp2_frommont(&b, y);
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.re[i] > b.re[i])
+            return 1;
+        if(a.re[i] < b.re[i])
+            return -1;
+    }
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.im[i] > b.im[i])
+            return 1;
+        if(a.im[i] < b.im[i])
+            return -1;
+    }
+    return 0;
+}
--- a/src/gf/broadwell/lvl1/fp_asm.S
+++ b/src/gf/broadwell/lvl1/fp_asm.S
@@ -0,0 +1,555 @@
+.intel_syntax noprefix
+
+.set pbytes,32
+.set plimbs,4
+
+.global p_plus_1
+p_plus_1: .quad 0x0000000000000000, 0x252C9E4935514800, 0x33A6A86587407437, 0x34E29E286B95D98C
+
+.text
+.p2align 4,,15
+
+.global fp_add
+fp_add:
+  push   r12  
+  xor    rax, rax
+  mov    r8, [rsi]
+  mov    r9, [rsi+8]
+  mov    r10, [rsi+16]
+  mov    r11, [rsi+24]
+  add    r8, [rdx] 
+  adc    r9, [rdx+8] 
+  adc    r10, [rdx+16] 
+  adc    r11, [rdx+24] 
+  mov    r12, [rip+p]
+  sub    r8, r12
+  mov    rcx, [rip+p+8]
+  sbb    r9, rcx
+  mov    rsi, [rip+p+16]
+  sbb    r10, rsi
+  mov    rdx, [rip+p+24]
+  sbb    r11, rdx
+  sbb    rax, 0
+  
+  and    r12, rax
+  and    rcx, rax
+  and    rsi, rax
+  and    rdx, rax
+  
+  add    r8, r12  
+  adc    r9, rcx  
+  adc    r10, rsi  
+  adc    r11, rdx 
+  mov    [rdi], r8
+  mov    [rdi+8], r9 
+  mov    [rdi+16], r10 
+  mov    [rdi+24], r11
+  pop    r12
+  ret
+
+.global fp_sub
+fp_sub:
+  push   r12  
+  xor    rax, rax
+  mov    r8, [rsi]
+  mov    r9, [rsi+8]
+  mov    r10, [rsi+16]
+  mov    r11, [rsi+24]
+  sub    r8, [rdx] 
+  sbb    r9, [rdx+8] 
+  sbb    r10, [rdx+16] 
+  sbb    r11, [rdx+24]
+  sbb    rax, 0
+  
+  mov    r12, [rip+p]
+  mov    rcx, [rip+p+8]
+  mov    rsi, [rip+p+16]
+  mov    rdx, [rip+p+24]
+  and    r12, rax
+  and    rcx, rax
+  and    rsi, rax
+  and    rdx, rax  
+  add    r8, r12  
+  adc    r9, rcx 
+  adc    r10, rsi  
+  adc    r11, rdx 
+  mov    [rdi], r8
+  mov    [rdi+8], r9 
+  mov    [rdi+16], r10 
+  mov    [rdi+24], r11 
+  pop    r12
+  ret
+  
+///////////////////////////////////////////////////////////////// MACROS
+// z = a x bi + z
+// Inputs: base memory pointer M1 (a),
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z4]
+// Output: [Z0:Z4]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro MULADD64x256 M1, Z0, Z1, Z2, Z3, Z4, T0, T1, C
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    \C, \C
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    adc    \Z4, 0   
+.endm
+
+.macro MULADD64x192 M1, Z0, Z1, Z2, Z3, T0, T1
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    rax, rax
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    adc    \Z3, 0   
+.endm
+  
+//***********************************************************************
+//  Multiplication in GF(p^2), non-complex part
+//  Operation: c [rdi] = a0 x b0 - a1 x b1
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//          b = [b1, b0] stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_mul_c0
+fp2_mul_c0:    
+    push   r12 
+    push   r13 
+    push   r14   
+    mov    rcx, rdx
+	
+	// [rdi0:3] <- p - b1
+	mov    r8, [rip+p]  
+	mov    r9, [rip+p+8]   
+	mov    r10, [rip+p+16]
+	mov    r11, [rip+p+24] 
+	mov    rax, [rcx+32]
+	mov    rdx, [rcx+40]        
+	sub    r8, rax
+	sbb    r9, rdx
+	mov    rax, [rcx+48]
+	mov    rdx, [rcx+56]
+	sbb    r10, rax
+	sbb    r11, rdx
+	mov    [rdi], r8
+	mov    [rdi+8], r9
+	mov    [rdi+16], r10
+	mov    [rdi+24], r11
+    
+    // [r8:r12] <- z = a0 x b00 - a1 x b10
+    mov    rdx, [rcx]
+    mulx   r9, r8, [rsi]         
+    xor    rax, rax
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16] 
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24]
+    adox   r11, r13  
+    adox   r12, rax
+           
+    mov    rdx, [rdi]    
+    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
+    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r8                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    
+    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, [rcx+8]
+    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
+    mov    rdx, [rdi+8]    
+    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
+    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r9                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    
+    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
+    mov    rdx, [rcx+16]
+    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
+    mov    rdx, [rdi+16]    
+    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
+    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r10                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    
+    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
+    mov    rdx, [rcx+24]
+    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
+    mov    rdx, [rdi+24]    
+    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
+    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r11                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+  
+//***********************************************************************
+//  Multiplication in GF(p^2), complex part
+//  Operation: c [rdi] = a0 x b1 + a1 x b0
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//          b = [b1, b0] stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_mul_c1
+fp2_mul_c1:    
+    push   r12 
+    push   r13 
+    push   r14   
+    mov    rcx, rdx
+    
+    // [r8:r12] <- z = a0 x b10 + a1 x b00
+    mov    rdx, [rcx+32]
+    mulx   r9, r8, [rsi]         
+    xor    rax, rax
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16] 
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24]
+    adox   r11, r13  
+    adox   r12, rax
+           
+    mov    rdx, [rcx]    
+    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
+    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r8                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    
+    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, [rcx+40]
+    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
+    mov    rdx, [rcx+8]    
+    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
+    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r9                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    
+    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
+    mov    rdx, [rcx+48]
+    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
+    mov    rdx, [rcx+16]    
+    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
+    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r10                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    
+    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
+    mov    rdx, [rcx+56]
+    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
+    mov    rdx, [rcx+24]    
+    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
+    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r11                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+ 
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b (mod p)
+// Inputs: base memory pointers M0 (a), M1 (b)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z4], pre-stores a0 x b
+// Output: [Z0:Z4]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPMUL256x256 M0, M1, Z0, Z1, Z2, Z3, Z4, T0, T1           
+    // [Z1:Z4] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z0                 // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    
+    // [Z1:Z4, Z0] <- z = a01 x a1 + z 
+    mov    rdx, 8\M0
+    MULADD64x256 \M1, \Z1, \Z2, \Z3, \Z4, \Z0, \T0, \T1, \Z0
+    // [Z2:Z4, Z0] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z1                 // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z2, \Z3, \Z4, \Z0, \T0, \T1
+    
+    // [Z2:Z4, Z0:Z1] <- z = a02 x a1 + z  
+    mov    rdx, 16\M0
+    MULADD64x256 \M1, \Z2, \Z3, \Z4, \Z0, \Z1, \T0, \T1, \Z1
+    // [Z3:Z4, Z0:Z1] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z2                // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z3, \Z4, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z4, Z0:Z2] <- z = a03 x a1 + z
+    mov    rdx, 24\M0
+    MULADD64x256 \M1, \Z3, \Z4, \Z0, \Z1, \Z2, \T0, \T1, \Z2
+    // [Z4, Z0:Z2] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z3                // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z4, \Z0, \Z1, \Z2, \T0, \T1
+.endm
+
+//***********************************************************************
+//  Squaring in GF(p^2), non-complex part
+//  Operation: c [rdi] = (a0+a1) x (a0-a1)
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_sq_c0
+fp2_sq_c0:   
+    push   r12 
+    push   r13
+
+	// a0 + a1
+	mov    rdx, [rsi]
+	mov    r9, [rsi+8]
+	mov    r10, [rsi+16]
+	mov    r11, [rsi+24]
+	add    rdx, [rsi+32]
+	adc    r9, [rsi+40]
+	adc    r10, [rsi+48]
+	adc    r11, [rsi+56]
+	mov    [rdi], rdx
+	mov    [rdi+8], r9
+	mov    [rdi+16], r10
+	mov    [rdi+24], r11
+	
+	// a0 - a1 + p
+	mov    r8, [rsi]
+	mov    r10, [rsi+8]
+	mov    r12, [rsi+16]
+	mov    r13, [rsi+24]
+	sub    r8, [rsi+32]
+	sbb    r10, [rsi+40]
+	sbb    r12, [rsi+48] 
+	sbb    r13, [rsi+56]
+	add    r8, [rip+p]                    
+	adc    r10, [rip+p+8]
+	adc    r12, [rip+p+16]
+	adc    r13, [rip+p+24]
+	mov    [rdi+32], r8               
+	mov    [rdi+40], r10 
+	mov    [rdi+48], r12 
+	mov    [rdi+56], r13 
+    
+    // [r8:r12] <- z = a00 x a1
+    mulx   r9, r8, r8   
+    xor    rax, rax
+    mulx   r10, r11, r10  
+    adox   r9, r11        
+    mulx   r11, r12, r12  
+    adox   r10, r12        
+    mulx   r12, r13, r13  
+    adox   r11, r13
+    adox   r12, rax 
+
+    FPMUL256x256 [rdi], [rdi+32], r8, r9, r10, r11, r12, r13, rcx
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10
+    pop    r13
+    pop    r12
+    ret
+
+//***********************************************************************
+//  Squaring in GF(p^2), complex part
+//  Operation: c [rdi] = 2a0 x a1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_sq_c1
+fp2_sq_c1:  
+    push   r12
+    push   r13 
+	
+	mov    rdx, [rsi]
+	mov    r9, [rsi+8]
+	mov    r10, [rsi+16]
+	mov    r11, [rsi+24]
+	add    rdx, rdx
+	adc    r9, r9
+	adc    r10, r10
+	adc    r11, r11
+	sub    rsp, 32
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10 
+	mov    [rsp+24], r11   
+    
+    // [r8:r12] <- z = a00 x a1
+    mulx   r9, r8, [rsi+32]
+    xor    rax, rax 
+    mulx   r10, r11, [rsi+40]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+48]
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+56]
+    adox   r11, r13  
+    adox   r12, rax 
+
+	FPMUL256x256 [rsp], [rsi+32], r8, r9, r10, r11, r12, r13, rcx
+	add    rsp, 32
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r13
+    pop    r12
+    ret
+
+//***********************************************************************
+//  Field multiplication in GF(p)
+//  Operation: c = a x b mod p
+//  Inputs: a stored in [rsi], b stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp_mul
+fp_mul: 
+    push   r12
+    push   r13 
+    push   r14 
+    mov    rcx, rdx 
+     
+    // [r8:r12] <- z = a x b0
+    mov    rdx, [rcx]
+    mulx   r9, r8, [rsi]
+    xor    rax, rax 
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16]
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24] 
+    adox   r11, r13
+    adox   r12, rax 
+
+	FPMUL256x256 [rcx], [rsi], r8, r9, r10, r11, r12, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10  
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+    
+.global fp_sqr
+fp_sqr:
+    mov rdx, rsi
+    jmp fp_mul
--- a/src/gf/broadwell/lvl1/include/fp.h
+++ b/src/gf/broadwell/lvl1/include/fp.h
@@ -0,0 +1,76 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+
+void fp_set(digit_t* x, const digit_t val);
+bool fp_is_equal(const digit_t* a, const digit_t* b);
+bool fp_is_zero(const digit_t* a);
+void fp_copy(digit_t* out, const digit_t* a);
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_neg(digit_t* out, const digit_t* a);
+void fp_sqr(digit_t* out, const digit_t* a);
+void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
+void MUL(digit_t* out, const digit_t a, const digit_t b);
+void fp_inv(digit_t* x);
+bool fp_is_square(const digit_t* a);
+void fp_sqrt(digit_t* a);
+void fp_tomont(digit_t* out, const digit_t* a);
+void fp_frommont(digit_t* out, const digit_t* a);
+void fp_mont_setone(digit_t* out);
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+
+static inline unsigned int is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
--- a/src/gf/broadwell/lvl1/include/fp2.h
+++ b/src/gf/broadwell/lvl1/include/fp2.h
@@ -0,0 +1,29 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include "fp.h"
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t {
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set(fp2_t* x, const digit_t val);
+bool fp2_is_zero(const fp2_t* a);
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
+void fp2_copy(fp2_t* x, const fp2_t* y);
+fp2_t fp2_non_residue();
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_neg(fp2_t* x, const fp2_t* y);
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sqr(fp2_t* x, const fp2_t* y);
+void fp2_inv(fp2_t* x);
+bool fp2_is_square(const fp2_t* x);
+void fp2_frob(fp2_t* x, const fp2_t* y);
+void fp2_sqrt(fp2_t* x);
+void fp2_tomont(fp2_t* x, const fp2_t* y);
+void fp2_frommont(fp2_t* x, const fp2_t* y);
+int fp2_cmp(fp2_t* x, fp2_t* y);
+
+#endif
--- a/src/gf/broadwell/lvl1/test/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
+
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/src/gf/broadwell/lvl1/test/test_extras.c
+++ b/src/gf/broadwell/lvl1/test/test_extras.c
@@ -0,0 +1,74 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+static void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/src/gf/broadwell/lvl1/test/test_extras.h
+++ b/src/gf/broadwell/lvl1/test/test_extras.h
@@ -0,0 +1,25 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <time.h>
+#include <stdlib.h>
+#include "../include/fp.h"
+#include "../include/fp2.h"
+
+#define PASSED    0
+#define FAILED    1
+    
+// Access system counter for benchmarking
+//int64_t cpucycles(void);
+
+// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
+
+// Generating a pseudo-random field element in [0, p-1] 
+void fprandom_test(digit_t* a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2random_test(fp2_t* a);
+
+#endif
--- a/src/gf/broadwell/lvl1/test/test_fp.c
+++ b/src/gf/broadwell/lvl1/test/test_fp.c
@@ -0,0 +1,295 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp_test()
+{ // Tests for the field arithmetic
+    bool OK = true;
+    int n, passed;
+    fp_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing field arithmetic over GF(p): \n\n"); 
+
+    // Field addition
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d); 
+
+        fp_add(d, a, b); fp_add(e, d, c);                 // e = (a+b)+c
+        fp_add(d, b, c); fp_add(f, d, a);                 // f = a+(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_add(d, a, b);                                  // d = a+b 
+        fp_add(e, b, a);                                  // e = b+a
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_add(d, a, b);                                  // d = a+0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);   
+        fp_neg(d, a);                      
+        fp_add(e, a, d);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) addition tests ............................................ PASSED");
+    else { printf("  GF(p) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field subtraction
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d);
+
+        fp_sub(d, a, b); fp_sub(e, d, c);                 // e = (a-b)-c
+        fp_add(d, b, c); fp_sub(f, a, d);                 // f = a-(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_sub(d, a, b);                                  // d = a-b 
+        fp_sub(e, b, a);
+        fp_neg(e, e);                                     // e = -(b-a)
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_sub(d, a, b);                                  // d = a-0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp_set(b, 0);              
+        fp_sub(e, a, a);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field multiplication
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fprandom_test(a); fprandom_test(b); fprandom_test(c);
+        
+        fp_tomont(ma, a);
+        fp_frommont(c, ma);
+        if (compare_words(a, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c);
+        fp_mul(md, ma, mb); fp_mul(me, md, mc);                          // e = (a*b)*c
+        fp_mul(md, mb, mc); fp_mul(mf, md, ma);                          // f = a*(b*c)
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c); 
+        fp_add(md, mb, mc); fp_mul(me, ma, md);                          // e = a*(b+c)
+        fp_mul(md, ma, mb); fp_mul(mf, ma, mc); fp_add(mf, md, mf);      // f = a*b+a*c
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp_tomont(ma, a); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*b 
+        fp_mul(me, mb, ma);                                              // e = b*a 
+        fp_frommont(d, md);
+        fp_frommont(e, me);
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a);
+        fp_set(b, 1); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*1  
+        fp_frommont(a, ma);
+        fp_frommont(d, md);                
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp_set(b, 0);
+        fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*0 
+        fp_frommont(d, md);                
+        if (compare_words(b, d, NWORDS_FIELD)!=0) { passed=0; break; } 
+    }
+    if (passed==1) printf("  GF(p) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field squaring
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+        
+        fp_tomont(ma, a);
+        fp_sqr(mb, ma);                                   // b = a^2
+        fp_mul(mc, ma, ma);                               // c = a*a 
+        fp_frommont(b, mb);
+        fp_frommont(c, mc);
+        if (compare_words(b, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(a, 0); fp_tomont(ma, a);
+        fp_sqr(md, ma);                                   // d = 0^2 
+        if (compare_words(ma, md, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) squaring tests............................................. PASSED");
+    else { printf("  GF(p) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field inversion
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_set(d, 1);
+        memcpy(mb, ma, RADIX/8 * NWORDS_FIELD);
+        fp_inv(ma);
+        fp_mul(mc, ma, mb);                               // c = a*a^-1 
+        fp_frommont(c, mc);
+        if (compare_words(c, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp_set(a, 0);
+        fp_set(d, 0);
+        fp_inv(a);                                        // c = 0^-1
+        if (compare_words(a, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p) inversion tests............................................ PASSED");
+    else { printf("  GF(p) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Square root and square detection
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_sqr(mc, ma);
+        fp_frommont(c, mc);                               // c = a^2
+        if (fp_is_square(mc) != 1) { passed = 0; break; }
+
+        fp_sqrt(mc);                                      // c = a = sqrt(c) 
+        fp_neg(md, mc);
+        fp_frommont(c, mc);
+        fp_frommont(d, md);
+        if ((compare_words(a, c, NWORDS_FIELD) != 0) && (compare_words(a, d, NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests........................................ PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ 
+    return OK;
+}
+
+bool fp_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking field arithmetic: \n\n"); 
+        
+    fprandom_test(a); fprandom_test(b); fprandom_test(c);
+
+    // GF(p) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_add(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_sub(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_mul(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) inversion
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_inv(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) square root
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_sqrt(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_is_square(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/broadwell/lvl1/test/test_fp2.c
+++ b/src/gf/broadwell/lvl1/test/test_fp2.c
@@ -0,0 +1,307 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp2_test()
+{ // Tests for the GF(p^2) arithmetic
+    bool OK = true;
+    int n, passed;
+    fp2_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing arithmetic over GF(p^2): \n\n"); 
+
+    // Addition in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d); 
+
+        fp2_add(&d, &a, &b); fp2_add(&e, &d, &c);                 // e = (a+b)+c
+        fp2_add(&d, &b, &c); fp2_add(&f, &d, &a);                 // f = a+(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_add(&d, &a, &b);                                      // d = a+b 
+        fp2_add(&e, &b, &a);                                      // e = b+a
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_add(&d, &a, &b);                                      // d = a+0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);   
+        fp2_neg(&d, &a);                      
+        fp2_add(&e, &a, &d);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) addition tests ............................................ PASSED");
+    else { printf("  GF(p^2) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Subtraction in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d);
+
+        fp2_sub(&d, &a, &b); fp2_sub(&e, &d, &c);                 // e = (a-b)-c
+        fp2_add(&d, &b, &c); fp2_sub(&f, &a, &d);                 // f = a-(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_sub(&d, &a, &b);                                      // d = a-b 
+        fp2_sub(&e, &b, &a);
+        fp2_neg(&e, &e);                                          // e = -(b-a)
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_sub(&d, &a, &b);                                      // d = a-0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp2_set(&b, 0);              
+        fp2_sub(&e, &a, &a);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p^2) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Multiplication in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+        
+        fp2_tomont(&ma, &a);
+        fp2_frommont(&c, &ma);
+        if (compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c);
+        fp2_mul(&md, &ma, &mb); fp2_mul(&me, &md, &mc);                          // e = (a*b)*c
+        fp2_mul(&md, &mb, &mc); fp2_mul(&mf, &md, &ma);                          // f = a*(b*c)
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c); 
+        fp2_add(&md, &mb, &mc); fp2_mul(&me, &ma, &md);                          // e = a*(b+c)
+        fp2_mul(&md, &ma, &mb); fp2_mul(&mf, &ma, &mc); fp2_add(&mf, &md, &mf);  // f = a*b+a*c
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*b 
+        fp2_mul(&me, &mb, &ma);                                                  // e = b*a 
+        fp2_frommont(&d, &md);
+        fp2_frommont(&e, &me);
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&b, 1); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*1  
+        fp2_frommont(&a, &ma);
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp2_set(&b, 0);
+        fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*0 
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&b, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p^2) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Squaring in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+        
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mb, &ma);                                          // b = a^2
+        fp2_mul(&mc, &ma, &ma);                                     // c = a*a 
+        fp2_frommont(&b, &mb);
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&b, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&a, 0); fp2_tomont(&ma, &a);
+        fp2_sqr(&md, &ma);                                          // d = 0^2 
+        if (compare_words((digit_t*)&ma, (digit_t*)&md, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) squaring tests............................................. PASSED");
+    else { printf("  GF(p^2) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Inversion in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&d, 1);
+        memcpy(&mb, &ma, RADIX/8 * 2*NWORDS_FIELD);
+        fp2_inv(&ma);
+        fp2_mul(&mc, &ma, &mb);                                     // c = a*a^-1 
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&c, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp2_set(&a, 0);
+        fp2_set(&d, 0);
+        fp2_inv(&a);                                                // c = 0^-1
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p^2) inversion tests............................................ PASSED");
+    else { printf("  GF(p^2) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Square root and square detection in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mc, &ma);
+        fp2_frommont(&c, &mc);                                      // c = a^2
+        if (fp2_is_square(&mc) != 1) { passed = 0; break; }        
+
+        fp2_sqrt(&mc);                                              // c = a = sqrt(c) 
+        fp2_neg(&md, &mc);
+        fp2_frommont(&c, &mc);
+        fp2_frommont(&d, &md);
+        if ((compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD) != 0) & (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests.......................................... PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool fp2_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp2_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking arithmetic over GF(p^2): \n\n"); 
+        
+    fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+
+    // GF(p^2) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_add(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_sub(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) squaring
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqr(&c, &a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) squaring runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_mul(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) inversion
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_inv(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) square root
+    cycles = 0;
+    for (n = 0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqrt(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_is_square(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ........................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp2_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp2_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/ref/CMakeLists.txt
+++ b/src/gf/ref/CMakeLists.txt
@@ -0,0 +1 @@
+include(${SELECT_SQISIGN_VARIANT})
--- a/src/gf/ref/lvl1/CMakeLists.txt
+++ b/src/gf/ref/lvl1/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set(SOURCE_FILES_GF_${SVARIANT_UPPER}_REF
+    fp_p1913.c fp.c fp2.c
+)
+
+add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
+target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/gf/ref/lvl1/Makefile
+++ b/src/gf/ref/lvl1/Makefile
@@ -0,0 +1,43 @@
+
+CC=gcc
+CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
+LDFLAGS=-lm
+AR=ar rcs
+RANLIB=ranlib
+
+OBJECTS=objs/fp_p1913.o objs/fp.o objs/fp2.o objs/random.o
+
+all: lib tests
+	
+objs/fp_p1913.o: fp_p1913.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp_p1913.c -o objs/fp_p1913.o
+	
+objs/fp.o: fp.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
+	
+objs/fp2.o: fp2.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
+
+objs/random.o: ../../../common/generic/randombytes_system.c
+	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
+
+lib: $(OBJECTS)
+	rm -rf lib
+	mkdir lib
+	$(AR) lib/libtest.a $^
+	$(RANLIB) lib/libtest.a
+
+tests: lib
+	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
+	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
+
+check: tests
+
+.PHONY: clean
+
+clean:
+	rm -rf *.req objs lib test_fp*
+
--- a/src/gf/ref/lvl1/fp.c
+++ b/src/gf/ref/lvl1/fp.c
@@ -0,0 +1,169 @@
+#include "include/fp.h"
+
+const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0x252C9E49355147FF, 0x33A6A86587407437, 0x34E29E286B95D98C };
+const uint64_t R2[NWORDS_FIELD] = { 0x233625AE400674D4, 0x20AFD6C1025A1C2E, 0x30A841AB0920655D, 0x0D72E7D67C30CD3D };
+const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+
+
+void fp_set(digit_t* x, const digit_t val)
+{ // Set field element x = val, where val has wordsize
+
+    x[0] = val;
+    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
+        x[i] = 0;
+    }
+}
+
+bool fp_is_equal(const digit_t* a, const digit_t* b)
+{ // Compare two field elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ b[i];
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+bool fp_is_zero(const digit_t* a)
+{ // Is a field element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void fp_copy(digit_t* out, const digit_t* a)
+{
+    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+}
+
+void fp_neg(digit_t* out, const digit_t* a)
+{ // Modular negation, out = -a mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < NWORDS_FIELD; i++) {
+        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
+    }
+    fp_sub(out, out, (digit_t*)p);
+}
+
+void MUL(digit_t* out, const digit_t a, const digit_t b)
+{ // Digit multiplication, digit*digit -> 2-digit result 
+  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
+  // Output: 0 < out < 2^(2w)-1    
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        // Low part
+    ah = a >> (sizeof(digit_t)*4);            // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t)*4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low;                 // out00
+
+    res1 = albl >> (sizeof(digit_t)*4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t)*4);
+    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
+
+    res1 = ahbl >> (sizeof(digit_t)*4);
+    res2 = albh >> (sizeof(digit_t)*4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low;                 // out10 
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry;     // out11
+}
+
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords-1; i++) {
+        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords-1] >>= shift;
+    return bit_out;
+}
+
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift
+
+    for (int i = nwords-1; i > 0; i--) {
+        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+static void fp_exp3div4(digit_t* out, const digit_t* a)
+{ // Fixed exponentiation out = a^((p-3)/4) mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+  // Requirement: p = 3(mod 4)
+    fp_t p_t, acc;
+    digit_t bit;
+
+    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
+    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    fp_set(out, 1);
+    fp_tomont(out, out);
+
+    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
+        bit = p_t[0] & 1;
+        mp_shiftr(p_t, 1, NWORDS_FIELD);
+        if (bit == 1) {
+            fp_mul(out, out, acc);
+        }
+        fp_sqr(acc, acc);
+    }
+}
+
+void fp_inv(digit_t* a)
+{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
+  // Input: a=xR in [0, p-1] 
+  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
+  // Requirement: Ceiling(Log(p)) < w*nwords
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_sqr(t, t);
+    fp_mul(a, t, a);    // a^(p-2)
+}
+
+bool fp_is_square(const digit_t* a)
+{ // Is field element a square?
+  // Output: out = 0 (false), 1 (true)
+    fp_t t, one;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_mul(t, t, a);    // a^((p-1)/2)
+    fp_frommont(t, t);
+    fp_set(one, 1);
+
+    return fp_is_equal(t, one);
+}
+
+void fp_sqrt(digit_t* a)
+{ // Square root computation, out = a^((p+1)/4) mod p
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_mul(a, t, a);    // a^((p+1)/4)
+}
--- a/src/gf/ref/lvl1/fp2.c
+++ b/src/gf/ref/lvl1/fp2.c
@@ -0,0 +1,192 @@
+#include <fp2.h>
+
+extern const digit_t R[NWORDS_FIELD];
+
+/* Arithmetic modulo X^2 + 1 */
+
+void fp2_set(fp2_t* x, const digit_t val)
+{
+    fp_set(x->re, val);
+    fp_set(x->im, 0);
+}
+
+bool fp2_is_zero(const fp2_t* a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(a->re) & fp_is_zero(a->im);
+}
+
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
+}
+
+void fp2_copy(fp2_t* x, const fp2_t* y)
+{
+    fp_copy(x->re, y->re);
+    fp_copy(x->im, y->im);
+}
+
+fp2_t fp2_non_residue()
+{ // 2 + i is a quadratic non-residue for p1913
+    fp_t one = {0};
+    fp2_t res;
+
+    one[0] = 1;
+    fp_tomont(one, one);
+    fp_add(res.re, one, one);
+    fp_copy(res.im, one);
+    return res;
+}
+
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_add(x->re, y->re, z->re);
+    fp_add(x->im, y->im, z->im);
+}
+
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_sub(x->re, y->re, z->re);
+    fp_sub(x->im, y->im, z->im);
+}
+
+void fp2_neg(fp2_t* x, const fp2_t* y)
+{
+    fp_neg(x->re, y->re);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_t t0, t1;
+
+    fp_add(t0, y->re, y->im);
+    fp_add(t1, z->re, z->im);
+    fp_mul(t0, t0, t1);
+    fp_mul(t1, y->im, z->im);
+    fp_mul(x->re, y->re, z->re);
+    fp_sub(x->im, t0, t1);
+    fp_sub(x->im, x->im, x->re);
+    fp_sub(x->re, x->re, t1);
+}
+
+void fp2_sqr(fp2_t* x, const fp2_t* y)
+{
+    fp_t sum, diff;
+
+    fp_add(sum, y->re, y->im);
+    fp_sub(diff, y->re, y->im);
+    fp_mul(x->im, y->re, y->im);
+    fp_add(x->im, x->im, x->im);
+    fp_mul(x->re, sum, diff);
+}
+
+void fp2_inv(fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+    fp_inv(t0);
+    fp_mul(x->re, x->re, t0);
+    fp_mul(x->im, x->im, t0);
+    fp_neg(x->im, x->im);
+}
+
+bool fp2_is_square(const fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+
+    return fp_is_square(t0);
+}
+
+void fp2_frob(fp2_t* x, const fp2_t* y)
+{
+    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_tomont(fp2_t* x, const fp2_t* y)
+{ 
+    fp_tomont(x->re, y->re);
+    fp_tomont(x->im, y->im);
+}
+
+void fp2_frommont(fp2_t* x, const fp2_t* y)
+{
+    fp_frommont(x->re, y->re);
+    fp_frommont(x->im, y->im);
+}
+
+// NOTE: old, non-constant-time implementation. Could be optimized
+void fp2_sqrt(fp2_t* x)
+{
+    fp_t sdelta, re, tmp1, tmp2, inv2, im;
+
+    if (fp_is_zero(x->im)) {
+        if (fp_is_square(x->re)) {
+            fp_sqrt(x->re);
+            return;
+        } else {
+            fp_neg(x->im, x->re);
+            fp_sqrt(x->im);
+            fp_set(x->re, 0);
+            return;
+        }
+    }
+
+    // sdelta = sqrt(re^2 + im^2)
+    fp_sqr(sdelta, x->re);
+    fp_sqr(tmp1, x->im);
+    fp_add(sdelta, sdelta, tmp1);
+    fp_sqrt(sdelta);
+
+    fp_set(inv2, 2);
+    fp_tomont(inv2, inv2);     // inv2 <- 2
+    fp_inv(inv2);
+    fp_add(re, x->re, sdelta);
+    fp_mul(re, re, inv2);
+    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    if (!fp_is_square(tmp2)) {
+        fp_sub(re, x->re, sdelta);
+        fp_mul(re, re, inv2);
+    }
+
+    fp_sqrt(re);
+    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    fp_inv(im);
+    fp_mul(im, im, inv2);
+    fp_mul(x->im, im, x->im);    
+    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+}
+
+// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
+int fp2_cmp(fp2_t* x, fp2_t* y){
+    fp2_t a, b;
+    fp2_frommont(&a, x);
+    fp2_frommont(&b, y);
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.re[i] > b.re[i])
+            return 1;
+        if(a.re[i] < b.re[i])
+            return -1;
+    }
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.im[i] > b.im[i])
+            return 1;
+        if(a.im[i] < b.im[i])
+            return -1;
+    }
+    return 0;
+}
--- a/src/gf/ref/lvl1/fp_p1913.c
+++ b/src/gf/ref/lvl1/fp_p1913.c
--- a/src/gf/ref/lvl1/fp_p3923.c
+++ b/src/gf/ref/lvl1/fp_p3923.c
--- a/src/gf/ref/lvl1/include/fp.h
+++ b/src/gf/ref/lvl1/include/fp.h
@@ -0,0 +1,76 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+
+void fp_set(digit_t* x, const digit_t val);
+bool fp_is_equal(const digit_t* a, const digit_t* b);
+bool fp_is_zero(const digit_t* a);
+void fp_copy(digit_t* out, const digit_t* a);
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_neg(digit_t* out, const digit_t* a);
+void fp_sqr(digit_t* out, const digit_t* a);
+void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
+void MUL(digit_t* out, const digit_t a, const digit_t b);
+void fp_inv(digit_t* x);
+bool fp_is_square(const digit_t* a);
+void fp_sqrt(digit_t* a);
+void fp_tomont(digit_t* out, const digit_t* a);
+void fp_frommont(digit_t* out, const digit_t* a);
+void fp_mont_setone(digit_t* out);
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+
+static inline unsigned int is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
--- a/src/gf/ref/lvl1/include/fp2.h
+++ b/src/gf/ref/lvl1/include/fp2.h
@@ -0,0 +1,29 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include "fp.h"
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t {
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set(fp2_t* x, const digit_t val);
+bool fp2_is_zero(const fp2_t* a);
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
+void fp2_copy(fp2_t* x, const fp2_t* y);
+fp2_t fp2_non_residue();
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_neg(fp2_t* x, const fp2_t* y);
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sqr(fp2_t* x, const fp2_t* y);
+void fp2_inv(fp2_t* x);
+bool fp2_is_square(const fp2_t* x);
+void fp2_frob(fp2_t* x, const fp2_t* y);
+void fp2_sqrt(fp2_t* x);
+void fp2_tomont(fp2_t* x, const fp2_t* y);
+void fp2_frommont(fp2_t* x, const fp2_t* y);
+int fp2_cmp(fp2_t* x, fp2_t* y);
+
+#endif
--- a/src/gf/ref/lvl1/test/CMakeLists.txt
+++ b/src/gf/ref/lvl1/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
+
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/src/gf/ref/lvl1/test/test_extras.c
+++ b/src/gf/ref/lvl1/test/test_extras.c
@@ -0,0 +1,74 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+static void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/src/gf/ref/lvl1/test/test_extras.h
+++ b/src/gf/ref/lvl1/test/test_extras.h
@@ -0,0 +1,25 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <time.h>
+#include <stdlib.h>
+#include "../include/fp.h"
+#include "../include/fp2.h"
+
+#define PASSED    0
+#define FAILED    1
+    
+// Access system counter for benchmarking
+//int64_t cpucycles(void);
+
+// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
+
+// Generating a pseudo-random field element in [0, p-1] 
+void fprandom_test(digit_t* a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2random_test(fp2_t* a);
+
+#endif
--- a/src/gf/ref/lvl1/test/test_fp.c
+++ b/src/gf/ref/lvl1/test/test_fp.c
@@ -0,0 +1,295 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp_test()
+{ // Tests for the field arithmetic
+    bool OK = true;
+    int n, passed;
+    fp_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing field arithmetic over GF(p): \n\n"); 
+
+    // Field addition
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d); 
+
+        fp_add(d, a, b); fp_add(e, d, c);                 // e = (a+b)+c
+        fp_add(d, b, c); fp_add(f, d, a);                 // f = a+(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_add(d, a, b);                                  // d = a+b 
+        fp_add(e, b, a);                                  // e = b+a
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_add(d, a, b);                                  // d = a+0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);   
+        fp_neg(d, a);                      
+        fp_add(e, a, d);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) addition tests ............................................ PASSED");
+    else { printf("  GF(p) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field subtraction
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d);
+
+        fp_sub(d, a, b); fp_sub(e, d, c);                 // e = (a-b)-c
+        fp_add(d, b, c); fp_sub(f, a, d);                 // f = a-(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_sub(d, a, b);                                  // d = a-b 
+        fp_sub(e, b, a);
+        fp_neg(e, e);                                     // e = -(b-a)
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_sub(d, a, b);                                  // d = a-0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp_set(b, 0);              
+        fp_sub(e, a, a);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field multiplication
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fprandom_test(a); fprandom_test(b); fprandom_test(c);
+        
+        fp_tomont(ma, a);
+        fp_frommont(c, ma);
+        if (compare_words(a, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c);
+        fp_mul(md, ma, mb); fp_mul(me, md, mc);                          // e = (a*b)*c
+        fp_mul(md, mb, mc); fp_mul(mf, md, ma);                          // f = a*(b*c)
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c); 
+        fp_add(md, mb, mc); fp_mul(me, ma, md);                          // e = a*(b+c)
+        fp_mul(md, ma, mb); fp_mul(mf, ma, mc); fp_add(mf, md, mf);      // f = a*b+a*c
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp_tomont(ma, a); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*b 
+        fp_mul(me, mb, ma);                                              // e = b*a 
+        fp_frommont(d, md);
+        fp_frommont(e, me);
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a);
+        fp_set(b, 1); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*1  
+        fp_frommont(a, ma);
+        fp_frommont(d, md);                
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp_set(b, 0);
+        fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*0 
+        fp_frommont(d, md);                
+        if (compare_words(b, d, NWORDS_FIELD)!=0) { passed=0; break; } 
+    }
+    if (passed==1) printf("  GF(p) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field squaring
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+        
+        fp_tomont(ma, a);
+        fp_sqr(mb, ma);                                   // b = a^2
+        fp_mul(mc, ma, ma);                               // c = a*a 
+        fp_frommont(b, mb);
+        fp_frommont(c, mc);
+        if (compare_words(b, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(a, 0); fp_tomont(ma, a);
+        fp_sqr(md, ma);                                   // d = 0^2 
+        if (compare_words(ma, md, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) squaring tests............................................. PASSED");
+    else { printf("  GF(p) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field inversion
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_set(d, 1);
+        memcpy(mb, ma, RADIX/8 * NWORDS_FIELD);
+        fp_inv(ma);
+        fp_mul(mc, ma, mb);                               // c = a*a^-1 
+        fp_frommont(c, mc);
+        if (compare_words(c, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp_set(a, 0);
+        fp_set(d, 0);
+        fp_inv(a);                                        // c = 0^-1
+        if (compare_words(a, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p) inversion tests............................................ PASSED");
+    else { printf("  GF(p) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Square root and square detection
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_sqr(mc, ma);
+        fp_frommont(c, mc);                               // c = a^2
+        if (fp_is_square(mc) != 1) { passed = 0; break; }
+
+        fp_sqrt(mc);                                      // c = a = sqrt(c) 
+        fp_neg(md, mc);
+        fp_frommont(c, mc);
+        fp_frommont(d, md);
+        if ((compare_words(a, c, NWORDS_FIELD) != 0) && (compare_words(a, d, NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests........................................ PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ 
+    return OK;
+}
+
+bool fp_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking field arithmetic: \n\n"); 
+        
+    fprandom_test(a); fprandom_test(b); fprandom_test(c);
+
+    // GF(p) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_add(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_sub(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_mul(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) inversion
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_inv(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) square root
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_sqrt(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_is_square(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/ref/lvl1/test/test_fp2.c
+++ b/src/gf/ref/lvl1/test/test_fp2.c
@@ -0,0 +1,307 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp2_test()
+{ // Tests for the GF(p^2) arithmetic
+    bool OK = true;
+    int n, passed;
+    fp2_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing arithmetic over GF(p^2): \n\n"); 
+
+    // Addition in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d); 
+
+        fp2_add(&d, &a, &b); fp2_add(&e, &d, &c);                 // e = (a+b)+c
+        fp2_add(&d, &b, &c); fp2_add(&f, &d, &a);                 // f = a+(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_add(&d, &a, &b);                                      // d = a+b 
+        fp2_add(&e, &b, &a);                                      // e = b+a
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_add(&d, &a, &b);                                      // d = a+0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);   
+        fp2_neg(&d, &a);                      
+        fp2_add(&e, &a, &d);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) addition tests ............................................ PASSED");
+    else { printf("  GF(p^2) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Subtraction in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d);
+
+        fp2_sub(&d, &a, &b); fp2_sub(&e, &d, &c);                 // e = (a-b)-c
+        fp2_add(&d, &b, &c); fp2_sub(&f, &a, &d);                 // f = a-(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_sub(&d, &a, &b);                                      // d = a-b 
+        fp2_sub(&e, &b, &a);
+        fp2_neg(&e, &e);                                          // e = -(b-a)
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_sub(&d, &a, &b);                                      // d = a-0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp2_set(&b, 0);              
+        fp2_sub(&e, &a, &a);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p^2) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Multiplication in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+        
+        fp2_tomont(&ma, &a);
+        fp2_frommont(&c, &ma);
+        if (compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c);
+        fp2_mul(&md, &ma, &mb); fp2_mul(&me, &md, &mc);                          // e = (a*b)*c
+        fp2_mul(&md, &mb, &mc); fp2_mul(&mf, &md, &ma);                          // f = a*(b*c)
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c); 
+        fp2_add(&md, &mb, &mc); fp2_mul(&me, &ma, &md);                          // e = a*(b+c)
+        fp2_mul(&md, &ma, &mb); fp2_mul(&mf, &ma, &mc); fp2_add(&mf, &md, &mf);  // f = a*b+a*c
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*b 
+        fp2_mul(&me, &mb, &ma);                                                  // e = b*a 
+        fp2_frommont(&d, &md);
+        fp2_frommont(&e, &me);
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&b, 1); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*1  
+        fp2_frommont(&a, &ma);
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp2_set(&b, 0);
+        fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*0 
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&b, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p^2) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Squaring in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+        
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mb, &ma);                                          // b = a^2
+        fp2_mul(&mc, &ma, &ma);                                     // c = a*a 
+        fp2_frommont(&b, &mb);
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&b, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&a, 0); fp2_tomont(&ma, &a);
+        fp2_sqr(&md, &ma);                                          // d = 0^2 
+        if (compare_words((digit_t*)&ma, (digit_t*)&md, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) squaring tests............................................. PASSED");
+    else { printf("  GF(p^2) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Inversion in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&d, 1);
+        memcpy(&mb, &ma, RADIX/8 * 2*NWORDS_FIELD);
+        fp2_inv(&ma);
+        fp2_mul(&mc, &ma, &mb);                                     // c = a*a^-1 
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&c, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp2_set(&a, 0);
+        fp2_set(&d, 0);
+        fp2_inv(&a);                                                // c = 0^-1
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p^2) inversion tests............................................ PASSED");
+    else { printf("  GF(p^2) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Square root and square detection in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mc, &ma);
+        fp2_frommont(&c, &mc);                                      // c = a^2
+        if (fp2_is_square(&mc) != 1) { passed = 0; break; }        
+
+        fp2_sqrt(&mc);                                              // c = a = sqrt(c) 
+        fp2_neg(&md, &mc);
+        fp2_frommont(&c, &mc);
+        fp2_frommont(&d, &md);
+        if ((compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD) != 0) & (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests.......................................... PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool fp2_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp2_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking arithmetic over GF(p^2): \n\n"); 
+        
+    fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+
+    // GF(p^2) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_add(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_sub(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) squaring
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqr(&c, &a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) squaring runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_mul(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) inversion
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_inv(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) square root
+    cycles = 0;
+    for (n = 0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqrt(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_is_square(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ........................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp2_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp2_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/ref/lvl3/CMakeLists.txt
+++ b/src/gf/ref/lvl3/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set(SOURCE_FILES_GF_${SVARIANT_UPPER}_REF
+    fp_p47441.c fp.c fp2.c
+)
+
+add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
+target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/gf/ref/lvl3/Makefile
+++ b/src/gf/ref/lvl3/Makefile
@@ -0,0 +1,43 @@
+
+CC=gcc
+CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
+LDFLAGS=-lm
+AR=ar rcs
+RANLIB=ranlib
+
+OBJECTS=objs/fp_p47441.o objs/fp.o objs/fp2.o objs/random.o
+
+all: lib tests
+	
+objs/fp_p47441.o: fp_p47441.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp_p47441.c -o objs/fp_p47441.o
+	
+objs/fp.o: fp.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
+	
+objs/fp2.o: fp2.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
+
+objs/random.o: ../../../common/generic/randombytes_system.c
+	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
+
+lib: $(OBJECTS)
+	rm -rf lib
+	mkdir lib
+	$(AR) lib/libtest.a $^
+	$(RANLIB) lib/libtest.a
+
+tests: lib
+	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
+	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
+
+check: tests
+
+.PHONY: clean
+
+clean:
+	rm -rf *.req objs lib test_fp*
+
--- a/src/gf/ref/lvl3/fp.c
+++ b/src/gf/ref/lvl3/fp.c
@@ -0,0 +1,171 @@
+#include "include/fp.h"
+
+
+const uint64_t p[NWORDS_FIELD] =  { 0xFFFFFFFFFFFFFFFF, 0x4C6174C1FFFFFFFF, 0xC722F669356EA468, 0x65BC2E0A90AEB751, 0xC6AE604A45D10AD6, 0x03DF6EEEAB0871A2 };
+const uint64_t R2[NWORDS_FIELD] = { 0x47B3E8268664617E, 0xDC10C645BFE4A1AC, 0x342C8B98F26F21ED, 0x328905E465CD7DB3, 0x0AFEA5EB6EF0DA10, 0x0389174E2D56216F };
+const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+
+
+
+void fp_set(digit_t* x, const digit_t val)
+{ // Set field element x = val, where val has wordsize
+
+    x[0] = val;
+    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
+        x[i] = 0;
+    }
+}
+
+bool fp_is_equal(const digit_t* a, const digit_t* b)
+{ // Compare two field elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ b[i];
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+bool fp_is_zero(const digit_t* a)
+{ // Is a field element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void fp_copy(digit_t* out, const digit_t* a)
+{
+    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+}
+
+void fp_neg(digit_t* out, const digit_t* a)
+{ // Modular negation, out = -a mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < NWORDS_FIELD; i++) {
+        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
+    }
+    fp_sub(out, out, (digit_t*)p);
+}
+
+void MUL(digit_t* out, const digit_t a, const digit_t b)
+{ // Digit multiplication, digit*digit -> 2-digit result 
+  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
+  // Output: 0 < out < 2^(2w)-1    
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        // Low part
+    ah = a >> (sizeof(digit_t)*4);            // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t)*4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low;                 // out00
+
+    res1 = albl >> (sizeof(digit_t)*4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t)*4);
+    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
+
+    res1 = ahbl >> (sizeof(digit_t)*4);
+    res2 = albh >> (sizeof(digit_t)*4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low;                 // out10 
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry;     // out11
+}
+
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords-1; i++) {
+        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords-1] >>= shift;
+    return bit_out;
+}
+
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift
+
+    for (int i = nwords-1; i > 0; i--) {
+        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+static void fp_exp3div4(digit_t* out, const digit_t* a)
+{ // Fixed exponentiation out = a^((p-3)/4) mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+  // Requirement: p = 3(mod 4)
+    fp_t p_t, acc;
+    digit_t bit;
+
+    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
+    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    fp_set(out, 1);
+    fp_tomont(out, out);
+
+    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
+        bit = p_t[0] & 1;
+        mp_shiftr(p_t, 1, NWORDS_FIELD);
+        if (bit == 1) {
+            fp_mul(out, out, acc);
+        }
+        fp_sqr(acc, acc);
+    }
+}
+
+void fp_inv(digit_t* a)
+{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
+  // Input: a=xR in [0, p-1] 
+  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
+  // Requirement: Ceiling(Log(p)) < w*nwords
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_sqr(t, t);
+    fp_mul(a, t, a);    // a^(p-2)
+}
+
+bool fp_is_square(const digit_t* a)
+{ // Is field element a square?
+  // Output: out = 0 (false), 1 (true)
+    fp_t t, one;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_mul(t, t, a);    // a^((p-1)/2)
+    fp_frommont(t, t);
+    fp_set(one, 1);
+
+    return fp_is_equal(t, one);
+}
+
+void fp_sqrt(digit_t* a)
+{ // Square root computation, out = a^((p+1)/4) mod p
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_mul(a, t, a);    // a^((p+1)/4)
+}
--- a/src/gf/ref/lvl3/fp2.c
+++ b/src/gf/ref/lvl3/fp2.c
@@ -0,0 +1,194 @@
+#include <fp2.h>
+
+extern const digit_t R[NWORDS_FIELD];
+
+/* Arithmetic modulo X^2 + 1 */
+
+void fp2_set(fp2_t* x, const digit_t val)
+{
+    fp_set(x->re, val);
+    fp_set(x->im, 0);
+}
+
+bool fp2_is_zero(const fp2_t* a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(a->re) & fp_is_zero(a->im);
+}
+
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
+}
+
+void fp2_copy(fp2_t* x, const fp2_t* y)
+{
+    fp_copy(x->re, y->re);
+    fp_copy(x->im, y->im);
+}
+
+fp2_t fp2_non_residue()
+{ // 6 + i is a quadratic non-residue for p47441
+    fp_t one = {0};
+    fp2_t res;
+
+    one[0] = 1;
+    fp_tomont(one, one);
+    fp_copy(res.im, one);
+    fp_add(one, one, one);
+    fp_add(res.re, one, one);
+    fp_add(res.re, res.re, one);
+    return res;
+}
+
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_add(x->re, y->re, z->re);
+    fp_add(x->im, y->im, z->im);
+}
+
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_sub(x->re, y->re, z->re);
+    fp_sub(x->im, y->im, z->im);
+}
+
+void fp2_neg(fp2_t* x, const fp2_t* y)
+{
+    fp_neg(x->re, y->re);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_t t0, t1;
+
+    fp_add(t0, y->re, y->im);
+    fp_add(t1, z->re, z->im);
+    fp_mul(t0, t0, t1);
+    fp_mul(t1, y->im, z->im);
+    fp_mul(x->re, y->re, z->re);
+    fp_sub(x->im, t0, t1);
+    fp_sub(x->im, x->im, x->re);
+    fp_sub(x->re, x->re, t1);
+}
+
+void fp2_sqr(fp2_t* x, const fp2_t* y)
+{
+    fp_t sum, diff;
+
+    fp_add(sum, y->re, y->im);
+    fp_sub(diff, y->re, y->im);
+    fp_mul(x->im, y->re, y->im);
+    fp_add(x->im, x->im, x->im);
+    fp_mul(x->re, sum, diff);
+}
+
+void fp2_inv(fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+    fp_inv(t0);
+    fp_mul(x->re, x->re, t0);
+    fp_mul(x->im, x->im, t0);
+    fp_neg(x->im, x->im);
+}
+
+bool fp2_is_square(const fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+
+    return fp_is_square(t0);
+}
+
+void fp2_frob(fp2_t* x, const fp2_t* y)
+{
+    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_tomont(fp2_t* x, const fp2_t* y)
+{ 
+    fp_tomont(x->re, y->re);
+    fp_tomont(x->im, y->im);
+}
+
+void fp2_frommont(fp2_t* x, const fp2_t* y)
+{
+    fp_frommont(x->re, y->re);
+    fp_frommont(x->im, y->im);
+}
+
+// NOTE: old, non-constant-time implementation. Could be optimized
+void fp2_sqrt(fp2_t* x)
+{
+    fp_t sdelta, re, tmp1, tmp2, inv2, im;
+
+    if (fp_is_zero(x->im)) {
+        if (fp_is_square(x->re)) {
+            fp_sqrt(x->re);
+            return;
+        } else {
+            fp_neg(x->im, x->re);
+            fp_sqrt(x->im);
+            fp_set(x->re, 0);
+            return;
+        }
+    }
+
+    // sdelta = sqrt(re^2 + im^2)
+    fp_sqr(sdelta, x->re);
+    fp_sqr(tmp1, x->im);
+    fp_add(sdelta, sdelta, tmp1);
+    fp_sqrt(sdelta);
+
+    fp_set(inv2, 2);
+    fp_tomont(inv2, inv2);     // inv2 <- 2
+    fp_inv(inv2);
+    fp_add(re, x->re, sdelta);
+    fp_mul(re, re, inv2);
+    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    if (!fp_is_square(tmp2)) {
+        fp_sub(re, x->re, sdelta);
+        fp_mul(re, re, inv2);
+    }
+
+    fp_sqrt(re);
+    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    fp_inv(im);
+    fp_mul(im, im, inv2);
+    fp_mul(x->im, im, x->im);    
+    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+}
+
+// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
+int fp2_cmp(fp2_t* x, fp2_t* y){
+    fp2_t a, b;
+    fp2_frommont(&a, x);
+    fp2_frommont(&b, y);
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.re[i] > b.re[i])
+            return 1;
+        if(a.re[i] < b.re[i])
+            return -1;
+    }
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.im[i] > b.im[i])
+            return 1;
+        if(a.im[i] < b.im[i])
+            return -1;
+    }
+    return 0;
+}
--- a/src/gf/ref/lvl3/fp_p47441.c
+++ b/src/gf/ref/lvl3/fp_p47441.c
--- a/src/gf/ref/lvl3/include/fp.h
+++ b/src/gf/ref/lvl3/include/fp.h
@@ -0,0 +1,76 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+
+void fp_set(digit_t* x, const digit_t val);
+bool fp_is_equal(const digit_t* a, const digit_t* b);
+bool fp_is_zero(const digit_t* a);
+void fp_copy(digit_t* out, const digit_t* a);
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_neg(digit_t* out, const digit_t* a);
+void fp_sqr(digit_t* out, const digit_t* a);
+void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
+void MUL(digit_t* out, const digit_t a, const digit_t b);
+void fp_inv(digit_t* x);
+bool fp_is_square(const digit_t* a);
+void fp_sqrt(digit_t* a);
+void fp_tomont(digit_t* out, const digit_t* a);
+void fp_frommont(digit_t* out, const digit_t* a);
+void fp_mont_setone(digit_t* out);
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+
+static inline unsigned int is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
--- a/src/gf/ref/lvl3/include/fp2.h
+++ b/src/gf/ref/lvl3/include/fp2.h
@@ -0,0 +1,29 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include "fp.h"
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t {
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set(fp2_t* x, const digit_t val);
+bool fp2_is_zero(const fp2_t* a);
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
+void fp2_copy(fp2_t* x, const fp2_t* y);
+fp2_t fp2_non_residue();
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_neg(fp2_t* x, const fp2_t* y);
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sqr(fp2_t* x, const fp2_t* y);
+void fp2_inv(fp2_t* x);
+bool fp2_is_square(const fp2_t* x);
+void fp2_frob(fp2_t* x, const fp2_t* y);
+void fp2_sqrt(fp2_t* x);
+void fp2_tomont(fp2_t* x, const fp2_t* y);
+void fp2_frommont(fp2_t* x, const fp2_t* y);
+int fp2_cmp(fp2_t* x, fp2_t* y);
+
+#endif
--- a/src/gf/ref/lvl3/test/CMakeLists.txt
+++ b/src/gf/ref/lvl3/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
+
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/src/gf/ref/lvl3/test/test_extras.c
+++ b/src/gf/ref/lvl3/test/test_extras.c
@@ -0,0 +1,74 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+static void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/src/gf/ref/lvl3/test/test_extras.h
+++ b/src/gf/ref/lvl3/test/test_extras.h
@@ -0,0 +1,25 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <time.h>
+#include <stdlib.h>
+#include "../include/fp.h"
+#include "../include/fp2.h"
+
+#define PASSED    0
+#define FAILED    1
+    
+// Access system counter for benchmarking
+//int64_t cpucycles(void);
+
+// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
+
+// Generating a pseudo-random field element in [0, p-1] 
+void fprandom_test(digit_t* a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2random_test(fp2_t* a);
+
+#endif
--- a/src/gf/ref/lvl3/test/test_fp.c
+++ b/src/gf/ref/lvl3/test/test_fp.c
@@ -0,0 +1,295 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp_test()
+{ // Tests for the field arithmetic
+    bool OK = true;
+    int n, passed;
+    fp_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing field arithmetic over GF(p): \n\n"); 
+
+    // Field addition
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d); 
+
+        fp_add(d, a, b); fp_add(e, d, c);                 // e = (a+b)+c
+        fp_add(d, b, c); fp_add(f, d, a);                 // f = a+(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_add(d, a, b);                                  // d = a+b 
+        fp_add(e, b, a);                                  // e = b+a
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_add(d, a, b);                                  // d = a+0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);   
+        fp_neg(d, a);                      
+        fp_add(e, a, d);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) addition tests ............................................ PASSED");
+    else { printf("  GF(p) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field subtraction
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d);
+
+        fp_sub(d, a, b); fp_sub(e, d, c);                 // e = (a-b)-c
+        fp_add(d, b, c); fp_sub(f, a, d);                 // f = a-(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_sub(d, a, b);                                  // d = a-b 
+        fp_sub(e, b, a);
+        fp_neg(e, e);                                     // e = -(b-a)
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_sub(d, a, b);                                  // d = a-0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp_set(b, 0);              
+        fp_sub(e, a, a);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field multiplication
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fprandom_test(a); fprandom_test(b); fprandom_test(c);
+        
+        fp_tomont(ma, a);
+        fp_frommont(c, ma);
+        if (compare_words(a, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c);
+        fp_mul(md, ma, mb); fp_mul(me, md, mc);                          // e = (a*b)*c
+        fp_mul(md, mb, mc); fp_mul(mf, md, ma);                          // f = a*(b*c)
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c); 
+        fp_add(md, mb, mc); fp_mul(me, ma, md);                          // e = a*(b+c)
+        fp_mul(md, ma, mb); fp_mul(mf, ma, mc); fp_add(mf, md, mf);      // f = a*b+a*c
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp_tomont(ma, a); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*b 
+        fp_mul(me, mb, ma);                                              // e = b*a 
+        fp_frommont(d, md);
+        fp_frommont(e, me);
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a);
+        fp_set(b, 1); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*1  
+        fp_frommont(a, ma);
+        fp_frommont(d, md);                
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp_set(b, 0);
+        fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*0 
+        fp_frommont(d, md);                
+        if (compare_words(b, d, NWORDS_FIELD)!=0) { passed=0; break; } 
+    }
+    if (passed==1) printf("  GF(p) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field squaring
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+        
+        fp_tomont(ma, a);
+        fp_sqr(mb, ma);                                   // b = a^2
+        fp_mul(mc, ma, ma);                               // c = a*a 
+        fp_frommont(b, mb);
+        fp_frommont(c, mc);
+        if (compare_words(b, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(a, 0); fp_tomont(ma, a);
+        fp_sqr(md, ma);                                   // d = 0^2 
+        if (compare_words(ma, md, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) squaring tests............................................. PASSED");
+    else { printf("  GF(p) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field inversion
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_set(d, 1);
+        memcpy(mb, ma, RADIX/8 * NWORDS_FIELD);
+        fp_inv(ma);
+        fp_mul(mc, ma, mb);                               // c = a*a^-1 
+        fp_frommont(c, mc);
+        if (compare_words(c, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp_set(a, 0);
+        fp_set(d, 0);
+        fp_inv(a);                                        // c = 0^-1
+        if (compare_words(a, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p) inversion tests............................................ PASSED");
+    else { printf("  GF(p) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Square root and square detection
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_sqr(mc, ma);
+        fp_frommont(c, mc);                               // c = a^2
+        if (fp_is_square(mc) != 1) { passed = 0; break; }
+
+        fp_sqrt(mc);                                      // c = a = sqrt(c) 
+        fp_neg(md, mc);
+        fp_frommont(c, mc);
+        fp_frommont(d, md);
+        if ((compare_words(a, c, NWORDS_FIELD) != 0) && (compare_words(a, d, NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests........................................ PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ 
+    return OK;
+}
+
+bool fp_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking field arithmetic: \n\n"); 
+        
+    fprandom_test(a); fprandom_test(b); fprandom_test(c);
+
+    // GF(p) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_add(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_sub(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_mul(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) inversion
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_inv(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) square root
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_sqrt(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_is_square(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/ref/lvl3/test/test_fp2.c
+++ b/src/gf/ref/lvl3/test/test_fp2.c
@@ -0,0 +1,307 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp2_test()
+{ // Tests for the GF(p^2) arithmetic
+    bool OK = true;
+    int n, passed;
+    fp2_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing arithmetic over GF(p^2): \n\n"); 
+
+    // Addition in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d); 
+
+        fp2_add(&d, &a, &b); fp2_add(&e, &d, &c);                 // e = (a+b)+c
+        fp2_add(&d, &b, &c); fp2_add(&f, &d, &a);                 // f = a+(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_add(&d, &a, &b);                                      // d = a+b 
+        fp2_add(&e, &b, &a);                                      // e = b+a
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_add(&d, &a, &b);                                      // d = a+0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);   
+        fp2_neg(&d, &a);                      
+        fp2_add(&e, &a, &d);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) addition tests ............................................ PASSED");
+    else { printf("  GF(p^2) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Subtraction in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d);
+
+        fp2_sub(&d, &a, &b); fp2_sub(&e, &d, &c);                 // e = (a-b)-c
+        fp2_add(&d, &b, &c); fp2_sub(&f, &a, &d);                 // f = a-(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_sub(&d, &a, &b);                                      // d = a-b 
+        fp2_sub(&e, &b, &a);
+        fp2_neg(&e, &e);                                          // e = -(b-a)
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_sub(&d, &a, &b);                                      // d = a-0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp2_set(&b, 0);              
+        fp2_sub(&e, &a, &a);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p^2) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Multiplication in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+        
+        fp2_tomont(&ma, &a);
+        fp2_frommont(&c, &ma);
+        if (compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c);
+        fp2_mul(&md, &ma, &mb); fp2_mul(&me, &md, &mc);                          // e = (a*b)*c
+        fp2_mul(&md, &mb, &mc); fp2_mul(&mf, &md, &ma);                          // f = a*(b*c)
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c); 
+        fp2_add(&md, &mb, &mc); fp2_mul(&me, &ma, &md);                          // e = a*(b+c)
+        fp2_mul(&md, &ma, &mb); fp2_mul(&mf, &ma, &mc); fp2_add(&mf, &md, &mf);  // f = a*b+a*c
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*b 
+        fp2_mul(&me, &mb, &ma);                                                  // e = b*a 
+        fp2_frommont(&d, &md);
+        fp2_frommont(&e, &me);
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&b, 1); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*1  
+        fp2_frommont(&a, &ma);
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp2_set(&b, 0);
+        fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*0 
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&b, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p^2) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Squaring in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+        
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mb, &ma);                                          // b = a^2
+        fp2_mul(&mc, &ma, &ma);                                     // c = a*a 
+        fp2_frommont(&b, &mb);
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&b, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&a, 0); fp2_tomont(&ma, &a);
+        fp2_sqr(&md, &ma);                                          // d = 0^2 
+        if (compare_words((digit_t*)&ma, (digit_t*)&md, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) squaring tests............................................. PASSED");
+    else { printf("  GF(p^2) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Inversion in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&d, 1);
+        memcpy(&mb, &ma, RADIX/8 * 2*NWORDS_FIELD);
+        fp2_inv(&ma);
+        fp2_mul(&mc, &ma, &mb);                                     // c = a*a^-1 
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&c, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp2_set(&a, 0);
+        fp2_set(&d, 0);
+        fp2_inv(&a);                                                // c = 0^-1
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p^2) inversion tests............................................ PASSED");
+    else { printf("  GF(p^2) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Square root and square detection in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mc, &ma);
+        fp2_frommont(&c, &mc);                                      // c = a^2
+        if (fp2_is_square(&mc) != 1) { passed = 0; break; }        
+
+        fp2_sqrt(&mc);                                              // c = a = sqrt(c) 
+        fp2_neg(&md, &mc);
+        fp2_frommont(&c, &mc);
+        fp2_frommont(&d, &md);
+        if ((compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD) != 0) & (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests.......................................... PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool fp2_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp2_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking arithmetic over GF(p^2): \n\n"); 
+        
+    fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+
+    // GF(p^2) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_add(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_sub(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) squaring
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqr(&c, &a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) squaring runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_mul(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) inversion
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_inv(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) square root
+    cycles = 0;
+    for (n = 0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqrt(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_is_square(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ........................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp2_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp2_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/ref/lvl5/CMakeLists.txt
+++ b/src/gf/ref/lvl5/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set(SOURCE_FILES_GF_${SVARIANT_UPPER}_REF
+    fp_p318233.c fp.c fp2.c
+)
+
+add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
+target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/gf/ref/lvl5/Makefile
+++ b/src/gf/ref/lvl5/Makefile
@@ -0,0 +1,43 @@
+
+CC=gcc
+CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
+LDFLAGS=-lm
+AR=ar rcs
+RANLIB=ranlib
+
+OBJECTS=objs/fp_p318233.o objs/fp.o objs/fp2.o objs/random.o
+
+all: lib tests
+	
+objs/fp_p318233.o: fp_p318233.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp_p318233.c -o objs/fp_p318233.o
+	
+objs/fp.o: fp.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
+	
+objs/fp2.o: fp2.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
+
+objs/random.o: ../../../common/generic/randombytes_system.c
+	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
+
+lib: $(OBJECTS)
+	rm -rf lib
+	mkdir lib
+	$(AR) lib/libtest.a $^
+	$(RANLIB) lib/libtest.a
+
+tests: lib
+	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
+	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
+
+check: tests
+
+.PHONY: clean
+
+clean:
+	rm -rf *.req objs lib test_fp*
+
--- a/src/gf/ref/lvl5/fp.c
+++ b/src/gf/ref/lvl5/fp.c
@@ -0,0 +1,168 @@
+#include "include/fp.h"
+
+const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0xFFFFFFFFFFFFFFFF, 0x994C68ADA6E1FFFF, 0xFAF0A29A781974CE, 0xFE3AC5904A0DEA65, 0x02BDBE6326507D01, 0x8C15B0036936E792, 0x255946A8869BC6 };
+const uint64_t R2[NWORDS_FIELD] = { 0x46E4E8A0C7549CBD, 0xCB993B5943E89EA5, 0x545AC09F2F1B55C8, 0x1ADB99DDACAA06EC, 0x87994B8955D8B8D4, 0x2CC2EA622F9E57C8, 0x2780B5F2DAF1003C, 0x1691676B8674B8 };
+const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+
+void fp_set(digit_t* x, const digit_t val)
+{ // Set field element x = val, where val has wordsize
+
+    x[0] = val;
+    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
+        x[i] = 0;
+    }
+}
+
+bool fp_is_equal(const digit_t* a, const digit_t* b)
+{ // Compare two field elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ b[i];
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+bool fp_is_zero(const digit_t* a)
+{ // Is a field element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void fp_copy(digit_t* out, const digit_t* a)
+{
+    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+}
+
+void fp_neg(digit_t* out, const digit_t* a)
+{ // Modular negation, out = -a mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < NWORDS_FIELD; i++) {
+        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
+    }
+    fp_sub(out, out, (digit_t*)p);
+}
+
+void MUL(digit_t* out, const digit_t a, const digit_t b)
+{ // Digit multiplication, digit*digit -> 2-digit result 
+  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
+  // Output: 0 < out < 2^(2w)-1    
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        // Low part
+    ah = a >> (sizeof(digit_t)*4);            // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t)*4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low;                 // out00
+
+    res1 = albl >> (sizeof(digit_t)*4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t)*4);
+    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
+
+    res1 = ahbl >> (sizeof(digit_t)*4);
+    res2 = albh >> (sizeof(digit_t)*4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low;                 // out10 
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry;     // out11
+}
+
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords-1; i++) {
+        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords-1] >>= shift;
+    return bit_out;
+}
+
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift
+
+    for (int i = nwords-1; i > 0; i--) {
+        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+static void fp_exp3div4(digit_t* out, const digit_t* a)
+{ // Fixed exponentiation out = a^((p-3)/4) mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+  // Requirement: p = 3(mod 4)
+    fp_t p_t, acc;
+    digit_t bit;
+
+    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
+    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    fp_set(out, 1);
+    fp_tomont(out, out);
+
+    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
+        bit = p_t[0] & 1;
+        mp_shiftr(p_t, 1, NWORDS_FIELD);
+        if (bit == 1) {
+            fp_mul(out, out, acc);
+        }
+        fp_sqr(acc, acc);
+    }
+}
+
+void fp_inv(digit_t* a)
+{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
+  // Input: a=xR in [0, p-1] 
+  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
+  // Requirement: Ceiling(Log(p)) < w*nwords
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_sqr(t, t);
+    fp_mul(a, t, a);    // a^(p-2)
+}
+
+bool fp_is_square(const digit_t* a)
+{ // Is field element a square?
+  // Output: out = 0 (false), 1 (true)
+    fp_t t, one;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_mul(t, t, a);    // a^((p-1)/2)
+    fp_frommont(t, t);
+    fp_set(one, 1);
+
+    return fp_is_equal(t, one);
+}
+
+void fp_sqrt(digit_t* a)
+{ // Square root computation, out = a^((p+1)/4) mod p
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_mul(a, t, a);    // a^((p+1)/4)
+}
--- a/src/gf/ref/lvl5/fp2.c
+++ b/src/gf/ref/lvl5/fp2.c
@@ -0,0 +1,193 @@
+#include <fp2.h>
+
+extern const digit_t R[NWORDS_FIELD];
+
+/* Arithmetic modulo X^2 + 1 */
+
+void fp2_set(fp2_t* x, const digit_t val)
+{
+    fp_set(x->re, val);
+    fp_set(x->im, 0);
+}
+
+bool fp2_is_zero(const fp2_t* a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(a->re) & fp_is_zero(a->im);
+}
+
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
+}
+
+void fp2_copy(fp2_t* x, const fp2_t* y)
+{
+    fp_copy(x->re, y->re);
+    fp_copy(x->im, y->im);
+}
+
+fp2_t fp2_non_residue()
+{ // 5 + 2i is a quadratic non-residue for p318233
+    fp_t one = {0};
+    fp2_t res;
+
+    one[0] = 1;
+    fp_tomont(one, one);
+    fp_add(res.im, one, one);
+    fp_add(res.re, res.im, res.im);
+    fp_add(res.re, res.re, one);
+    return res;
+}
+
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_add(x->re, y->re, z->re);
+    fp_add(x->im, y->im, z->im);
+}
+
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_sub(x->re, y->re, z->re);
+    fp_sub(x->im, y->im, z->im);
+}
+
+void fp2_neg(fp2_t* x, const fp2_t* y)
+{
+    fp_neg(x->re, y->re);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_t t0, t1;
+
+    fp_add(t0, y->re, y->im);
+    fp_add(t1, z->re, z->im);
+    fp_mul(t0, t0, t1);
+    fp_mul(t1, y->im, z->im);
+    fp_mul(x->re, y->re, z->re);
+    fp_sub(x->im, t0, t1);
+    fp_sub(x->im, x->im, x->re);
+    fp_sub(x->re, x->re, t1);
+}
+
+void fp2_sqr(fp2_t* x, const fp2_t* y)
+{
+    fp_t sum, diff;
+
+    fp_add(sum, y->re, y->im);
+    fp_sub(diff, y->re, y->im);
+    fp_mul(x->im, y->re, y->im);
+    fp_add(x->im, x->im, x->im);
+    fp_mul(x->re, sum, diff);
+}
+
+void fp2_inv(fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+    fp_inv(t0);
+    fp_mul(x->re, x->re, t0);
+    fp_mul(x->im, x->im, t0);
+    fp_neg(x->im, x->im);
+}
+
+bool fp2_is_square(const fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+
+    return fp_is_square(t0);
+}
+
+void fp2_frob(fp2_t* x, const fp2_t* y)
+{
+    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_tomont(fp2_t* x, const fp2_t* y)
+{ 
+    fp_tomont(x->re, y->re);
+    fp_tomont(x->im, y->im);
+}
+
+void fp2_frommont(fp2_t* x, const fp2_t* y)
+{
+    fp_frommont(x->re, y->re);
+    fp_frommont(x->im, y->im);
+}
+
+// NOTE: old, non-constant-time implementation. Could be optimized
+void fp2_sqrt(fp2_t* x)
+{
+    fp_t sdelta, re, tmp1, tmp2, inv2, im;
+
+    if (fp_is_zero(x->im)) {
+        if (fp_is_square(x->re)) {
+            fp_sqrt(x->re);
+            return;
+        } else {
+            fp_neg(x->im, x->re);
+            fp_sqrt(x->im);
+            fp_set(x->re, 0);
+            return;
+        }
+    }
+
+    // sdelta = sqrt(re^2 + im^2)
+    fp_sqr(sdelta, x->re);
+    fp_sqr(tmp1, x->im);
+    fp_add(sdelta, sdelta, tmp1);
+    fp_sqrt(sdelta);
+
+    fp_set(inv2, 2);
+    fp_tomont(inv2, inv2);     // inv2 <- 2
+    fp_inv(inv2);
+    fp_add(re, x->re, sdelta);
+    fp_mul(re, re, inv2);
+    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    if (!fp_is_square(tmp2)) {
+        fp_sub(re, x->re, sdelta);
+        fp_mul(re, re, inv2);
+    }
+
+    fp_sqrt(re);
+    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    fp_inv(im);
+    fp_mul(im, im, inv2);
+    fp_mul(x->im, im, x->im);    
+    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+}
+
+// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
+int fp2_cmp(fp2_t* x, fp2_t* y){
+    fp2_t a, b;
+    fp2_frommont(&a, x);
+    fp2_frommont(&b, y);
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.re[i] > b.re[i])
+            return 1;
+        if(a.re[i] < b.re[i])
+            return -1;
+    }
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.im[i] > b.im[i])
+            return 1;
+        if(a.im[i] < b.im[i])
+            return -1;
+    }
+    return 0;
+}
--- a/src/gf/ref/lvl5/fp_p318233.c
+++ b/src/gf/ref/lvl5/fp_p318233.c
--- a/src/gf/ref/lvl5/include/fp.h
+++ b/src/gf/ref/lvl5/include/fp.h
@@ -0,0 +1,76 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+
+void fp_set(digit_t* x, const digit_t val);
+bool fp_is_equal(const digit_t* a, const digit_t* b);
+bool fp_is_zero(const digit_t* a);
+void fp_copy(digit_t* out, const digit_t* a);
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_neg(digit_t* out, const digit_t* a);
+void fp_sqr(digit_t* out, const digit_t* a);
+void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
+void MUL(digit_t* out, const digit_t a, const digit_t b);
+void fp_inv(digit_t* x);
+bool fp_is_square(const digit_t* a);
+void fp_sqrt(digit_t* a);
+void fp_tomont(digit_t* out, const digit_t* a);
+void fp_frommont(digit_t* out, const digit_t* a);
+void fp_mont_setone(digit_t* out);
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+
+static inline unsigned int is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
--- a/src/gf/ref/lvl5/include/fp2.h
+++ b/src/gf/ref/lvl5/include/fp2.h
@@ -0,0 +1,29 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include "fp.h"
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t {
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set(fp2_t* x, const digit_t val);
+bool fp2_is_zero(const fp2_t* a);
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
+void fp2_copy(fp2_t* x, const fp2_t* y);
+fp2_t fp2_non_residue();
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_neg(fp2_t* x, const fp2_t* y);
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sqr(fp2_t* x, const fp2_t* y);
+void fp2_inv(fp2_t* x);
+bool fp2_is_square(const fp2_t* x);
+void fp2_frob(fp2_t* x, const fp2_t* y);
+void fp2_sqrt(fp2_t* x);
+void fp2_tomont(fp2_t* x, const fp2_t* y);
+void fp2_frommont(fp2_t* x, const fp2_t* y);
+int fp2_cmp(fp2_t* x, fp2_t* y);
+
+#endif
--- a/src/gf/ref/lvl5/test/CMakeLists.txt
+++ b/src/gf/ref/lvl5/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
+
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/src/gf/ref/lvl5/test/test_extras.c
+++ b/src/gf/ref/lvl5/test/test_extras.c
@@ -0,0 +1,74 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+static void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/Show More
+++ b/Show More