second-round version of SQIsign

Co-authored-by: Marius A. Aardal <marius.andre.aardal@gmail.com> Co-authored-by: Gora Adj <gora.adj@tii.ae> Co-authored-by: Diego F. Aranha <dfaranha@cs.au.dk> Co-authored-by: Andrea Basso <sqisign@andreabasso.com> Co-authored-by: Isaac Andrés Canales Martínez <icanalesm0500@gmail.com> Co-authored-by: Jorge Chávez-Saab <jorgechavezsaab@gmail.com> Co-authored-by: Maria Corte-Real Santos <mariascrsantos98@gmail.com> Co-authored-by: Luca De Feo <github@defeo.lu> Co-authored-by: Max Duparc <max.duparc@epfl.ch> Co-authored-by: Jonathan Komada Eriksen <jonathan.eriksen97@gmail.com> Co-authored-by: Décio Luiz Gazzoni Filho <decio@decpp.net> Co-authored-by: Basil Hess <bhe@zurich.ibm.com> Co-authored-by: Antonin Leroux <antonin.leroux@polytechnique.org> Co-authored-by: Patrick Longa <plonga@microsoft.com> Co-authored-by: Luciano Maino <mainoluciano.96@gmail.com> Co-authored-by: Michael Meyer <michael@random-oracles.org> Co-authored-by: Hiroshi Onuki <onuki@mist.i.u-tokyo.ac.jp> Co-authored-by: Lorenz Panny <lorenz@yx7.cc> Co-authored-by: Giacomo Pope <giacomopope@gmail.com> Co-authored-by: Krijn Reijnders <reijnderskrijn@gmail.com> Co-authored-by: Damien Robert <damien.robert@inria.fr> Co-authored-by: Francisco Rodríguez-Henriquez <francisco.rodriguez@tii.ae> Co-authored-by: Sina Schaeffler <sschaeffle@student.ethz.ch> Co-authored-by: Benjamin Wesolowski <benjamin.wesolowski@ens-lyon.fr>
2025-02-06 00:00:00 +00:00
parent ff34a8cd18
commit 91e9e464fe
481 changed files with 80785 additions and 55963 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,92 +1,74 @@
-# There are the following dependencies
-#     ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
-#     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
-#     │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
-#     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
-#     └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
-#           │                    │                       │
-#           │                    │                       │
-#           ├────────────────────┼─────────────────┐     │
-#           │                    │                 │     │
-#           │                    │                 │     │
-#       ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
-#       │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
-#       └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
-#           │       │            │            └───▲──────┬───────────┘
-#           │       │            │                │      │
-#       ┌───┴──┐    │            │                │      │
-#       │ KLPT ◄────┘            │     ┌──────────┘      │
-#       └───┬──┘                 │     │                 │
-#           │                    │     │                 │
-# ┌─────────▼─────────┐          │     │                 │
-# │ Quaternion orders │          │     │            ┌────▼───┐
-# │     and ideals    │          │     │            │ GF(p²) │
-# └─────────┬─────────┘          │     │            └────┬───┘
-#           │           ┌─┬──────▼─────┴──┬─┐            │
-#     ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
-#     │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
-#     └───────────┘     │ ├───────────────┤ │      └───────────┘
-#                       └─┴───────────────┴─┘                    
-
 add_subdirectory(common)
-add_subdirectory(intbig)
-add_subdirectory(quaternion)
-add_subdirectory(precomp)
-add_subdirectory(klpt)
-add_subdirectory(gf)
-add_subdirectory(ec)

-add_subdirectory(id2iso)
-add_subdirectory(protocols)
+if(ENABLE_SIGN)
+    add_subdirectory(quaternion)
+endif()
+
+add_subdirectory(mp)
+add_subdirectory(gf)
+add_subdirectory(precomp)
+add_subdirectory(ec)
+add_subdirectory(hd)
+add_subdirectory(verification)
+
+if(ENABLE_SIGN)
+    add_subdirectory(id2iso)
+    add_subdirectory(signature)
+endif()

 FOREACH(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
    set(SOURCE_FILES_VARIANT sqisign.c)
+
    # Library for SQIsign variant
    add_library(sqisign_${SVARIANT_LOWER} ${SOURCE_FILES_VARIANT})
-    target_link_libraries(sqisign_${SVARIANT_LOWER} PUBLIC 
-        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
-        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
-        ${LIB_KLPT_${SVARIANT_UPPER}} 
-        ${LIB_QUATERNION} 
-        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
-        ${LIB_INTBIG} 
-        ${LIB_GF_${SVARIANT_UPPER}} 
-        ${LIB_EC_${SVARIANT_UPPER}} 
-        ${GMP} 
+    target_link_libraries(sqisign_${SVARIANT_LOWER} PUBLIC
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_SIGNATURE_${SVARIANT_UPPER}}>
+        ${LIB_VERIFICATION_${SVARIANT_UPPER}}
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_ID2ISO_${SVARIANT_UPPER}}>
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_QUATERNION}>
+        ${LIB_MP}
+        ${LIB_GF_${SVARIANT_UPPER}}
+        ${LIB_EC_${SVARIANT_UPPER}}
+        ${LIB_HD_${SVARIANT_UPPER}}
+        ${LIB_PRECOMP_${SVARIANT_UPPER}}
+        $<$<BOOL:${ENABLE_SIGN}>:GMP>
        sqisign_common_sys
    )

-    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC $<$<BOOL:${ENABLE_SIGN}>:${INC_SIGNATURE}> ${INC_VERIFICATION} $<$<BOOL:${ENABLE_SIGN}>:${INC_QUATERNION}> ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_MP} ${INC_EC} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_HD} $<$<BOOL:${ENABLE_SIGN}>:${INC_ID2ISO}> ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT})

    # Library for SQIsign variant (test)
    add_library(sqisign_${SVARIANT_LOWER}_test ${SOURCE_FILES_VARIANT})
-    target_link_libraries(sqisign_${SVARIANT_LOWER}_test PUBLIC 
-        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
-        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
-        ${LIB_KLPT_${SVARIANT_UPPER}} 
-        ${LIB_QUATERNION} 
-        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
-        ${LIB_INTBIG} 
-        ${LIB_GF_${SVARIANT_UPPER}} 
-        ${LIB_EC_${SVARIANT_UPPER}} 
-        ${GMP} 
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_test PUBLIC
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_SIGNATURE_${SVARIANT_UPPER}}>
+        ${LIB_VERIFICATION_${SVARIANT_UPPER}}
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_ID2ISO_${SVARIANT_UPPER}}>
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_QUATERNION}>
+        ${LIB_MP}
+        ${LIB_GF_${SVARIANT_UPPER}}
+        ${LIB_EC_${SVARIANT_UPPER}}
+        ${LIB_HD_${SVARIANT_UPPER}}
+        ${LIB_PRECOMP_${SVARIANT_UPPER}}
+        $<$<BOOL:${ENABLE_SIGN}>:GMP>
        sqisign_common_test
    )

-    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC $<$<BOOL:${ENABLE_SIGN}>:${INC_SIGNATURE}> ${INC_VERIFICATION} $<$<BOOL:${ENABLE_SIGN}>:${INC_QUATERNION}> ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_MP} ${INC_EC} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_HD} $<$<BOOL:${ENABLE_SIGN}>:${INC_ID2ISO}> ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test PUBLIC SQISIGN_VARIANT=${SVARIANT})

    # Library with NIST API
    set(SOURCE_FILE_NISTAPI nistapi/${SVARIANT_LOWER}/api.c)
    add_library(sqisign_${SVARIANT_LOWER}_nistapi ${SOURCE_FILE_NISTAPI})
-    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PRIVATE sqisign_${SVARIANT_LOWER})
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC sqisign_${SVARIANT_LOWER})
    target_include_directories(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC nistapi/${SVARIANT_LOWER} PUBLIC ../include)
+    target_compile_definitions(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC SQISIGN_VARIANT=${SVARIANT})

    # Library with NIST API (test)
    add_library(sqisign_${SVARIANT_LOWER}_test_nistapi ${SOURCE_FILE_NISTAPI})
-    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PRIVATE sqisign_${SVARIANT_LOWER}_test)
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC sqisign_${SVARIANT_LOWER}_test)
    target_include_directories(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC nistapi/${SVARIANT_LOWER})
+    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC SQISIGN_VARIANT=${SVARIANT})
 ENDFOREACH()
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (POLICY CMP0076)
+    cmake_policy(SET CMP0076 NEW)
+endif()
+
 get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
 string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
 include(${SELECT_SQISIGN_VARIANT})
+include(${SELECT_IMPL_TYPE})
--- a/src/common/arm64crypto/CMakeLists.txt
+++ b/src/common/arm64crypto/CMakeLists.txt
@@ -0,0 +1,40 @@
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    set(SOURCE_FILES_COMMON_ARM64CRYPTO randombytes_ctrdrbg_inline_asm.c)
+else()
+    set(SOURCE_FILES_COMMON_ARM64CRYPTO randombytes_ctrdrbg.c)
+    set_source_files_properties(randombytes_ctrdrbg.c PROPERTIES COMPILE_FLAGS -fno-strict-aliasing)
+endif()
+
+foreach(SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
+    target_sources(${SQISIGN_COMMON_TARGET} PRIVATE ${SOURCE_FILES_COMMON_ARM64CRYPTO})
+    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include)
+    target_compile_definitions(${SQISIGN_COMMON_TARGET} PRIVATE RANDOMBYTES_ARM64CRYPTO)
+    target_compile_options(${SQISIGN_COMMON_TARGET} PRIVATE -march=armv8-a+crypto)
+endforeach()
+
+set(SOURCE_FILES_CTRDRBG_TEST_BENCHMARK
+    ${SOURCE_FILES_COMMON_ARM64CRYPTO}
+    ../ref/aes_c.c
+    ../ref/randombytes_ctrdrbg.c
+    ../generic/randombytes_system.c
+)
+
+add_executable(sqisign_test_ctrdrbg_arm64crypto ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/test_ctrdrbg.c)
+target_include_directories(sqisign_test_ctrdrbg_arm64crypto PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
+target_compile_definitions(sqisign_test_ctrdrbg_arm64crypto PRIVATE
+    CTRDRBG_TEST_BENCH
+    RANDOMBYTES_INIT_PLATFORM=randombytes_init_arm64crypto
+    RANDOMBYTES_PLATFORM=randombytes_arm64crypto)
+target_compile_options(sqisign_test_ctrdrbg_arm64crypto PRIVATE -march=armv8-a+crypto)
+
+add_test(sqisign_test_ctrdrbg_arm64crypto sqisign_test_ctrdrbg_arm64crypto)
+
+add_executable(sqisign_bench_ctrdrbg_arm64crypto ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/bench_ctrdrbg.c)
+target_include_directories(sqisign_bench_ctrdrbg_arm64crypto PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
+target_compile_definitions(sqisign_bench_ctrdrbg_arm64crypto PRIVATE
+    CTRDRBG_TEST_BENCH
+    RANDOMBYTES_INIT_PLATFORM=randombytes_init_arm64crypto
+    RANDOMBYTES_PLATFORM=randombytes_arm64crypto)
+target_compile_options(sqisign_bench_ctrdrbg_arm64crypto PRIVATE -march=armv8-a+crypto)
+
+set(BM_BINS ${BM_BINS} sqisign_bench_ctrdrbg_arm64crypto CACHE INTERNAL "List of benchmark executables")
--- a/src/common/arm64crypto/include/randombytes_arm64crypto.h
+++ b/src/common/arm64crypto/include/randombytes_arm64crypto.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef RANDOMBYTES_ARM64CRYPTO_H
+#define RANDOMBYTES_ARM64CRYPTO_H
+
+#include <stdio.h>
+
+#define RNG_SUCCESS      0
+#define RNG_BAD_MAXLEN  -1
+#define RNG_BAD_OUTBUF  -2
+#define RNG_BAD_REQ_LEN -3
+
+typedef struct {
+    unsigned char   buffer[16];
+    int             buffer_pos;
+    unsigned long   length_remaining;
+    unsigned char   key[32];
+    unsigned char   ctr[16];
+} AES_XOF_struct;
+
+typedef struct {
+    unsigned char   Key[32];
+    unsigned char   V[16];
+    int             reseed_counter;
+} AES256_CTR_DRBG_struct;
+
+#endif /* RANDOMBYTES_ARM64CRYPTO_H */
--- a/src/common/arm64crypto/randombytes_ctrdrbg.c
+++ b/src/common/arm64crypto/randombytes_ctrdrbg.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "randombytes_arm64crypto.h"
+
+#include <arm_neon.h>
+#include <string.h>
+
+static AES256_CTR_DRBG_struct DRBG_ctx;
+
+static inline uint32_t AES_sbox_x4(uint32_t in) {
+  uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
+  sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
+
+  return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
+}
+
+#define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
+
+typedef union {
+  uint8_t u8[15][16];
+  uint32_t u32[15][4];
+} subkeys_t;
+
+static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) {
+  subkeys_t *sk = (subkeys_t *)subkeys;
+  uint8_t rcon = 1;
+  uint32_t s;
+  int i, j;
+
+  memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t));
+
+  for (i = 2; i < 14; i += 2) {
+    s = AES_sbox_x4(sk->u32[i - 1][3]);
+    sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0];
+
+    for (j = 1; j < 4; j++) {
+      sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j];
+    }
+
+    s = AES_sbox_x4(sk->u32[i][3]);
+    sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0];
+
+    for (j = 1; j < 4; j++) {
+      sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j];
+    }
+
+    rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b);
+  }
+
+  s = AES_sbox_x4(sk->u32[13][3]);
+  sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0];
+
+  for (j = 1; j < 4; j++) {
+    sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j];
+  }
+}
+
+#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out)                             \
+  do {                                                                         \
+    uint8x16_t state[ways];                                                    \
+                                                                               \
+    for (int j = 0; j < ways; j++) {                                           \
+      state[j] = vaeseq_u8(ctr[j], vsubkeys[0]);                               \
+      state[j] = vaesmcq_u8(state[j]);                                         \
+    }                                                                          \
+                                                                               \
+    for (int i = 1; i < 13; i++) {                                             \
+      for (int j = 0; j < ways; j++) {                                         \
+        state[j] = vaeseq_u8(state[j], vsubkeys[i]);                           \
+        state[j] = vaesmcq_u8(state[j]);                                       \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    for (int j = 0; j < ways; j++) {                                           \
+      state[j] = vaeseq_u8(state[j], vsubkeys[13]);                            \
+      state[j] = veorq_u8(state[j], vsubkeys[14]);                             \
+      vst1q_u8(out + j * 16, state[j]);                                        \
+    }                                                                          \
+  } while (0);
+
+//    subkeys - subkeys for AES-256
+//    ctr - a 128-bit plaintext value
+//    buffer - a 128-bit ciphertext value
+static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr,
+                       unsigned char *buffer) {
+  AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
+}
+
+// vsubkeys - subkeys for AES-256
+// ctr - an array of 3 x 128-bit plaintext value
+// buffer - an array of 3 x 128-bit ciphertext value
+static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3],
+                          unsigned char *buffer) {
+  AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
+}
+
+static void bswap128(__uint128_t *x) {
+  uint64_t *x64 = (uint64_t *)x;
+
+  uint64_t t = x64[0];
+  x64[0] = x64[1];
+  x64[1] = t;
+
+  x64[0] = __builtin_bswap64(x64[0]);
+  x64[1] = __builtin_bswap64(x64[1]);
+}
+
+static void add_to_V(unsigned char V[], int incr) {
+  __uint128_t *V128 = (__uint128_t *)V;
+  bswap128(V128);
+  (*V128) += incr;
+  bswap128(V128);
+}
+
+static void AES256_CTR_DRBG_Update(unsigned char *provided_data,
+                                   uint8x16_t vsubkeys[15], unsigned char *Key,
+                                   unsigned char *V) {
+  unsigned char temp[48];
+  __uint128_t V128, t;
+  uint64x2_t vV[3];
+
+  memcpy(&V128, DRBG_ctx.V, sizeof(V128));
+
+  bswap128(&V128);
+
+  for (int j = 0; j < 3; j++) {
+    V128++;
+    t = V128;
+    bswap128(&t);
+    vV[j] = vld1q_u64((uint64_t *)&t);
+  }
+
+  AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
+
+  if (provided_data != NULL)
+    for (int i = 0; i < 48; i++)
+      temp[i] ^= provided_data[i];
+  memcpy(Key, temp, 32);
+  memcpy(V, temp + 32, 16);
+
+  add_to_V(DRBG_ctx.V, 1);
+}
+
+void randombytes_init_arm64crypto(unsigned char *entropy_input,
+                                  unsigned char *personalization_string,
+                                  int security_strength) {
+  (void)security_strength;
+
+  unsigned char seed_material[48];
+  uint8_t subkeys[15][16];
+  uint8x16_t vsubkeys[15];
+
+  memcpy(seed_material, entropy_input, 48);
+  if (personalization_string)
+    for (int i = 0; i < 48; i++)
+      seed_material[i] ^= personalization_string[i];
+  memset(DRBG_ctx.Key, 0x00, 32);
+  memset(DRBG_ctx.V, 0x00, 16);
+
+  AES256_key_schedule(subkeys, DRBG_ctx.Key);
+  for (int i = 0; i < 15; i++) {
+    vsubkeys[i] = vld1q_u8(subkeys[i]);
+  }
+
+  AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter = 1;
+}
+
+#define WAYS 4
+
+int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) {
+  uint8_t subkeys[15][16];
+  unsigned char block[16];
+  __uint128_t V[WAYS], Vle[WAYS];
+  uint8x16x4_t vV;
+  uint8x16_t vsubkeys[15];
+
+  AES256_key_schedule(subkeys, DRBG_ctx.Key);
+
+  for (int j = 0; j < 15; j++) {
+    vsubkeys[j] = vld1q_u8(subkeys[j]);
+  }
+
+  memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0]));
+  V[0] = Vle[0];
+  vV.val[0] = vld1q_u8((uint8_t *)&V[0]);
+  bswap128(&Vle[0]);
+  for (int j = 1; j < WAYS; j++) {
+    Vle[j] = Vle[j - 1] + 1;
+    V[j] = Vle[j];
+    bswap128(&V[j]);
+    vV.val[j] = vld1q_u8((uint8_t *)&V[j]);
+  }
+
+  int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0;
+
+  while (xlen >= WAYS * 16) {
+    for (int j = 0; j < WAYS; j++) {
+      Vle[j] += 4;
+    }
+
+    for (int j = 0; j < WAYS; j++) {
+      vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[0]);
+      vV.val[j] = vaesmcq_u8(vV.val[j]);
+    }
+
+    for (int i = 1; i < 13; i++) {
+      for (int j = 0; j < WAYS; j++) {
+        vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[i]);
+        vV.val[j] = vaesmcq_u8(vV.val[j]);
+      }
+    }
+
+    for (int j = 0; j < WAYS; j++) {
+      vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[13]);
+      vV.val[j] = veorq_u8(vV.val[j], vsubkeys[14]);
+      vst1q_u8(x + j * 16, vV.val[j]);
+    }
+
+    for (int j = 0; j < WAYS; j++) {
+      V[j] = Vle[j];
+      bswap128(&V[j]);
+    }
+
+    vV = vld1q_u8_x4((uint8_t *)V);
+
+    x += WAYS * 16;
+    xlen -= WAYS * 16;
+  }
+
+  if (entered_fast_path && xlen == 0) {
+    asm volatile("" : "+r,m"(Vle[3]) : : "memory");
+    V[0] = Vle[3] - 4;
+    bswap128(&V[0]);
+  }
+
+  while (xlen > 0) {
+    if (xlen > 16) {
+      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
+      x += 16;
+      xlen -= 16;
+
+      Vle[0]++;
+      V[0] = Vle[0];
+      bswap128(&V[0]);
+    } else {
+      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
+      memcpy(x, block, xlen);
+      xlen = 0;
+    }
+  }
+
+  memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
+
+  AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter++;
+
+  return RNG_SUCCESS;
+}
+
+#ifdef RANDOMBYTES_ARM64CRYPTO
+int randombytes(unsigned char *random_array, unsigned long long nbytes) {
+  int ret = randombytes_arm64crypto(random_array, nbytes);
+#ifdef ENABLE_CT_TESTING
+  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
+#endif
+  return ret;
+}
+
+void randombytes_init(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+  randombytes_init_arm64crypto(entropy_input, personalization_string,
+                               security_strength);
+}
+#endif
--- a/src/common/arm64crypto/randombytes_ctrdrbg_inline_asm.c
+++ b/src/common/arm64crypto/randombytes_ctrdrbg_inline_asm.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "randombytes_arm64crypto.h"
+
+typedef union {
+  uint8_t u8[16];
+  uint64_t u64[2];
+  __uint128_t u128;
+} u128_t;
+
+static AES256_CTR_DRBG_struct DRBG_ctx;
+
+static inline uint32_t AES_sbox_x4(uint32_t in) {
+  uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
+  sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
+
+  return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
+}
+
+#define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
+
+typedef union {
+  uint32_t u32[15][4];
+} subkeys_t;
+
+static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) {
+  subkeys_t *sk = (subkeys_t *)subkeys;
+  uint8_t rcon = 1;
+  uint32_t s;
+  int i, j;
+
+  memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t));
+
+  for (i = 2; i < 14; i += 2) {
+    s = AES_sbox_x4(sk->u32[i - 1][3]);
+    sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0];
+
+    for (j = 1; j < 4; j++) {
+      sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j];
+    }
+
+    s = AES_sbox_x4(sk->u32[i][3]);
+    sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0];
+
+    for (j = 1; j < 4; j++) {
+      sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j];
+    }
+
+    rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b);
+  }
+
+  s = AES_sbox_x4(sk->u32[13][3]);
+  sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0];
+
+  for (j = 1; j < 4; j++) {
+    sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j];
+  }
+}
+
+#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out)                             \
+  do {                                                                         \
+    uint8x16_t state[ways];                                                    \
+                                                                               \
+    for (int j = 0; j < ways; j++) {                                           \
+      state[j] = vaeseq_u8(ctr[j], vsubkeys[0]);                               \
+      state[j] = vaesmcq_u8(state[j]);                                         \
+    }                                                                          \
+                                                                               \
+    for (int i = 1; i < 13; i++) {                                             \
+      for (int j = 0; j < ways; j++) {                                         \
+        state[j] = vaeseq_u8(state[j], vsubkeys[i]);                           \
+        state[j] = vaesmcq_u8(state[j]);                                       \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    for (int j = 0; j < ways; j++) {                                           \
+      state[j] = vaeseq_u8(state[j], vsubkeys[13]);                            \
+      state[j] = veorq_u8(state[j], vsubkeys[14]);                             \
+      vst1q_u8(out + j * 16, state[j]);                                        \
+    }                                                                          \
+  } while (0);
+
+//    subkeys - subkeys for AES-256
+//    ctr - a 128-bit plaintext value
+//    buffer - a 128-bit ciphertext value
+static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr,
+                       unsigned char *buffer) {
+  AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
+}
+
+// vsubkeys - subkeys for AES-256
+// ctr - an array of 3 x 128-bit plaintext value
+// buffer - an array of 3 x 128-bit ciphertext value
+static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3],
+                          unsigned char *buffer) {
+  AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
+}
+
+static void bswap128(u128_t *x) {
+  uint64_t t = x->u64[0];
+  x->u64[0] = x->u64[1];
+  x->u64[1] = t;
+
+  x->u64[0] = __builtin_bswap64(x->u64[0]);
+  x->u64[1] = __builtin_bswap64(x->u64[1]);
+}
+
+static void incr_V(u128_t *V) {
+  bswap128(V);
+  V->u128++;
+  bswap128(V);
+}
+
+static void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
+                                   uint8x16_t vsubkeys[15], unsigned char *Key,
+                                   unsigned char *V) {
+  (void)V;
+
+  unsigned char temp[48];
+  u128_t V128, t;
+  uint64x2_t vV[3];
+
+  memcpy(&V128, DRBG_ctx.V, sizeof(V128));
+
+  bswap128(&V128);
+
+  for (int j = 0; j < 3; j++) {
+    V128.u128++;
+    t = V128;
+    bswap128(&t);
+    vV[j] = vld1q_u64((uint64_t *)&t);
+  }
+
+  AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
+
+  if (provided_data != NULL)
+    for (int i = 0; i < 48; i++)
+      temp[i] ^= provided_data[i];
+  memcpy(Key, temp, 32);
+  memcpy(V128.u8, temp + 32, 16);
+
+  incr_V(&V128);
+
+  memcpy(DRBG_ctx.V, V128.u8, 16);
+}
+
+void randombytes_init_arm64crypto(unsigned char *entropy_input,
+                                  unsigned char *personalization_string,
+                                  int security_strength) {
+  (void)security_strength;
+
+  unsigned char seed_material[48];
+  uint8_t subkeys[15][16];
+  uint8x16_t vsubkeys[15];
+
+  memcpy(seed_material, entropy_input, 48);
+  if (personalization_string)
+    for (int i = 0; i < 48; i++)
+      seed_material[i] ^= personalization_string[i];
+  memset(DRBG_ctx.Key, 0x00, 32);
+  memset(DRBG_ctx.V, 0x00, 16);
+
+  AES256_key_schedule(subkeys, DRBG_ctx.Key);
+  for (int i = 0; i < 15; i++) {
+    vsubkeys[i] = vld1q_u8(subkeys[i]);
+  }
+
+  AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter = 1;
+}
+
+#define WAYS 4
+
+int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) {
+  uint8_t subkeys[15][16];
+  unsigned char block[16];
+  u128_t V[WAYS], Vle[WAYS];
+  uint8x16x4_t vV;
+  uint8x16_t vsubkeys[15];
+
+  AES256_key_schedule(subkeys, DRBG_ctx.Key);
+
+  for (int j = 0; j < 15; j++) {
+    vsubkeys[j] = vld1q_u8(subkeys[j]);
+  }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+  asm("ldp         %[V0l],     %[V0h],  %[DRBG_ctx_V]     \n\t"
+      "stp         %[V0l],     %[V0h],    [%[V]     ]     \n\t"
+      "rev       %[Vle0h],     %[V0l]                     \n\t"
+      "rev       %[Vle0l],     %[V0h]                     \n\t"
+      "adds      %[Vle1l],   %[Vle0l],             #1     \n\t"
+      "adc       %[Vle1h],   %[Vle0h],            xzr     \n\t"
+      "rev         %[V1h],   %[Vle1l]                     \n\t"
+      "rev         %[V1l],   %[Vle1h]                     \n\t"
+      "stp         %[V1l],     %[V1h],    [%[V], #16]     \n\t"
+      "adds      %[Vle2l],   %[Vle0l],             #2     \n\t"
+      "adc       %[Vle2h],   %[Vle0h],            xzr     \n\t"
+      "rev         %[V2h],   %[Vle2l]                     \n\t"
+      "rev         %[V2l],   %[Vle2h]                     \n\t"
+      "stp         %[V2l],     %[V2h],    [%[V], #32]     \n\t"
+      "adds      %[Vle3l],   %[Vle0l],             #3     \n\t"
+      "adc       %[Vle3h],   %[Vle0h],            xzr     \n\t"
+      "rev         %[V3h],   %[Vle3l]                     \n\t"
+      "rev         %[V3l],   %[Vle3h]                     \n\t"
+      "stp         %[V3l],     %[V3h],    [%[V], #48]     \n\t"
+      "ld1       { %[vV0].16b, %[vV1].16b, %[vV2].16b, %[vV3].16b }, [%[V]]\n\t"
+      "cmp        %[xlen],          #64                   \n\t"
+      "b.lo            2f                                 \n\t"
+      ".p2align         6                                 \n\t"
+      "1:                                                 \n\t"
+      "aese    %[vV0].16b,  %[vsk0].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk0].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk0].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk0].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk1].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk1].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk1].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk1].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "adds      %[Vle0l],     %[Vle0l],           #4     \n\t"
+      "adc       %[Vle0h],     %[Vle0h],          xzr     \n\t"
+      "adds      %[Vle1l],     %[Vle1l],           #4     \n\t"
+      "adc       %[Vle1h],     %[Vle1h],          xzr     \n\t"
+      "adds      %[Vle2l],     %[Vle2l],           #4     \n\t"
+      "adc       %[Vle2h],     %[Vle2h],          xzr     \n\t"
+      "adds      %[Vle3l],     %[Vle3l],           #4     \n\t"
+      "adc       %[Vle3h],     %[Vle3h],          xzr     \n\t"
+      "aese    %[vV0].16b,  %[vsk2].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk2].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk2].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk2].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk3].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk3].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk3].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk3].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "rev         %[V0h],     %[Vle0l]                   \n\t"
+      "rev         %[V0l],     %[Vle0h]                   \n\t"
+      "rev         %[V1h],     %[Vle1l]                   \n\t"
+      "rev         %[V1l],     %[Vle1h]                   \n\t"
+      "rev         %[V2h],     %[Vle2l]                   \n\t"
+      "rev         %[V2l],     %[Vle2h]                   \n\t"
+      "rev         %[V3h],     %[Vle3l]                   \n\t"
+      "rev         %[V3l],     %[Vle3h]                   \n\t"
+      "aese    %[vV0].16b,  %[vsk4].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk4].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk4].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk4].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk5].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk5].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk5].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk5].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk6].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk6].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk6].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk6].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk7].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk7].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk7].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk7].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk8].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk8].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk8].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk8].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b,  %[vsk9].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b,  %[vsk9].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b,  %[vsk9].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b,  %[vsk9].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "stp         %[V0l],       %[V0h],  [%[V]]          \n\t"
+      "stp         %[V1l],       %[V1h],  [%[V], #16]     \n\t"
+      "stp         %[V2l],       %[V2h],  [%[V], #32]     \n\t"
+      "stp         %[V3l],       %[V3h],  [%[V], #48]     \n\t"
+      "aese    %[vV0].16b, %[vsk10].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b, %[vsk10].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b, %[vsk10].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b, %[vsk10].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b, %[vsk11].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b, %[vsk11].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b, %[vsk11].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b, %[vsk11].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b, %[vsk12].16b                   \n\t"
+      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
+      "aese    %[vV1].16b, %[vsk12].16b                   \n\t"
+      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
+      "aese    %[vV2].16b, %[vsk12].16b                   \n\t"
+      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
+      "aese    %[vV3].16b, %[vsk12].16b                   \n\t"
+      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
+      "aese    %[vV0].16b, %[vsk13].16b                   \n\t"
+      "eor     %[vV0].16b,   %[vV0].16b, %[vsk14].16b     \n\t"
+      "aese    %[vV1].16b, %[vsk13].16b                   \n\t"
+      "eor     %[vV1].16b,   %[vV1].16b, %[vsk14].16b     \n\t"
+      "stp        %q[vV0],      %q[vV1],       [%[x]], #32\n\t"
+      "aese    %[vV2].16b, %[vsk13].16b                   \n\t"
+      "eor     %[vV2].16b,   %[vV2].16b, %[vsk14].16b     \n\t"
+      "aese    %[vV3].16b, %[vsk13].16b                   \n\t"
+      "eor     %[vV3].16b,   %[vV3].16b, %[vsk14].16b     \n\t"
+      "stp        %q[vV2],      %q[vV3],       [%[x]], #32\n\t"
+      "sub        %[xlen],      %[xlen],          #64     \n\t"
+      "ld1       { %[vV0].16b, %[vV1].16b, %[vV2].16b, %[vV3].16b }, [%[V]]\n\t"
+      "cmp        %[xlen],          #64                   \n\t"
+      "b.hs            1b                                 \n\t"
+      "cbnz       %[xlen],           2f                   \n\t"
+      "subs        %[V0h],     %[Vle3l],           #4     \n\t"
+      "sbc         %[V0l],     %[Vle3h],          xzr     \n\t"
+      "rev         %[V0h],       %[V0h]                   \n\t"
+      "rev         %[V0l],       %[V0l]                   \n\t"
+      "stp         %[V0l],       %[V0h],       [%[V]]     \n\t"
+      "2:                                                 \n\t"
+      : [vV0] "=&w"(vV.val[0]), [vV1] "=&w"(vV.val[1]), [vV2] "=&w"(vV.val[2]),
+        [vV3] "=&w"(vV.val[3]), [Vle0l] "=&r"(Vle[0].u64[0]),
+        [Vle0h] "=&r"(Vle[0].u64[1]), [Vle1l] "=&r"(Vle[1].u64[0]),
+        [Vle1h] "=&r"(Vle[1].u64[1]), [Vle2l] "=&r"(Vle[2].u64[0]),
+        [Vle2h] "=&r"(Vle[2].u64[1]), [Vle3l] "=&r"(Vle[3].u64[0]),
+        [Vle3h] "=&r"(Vle[3].u64[1]), [x] "+r"(x), [xlen] "+r"(xlen),
+        [V0l] "=&r"(V[0].u64[0]), [V0h] "=&r"(V[0].u64[1]),
+        [V1l] "=&r"(V[1].u64[0]), [V1h] "=&r"(V[1].u64[1]),
+        [V2l] "=&r"(V[2].u64[0]), [V2h] "=&r"(V[2].u64[1]),
+        [V3l] "=&r"(V[3].u64[0]), [V3h] "=&r"(V[3].u64[1]),
+        "=m"(*(unsigned char(*)[64])x), "=m"(*(unsigned char(*)[64])V)
+      :
+      [vsk0] "w"(vsubkeys[0]), [vsk1] "w"(vsubkeys[1]), [vsk2] "w"(vsubkeys[2]),
+      [vsk3] "w"(vsubkeys[3]), [vsk4] "w"(vsubkeys[4]), [vsk5] "w"(vsubkeys[5]),
+      [vsk6] "w"(vsubkeys[6]), [vsk7] "w"(vsubkeys[7]), [vsk8] "w"(vsubkeys[8]),
+      [vsk9] "w"(vsubkeys[9]), [vsk10] "w"(vsubkeys[10]),
+      [vsk11] "w"(vsubkeys[11]), [vsk12] "w"(vsubkeys[12]),
+      [vsk13] "w"(vsubkeys[13]), [vsk14] "w"(vsubkeys[14]), [V] "r"(V),
+      [DRBG_ctx_V] "m"(DRBG_ctx.V)
+      : "cc");
+#pragma GCC diagnostic pop
+
+  while (xlen > 0) {
+    if (xlen > 16) {
+      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
+      x += 16;
+      xlen -= 16;
+
+      Vle[0].u128++;
+      V[0] = Vle[0];
+      bswap128(&V[0]);
+    } else {
+      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
+      memcpy(x, block, xlen);
+      xlen = 0;
+    }
+  }
+
+  memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
+
+  AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter++;
+
+  return RNG_SUCCESS;
+}
+
+#ifdef RANDOMBYTES_ARM64CRYPTO
+int randombytes(unsigned char *random_array, unsigned long long nbytes) {
+  int ret = randombytes_arm64crypto(random_array, nbytes);
+#ifdef ENABLE_CT_TESTING
+  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
+#endif
+  return ret;
+}
+
+void randombytes_init(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+  randombytes_init_arm64crypto(entropy_input, personalization_string,
+                               security_strength);
+}
+#endif
--- a/src/common/broadwell/CMakeLists.txt
+++ b/src/common/broadwell/CMakeLists.txt
@@ -0,0 +1,43 @@
+set(SOURCE_FILES_COMMON_AESNI
+    aes_ni.c
+    ctr_drbg.c
+    randombytes_ctrdrbg_aesni.c
+    vaes256_key_expansion.S
+)
+
+foreach(SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
+    target_sources(${SQISIGN_COMMON_TARGET} PRIVATE ${SOURCE_FILES_COMMON_AESNI})
+    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include)
+    target_compile_definitions(${SQISIGN_COMMON_TARGET} PRIVATE RANDOMBYTES_AES_NI)
+    target_compile_options(${SQISIGN_COMMON_TARGET} PRIVATE -maes -mavx2)
+endforeach()
+
+set(SOURCE_FILES_CTRDRBG_TEST_BENCHMARK
+    ../ref/aes_c.c
+    aes_ni.c
+    ctr_drbg.c
+    randombytes_ctrdrbg_aesni.c
+    ../ref/randombytes_ctrdrbg.c
+    ../generic/randombytes_system.c
+    vaes256_key_expansion.S
+)
+
+add_executable(sqisign_test_ctrdrbg_intel ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/test_ctrdrbg.c)
+target_include_directories(sqisign_test_ctrdrbg_intel PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
+target_compile_definitions(sqisign_test_ctrdrbg_intel PRIVATE
+    CTRDRBG_TEST_BENCH
+    RANDOMBYTES_INIT_PLATFORM=randombytes_init_aes_ni
+    RANDOMBYTES_PLATFORM=randombytes_aes_ni)
+target_compile_options(sqisign_test_ctrdrbg_intel PRIVATE -maes -mavx2)
+
+add_test(sqisign_test_ctrdrbg_intel sqisign_test_ctrdrbg_intel)
+
+add_executable(sqisign_bench_ctrdrbg_intel ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/bench_ctrdrbg.c)
+target_include_directories(sqisign_bench_ctrdrbg_intel PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
+target_compile_definitions(sqisign_bench_ctrdrbg_intel PRIVATE
+    CTRDRBG_TEST_BENCH
+    RANDOMBYTES_INIT_PLATFORM=randombytes_init_aes_ni
+    RANDOMBYTES_PLATFORM=randombytes_aes_ni)
+target_compile_options(sqisign_bench_ctrdrbg_intel PRIVATE -maes -mavx2)
+
+set(BM_BINS ${BM_BINS} sqisign_bench_ctrdrbg_intel CACHE INTERNAL "List of benchmark executables")
--- a/src/common/broadwell/aes_ni.c
+++ b/src/common/broadwell/aes_ni.c
@@ -0,0 +1,258 @@
+/***************************************************************************
+* This implementation is a modified version of the code,
+* written by Nir Drucker and Shay Gueron
+* AWS Cryptographic Algorithms Group
+* (ndrucker@amazon.com, gueron@amazon.com)
+*
+* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+*  
+* Licensed under the Apache License, Version 2.0 (the "License").
+* You may not use this file except in compliance with the License.
+* A copy of the License is located at
+*  
+*     http://www.apache.org/licenses/LICENSE-2.0
+*  
+* or in the "license" file accompanying this file. This file is distributed 
+* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
+* express or implied. See the License for the specific language governing 
+* permissions and limitations under the License.
+* The license is detailed in the file LICENSE.txt, and applies to this file.
+* ***************************************************************************/
+
+#include "aes_ni.h"
+#include <string.h>
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#define AESENC(m, key)         _mm_aesenc_si128(m, key)
+#define AESENCLAST(m, key)     _mm_aesenclast_si128(m, key)
+#define XOR(a, b)              _mm_xor_si128(a, b)
+#define ADD32(a, b)            _mm_add_epi32(a, b)
+#define SHUF8(a, mask)         _mm_shuffle_epi8(a, mask)
+
+#define ZERO256                _mm256_zeroall
+
+#define BSWAP_MASK 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+#ifdef VAES256
+#define VAESENC(a, key)        _mm256_aesenc_epi128(a, key)
+#define VAESENCLAST(a, key)    _mm256_aesenclast_epi128(a, key)
+#define EXTRACT128(a, imm)     _mm256_extracti128_si256(a, imm)
+#define XOR256(a, b)           _mm256_xor_si256(a,b)
+#define ADD32_256(a, b)        _mm256_add_epi32(a,b)
+#define SHUF8_256(a, mask)     _mm256_shuffle_epi8(a, mask)
+#endif
+
+#ifdef VAES512
+#define VAESENC(a, key)        _mm512_aesenc_epi128(a, key)
+#define VAESENCLAST(a, key)    _mm512_aesenclast_epi128(a, key)
+#define EXTRACT128(a, imm)     _mm512_extracti64x2_epi64(a, imm)
+#define XOR512(a, b)           _mm512_xor_si512(a,b)
+#define ADD32_512(a, b)        _mm512_add_epi32(a,b)
+#define SHUF8_512(a, mask)     _mm512_shuffle_epi8(a, mask)
+#endif
+
+_INLINE_ __m128i load_m128i(IN const uint8_t *ctr)
+{
+    return _mm_set_epi8(ctr[0],  ctr[1],  ctr[2],  ctr[3],
+                        ctr[4],  ctr[5],  ctr[6],  ctr[7],
+                        ctr[8],  ctr[9],  ctr[10], ctr[11],
+                        ctr[12], ctr[13], ctr[14], ctr[15]);
+}
+
+_INLINE_ __m128i loadr_m128i(IN const uint8_t *ctr)
+{
+    return _mm_setr_epi8(ctr[0],  ctr[1],  ctr[2],  ctr[3],
+                         ctr[4],  ctr[5],  ctr[6],  ctr[7],
+                         ctr[8],  ctr[9],  ctr[10], ctr[11],
+                         ctr[12], ctr[13], ctr[14], ctr[15]);
+}
+
+void aes256_enc(OUT uint8_t *ct,
+                IN const uint8_t *pt,
+                IN const aes256_ks_t *ks) {
+    uint32_t i = 0;
+    __m128i block = loadr_m128i(pt);
+
+    block = XOR(block, ks->keys[0]);
+    for (i = 1; i < AES256_ROUNDS; i++) {
+        block = AESENC(block, ks->keys[i]);
+    }
+    block = AESENCLAST(block, ks->keys[AES256_ROUNDS]);
+
+    _mm_storeu_si128((void*)ct, block);
+
+    // Delete secrets from registers if any.
+    ZERO256();
+}
+
+void aes256_ctr_enc(OUT uint8_t *ct,
+                    IN const uint8_t *ctr,
+                    IN const uint32_t num_blocks,
+                    IN const aes256_ks_t *ks)
+{
+    __m128i ctr_block = load_m128i(ctr);
+
+    const __m128i bswap_mask = _mm_set_epi32(BSWAP_MASK);
+    const __m128i one = _mm_set_epi32(0,0,0,1);
+
+    __m128i block = SHUF8(ctr_block, bswap_mask);
+
+    for (uint32_t bidx = 0; bidx < num_blocks; bidx++) 
+    {
+        block = XOR(block, ks->keys[0]);
+        for (uint32_t i = 1; i < AES256_ROUNDS; i++) {
+            block = AESENC(block, ks->keys[i]);
+        }
+        block = AESENCLAST(block, ks->keys[AES256_ROUNDS]);
+
+        //We use memcpy to avoid align casting.
+        _mm_storeu_si128((void*)&ct[16*bidx], block);
+
+        ctr_block = ADD32(ctr_block, one);
+        block = SHUF8(ctr_block, bswap_mask);
+    }
+    
+    // Delete secrets from registers if any.
+    ZERO256();
+}
+
+#ifdef VAES256
+_INLINE_ void load_ks(OUT __m256i ks256[AES256_ROUNDS + 1], 
+                      IN const aes256_ks_t *ks)
+{
+    for(uint32_t i = 0; i < AES256_ROUNDS + 1; i++)
+    {
+        ks256[i] = _mm256_broadcastsi128_si256(ks->keys[i]);
+    }
+}
+
+// NIST 800-90A Table 3, Section 10.2.1 (no derivation function) states that 
+// max_number_of_bits_per_request is min((2^ctr_len - 4) x block_len, 2^19) <= 2^19
+// Therefore the maximal number of blocks (16 bytes) is 2^19/128 = 2^19/2^7 = 2^12 < 2^32
+// Here num_blocks is assumed to be less then 2^32. 
+// It is the caller responsiblity to ensure it.
+void aes256_ctr_enc256(OUT uint8_t *ct,
+                       IN const uint8_t *ctr,
+                       IN const uint32_t num_blocks,
+                       IN const aes256_ks_t *ks)
+{
+    const uint64_t num_par_blocks = num_blocks/2;
+    const uint64_t blocks_rem = num_blocks - (2*(num_par_blocks));
+
+    __m256i ks256[AES256_ROUNDS + 1];
+    load_ks(ks256, ks);
+
+    __m128i single_block = load_m128i(ctr);
+    __m256i ctr_blocks = _mm256_broadcastsi128_si256(single_block);
+
+    // Preparing the masks
+    const __m256i bswap_mask = _mm256_set_epi32(BSWAP_MASK, BSWAP_MASK);
+    const __m256i two = _mm256_set_epi32(0,0,0,2,0,0,0,2);
+    const __m256i init = _mm256_set_epi32(0,0,0,1,0,0,0,0);
+
+    // Initialize two parallel counters
+    ctr_blocks = ADD32_256(ctr_blocks, init);
+    __m256i p = SHUF8_256(ctr_blocks, bswap_mask);
+
+    for (uint32_t block_idx = 0; block_idx < num_par_blocks; block_idx++) 
+    {
+        p = XOR256(p, ks256[0]);
+        for (uint32_t i = 1; i < AES256_ROUNDS; i++) 
+        {
+            p = VAESENC(p, ks256[i]);
+        }
+        p = VAESENCLAST(p, ks256[AES256_ROUNDS]);
+
+        // We use memcpy to avoid align casting.
+        _mm256_storeu_si256((__m256i *)&ct[PAR_AES_BLOCK_SIZE * block_idx], p);
+
+        // Increase the two counters in parallel
+        ctr_blocks = ADD32_256(ctr_blocks, two);
+        p = SHUF8_256(ctr_blocks, bswap_mask);
+    }
+ 
+    if(0 != blocks_rem)
+    {
+        single_block = EXTRACT128(p, 0);
+        aes256_ctr_enc(&ct[PAR_AES_BLOCK_SIZE * num_par_blocks], 
+                       (const uint8_t*)&single_block, blocks_rem, ks);
+    }
+
+    // Delete secrets from registers if any.
+    ZERO256();
+}
+
+#endif //VAES256
+
+#ifdef VAES512
+
+_INLINE_ void load_ks(OUT __m512i ks512[AES256_ROUNDS + 1], 
+                      IN const aes256_ks_t *ks)
+{
+    for(uint32_t i = 0; i < AES256_ROUNDS + 1; i++)
+    {
+        ks512[i] = _mm512_broadcast_i32x4(ks->keys[i]);
+    }
+}
+
+// NIST 800-90A Table 3, Section 10.2.1 (no derivation function) states that 
+// max_number_of_bits_per_request is min((2^ctr_len - 4) x block_len, 2^19) <= 2^19
+// Therefore the maximal number of blocks (16 bytes) is 2^19/128 = 2^19/2^7 = 2^12 < 2^32
+// Here num_blocks is assumed to be less then 2^32. 
+// It is the caller responsiblity to ensure it.
+void aes256_ctr_enc512(OUT uint8_t *ct,
+                       IN const uint8_t *ctr,
+                       IN const uint32_t num_blocks,
+                       IN const aes256_ks_t *ks)
+{
+    const uint64_t num_par_blocks = num_blocks/4;
+    const uint64_t blocks_rem = num_blocks - (4*(num_par_blocks));
+
+    __m512i ks512[AES256_ROUNDS + 1];
+    load_ks(ks512, ks);
+
+    __m128i single_block = load_m128i(ctr);
+    __m512i ctr_blocks = _mm512_broadcast_i32x4(single_block);
+
+    // Preparing the masks
+    const __m512i bswap_mask = _mm512_set_epi32(BSWAP_MASK, BSWAP_MASK,
+                                                BSWAP_MASK, BSWAP_MASK);
+    const __m512i four = _mm512_set_epi32(0,0,0,4,0,0,0,4,0,0,0,4,0,0,0,4);
+    const __m512i init = _mm512_set_epi32(0,0,0,3,0,0,0,2,0,0,0,1,0,0,0,0);
+
+    // Initialize four parallel counters
+    ctr_blocks = ADD32_512(ctr_blocks, init);
+    __m512i p = SHUF8_512(ctr_blocks, bswap_mask);
+
+    for (uint32_t block_idx = 0; block_idx < num_par_blocks; block_idx++) 
+    {
+        p = XOR512(p, ks512[0]);
+        for (uint32_t i = 1; i < AES256_ROUNDS; i++) 
+        {
+            p = VAESENC(p, ks512[i]);
+        }
+        p = VAESENCLAST(p, ks512[AES256_ROUNDS]);
+
+
+        // We use memcpy to avoid align casting.
+        _mm512_storeu_si512(&ct[PAR_AES_BLOCK_SIZE * block_idx], p);
+
+        // Increase the four counters in parallel
+        ctr_blocks = ADD32_512(ctr_blocks, four);
+        p = SHUF8_512(ctr_blocks, bswap_mask);
+    }
+ 
+    if(0 != blocks_rem)
+    {
+        single_block = EXTRACT128(p, 0);
+        aes256_ctr_enc(&ct[PAR_AES_BLOCK_SIZE * num_par_blocks], 
+                       (const uint8_t*)&single_block, blocks_rem, ks);
+    }
+
+    // Delete secrets from registers if any.
+    ZERO256();
+}
+
+#endif //VAES512
--- a/src/common/broadwell/ctr_drbg.c
+++ b/src/common/broadwell/ctr_drbg.c
@@ -0,0 +1,201 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/***************************************************************************
+ * Small modification by Nir Drucker and Shay Gueron
+ * AWS Cryptographic Algorithms Group
+ * (ndrucker@amazon.com, gueron@amazon.com)
+ * include:
+ * 1) Use memcpy/memset instead of OPENSSL_memcpy/memset
+ * 2) Include aes.h as the underlying aes code
+ * 3) Modifying the drbg structure
+ * ***************************************************************************/
+
+#include "ctr_drbg.h"
+#include <string.h>
+
+
+// Section references in this file refer to SP 800-90Ar1:
+// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf
+
+int CTR_DRBG_init(CTR_DRBG_STATE *drbg,
+                  const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
+                  const uint8_t *personalization, size_t personalization_len) {
+  // Section 10.2.1.3.1
+  if (personalization_len > CTR_DRBG_ENTROPY_LEN) {
+    return 0;
+  }
+
+  uint8_t seed_material[CTR_DRBG_ENTROPY_LEN];
+  memcpy(seed_material, entropy, CTR_DRBG_ENTROPY_LEN);
+
+  for (size_t i = 0; i < personalization_len; i++) {
+    seed_material[i] ^= personalization[i];
+  }
+
+  // Section 10.2.1.2
+  // kInitMask is the result of encrypting blocks with big-endian value 1, 2
+  // and 3 with the all-zero AES-256 key.
+  static const uint8_t kInitMask[CTR_DRBG_ENTROPY_LEN] = {
+      0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, 0xa9, 0x63, 0xb4, 0xf1,
+      0xc4, 0xcb, 0x73, 0x8b, 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
+      0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18, 0x72, 0x60, 0x03, 0xca,
+      0x37, 0xa6, 0x2a, 0x74, 0xd1, 0xa2, 0xf5, 0x8e, 0x75, 0x06, 0x35, 0x8e,
+  };
+
+  for (size_t i = 0; i < sizeof(kInitMask); i++) {
+    seed_material[i] ^= kInitMask[i];
+  }
+
+  aes256_key_t key;
+  memcpy(key.raw, seed_material, 32);
+  memcpy(drbg->counter.bytes, seed_material + 32, 16);
+
+  aes256_key_expansion(&drbg->ks, &key);
+  drbg->reseed_counter = 1;
+
+  return 1;
+}
+
+// ctr_inc adds |n| to the last four bytes of |drbg->counter|, treated as a
+// big-endian number.
+static void ctr32_add(CTR_DRBG_STATE *drbg, uint32_t n) {
+  drbg->counter.words[3] =
+      CRYPTO_bswap4(CRYPTO_bswap4(drbg->counter.words[3]) + n);
+}
+
+static int ctr_drbg_update(CTR_DRBG_STATE *drbg, const uint8_t *data,
+                           size_t data_len) {
+  // Per section 10.2.1.2, |data_len| must be |CTR_DRBG_ENTROPY_LEN|. Here, we
+  // allow shorter inputs and right-pad them with zeros. This is equivalent to
+  // the specified algorithm but saves a copy in |CTR_DRBG_generate|.
+  if (data_len > CTR_DRBG_ENTROPY_LEN) {
+    return 0;
+  }
+
+  uint8_t temp[CTR_DRBG_ENTROPY_LEN];
+  for (size_t i = 0; i < CTR_DRBG_ENTROPY_LEN; i += AES_BLOCK_SIZE) {
+    ctr32_add(drbg, 1);
+    aes256_enc(temp + i, drbg->counter.bytes, &drbg->ks);
+  }
+
+  for (size_t i = 0; i < data_len; i++) {
+    temp[i] ^= data[i];
+  }
+
+  aes256_key_t key;
+  memcpy(key.raw, temp, 32);
+  memcpy(drbg->counter.bytes, temp + 32, 16);
+  aes256_key_expansion(&drbg->ks, &key);
+
+  return 1;
+}
+
+int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg,
+                    const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
+                    const uint8_t *additional_data,
+                    size_t additional_data_len) {
+  // Section 10.2.1.4
+  uint8_t entropy_copy[CTR_DRBG_ENTROPY_LEN];
+
+  if (additional_data_len > 0) {
+    if (additional_data_len > CTR_DRBG_ENTROPY_LEN) {
+      return 0;
+    }
+
+    memcpy(entropy_copy, entropy, CTR_DRBG_ENTROPY_LEN);
+    for (size_t i = 0; i < additional_data_len; i++) {
+      entropy_copy[i] ^= additional_data[i];
+    }
+
+    entropy = entropy_copy;
+  }
+
+  if (!ctr_drbg_update(drbg, entropy, CTR_DRBG_ENTROPY_LEN)) {
+    return 0;
+  }
+
+  drbg->reseed_counter = 1;
+
+  return 1;
+}
+
+int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len,
+                      const uint8_t *additional_data,
+                      size_t additional_data_len) {
+  if (additional_data_len != 0 &&
+      !ctr_drbg_update(drbg, additional_data, additional_data_len)) {
+    return 0;
+  }
+
+  // kChunkSize is used to interact better with the cache. Since the AES-CTR
+  // code assumes that it's encrypting rather than just writing keystream, the
+  // buffer has to be zeroed first. Without chunking, large reads would zero
+  // the whole buffer, flushing the L1 cache, and then do another pass (missing
+  // the cache every time) to “encrypt” it. The code can avoid this by
+  // chunking.
+  static const size_t kChunkSize = 8 * 1024;
+
+  while (out_len >= AES_BLOCK_SIZE) {
+    size_t todo = kChunkSize;
+    if (todo > out_len) {
+      todo = out_len;
+    }
+
+    todo &= ~(AES_BLOCK_SIZE - 1);
+
+    const size_t num_blocks = todo / AES_BLOCK_SIZE;
+    if (1) {
+      memset(out, 0, todo);
+      ctr32_add(drbg, 1);
+#ifdef VAES512
+      aes256_ctr_enc512(out, drbg->counter.bytes, num_blocks, &drbg->ks);
+#elif defined(VAES256)
+      aes256_ctr_enc256(out, drbg->counter.bytes, num_blocks, &drbg->ks);
+#else
+      aes256_ctr_enc(out, drbg->counter.bytes, num_blocks, &drbg->ks);
+#endif
+      ctr32_add(drbg, num_blocks - 1);
+    } else {
+      for (size_t i = 0; i < todo; i += AES_BLOCK_SIZE) {
+        ctr32_add(drbg, 1);
+        aes256_enc(&out[i], drbg->counter.bytes, &drbg->ks);
+      }
+    }
+
+    out += todo;
+    out_len -= todo;
+  }
+
+  if (out_len > 0) {
+    uint8_t block[AES_BLOCK_SIZE];
+    ctr32_add(drbg, 1);
+    aes256_enc(block, drbg->counter.bytes, &drbg->ks);
+
+    memcpy(out, block, out_len);
+  }
+
+  // Right-padding |additional_data| in step 2.2 is handled implicitly by
+  // |ctr_drbg_update|, to save a copy.
+  if (!ctr_drbg_update(drbg, additional_data, additional_data_len)) {
+    return 0;
+  }
+
+  drbg->reseed_counter++;
+  return 1;
+}
+
+void CTR_DRBG_clear(CTR_DRBG_STATE *drbg) {
+  secure_clean((uint8_t *)drbg, sizeof(CTR_DRBG_STATE));
+}
--- a/src/common/broadwell/include/aes_ni.h
+++ b/src/common/broadwell/include/aes_ni.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+* Written by Nir Drucker and Shay Gueron
+* AWS Cryptographic Algorithms Group
+* (ndrucker@amazon.com, gueron@amazon.com)
+*
+* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+*  
+* Licensed under the Apache License, Version 2.0 (the "License").
+* You may not use this file except in compliance with the License.
+* A copy of the License is located at
+*  
+*     http://www.apache.org/licenses/LICENSE-2.0
+*  
+* or in the "license" file accompanying this file. This file is distributed 
+* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
+* express or implied. See the License for the specific language governing 
+* permissions and limitations under the License.
+* The license is detailed in the file LICENSE.txt, and applies to this file.
+* ***************************************************************************/
+
+#pragma once
+
+#include <stdint.h>
+#include <wmmintrin.h>
+#include "defs.h"
+
+#define MAX_AES_INVOKATION (MASK(32))
+
+#define AES256_KEY_SIZE (32ULL)
+#define AES256_KEY_BITS (AES256_KEY_SIZE * 8)
+#define AES_BLOCK_SIZE (16ULL)
+#define AES256_ROUNDS (14ULL)
+
+#ifdef VAES256
+#define PAR_AES_BLOCK_SIZE (AES_BLOCK_SIZE*2)
+#elif defined(VAES512)
+#define PAR_AES_BLOCK_SIZE (AES_BLOCK_SIZE*4)
+#endif
+
+typedef ALIGN(16) struct aes256_key_s {
+    uint8_t raw[AES256_KEY_SIZE];
+} aes256_key_t;
+
+typedef ALIGN(16) struct aes256_ks_s {
+    __m128i keys[AES256_ROUNDS + 1];
+} aes256_ks_t;
+
+// The ks parameter must be 16 bytes aligned!
+EXTERNC void aes256_key_expansion(OUT aes256_ks_t *ks,
+                                  IN const aes256_key_t *key);
+
+// Encrypt one 128-bit block ct = E(pt,ks)
+void aes256_enc(OUT uint8_t *ct,
+                IN const uint8_t *pt,
+                IN const aes256_ks_t *ks);
+
+// Encrypt num_blocks 128-bit blocks 
+// ct[15:0] = E(pt[15:0],ks)
+// ct[31:16] = E(pt[15:0] + 1,ks)
+// ...
+// ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
+void aes256_ctr_enc(OUT uint8_t *ct,
+                    IN const uint8_t *pt,
+                    IN const uint32_t num_blocks,
+                    IN const aes256_ks_t *ks);
+
+// Encrypt num_blocks 128-bit blocks using VAES (AVX-2)
+// ct[15:0] = E(pt[15:0],ks)
+// ct[31:16] = E(pt[15:0] + 1,ks)
+// ...
+// ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
+void aes256_ctr_enc256(OUT uint8_t *ct,
+                       IN const uint8_t *ctr,
+                       IN const uint32_t num_blocks,
+                       IN const aes256_ks_t *ks);
+
+// Encrypt num_blocks 128-bit blocks using VAES (AVX512)
+// ct[15:0] = E(pt[15:0],ks)
+// ct[31:16] = E(pt[15:0] + 1,ks)
+// ...
+// ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
+void aes256_ctr_enc512(OUT uint8_t *ct,
+                       IN const uint8_t *ctr,
+                       IN const uint32_t num_blocks,
+                       IN const aes256_ks_t *ks);
--- a/src/common/broadwell/include/ctr_drbg.h
+++ b/src/common/broadwell/include/ctr_drbg.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/***************************************************************************
+* Small modification by Nir Drucker and Shay Gueron
+* AWS Cryptographic Algorithms Group
+* (ndrucker@amazon.com, gueron@amazon.com)
+* include:
+* 1) Use memcpy/memset instead of OPENSSL_memcpy/memset
+* 2) Include aes.h as the underlying aes code
+* 3) Modifying the drbg structure
+* ***************************************************************************/
+
+#pragma once
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include "aes_ni.h"
+
+// CTR_DRBG_STATE contains the state of a CTR_DRBG based on AES-256. See SP
+// 800-90Ar1.
+typedef struct {
+  aes256_ks_t ks;
+  union {
+    uint8_t bytes[16];
+    uint32_t words[4];
+  } counter;
+  uint64_t reseed_counter;
+} CTR_DRBG_STATE;
+
+// See SP 800-90Ar1, table 3.
+#define CTR_DRBG_ENTROPY_LEN 48
+
+// CTR_DRBG_init initialises |*drbg| given |CTR_DRBG_ENTROPY_LEN| bytes of
+// entropy in |entropy| and, optionally, a personalization string up to
+// |CTR_DRBG_ENTROPY_LEN| bytes in length. It returns one on success and zero
+// on error.
+int CTR_DRBG_init(CTR_DRBG_STATE *drbg,
+                  const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
+                  const uint8_t *personalization,
+                  size_t personalization_len);
+
+// CTR_DRBG_reseed reseeds |drbg| given |CTR_DRBG_ENTROPY_LEN| bytes of entropy
+// in |entropy| and, optionally, up to |CTR_DRBG_ENTROPY_LEN| bytes of
+// additional data. It returns one on success or zero on error.
+int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg,
+                    const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
+                    const uint8_t *additional_data,
+                    size_t additional_data_len);
+
+// CTR_DRBG_generate processes to up |CTR_DRBG_ENTROPY_LEN| bytes of additional
+// data (if any) and then writes |out_len| random bytes to |out|. It returns one on success or
+// zero on error.
+int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out,
+                      size_t out_len,
+                      const uint8_t *additional_data,
+                      size_t additional_data_len);
+
+// CTR_DRBG_clear zeroises the state of |drbg|.
+void CTR_DRBG_clear(CTR_DRBG_STATE *drbg);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
--- a/src/common/broadwell/include/defs.h
+++ b/src/common/broadwell/include/defs.h
@@ -0,0 +1,63 @@
+/***************************************************************************
+* Written by Nir Drucker and Shay Gueron
+* AWS Cryptographic Algorithms Group
+* (ndrucker@amazon.com, gueron@amazon.com)
+*
+* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+*  
+* Licensed under the Apache License, Version 2.0 (the "License").
+* You may not use this file except in compliance with the License.
+* A copy of the License is located at
+*  
+*     http://www.apache.org/licenses/LICENSE-2.0
+*  
+* or in the "license" file accompanying this file. This file is distributed 
+* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
+* express or implied. See the License for the specific language governing 
+* permissions and limitations under the License.
+* The license is detailed in the file LICENSE.txt, and applies to this file.
+* ***************************************************************************/
+
+#pragma once
+
+#include <string.h>
+
+#ifdef __cplusplus
+  #define EXTERNC extern "C"
+#else
+  #define EXTERNC
+#endif
+
+// For code clarity.
+#define IN
+#define OUT
+
+#define ALIGN(n) __attribute__((aligned(n)))
+#define _INLINE_ static inline
+
+typedef enum
+{
+  SUCCESS=0,
+  ERROR=1
+} status_t;
+
+#define SUCCESS 0
+#define ERROR 1
+#define GUARD(func) {if(SUCCESS != func) {return ERROR;}}
+
+#if defined(__GNUC__) && __GNUC__ >= 2
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  return __builtin_bswap32(x);
+}
+#endif
+
+_INLINE_ void secure_clean(OUT uint8_t *p, IN const uint32_t len)
+{
+#ifdef _WIN32
+    SecureZeroMemory(p, len);
+#else
+    typedef void *(*memset_t)(void *, int, size_t);
+    static volatile memset_t memset_func = memset;
+    memset_func(p, 0, len);
+#endif
+}
--- a/src/common/broadwell/randombytes_ctrdrbg_aesni.c
+++ b/src/common/broadwell/randombytes_ctrdrbg_aesni.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: Apache-2.0 and Unknown
+//
+/*
+NIST-developed software is provided by NIST as a public service. You may use,
+copy, and distribute copies of the software in any medium, provided that you
+keep intact this entire notice. You may improve, modify, and create derivative
+works of the software or any portion of the software, and you may copy and
+distribute such modifications or works. Modified works should carry a notice
+stating that you changed the software and should note the date and nature of any
+such change. Please explicitly acknowledge the National Institute of Standards
+and Technology as the source of the software.
+
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
+ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
+WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
+NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
+ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
+ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
+INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
+USEFULNESS OF THE SOFTWARE.
+
+You are solely responsible for determining the appropriateness of using and
+distributing the software and you assume all risks associated with its use,
+including but not limited to the risks and costs of program errors, compliance
+with applicable laws, damage to or loss of data, programs or equipment, and the
+unavailability or interruption of operation. This software is not intended to be
+used in any situation where a failure could cause risk of injury or damage to
+property. The software developed by NIST employees is not subject to copyright
+protection within the United States.
+*/
+
+#include <string.h>
+
+#include <rng.h>
+#include "ctr_drbg.h"
+
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define RNG_SUCCESS 0
+#define RNG_BAD_MAXLEN -1
+#define RNG_BAD_OUTBUF -2
+#define RNG_BAD_REQ_LEN -3
+
+CTR_DRBG_STATE drbg;
+
+#ifndef CTRDRBG_TEST_BENCH
+static
+#endif
+void
+randombytes_init_aes_ni(unsigned char *entropy_input,
+                        unsigned char *personalization_string,
+                        int security_strength) {
+  (void)security_strength; // fixed to 256
+  CTR_DRBG_init(&drbg, entropy_input, personalization_string,
+                (personalization_string == NULL) ? 0 : CTR_DRBG_ENTROPY_LEN);
+}
+
+#ifndef CTRDRBG_TEST_BENCH
+static
+#endif
+int
+randombytes_aes_ni(unsigned char *x, size_t xlen) {
+  CTR_DRBG_generate(&drbg, x, xlen, NULL, 0);
+  return RNG_SUCCESS;
+}
+
+#ifdef RANDOMBYTES_AES_NI
+SQISIGN_API
+int randombytes(unsigned char *random_array, unsigned long long nbytes) {
+  int ret = randombytes_aes_ni(random_array, nbytes);
+#ifdef ENABLE_CT_TESTING
+  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
+#endif
+  return ret;
+}
+
+SQISIGN_API
+void randombytes_init(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+  randombytes_init_aes_ni(entropy_input, personalization_string,
+                          security_strength);
+}
+#endif
--- a/src/common/broadwell/vaes256_key_expansion.S
+++ b/src/common/broadwell/vaes256_key_expansion.S
@@ -0,0 +1,122 @@
+#***************************************************************************
+# This implementation is a modified version of the code,
+# written by Nir Drucker and Shay Gueron
+# AWS Cryptographic Algorithms Group
+# (ndrucker@amazon.com, gueron@amazon.com)
+#
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#  
+#     http://www.apache.org/licenses/LICENSE-2.0
+#  
+# or in the "license" file accompanying this file. This file is distributed 
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
+# express or implied. See the License for the specific language governing 
+# permissions and limitations under the License.
+# The license is detailed in the file LICENSE.txt, and applies to this file.
+#***************************************************************************
+
+.intel_syntax noprefix
+.data
+
+.p2align 4, 0x90
+MASK1:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+CON1:
+.long 1,1,1,1
+
+.set k256_size, 32
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+.text
+
+################################################################################
+# void aes256_key_expansion(OUT aes256_ks_t* ks, IN const uint8_t* key);
+# The output parameter must be 16 bytes aligned!
+#
+#Linux ABI
+#define out rdi
+#define in  rsi
+
+#define CON      xmm0
+#define MASK_REG xmm1
+
+#define IN0      xmm2
+#define IN1      xmm3
+
+#define TMP1     xmm4
+#define TMP2     xmm5
+
+#define ZERO     xmm15
+
+.macro ROUND1 in0 in1
+    add         out,   k256_size
+    vpshufb     TMP2,  \in1, MASK_REG
+    aesenclast  TMP2,  CON
+    vpslld      CON,   CON,  1
+    vpslldq     TMP1,  \in0, 4
+    vpxor       \in0,  \in0, TMP1
+    vpslldq     TMP1,  TMP1, 4
+    vpxor       \in0,  \in0, TMP1
+    vpslldq     TMP1,  TMP1, 4
+    vpxor       \in0,  \in0, TMP1
+    vpxor       \in0,  \in0, TMP2
+    vmovdqa     [out], \in0
+
+.endm
+
+.macro ROUND2
+   vpshufd     TMP2,     IN0,  0xff
+   aesenclast  TMP2,     ZERO
+   vpslldq     TMP1,     IN1,  4
+   vpxor       IN1,      IN1,  TMP1
+   vpslldq     TMP1,     TMP1, 4
+   vpxor       IN1,      IN1,  TMP1
+   vpslldq     TMP1,     TMP1, 4
+   vpxor       IN1,      IN1, TMP1
+   vpxor       IN1,      IN1, TMP2
+   vmovdqa     [out+16], IN1
+.endm
+
+#ifdef __APPLE__
+#define AES256_KEY_EXPANSION _aes256_key_expansion
+#else
+#define AES256_KEY_EXPANSION aes256_key_expansion
+#endif
+
+#ifndef __APPLE__
+.type   AES256_KEY_EXPANSION,@function
+.hidden AES256_KEY_EXPANSION
+#endif
+.globl  AES256_KEY_EXPANSION
+AES256_KEY_EXPANSION:
+   vmovdqu IN0,      [in]
+   vmovdqu IN1,      [in+16]
+   vmovdqa [out],    IN0
+   vmovdqa [out+16], IN1
+
+   vmovdqa CON,      [rip+CON1]
+   vmovdqa MASK_REG, [rip+MASK1]
+
+   vpxor   ZERO, ZERO, ZERO
+
+   mov     ax, 6
+.loop256:
+
+   ROUND1  IN0, IN1
+   dec     ax
+   ROUND2
+   jne     .loop256
+
+   ROUND1  IN0, IN1
+
+   ret
+#ifndef __APPLE__
+.size AES256_KEY_EXPANSION, .-AES256_KEY_EXPANSION
+#endif
+
--- a/src/common/generic/CMakeLists.txt
+++ b/src/common/generic/CMakeLists.txt
@@ -1,26 +1,15 @@
-set(SOURCE_FILES_COMMON_SYS 
+set(SOURCE_FILES_COMMON_GENERIC
    randombytes_system.c 
-    aes_c.c 
    fips202.c 
    mem.c
+    tools.c
 )

-add_library(sqisign_common_sys ${SOURCE_FILES_COMMON_SYS})
-target_include_directories(sqisign_common_sys PRIVATE include ../../include)
-target_compile_options(sqisign_common_sys PUBLIC ${C_OPT_FLAGS})
-
-set(SOURCE_FILES_COMMON_TEST 
-    randombytes_ctrdrbg.c 
-    aes_c.c 
-    fips202.c 
-    mem.c
-)
-
-add_library(sqisign_common_test ${SOURCE_FILES_COMMON_TEST})
-target_include_directories(sqisign_common_test PRIVATE include ../include)
-target_compile_options(sqisign_common_test PUBLIC ${C_OPT_FLAGS})
-
-if (ENABLE_CT_TESTING)
-    target_compile_definitions(sqisign_common_sys PUBLIC ENABLE_CT_TESTING)
-    target_compile_definitions(sqisign_common_test PUBLIC ENABLE_CT_TESTING)
-endif()
+foreach (SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
+    add_library(${SQISIGN_COMMON_TARGET} STATIC ${SOURCE_FILES_COMMON_GENERIC})
+    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include ${INC_PUBLIC})
+    target_compile_options(${SQISIGN_COMMON_TARGET} PUBLIC ${C_OPT_FLAGS})
+    if (ENABLE_CT_TESTING)
+        target_compile_definitions(${SQISIGN_COMMON_TARGET} PUBLIC ENABLE_CT_TESTING)
+    endif()
+endforeach()
--- a/src/common/generic/fips202.c
+++ b/src/common/generic/fips202.c
@@ -13,167 +13,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include <stddef.h>
-#include <stdint.h>
-
-#define SHAKE128_RATE 168
-#define SHAKE256_RATE 136
-#define SHA3_256_RATE 136
-#define SHA3_384_RATE 104
-#define SHA3_512_RATE 72
-
-#define PQC_SHAKEINCCTX_BYTES (sizeof(uint64_t)*26)
-#define PQC_SHAKECTX_BYTES (sizeof(uint64_t)*25)
-
-// Context for incremental API
-typedef struct {
-    uint64_t *ctx;
-} shake128incctx;
-
-// Context for non-incremental API
-typedef struct {
-    uint64_t *ctx;
-} shake128ctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t *ctx;
-} shake256incctx;
-
-// Context for non-incremental API
-typedef struct {
-    uint64_t *ctx;
-} shake256ctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t *ctx;
-} sha3_256incctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t *ctx;
-} sha3_384incctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t *ctx;
-} sha3_512incctx;
-
-/* Initialize the state and absorb the provided input.
- *
- * This function does not support being called multiple times
- * with the same state.
- */
-void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
-/* Free the state */
-void shake128_ctx_release(shake128ctx *state);
-/* Copy the state. */
-void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
-
-/* Initialize incremental hashing API */
-void shake128_inc_init(shake128incctx *state);
-/* Absorb more information into the XOF.
- *
- * Can be called multiple times.
- */
-void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
-/* Finalize the XOF for squeezing */
-void shake128_inc_finalize(shake128incctx *state);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
-/* Copy the context of the SHAKE128 XOF */
-void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src);
-/* Free the context of the SHAKE128 XOF */
-void shake128_inc_ctx_release(shake128incctx *state);
-
-/* Initialize the state and absorb the provided input.
- *
- * This function does not support being called multiple times
- * with the same state.
- */
-void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
-/* Free the context held by this XOF */
-void shake256_ctx_release(shake256ctx *state);
-/* Copy the context held by this XOF */
-void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src);
-
-/* Initialize incremental hashing API */
-void shake256_inc_init(shake256incctx *state);
-void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
-/* Prepares for squeeze phase */
-void shake256_inc_finalize(shake256incctx *state);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
-/* Copy the state */
-void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src);
-/* Free the state */
-void shake256_inc_ctx_release(shake256incctx *state);
-
-/* One-stop SHAKE128 call */
-void shake128(uint8_t *output, size_t outlen,
-              const uint8_t *input, size_t inlen);
-
-/* One-stop SHAKE256 call */
-void shake256(uint8_t *output, size_t outlen,
-              const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_256_inc_init(sha3_256incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
-/* Copy the context */
-void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_256_inc_ctx_release(sha3_256incctx *state);
-
-void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_384_inc_init(sha3_384incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state);
-/* Copy the context */
-void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_384_inc_ctx_release(sha3_384incctx *state);
-
-/* One-stop SHA3-384 shop */
-void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_512_inc_init(sha3_512incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
-/* Copy the context */
-void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_512_inc_ctx_release(sha3_512incctx *state);
-
-/* One-stop SHA3-512 shop */
-void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
+#include "fips202.h"

 #define NROUNDS 24
 #define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))
@@ -686,10 +526,6 @@ static void keccak_inc_squeeze(uint8_t *h, size_t outlen,
 }

 void shake128_inc_init(shake128incctx *state) {
-    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_inc_init(state->ctx);
 }

@@ -706,22 +542,14 @@ void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state)
 }

 void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src) {
-    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }

 void shake128_inc_ctx_release(shake128incctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 void shake256_inc_init(shake256incctx *state) {
-    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_inc_init(state->ctx);
 }

@@ -738,15 +566,11 @@ void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state)
 }

 void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src) {
-    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }

 void shake256_inc_ctx_release(shake256incctx *state) {
-    free(state->ctx);
+    (void)state;
 }


@@ -762,10 +586,6 @@ void shake256_inc_ctx_release(shake256incctx *state) {
 *              - size_t inlen: length of input in bytes
 **************************************************/
 void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) {
-    state->ctx = malloc(PQC_SHAKECTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_absorb(state->ctx, SHAKE128_RATE, input, inlen, 0x1F);
 }

@@ -786,16 +606,12 @@ void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state)
 }

 void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src) {
-    dest->ctx = malloc(PQC_SHAKECTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKECTX_BYTES);
 }

 /** Release the allocated state. Call only once. */
 void shake128_ctx_release(shake128ctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 /*************************************************
@@ -810,10 +626,6 @@ void shake128_ctx_release(shake128ctx *state) {
 *              - size_t inlen: length of input in bytes
 **************************************************/
 void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) {
-    state->ctx = malloc(PQC_SHAKECTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F);
 }

@@ -834,16 +646,12 @@ void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state)
 }

 void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src) {
-    dest->ctx = malloc(PQC_SHAKECTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKECTX_BYTES);
 }

 /** Release the allocated state. Call only once. */
 void shake256_ctx_release(shake256ctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 /*************************************************
@@ -909,23 +717,15 @@ void shake256(uint8_t *output, size_t outlen,
 }

 void sha3_256_inc_init(sha3_256incctx *state) {
-    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_inc_init(state->ctx);
 }

 void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src) {
-    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }

 void sha3_256_inc_ctx_release(sha3_256incctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen) {
@@ -970,18 +770,10 @@ void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) {
 }

 void sha3_384_inc_init(sha3_384incctx *state) {
-    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_inc_init(state->ctx);
 }

 void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src) {
-    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }

@@ -990,7 +782,7 @@ void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inl
 }

 void sha3_384_inc_ctx_release(sha3_384incctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state) {
@@ -1031,18 +823,10 @@ void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen) {
 }

 void sha3_512_inc_init(sha3_512incctx *state) {
-    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (state->ctx == NULL) {
-        exit(111);
-    }
    keccak_inc_init(state->ctx);
 }

 void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src) {
-    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
-    if (dest->ctx == NULL) {
-        exit(111);
-    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }

@@ -1051,7 +835,7 @@ void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inl
 }

 void sha3_512_inc_ctx_release(sha3_512incctx *state) {
-    free(state->ctx);
+    (void)state;
 }

 void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state) {
@@ -1090,13 +874,3 @@ void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) {
        output[i] = t[i];
    }
 }
-
-int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen) {
-    shake128(output, outputByteLen, input, inputByteLen);
-    return 0;
-}
-
-int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen) {
-    shake256(output, outputByteLen, input, inputByteLen);
-    return 0;
-}
--- a/src/common/generic/include/aes.h
+++ b/src/common/generic/include/aes.h
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef AES_H
-#define AES_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
-#define AES_ECB_encrypt AES_256_ECB
-
-#ifdef ENABLE_AESNI
-int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
-                   const unsigned char *input, size_t inputByteLen);
-int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
-                      const unsigned char *input, size_t inputByteLen);
-#define AES_128_CTR AES_128_CTR_NI
-#else
-int AES_128_CTR(unsigned char *output, size_t outputByteLen,
-                const unsigned char *input, size_t inputByteLen);
-#endif
-
-#endif
--- a/src/common/generic/include/bench.h
+++ b/src/common/generic/include/bench.h
@@ -1,63 +1,126 @@
 // SPDX-License-Identifier: Apache-2.0
+#ifndef BENCH_H__
+#define BENCH_H__

 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <inttypes.h>
-
-
-#if defined(TARGET_OS_UNIX) && (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_OTHER))
 #include <time.h>
+#if defined(__APPLE__)
+#include "bench_macos.h"
 #endif
-#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X) || defined(TARGET_OTHER))
-#define print_bench_unit printf("nsec\n");
+
+#if defined(TARGET_ARM) || defined(TARGET_S390X) || defined(NO_CYCLE_COUNTER)
+#define BENCH_UNIT0 "nanoseconds"
+#define BENCH_UNIT3 "microseconds"
+#define BENCH_UNIT6 "milliseconds"
+#define BENCH_UNIT9 "seconds"
 #else
-#define print_bench_unit printf("cycles\n");
+#define BENCH_UNIT0 "cycles"
+#define BENCH_UNIT3 "kilocycles"
+#define BENCH_UNIT6 "megacycles"
+#define BENCH_UNIT9 "gigacycles"
 #endif

-#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X))
-#define BENCH_UNITS "nsec"
-#else
-#define BENCH_UNITS "cycles"
+static inline void
+cpucycles_init(void) {
+#if defined(__APPLE__) && defined(TARGET_ARM64)
+    macos_init_rdtsc();
 #endif
+}

-static inline int64_t cpucycles(void) {
-#if (defined(TARGET_AMD64) || defined(TARGET_X86))
-    unsigned int hi, lo;
+static inline uint64_t
+cpucycles(void)
+{
+#if defined(TARGET_AMD64) || defined(TARGET_X86)
+    uint32_t hi, lo;

-    asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
-    return ((int64_t) lo) | (((int64_t) hi) << 32);
-#elif (defined(TARGET_S390X))
+    asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)lo) | ((uint64_t)hi << 32);
+#elif defined(TARGET_S390X)
    uint64_t tod;
-    asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
+    asm volatile("stckf %0\n" : "=Q"(tod) : : "cc");
    return (tod * 1000 / 4096);
+#elif defined(TARGET_ARM64) && !defined(NO_CYCLE_COUNTER)
+#if defined(__APPLE__)
+    return macos_rdtsc();
+#else
+    uint64_t cycles;
+    asm volatile("mrs %0, PMCCNTR_EL0" : "=r"(cycles));
+    return cycles;
+#endif // __APPLE__
 #else
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
-    return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
+    return (uint64_t)time.tv_sec * 1000000000 + time.tv_nsec;
 #endif
 }

-static inline int cmpfunc (const void *a, const void *b) {
-    return ( *(uint64_t *)a - * (uint64_t *)b );
+static inline int
+CMPFUNC(const void *a, const void *b)
+{
+    uint64_t aa = *(uint64_t *)a, bb = *(uint64_t *)b;
+
+    if (aa > bb)
+        return +1;
+    if (aa < bb)
+        return -1;
+    return 0;
 }

-#define BENCH_CODE_1(r) \
-    cycles = 0; \
-    for (i = 0; i < (r); ++i) { \
-        cycles1 = cpucycles();
-
-#define BENCH_CODE_2(name, csv) \
-        cycles2 = cpucycles(); \
-        if(i < LIST_SIZE) \
-          cycles_list[i] = (cycles2 - cycles1);\
-        cycles = cycles + (cycles2 - cycles1); \
-    } \
-    qsort(cycles_list, (runs < LIST_SIZE)? runs : LIST_SIZE, sizeof(uint64_t), cmpfunc);\
-    if (csv) \
-      printf("%2" PRId64 ",", cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2]); \
-    else { \
-      printf("  %-20s-> median: %2" PRId64 ", average: %2" PRId64 " ", name, \
-      cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2], (cycles / runs)); \
-      printf("%s\n", BENCH_UNITS); \
+static inline uint32_t
+ISQRT(uint64_t x)
+{
+    uint32_t r = 0;
+    for (ssize_t i = 31; i >= 0; --i) {
+        uint32_t s = r + (1 << i);
+        if ((uint64_t)s * s <= x)
+            r = s;
    }
+    return r;
+}
+
+static inline double
+_TRUNC(uint64_t x)
+{
+    return x / 1000 / 1000.;
+}
+#define _FMT ".3lf"
+#define _UNIT BENCH_UNIT6
+
+#define BENCH_CODE_1(RUNS)                                                                         \
+    {                                                                                              \
+        const size_t count = (RUNS);                                                               \
+        if (!count)                                                                                \
+            abort();                                                                               \
+        uint64_t cycles, cycles1, cycles2;                                                         \
+        uint64_t cycles_list[count];                                                               \
+        cycles = 0;                                                                                \
+        for (size_t i = 0; i < count; ++i) {                                                       \
+            cycles1 = cpucycles();
+
+#define BENCH_CODE_2(name)                                                                         \
+    cycles2 = cpucycles();                                                                         \
+    cycles_list[i] = cycles2 - cycles1;                                                            \
+    cycles += cycles2 - cycles1;                                                                   \
+    }                                                                                              \
+    qsort(cycles_list, count, sizeof(uint64_t), CMPFUNC);                                          \
+    uint64_t variance = 0;                                                                         \
+    for (size_t i = 0; i < count; ++i) {                                                           \
+        int64_t off = cycles_list[i] - cycles / count;                                             \
+        variance += off * off;                                                                     \
+    }                                                                                              \
+    variance /= count;                                                                             \
+    printf("  %-10s", name);                                                                       \
+    printf(" | average %9" _FMT " | stddev %9" _FMT,                                               \
+           _TRUNC(cycles / count),                                                                 \
+           _TRUNC(ISQRT(variance)));                                                               \
+    printf(" | median %9" _FMT " | min %9" _FMT " | max %9" _FMT,                                  \
+           _TRUNC(cycles_list[count / 2]),                                                         \
+           _TRUNC(cycles_list[0]),                                                                 \
+           _TRUNC(cycles_list[count - 1]));                                                        \
+    printf("  (%s)\n", _UNIT);                                                                     \
+    }
+
+#endif
--- a/src/common/generic/include/bench_macos.h
+++ b/src/common/generic/include/bench_macos.h
@@ -0,0 +1,143 @@
+// WARNING: must be run as root on an M1 device
+// WARNING: fragile, uses private apple APIs
+// currently no command line interface, see variables at top of main
+
+/*
+no warranty; use at your own risk - i believe this code needs
+some minor changes to work on some later hardware and/or software revisions,
+which is unsurprising given the use of undocumented, private APIs.
+------------------------------------------------------------------------------
+This code is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2020 Dougall Johnson
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
+
+/*
+  Based on https://github.com/travisdowns/robsize
+  Henry Wong <henry@stuffedcow.net>
+  http://blog.stuffedcow.net/2013/05/measuring-rob-capacity/
+  2014-10-14
+*/
+
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define KPERF_LIST                                                                                 \
+    /*  ret, name, params */                                                                       \
+    F(int, kpc_force_all_ctrs_set, int)                                                            \
+    F(int, kpc_set_counting, uint32_t)                                                             \
+    F(int, kpc_set_thread_counting, uint32_t)                                                      \
+    F(int, kpc_set_config, uint32_t, void *)                                                       \
+    F(int, kpc_get_thread_counters, int, unsigned int, void *)
+
+#define F(ret, name, ...)                                                                          \
+    typedef ret name##proc(__VA_ARGS__);                                                           \
+    static name##proc *name;
+KPERF_LIST
+#undef F
+
+#define CFGWORD_EL0A64EN_MASK (0x20000)
+
+#define CPMU_CORE_CYCLE 0x02
+
+#define KPC_CLASS_FIXED (0)
+#define KPC_CLASS_CONFIGURABLE (1)
+
+#define COUNTERS_COUNT 10
+#define KPC_MASK ((1u << KPC_CLASS_CONFIGURABLE) | (1u << KPC_CLASS_FIXED))
+static uint64_t g_config[COUNTERS_COUNT];
+static uint64_t g_counters[COUNTERS_COUNT];
+
+static void
+macos_configure_rdtsc()
+{
+    if (kpc_force_all_ctrs_set(1)) {
+        printf("kpc_force_all_ctrs_set failed\n");
+        return;
+    }
+
+    if (kpc_set_config(KPC_MASK, g_config)) {
+        printf("kpc_set_config failed\n");
+        return;
+    }
+
+    if (kpc_set_counting(KPC_MASK)) {
+        printf("kpc_set_counting failed\n");
+        return;
+    }
+
+    if (kpc_set_thread_counting(KPC_MASK)) {
+        printf("kpc_set_thread_counting failed\n");
+        return;
+    }
+}
+
+static void
+macos_init_rdtsc()
+{
+    void *kperf =
+        dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY);
+    if (!kperf) {
+        printf("kperf = %p\n", kperf);
+        return;
+    }
+#define F(ret, name, ...)                                                                          \
+    name = (name##proc *)(intptr_t)(dlsym(kperf, #name));                                          \
+    if (!name) {                                                                                   \
+        printf("%s = %p\n", #name, (void *)(intptr_t)name);                                        \
+        return;                                                                                    \
+    }
+    KPERF_LIST
+#undef F
+
+    g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK;
+
+    macos_configure_rdtsc();
+}
+
+static uint64_t
+macos_rdtsc(void)
+{
+    if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) {
+        printf("kpc_get_thread_counters failed\n");
+        return 1;
+    }
+    return g_counters[2];
+}
--- a/src/common/generic/include/bench_test_arguments.h
+++ b/src/common/generic/include/bench_test_arguments.h
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: Apache-2.0
+#ifndef BENCH_TEST_ARGUMENTS_H__
+#define BENCH_TEST_ARGUMENTS_H__
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdint.h>
+
+static int parse_seed(const char *arg, uint32_t *seed)
+{
+    if (sscanf(arg, "--seed=%u", &seed[0]) == 1)
+        return 0;
+
+    if (sscanf(arg, "--seed={ "
+        "0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", "
+        "0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 " }",
+        &seed[0], &seed[1], &seed[2], &seed[3], &seed[4], &seed[5],
+        &seed[6], &seed[7], &seed[8], &seed[9], &seed[10], &seed[11]) == 12)
+        return 0;
+
+    return 1;
+}
+
+static void print_seed(const uint32_t *seed)
+{
+    printf("Random seed: \"--seed={ ");
+    for (int i = 0; i < 12; i++) {
+        printf("0x%08x%s", seed[i], (i < 11) ? ", " : " }\"\n");
+    }
+}
+
+#endif
--- a/src/common/generic/include/fips202.h
+++ b/src/common/generic/include/fips202.h
@@ -4,8 +4,168 @@
 #define FIPS202_H

 #include <stddef.h>
+#include <stdint.h>

-int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
-int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+#define SHA3_256_RATE 136
+#define SHA3_384_RATE 104
+#define SHA3_512_RATE 72
+
+#define PQC_SHAKEINCCTX_U64WORDS 26
+#define PQC_SHAKECTX_U64WORDS 25
+
+#define PQC_SHAKEINCCTX_BYTES (sizeof(uint64_t) * 26)
+#define PQC_SHAKECTX_BYTES (sizeof(uint64_t) * 25)
+
+// Context for incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
+} shake128incctx;
+
+// Context for non-incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKECTX_U64WORDS];
+} shake128ctx;
+
+// Context for incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
+} shake256incctx;
+
+// Context for non-incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKECTX_U64WORDS];
+} shake256ctx;
+
+// Context for incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
+} sha3_256incctx;
+
+// Context for incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
+} sha3_384incctx;
+
+// Context for incremental API
+typedef struct {
+    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
+} sha3_512incctx;
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
+/* Free the state */
+void shake128_ctx_release(shake128ctx *state);
+/* Copy the state. */
+void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
+
+/* Initialize incremental hashing API */
+void shake128_inc_init(shake128incctx *state);
+/* Absorb more information into the XOF.
+ *
+ * Can be called multiple times.
+ */
+void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
+/* Finalize the XOF for squeezing */
+void shake128_inc_finalize(shake128incctx *state);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
+/* Copy the context of the SHAKE128 XOF */
+void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src);
+/* Free the context of the SHAKE128 XOF */
+void shake128_inc_ctx_release(shake128incctx *state);
+
+/* Initialize the state and absorb the provided input.
+ *
+ * This function does not support being called multiple times
+ * with the same state.
+ */
+void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
+/* Free the context held by this XOF */
+void shake256_ctx_release(shake256ctx *state);
+/* Copy the context held by this XOF */
+void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src);
+
+/* Initialize incremental hashing API */
+void shake256_inc_init(shake256incctx *state);
+void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
+/* Prepares for squeeze phase */
+void shake256_inc_finalize(shake256incctx *state);
+/* Squeeze output out of the sponge.
+ *
+ * Supports being called multiple times
+ */
+void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
+/* Copy the state */
+void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src);
+/* Free the state */
+void shake256_inc_ctx_release(shake256incctx *state);
+
+/* One-stop SHAKE128 call */
+void shake128(uint8_t *output, size_t outlen,
+              const uint8_t *input, size_t inlen);
+
+/* One-stop SHAKE256 call */
+void shake256(uint8_t *output, size_t outlen,
+              const uint8_t *input, size_t inlen);
+
+/* Initialize the incremental hashing state */
+void sha3_256_inc_init(sha3_256incctx *state);
+/* Absorb blocks into SHA3 */
+void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
+/* Obtain the output of the function and free `state` */
+void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
+/* Copy the context */
+void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src);
+/* Release the state, don't use if `_finalize` has been used */
+void sha3_256_inc_ctx_release(sha3_256incctx *state);
+
+void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+
+/* Initialize the incremental hashing state */
+void sha3_384_inc_init(sha3_384incctx *state);
+/* Absorb blocks into SHA3 */
+void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen);
+/* Obtain the output of the function and free `state` */
+void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state);
+/* Copy the context */
+void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src);
+/* Release the state, don't use if `_finalize` has been used */
+void sha3_384_inc_ctx_release(sha3_384incctx *state);
+
+/* One-stop SHA3-384 shop */
+void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen);
+
+/* Initialize the incremental hashing state */
+void sha3_512_inc_init(sha3_512incctx *state);
+/* Absorb blocks into SHA3 */
+void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
+/* Obtain the output of the function and free `state` */
+void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
+/* Copy the context */
+void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src);
+/* Release the state, don't use if `_finalize` has been used */
+void sha3_512_inc_ctx_release(sha3_512incctx *state);
+
+/* One-stop SHA3-512 shop */
+void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);

 #endif
--- a/src/common/generic/include/tools.h
+++ b/src/common/generic/include/tools.h
@@ -0,0 +1,49 @@
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include <time.h>
+
+// Debug printing:
+// https://stackoverflow.com/questions/1644868/define-macro-for-debug-printing-in-c
+#ifndef NDEBUG
+#define DEBUG_PRINT 1
+#else
+#define DEBUG_PRINT 0
+#endif
+
+#ifndef __FILE_NAME__
+#define __FILE_NAME__ "NA"
+#endif
+
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+
+#ifndef __func__
+#define __func__ "NA"
+#endif
+
+#define debug_print(fmt)                                                                           \
+    do {                                                                                           \
+        if (DEBUG_PRINT)                                                                           \
+            printf("warning: %s, file %s, line %d, function %s().\n",                              \
+                   fmt,                                                                            \
+                   __FILE_NAME__,                                                                  \
+                   __LINE__,                                                                       \
+                   __func__);                                                                      \
+    } while (0)
+
+
+clock_t tic(void);
+float tac(void);                             /* time in ms since last tic */
+float TAC(const char *str);                  /* same, but prints it with label 'str' */
+float toc(const clock_t t);                  /* time in ms since t */
+float TOC(const clock_t t, const char *str); /* same, but prints it with label 'str' */
+float TOC_clock(const clock_t t, const char *str);
+
+clock_t dclock(const clock_t t); // return the clock cycle diff between now and t
+float clock_to_time(const clock_t t,
+                    const char *str); // convert the number of clock cycles t to time
+float clock_print(const clock_t t, const char *str);
+#endif
--- a/src/common/generic/include/tutil.h
+++ b/src/common/generic/include/tutil.h
@@ -5,24 +5,27 @@
 #include <stdint.h>

 #if defined(__GNUC__) || defined(__clang__)
+#define BSWAP16(i) __builtin_bswap16((i))
 #define BSWAP32(i) __builtin_bswap32((i))
 #define BSWAP64(i) __builtin_bswap64((i))
+#define UNUSED __attribute__((unused))
 #else
-#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP16(i) ((((i) >> 8) & 0xff) | (((i) & 0xff00) << 8))
+#define BSWAP32(i)                                                                                 \
+    ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
 #define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
+#define UNUSED
 #endif

 #if defined(RADIX_64)
 #define digit_t uint64_t
 #define sdigit_t int64_t
-#define DIGIT_LEN 8
 #define RADIX 64
 #define LOG2RADIX 6
 #define BSWAP_DIGIT(i) BSWAP64(i)
 #elif defined(RADIX_32)
 #define digit_t uint32_t
 #define sdigit_t int32_t
-#define DIGIT_LEN 4
 #define RADIX 32
 #define LOG2RADIX 5
 #define BSWAP_DIGIT(i) BSWAP32(i)
--- a/src/common/generic/mem.c
+++ b/src/common/generic/mem.c
@@ -1,9 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0

+#include <mem.h>
 #include <string.h>
 #include <stdlib.h>

-void sqisign_secure_free(void *mem, size_t size) {
+void
+sqisign_secure_free(void *mem, size_t size)
+{
    if (mem) {
        typedef void *(*memset_t)(void *, int, size_t);
        static volatile memset_t memset_func = memset;
@@ -11,7 +14,9 @@ void sqisign_secure_free(void *mem, size_t size) {
        free(mem);
    }
 }
-void sqisign_secure_clear(void *mem, size_t size) {
+void
+sqisign_secure_clear(void *mem, size_t size)
+{
    typedef void *(*memset_t)(void *, int, size_t);
    static volatile memset_t memset_func = memset;
    memset_func(mem, 0, size);
--- a/src/common/generic/randombytes_ctrdrbg.c
+++ b/src/common/generic/randombytes_ctrdrbg.c
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 and Unknown
-//
-/*
-NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
-
-NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
-
-You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
-*/
-
-#include <string.h>
-
-#include <aes.h>
-
-#ifdef ENABLE_CT_TESTING
-#include <valgrind/memcheck.h>
-#endif
-
-#define RNG_SUCCESS      0
-#define RNG_BAD_MAXLEN  -1
-#define RNG_BAD_OUTBUF  -2
-#define RNG_BAD_REQ_LEN -3
-
-static __inline void AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer) {
-    AES_ECB_encrypt(ctr, key, buffer);
-}
-
-typedef struct {
-    unsigned char   buffer[16];
-    int             buffer_pos;
-    unsigned long   length_remaining;
-    unsigned char   key[32];
-    unsigned char   ctr[16];
-} AES_XOF_struct;
-
-typedef struct {
-    unsigned char   Key[32];
-    unsigned char   V[16];
-    int             reseed_counter;
-} AES256_CTR_DRBG_struct;
-
-
-void
-AES256_CTR_DRBG_Update(unsigned char *provided_data,
-                       unsigned char *Key,
-                       unsigned char *V);
-
-AES256_CTR_DRBG_struct  DRBG_ctx;
-
-static void
-randombytes_init_nist(unsigned char *entropy_input,
-                      unsigned char *personalization_string,
-                      int security_strength) {
-    unsigned char   seed_material[48];
-
-    (void)security_strength;  // Unused parameter
-    memcpy(seed_material, entropy_input, 48);
-    if (personalization_string)
-        for (int i = 0; i < 48; i++) {
-            seed_material[i] ^= personalization_string[i];
-        }
-    memset(DRBG_ctx.Key, 0x00, 32);
-    memset(DRBG_ctx.V, 0x00, 16);
-    AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
-    DRBG_ctx.reseed_counter = 1;
-}
-
-static int
-randombytes_nist(unsigned char *x, size_t xlen) {
-    unsigned char   block[16];
-    size_t          i = 0;
-
-    while ( xlen > 0 ) {
-        //increment V
-        for (int j = 15; j >= 0; j--) {
-            if ( DRBG_ctx.V[j] == 0xff ) {
-                DRBG_ctx.V[j] = 0x00;
-            } else {
-                DRBG_ctx.V[j]++;
-                break;
-            }
-        }
-        AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
-        if ( xlen > 15 ) {
-            memcpy(x + i, block, 16);
-            i += 16;
-            xlen -= 16;
-        } else {
-            memcpy(x + i, block, xlen);
-            i += xlen;
-            xlen = 0;
-        }
-    }
-    AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
-    DRBG_ctx.reseed_counter++;
-
-    return 0;
-}
-
-void
-AES256_CTR_DRBG_Update(unsigned char *provided_data,
-                       unsigned char *Key,
-                       unsigned char *V) {
-    unsigned char   temp[48];
-
-    for (int i = 0; i < 3; i++) {
-        //increment V
-        for (int j = 15; j >= 0; j--) {
-            if ( V[j] == 0xff ) {
-                V[j] = 0x00;
-            } else {
-                V[j]++;
-                break;
-            }
-        }
-
-        AES256_ECB(Key, V, temp + 16 * i);
-    }
-    if ( provided_data != NULL )
-        for (int i = 0; i < 48; i++) {
-            temp[i] ^= provided_data[i];
-        }
-    memcpy(Key, temp, 32);
-    memcpy(V, temp + 32, 16);
-}
-
-int randombytes(unsigned char *random_array, unsigned long long nbytes) {
-    int ret = randombytes_nist(random_array, nbytes);
-#ifdef ENABLE_CT_TESTING
-    VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
-#endif
-    return ret;
-}
-
-void
-randombytes_init(unsigned char *entropy_input,
-                 unsigned char *personalization_string,
-                 int security_strength) {
-    return randombytes_init_nist(entropy_input, personalization_string, security_strength);
-}
--- a/src/common/generic/randombytes_system.c
+++ b/src/common/generic/randombytes_system.c
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#include <rng.h>
+
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
@@ -28,14 +30,14 @@ THE SOFTWARE.
 // *before* randombytes.h is included. Otherwise SYS_getrandom will not be
 // declared.
 #if defined(__linux__) || defined(__GNU__)
-# define _GNU_SOURCE
+#define _GNU_SOURCE
 #endif /* defined(__linux__) || defined(__GNU__) */

 #if defined(_WIN32)
 /* Windows */
-# include <windows.h>
-# include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
-#endif /* defined(_WIN32) */
+#include <windows.h>
+#include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
+#endif                /* defined(_WIN32) */

 /* wasi */
 #if defined(__wasi__)
@@ -44,7 +46,7 @@ THE SOFTWARE.

 /* kFreeBSD */
 #if defined(__FreeBSD_kernel__) && defined(__GLIBC__)
-# define GNU_KFREEBSD
+#define GNU_KFREEBSD
 #endif

 #if defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
@@ -53,344 +55,377 @@ THE SOFTWARE.
 // to the linux headers. We only need RNDGETENTCNT, so we instead inline it.
 // RNDGETENTCNT is originally defined in `include/uapi/linux/random.h` in the
 // linux repo.
-# define RNDGETENTCNT 0x80045200
+#define RNDGETENTCNT 0x80045200

-# include <assert.h>
-# include <errno.h>
-# include <fcntl.h>
-# include <poll.h>
-# include <stdint.h>
-# include <stdio.h>
-# include <sys/ioctl.h>
-# if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
-#  define USE_GLIBC
-#  include <sys/random.h>
-# endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24)) */
-# include <sys/stat.h>
-# include <sys/syscall.h>
-# include <sys/types.h>
-# include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) &&                              \
+    ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
+#define USE_GLIBC
+#include <sys/random.h>
+#endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) ||    \
+          (__GLIBC_MINOR__ > 24)) */
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>

 // We need SSIZE_MAX as the maximum read len from /dev/urandom
-# if !defined(SSIZE_MAX)
-#  define SSIZE_MAX (SIZE_MAX / 2 - 1)
-# endif /* defined(SSIZE_MAX) */
+#if !defined(SSIZE_MAX)
+#define SSIZE_MAX (SIZE_MAX / 2 - 1)
+#endif /* defined(SSIZE_MAX) */

 #endif /* defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD) */

-
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 /* Dragonfly, FreeBSD, NetBSD, OpenBSD (has arc4random) */
-# include <sys/param.h>
-# if defined(BSD)
-#  include <stdlib.h>
-# endif
+#include <sys/param.h>
+#if defined(BSD)
+#include <stdlib.h>
+#endif
 /* GNU/Hurd defines BSD in sys/param.h which causes problems later */
-# if defined(__GNU__)
-#  undef BSD
-# endif
+#if defined(__GNU__)
+#undef BSD
+#endif
 #endif

 #if defined(__EMSCRIPTEN__)
-# include <assert.h>
-# include <emscripten.h>
-# include <errno.h>
-# include <stdbool.h>
+#include <assert.h>
+#include <emscripten.h>
+#include <errno.h>
+#include <stdbool.h>
 #endif /* defined(__EMSCRIPTEN__) */

-
 #if defined(_WIN32)
-static int randombytes_win32_randombytes(void* buf, size_t n)
+static int
+randombytes_win32_randombytes(void *buf, size_t n)
 {
-	HCRYPTPROV ctx;
-	BOOL tmp;
-	DWORD to_read = 0;
-	const size_t MAX_DWORD = 0xFFFFFFFF;
+    HCRYPTPROV ctx;
+    BOOL tmp;
+    DWORD to_read = 0;
+    const size_t MAX_DWORD = 0xFFFFFFFF;

-	tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL,
-	                          CRYPT_VERIFYCONTEXT);
-	if (tmp == FALSE) return -1;
+    tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT);
+    if (tmp == FALSE)
+        return -1;

-	while (n > 0) {
-		to_read = (DWORD)(n < MAX_DWORD ? n : MAX_DWORD);
-		tmp = CryptGenRandom(ctx, to_read, (BYTE*) buf);
-		if (tmp == FALSE) return -1;
-		buf = ((char*)buf) + to_read;
-		n -= to_read;
-	}
+    while (n > 0) {
+        to_read = (DWORD)(n < MAX_DWORD ? n : MAX_DWORD);
+        tmp = CryptGenRandom(ctx, to_read, (BYTE *)buf);
+        if (tmp == FALSE)
+            return -1;
+        buf = ((char *)buf) + to_read;
+        n -= to_read;
+    }

-	tmp = CryptReleaseContext(ctx, 0);
-	if (tmp == FALSE) return -1;
+    tmp = CryptReleaseContext(ctx, 0);
+    if (tmp == FALSE)
+        return -1;

-	return 0;
+    return 0;
 }
 #endif /* defined(_WIN32) */

 #if defined(__wasi__)
-static int randombytes_wasi_randombytes(void *buf, size_t n) {
-	arc4random_buf(buf, n);
-	return 0;
+static int
+randombytes_wasi_randombytes(void *buf, size_t n)
+{
+    arc4random_buf(buf, n);
+    return 0;
 }
 #endif /* defined(__wasi__) */

 #if (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom))
-# if defined(USE_GLIBC)
+#if defined(USE_GLIBC)
 // getrandom is declared in glibc.
-# elif defined(SYS_getrandom)
-static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) {
-	return syscall(SYS_getrandom, buf, buflen, flags);
-}
-# endif
-
-static int randombytes_linux_randombytes_getrandom(void *buf, size_t n)
+#elif defined(SYS_getrandom)
+static ssize_t
+getrandom(void *buf, size_t buflen, unsigned int flags)
 {
-	/* I have thought about using a separate PRF, seeded by getrandom, but
-	 * it turns out that the performance of getrandom is good enough
-	 * (250 MB/s on my laptop).
-	 */
-	size_t offset = 0, chunk;
-	int ret;
-	while (n > 0) {
-		/* getrandom does not allow chunks larger than 33554431 */
-		chunk = n <= 33554431 ? n : 33554431;
-		do {
-			ret = getrandom((char *)buf + offset, chunk, 0);
-		} while (ret == -1 && errno == EINTR);
-		if (ret < 0) return ret;
-		offset += ret;
-		n -= ret;
-	}
-	assert(n == 0);
-	return 0;
+    return syscall(SYS_getrandom, buf, buflen, flags);
 }
-#endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom)) */
+#endif
+
+static int
+randombytes_linux_randombytes_getrandom(void *buf, size_t n)
+{
+    /* I have thought about using a separate PRF, seeded by getrandom, but
+     * it turns out that the performance of getrandom is good enough
+     * (250 MB/s on my laptop).
+     */
+    size_t offset = 0, chunk;
+    int ret;
+    while (n > 0) {
+        /* getrandom does not allow chunks larger than 33554431 */
+        chunk = n <= 33554431 ? n : 33554431;
+        do {
+            ret = getrandom((char *)buf + offset, chunk, 0);
+        } while (ret == -1 && errno == EINTR);
+        if (ret < 0)
+            return ret;
+        offset += ret;
+        n -= ret;
+    }
+    assert(n == 0);
+    return 0;
+}
+#endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) ||                       \
+          defined(SYS_getrandom)) */

 #if (defined(__linux__) || defined(GNU_KFREEBSD)) && !defined(SYS_getrandom)

-# if defined(__linux__)
-static int randombytes_linux_read_entropy_ioctl(int device, int *entropy)
+#if defined(__linux__)
+static int
+randombytes_linux_read_entropy_ioctl(int device, int *entropy)
 {
-	return ioctl(device, RNDGETENTCNT, entropy);
+    return ioctl(device, RNDGETENTCNT, entropy);
 }

-static int randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
+static int
+randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
 {
-	int retcode;
-	do {
-		rewind(stream);
-		retcode = fscanf(stream, "%d", entropy);
-	} while (retcode != 1 && errno == EINTR);
-	if (retcode != 1) {
-		return -1;
-	}
-	return 0;
+    int retcode;
+    do {
+        rewind(stream);
+        retcode = fscanf(stream, "%d", entropy);
+    } while (retcode != 1 && errno == EINTR);
+    if (retcode != 1) {
+        return -1;
+    }
+    return 0;
 }

-static int randombytes_linux_wait_for_entropy(int device)
+static int
+randombytes_linux_wait_for_entropy(int device)
 {
-	/* We will block on /dev/random, because any increase in the OS' entropy
-	 * level will unblock the request. I use poll here (as does libsodium),
-	 * because we don't *actually* want to read from the device. */
-	enum { IOCTL, PROC } strategy = IOCTL;
-	const int bits = 128;
-	struct pollfd pfd;
-	int fd;
-	FILE *proc_file;
-	int retcode, retcode_error = 0; // Used as return codes throughout this function
-	int entropy = 0;
+    /* We will block on /dev/random, because any increase in the OS' entropy
+     * level will unblock the request. I use poll here (as does libsodium),
+     * because we don't *actually* want to read from the device. */
+    enum
+    {
+        IOCTL,
+        PROC
+    } strategy = IOCTL;
+    const int bits = 128;
+    struct pollfd pfd;
+    int fd;
+    FILE *proc_file;
+    int retcode, retcode_error = 0; // Used as return codes throughout this function
+    int entropy = 0;

-	/* If the device has enough entropy already, we will want to return early */
-	retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
-	// printf("errno: %d (%s)\n", errno, strerror(errno));
-	if (retcode != 0 && (errno == ENOTTY || errno == ENOSYS)) {
-		// The ioctl call on /dev/urandom has failed due to a
-		//   - ENOTTY (unsupported action), or
-		//   - ENOSYS (invalid ioctl; this happens on MIPS, see #22).
-		//
-		// We will fall back to reading from
-		// `/proc/sys/kernel/random/entropy_avail`.  This less ideal,
-		// because it allocates a file descriptor, and it may not work
-		// in a chroot.  But at this point it seems we have no better
-		// options left.
-		strategy = PROC;
-		// Open the entropy count file
-		proc_file = fopen("/proc/sys/kernel/random/entropy_avail", "r");
-		if (proc_file == NULL) {
-			return -1;
-		}
-	} else if (retcode != 0) {
-		// Unrecoverable ioctl error
-		return -1;
-	}
-	if (entropy >= bits) {
-		return 0;
-	}
+    /* If the device has enough entropy already, we will want to return early */
+    retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
+    // printf("errno: %d (%s)\n", errno, strerror(errno));
+    if (retcode != 0 && (errno == ENOTTY || errno == ENOSYS)) {
+        // The ioctl call on /dev/urandom has failed due to a
+        //   - ENOTTY (unsupported action), or
+        //   - ENOSYS (invalid ioctl; this happens on MIPS, see #22).
+        //
+        // We will fall back to reading from
+        // `/proc/sys/kernel/random/entropy_avail`.  This less ideal,
+        // because it allocates a file descriptor, and it may not work
+        // in a chroot.  But at this point it seems we have no better
+        // options left.
+        strategy = PROC;
+        // Open the entropy count file
+        proc_file = fopen("/proc/sys/kernel/random/entropy_avail", "r");
+        if (proc_file == NULL) {
+            return -1;
+        }
+    } else if (retcode != 0) {
+        // Unrecoverable ioctl error
+        return -1;
+    }
+    if (entropy >= bits) {
+        return 0;
+    }

-	do {
-		fd = open("/dev/random", O_RDONLY);
-	} while (fd == -1 && errno == EINTR); /* EAGAIN will not occur */
-	if (fd == -1) {
-		/* Unrecoverable IO error */
-		return -1;
-	}
+    do {
+        fd = open("/dev/random", O_RDONLY);
+    } while (fd == -1 && errno == EINTR); /* EAGAIN will not occur */
+    if (fd == -1) {
+        /* Unrecoverable IO error */
+        return -1;
+    }

-	pfd.fd = fd;
-	pfd.events = POLLIN;
-	for (;;) {
-		retcode = poll(&pfd, 1, -1);
-		if (retcode == -1 && (errno == EINTR || errno == EAGAIN)) {
-			continue;
-		} else if (retcode == 1) {
-			if (strategy == IOCTL) {
-				retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
-			} else if (strategy == PROC) {
-				retcode = randombytes_linux_read_entropy_proc(proc_file, &entropy);
-			} else {
-				return -1; // Unreachable
-			}
+    pfd.fd = fd;
+    pfd.events = POLLIN;
+    for (;;) {
+        retcode = poll(&pfd, 1, -1);
+        if (retcode == -1 && (errno == EINTR || errno == EAGAIN)) {
+            continue;
+        } else if (retcode == 1) {
+            if (strategy == IOCTL) {
+                retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
+            } else if (strategy == PROC) {
+                retcode = randombytes_linux_read_entropy_proc(proc_file, &entropy);
+            } else {
+                return -1; // Unreachable
+            }

-			if (retcode != 0) {
-				// Unrecoverable I/O error
-				retcode_error = retcode;
-				break;
-			}
-			if (entropy >= bits) {
-				break;
-			}
-		} else {
-			// Unreachable: poll() should only return -1 or 1
-			retcode_error = -1;
-			break;
-		}
-	}
-	do {
-		retcode = close(fd);
-	} while (retcode == -1 && errno == EINTR);
-	if (strategy == PROC) {
-		do {
-			retcode = fclose(proc_file);
-		} while (retcode == -1 && errno == EINTR);
-	}
-	if (retcode_error != 0) {
-		return retcode_error;
-	}
-	return retcode;
+            if (retcode != 0) {
+                // Unrecoverable I/O error
+                retcode_error = retcode;
+                break;
+            }
+            if (entropy >= bits) {
+                break;
+            }
+        } else {
+            // Unreachable: poll() should only return -1 or 1
+            retcode_error = -1;
+            break;
+        }
+    }
+    do {
+        retcode = close(fd);
+    } while (retcode == -1 && errno == EINTR);
+    if (strategy == PROC) {
+        do {
+            retcode = fclose(proc_file);
+        } while (retcode == -1 && errno == EINTR);
+    }
+    if (retcode_error != 0) {
+        return retcode_error;
+    }
+    return retcode;
 }
-# endif /* defined(__linux__) */
+#endif /* defined(__linux__) */

-
-static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
+static int
+randombytes_linux_randombytes_urandom(void *buf, size_t n)
 {
-	int fd;
-	size_t offset = 0, count;
-	ssize_t tmp;
-	do {
-		fd = open("/dev/urandom", O_RDONLY);
-	} while (fd == -1 && errno == EINTR);
-	if (fd == -1) return -1;
-# if defined(__linux__)
-	if (randombytes_linux_wait_for_entropy(fd) == -1) return -1;
-# endif
+    int fd;
+    size_t offset = 0, count;
+    ssize_t tmp;
+    do {
+        fd = open("/dev/urandom", O_RDONLY);
+    } while (fd == -1 && errno == EINTR);
+    if (fd == -1)
+        return -1;
+#if defined(__linux__)
+    if (randombytes_linux_wait_for_entropy(fd) == -1)
+        return -1;
+#endif

-	while (n > 0) {
-		count = n <= SSIZE_MAX ? n : SSIZE_MAX;
-		tmp = read(fd, (char *)buf + offset, count);
-		if (tmp == -1 && (errno == EAGAIN || errno == EINTR)) {
-			continue;
-		}
-		if (tmp == -1) return -1; /* Unrecoverable IO error */
-		offset += tmp;
-		n -= tmp;
-	}
-	close(fd);
-	assert(n == 0);
-	return 0;
+    while (n > 0) {
+        count = n <= SSIZE_MAX ? n : SSIZE_MAX;
+        tmp = read(fd, (char *)buf + offset, count);
+        if (tmp == -1 && (errno == EAGAIN || errno == EINTR)) {
+            continue;
+        }
+        if (tmp == -1)
+            return -1; /* Unrecoverable IO error */
+        offset += tmp;
+        n -= tmp;
+    }
+    close(fd);
+    assert(n == 0);
+    return 0;
 }
 #endif /* defined(__linux__) && !defined(SYS_getrandom) */

-
 #if defined(BSD)
-static int randombytes_bsd_randombytes(void *buf, size_t n)
+static int
+randombytes_bsd_randombytes(void *buf, size_t n)
 {
-	arc4random_buf(buf, n);
-	return 0;
+    arc4random_buf(buf, n);
+    return 0;
 }
 #endif /* defined(BSD) */

-
 #if defined(__EMSCRIPTEN__)
-static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
-	const int ret = EM_ASM_INT({
-		var crypto;
-		try {
-			crypto = require('crypto');
-		} catch (error) {
-			return -2;
-		}
-		try {
-			writeArrayToMemory(crypto.randomBytes($1), $0);
-			return 0;
-		} catch (error) {
-			return -1;
-		}
-	}, buf, n);
-	switch (ret) {
-	case 0:
-		return 0;
-	case -1:
-		errno = EINVAL;
-		return -1;
-	case -2:
-		errno = ENOSYS;
-		return -1;
-	}
-	assert(false); // Unreachable
+static int
+randombytes_js_randombytes_nodejs(void *buf, size_t n)
+{
+    const int ret = EM_ASM_INT(
+        {
+            var crypto;
+            try {
+                crypto = require('crypto');
+            } catch (error) {
+                return -2;
+            }
+            try {
+                writeArrayToMemory(crypto.randomBytes($1), $0);
+                return 0;
+            } catch (error) {
+                return -1;
+            }
+        },
+        buf,
+        n);
+    switch (ret) {
+        case 0:
+            return 0;
+        case -1:
+            errno = EINVAL;
+            return -1;
+        case -2:
+            errno = ENOSYS;
+            return -1;
+    }
+    assert(false); // Unreachable
 }
 #endif /* defined(__EMSCRIPTEN__) */

-
-static int randombytes_select(void *buf, size_t n)
+SQISIGN_API
+int
+randombytes_select(unsigned char *buf, unsigned long long n)
 {
 #if defined(__EMSCRIPTEN__)
-	return randombytes_js_randombytes_nodejs(buf, n);   
+    return randombytes_js_randombytes_nodejs(buf, n);
 #elif defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
-# if defined(USE_GLIBC)
-	/* Use getrandom system call */
-	return randombytes_linux_randombytes_getrandom(buf, n);
-# elif defined(SYS_getrandom)
-	/* Use getrandom system call */
-	return randombytes_linux_randombytes_getrandom(buf, n);
-# else
-	/* When we have enough entropy, we can read from /dev/urandom */
-	return randombytes_linux_randombytes_urandom(buf, n);
-# endif
-#elif defined(BSD)
-	/* Use arc4random system call */
-	return randombytes_bsd_randombytes(buf, n);
-#elif defined(_WIN32)
-	/* Use windows API */
-	return randombytes_win32_randombytes(buf, n);
-#elif defined(__wasi__)
-	/* Use WASI */
-	return randombytes_wasi_randombytes(buf, n);
+#if defined(USE_GLIBC)
+    /* Use getrandom system call */
+    return randombytes_linux_randombytes_getrandom(buf, n);
+#elif defined(SYS_getrandom)
+    /* Use getrandom system call */
+    return randombytes_linux_randombytes_getrandom(buf, n);
 #else
-# error "randombytes(...) is not supported on this platform"
+    /* When we have enough entropy, we can read from /dev/urandom */
+    return randombytes_linux_randombytes_urandom(buf, n);
+#endif
+#elif defined(BSD)
+    /* Use arc4random system call */
+    return randombytes_bsd_randombytes(buf, n);
+#elif defined(_WIN32)
+    /* Use windows API */
+    return randombytes_win32_randombytes(buf, n);
+#elif defined(__wasi__)
+    /* Use WASI */
+    return randombytes_wasi_randombytes(buf, n);
+#else
+#error "randombytes(...) is not supported on this platform"
 #endif
 }

-int randombytes(unsigned char *x, unsigned long long xlen) {
+#ifdef RANDOMBYTES_SYSTEM
+SQISIGN_API
+int
+randombytes(unsigned char *x, unsigned long long xlen)
+{

-    int ret = randombytes_select(x, (size_t) xlen);
+    int ret = randombytes_select(x, (size_t)xlen);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(x, xlen);
 #endif
    return ret;
 }

-void randombytes_init(unsigned char *entropy_input,
-                      unsigned char *personalization_string,
-                      int security_strength) {
-    (void) entropy_input;
-    (void) personalization_string;
-    (void) security_strength;
+SQISIGN_API
+void
+randombytes_init(unsigned char *entropy_input,
+                 unsigned char *personalization_string,
+                 int security_strength)
+{
+    (void)entropy_input;
+    (void)personalization_string;
+    (void)security_strength;
 }
+#endif
--- a/src/common/generic/test/bench_ctrdrbg.c
+++ b/src/common/generic/test/bench_ctrdrbg.c
@@ -0,0 +1,57 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bench.h"
+
+#define RANDOMBYTES_MAX_LENGTH 131072
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+void
+randombytes_init_nist(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength);
+
+int
+randombytes_nist(unsigned char *x, size_t xlen);
+
+void
+RANDOMBYTES_INIT_PLATFORM(unsigned char *entropy_input,
+                          unsigned char *personalization_string,
+                          int security_strength);
+
+int
+RANDOMBYTES_PLATFORM(unsigned char *x, size_t xlen);
+
+int
+randombytes_select(void *buf, size_t n);
+
+// run all tests in module
+int main(int argc, char *argv[]) {
+#ifndef NDEBUG
+    fprintf(stderr,
+            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
+            "This will severely impact performance measurements.\x1b[0m\n");
+#endif
+
+  printf("Running AES-CTR-DRBG benchmarks\n");
+
+  unsigned char x[RANDOMBYTES_MAX_LENGTH];
+
+  cpucycles_init();
+
+  BENCH_CODE_1(1000 * SQISIGN_TEST_REPS);
+  RANDOMBYTES_PLATFORM(x, RANDOMBYTES_MAX_LENGTH);
+  BENCH_CODE_2(STRINGIFY(RANDOMBYTES_PLATFORM));
+
+  BENCH_CODE_1(SQISIGN_TEST_REPS);
+  randombytes_nist(x, RANDOMBYTES_MAX_LENGTH);
+  BENCH_CODE_2("randombytes_nist");
+
+  BENCH_CODE_1(1000 * SQISIGN_TEST_REPS);
+  randombytes_select(x, RANDOMBYTES_MAX_LENGTH);
+  BENCH_CODE_2("randombytes_system");
+
+  return 0;
+}
--- a/src/common/generic/test/test_ctrdrbg.c
+++ b/src/common/generic/test/test_ctrdrbg.c
@@ -0,0 +1,68 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#define RANDOMBYTES_MAX_LENGTH 131072
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+void
+randombytes_init_nist(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength);
+
+int
+randombytes_nist(unsigned char *x, size_t xlen);
+
+void
+RANDOMBYTES_INIT_PLATFORM(unsigned char *entropy_input,
+                          unsigned char *personalization_string,
+                          int security_strength);
+
+int
+RANDOMBYTES_PLATFORM(unsigned char *x, size_t xlen);
+
+int
+randombytes_select(void *buf, size_t n);
+
+// run all tests in module
+int main(int argc, char *argv[]) {
+  int res = 1;
+
+  printf("Running AES-CTR-DRBG unit tests\n");
+
+  unsigned char seed[48];
+  unsigned char x_nist[RANDOMBYTES_MAX_LENGTH], x_platform[RANDOMBYTES_MAX_LENGTH];
+
+  for (int i = 0; i < 8; i++) {
+    for (unsigned j = 0; j < sizeof(seed); j++) {
+      seed[j] = 1 << i;
+    }
+
+    RANDOMBYTES_INIT_PLATFORM(seed, NULL, 256);
+    randombytes_init_nist(seed, NULL, 256);
+
+    for (int j = RANDOMBYTES_MAX_LENGTH; j <= RANDOMBYTES_MAX_LENGTH; j *= 2) {
+      RANDOMBYTES_PLATFORM(x_platform, j);
+      randombytes_nist(x_nist, j);
+
+      if (memcmp(x_platform, x_nist, j) != 0) {
+        for (int k = 0; k < j; k++) {
+          if (x_platform[k] != x_nist[k]) {
+            printf("Test failed for seed = %d, length = %d bytes: mismatch at index %d: %d != %d\n", i, j, k, x_platform[k], x_nist[k]);
+            break;
+          }
+        }
+        res = 0;
+      }
+    }
+  }
+
+  if (!res) {
+    printf("\nSome tests failed!\n");
+  } else {
+    printf("\nAll tests passed!\n");
+  }
+
+  return (!res);
+}
--- a/src/common/generic/tools.c
+++ b/src/common/generic/tools.c
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include <time.h>
+
+static clock_t global_timer;
+
+clock_t
+tic(void)
+{
+    global_timer = clock();
+    return global_timer;
+}
+
+float
+tac(void)
+{
+    float ms = (1000. * (float)(clock() - global_timer) / CLOCKS_PER_SEC);
+    return ms;
+}
+
+float
+TAC(const char *str)
+{
+    float ms = (1000. * (float)(clock() - global_timer) / CLOCKS_PER_SEC);
+#ifndef NDEBUG
+    printf("%s [%d ms]\n", str, (int)ms);
+#endif
+    return ms;
+}
+
+float
+toc(const clock_t t)
+{
+    float ms = (1000. * (float)(clock() - t) / CLOCKS_PER_SEC);
+    return ms;
+}
+
+float
+TOC(const clock_t t, const char *str)
+{
+    float ms = (1000. * (float)(clock() - t) / CLOCKS_PER_SEC);
+    printf("%s [%d ms]\n", str, (int)ms);
+    return ms;
+    // printf("%s [%ld]\n",str,clock()-t);
+    // return (float) (clock()-t);
+}
+
+float
+TOC_clock(const clock_t t, const char *str)
+{
+    printf("%s [%ld]\n", str, clock() - t);
+    return (float)(clock() - t);
+}
+
+clock_t
+dclock(const clock_t t)
+{
+    return (clock() - t);
+}
+
+float
+clock_to_time(const clock_t t, const char *str)
+{
+    float ms = (1000. * (float)(t) / CLOCKS_PER_SEC);
+    printf("%s [%d ms]\n", str, (int)ms);
+    return ms;
+    // printf("%s [%ld]\n",str,t);
+    // return (float) (t);
+}
+
+float
+clock_print(const clock_t t, const char *str)
+{
+    printf("%s [%ld]\n", str, t);
+    return (float)(t);
+}
--- a/src/common/ref/CMakeLists.txt
+++ b/src/common/ref/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCE_FILES_COMMON_TEST_REF
+    randombytes_ctrdrbg.c 
+    aes_c.c 
+)
+
+target_sources(sqisign_common_test PRIVATE ${SOURCE_FILES_COMMON_TEST_REF})
+target_include_directories(sqisign_common_test PRIVATE include)
+target_compile_definitions(sqisign_common_test PRIVATE RANDOMBYTES_C)
+
+target_compile_definitions(sqisign_common_sys PRIVATE RANDOMBYTES_SYSTEM)
--- a/src/common/generic/aes_c.c
+++ b/src/common/generic/aes_c.c
@@ -39,23 +39,24 @@
 #define AESCTR_NONCEBYTES 12
 #define AES_BLOCKBYTES 16

-// We've put these states on the heap to make sure ctx_release is used.
 #define PQC_AES128_STATESIZE 88
-typedef struct {
-    uint64_t *sk_exp;
+typedef struct
+{
+    uint64_t sk_exp[PQC_AES128_STATESIZE];
 } aes128ctx;

 #define PQC_AES192_STATESIZE 104
-typedef struct {
-    uint64_t  *sk_exp;
+typedef struct
+{
+    uint64_t sk_exp[PQC_AES192_STATESIZE];
 } aes192ctx;

 #define PQC_AES256_STATESIZE 120
-typedef struct {
-    uint64_t *sk_exp;
+typedef struct
+{
+    uint64_t sk_exp[PQC_AES256_STATESIZE];
 } aes256ctx;

-
 /** Initializes the context **/
 void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);

@@ -68,7 +69,6 @@ void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons
 /** Frees the context **/
 void aes128_ctx_release(aes128ctx *r);

-
 /** Initializes the context **/
 void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key);

@@ -80,7 +80,6 @@ void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons

 void aes192_ctx_release(aes192ctx *r);

-
 /** Initializes the context **/
 void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);

@@ -93,46 +92,50 @@ void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons
 /** Frees the context **/
 void aes256_ctx_release(aes256ctx *r);

-static inline uint32_t br_dec32le(const unsigned char *src) {
-    return (uint32_t)src[0]
-           | ((uint32_t)src[1] << 8)
-           | ((uint32_t)src[2] << 16)
-           | ((uint32_t)src[3] << 24);
+static inline uint32_t
+br_dec32le(const unsigned char *src)
+{
+    return (uint32_t)src[0] | ((uint32_t)src[1] << 8) | ((uint32_t)src[2] << 16) |
+           ((uint32_t)src[3] << 24);
 }

-
-static void br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src) {
+static void
+br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src)
+{
    while (num-- > 0) {
-        *v ++ = br_dec32le(src);
+        *v++ = br_dec32le(src);
        src += 4;
    }
 }

-
-static inline uint32_t br_swap32(uint32_t x) {
-    x = ((x & (uint32_t)0x00FF00FF) << 8)
-        | ((x >> 8) & (uint32_t)0x00FF00FF);
+static inline uint32_t
+br_swap32(uint32_t x)
+{
+    x = ((x & (uint32_t)0x00FF00FF) << 8) | ((x >> 8) & (uint32_t)0x00FF00FF);
    return (x << 16) | (x >> 16);
 }

-
-static inline void br_enc32le(unsigned char *dst, uint32_t x) {
+static inline void
+br_enc32le(unsigned char *dst, uint32_t x)
+{
    dst[0] = (unsigned char)x;
    dst[1] = (unsigned char)(x >> 8);
    dst[2] = (unsigned char)(x >> 16);
    dst[3] = (unsigned char)(x >> 24);
 }

-
-static void br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num) {
+static void
+br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num)
+{
    while (num-- > 0) {
-        br_enc32le(dst, *v ++);
+        br_enc32le(dst, *v++);
        dst += 4;
    }
 }

-
-static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
+static void
+br_aes_ct64_bitslice_Sbox(uint64_t *q)
+{
    /*
     * This S-box implementation is a straightforward translation of
     * the circuit described by Boyar and Peralta in "A new
@@ -306,18 +309,21 @@ static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
    q[0] = s7;
 }

-static void br_aes_ct64_ortho(uint64_t *q) {
-#define SWAPN(cl, ch, s, x, y)   do { \
-        uint64_t a, b; \
-        a = (x); \
-        b = (y); \
-        (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \
-        (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \
+static void
+br_aes_ct64_ortho(uint64_t *q)
+{
+#define SWAPN(cl, ch, s, x, y)                                                                     \
+    do {                                                                                           \
+        uint64_t a, b;                                                                             \
+        a = (x);                                                                                   \
+        b = (y);                                                                                   \
+        (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s));                                \
+        (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch));                                \
    } while (0)

-#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
-#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
-#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
+#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y)
+#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y)
+#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y)

    SWAP2(q[0], q[1]);
    SWAP2(q[2], q[3]);
@@ -335,8 +341,9 @@ static void br_aes_ct64_ortho(uint64_t *q) {
    SWAP8(q[3], q[7]);
 }

-
-static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
+static void
+br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w)
+{
    uint64_t x0, x1, x2, x3;

    x0 = w[0];
@@ -363,8 +370,9 @@ static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t
    *q1 = x1 | (x3 << 8);
 }

-
-static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
+static void
+br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1)
+{
    uint64_t x0, x1, x2, x3;

    x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
@@ -385,11 +393,11 @@ static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
    w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
 }

-static const unsigned char Rcon[] = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
-};
+static const unsigned char Rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 };

-static uint32_t sub_word(uint32_t x) {
+static uint32_t
+sub_word(uint32_t x)
+{
    uint64_t q[8];

    memset(q, 0, sizeof q);
@@ -400,7 +408,9 @@ static uint32_t sub_word(uint32_t x) {
    return (uint32_t)q[0];
 }

-static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len) {
+static void
+br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len)
+{
    unsigned int i, j, k, nk, nkf;
    uint32_t tmp;
    uint32_t skey[60];
@@ -410,7 +420,7 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
    nkf = ((nrounds + 1) << 2);
    br_range_dec32le(skey, (key_len >> 2), key);
    tmp = skey[(key_len >> 2) - 1];
-    for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+    for (i = nk, j = 0, k = 0; i < nkf; i++) {
        if (j == 0) {
            tmp = (tmp << 24) | (tmp >> 8);
            tmp = sub_word(tmp) ^ Rcon[k];
@@ -419,9 +429,9 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
        }
        tmp ^= skey[i - nk];
        skey[i] = tmp;
-        if (++ j == nk) {
+        if (++j == nk) {
            j = 0;
-            k ++;
+            k++;
        }
    }

@@ -437,23 +447,21 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
        q[7] = q[4];
        br_aes_ct64_ortho(q);
        comp_skey[j + 0] =
-            (q[0] & (uint64_t)0x1111111111111111)
-            | (q[1] & (uint64_t)0x2222222222222222)
-            | (q[2] & (uint64_t)0x4444444444444444)
-            | (q[3] & (uint64_t)0x8888888888888888);
+            (q[0] & (uint64_t)0x1111111111111111) | (q[1] & (uint64_t)0x2222222222222222) |
+            (q[2] & (uint64_t)0x4444444444444444) | (q[3] & (uint64_t)0x8888888888888888);
        comp_skey[j + 1] =
-            (q[4] & (uint64_t)0x1111111111111111)
-            | (q[5] & (uint64_t)0x2222222222222222)
-            | (q[6] & (uint64_t)0x4444444444444444)
-            | (q[7] & (uint64_t)0x8888888888888888);
+            (q[4] & (uint64_t)0x1111111111111111) | (q[5] & (uint64_t)0x2222222222222222) |
+            (q[6] & (uint64_t)0x4444444444444444) | (q[7] & (uint64_t)0x8888888888888888);
    }
 }

-static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds) {
+static void
+br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds)
+{
    unsigned u, v, n;

    n = (nrounds + 1) << 1;
-    for (u = 0, v = 0; u < n; u ++, v += 4) {
+    for (u = 0, v = 0; u < n; u++, v += 4) {
        uint64_t x0, x1, x2, x3;

        x0 = x1 = x2 = x3 = comp_skey[u];
@@ -471,8 +479,9 @@ static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, u
    }
 }

-
-static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
+static inline void
+add_round_key(uint64_t *q, const uint64_t *sk)
+{
    q[0] ^= sk[0];
    q[1] ^= sk[1];
    q[2] ^= sk[2];
@@ -483,28 +492,32 @@ static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
    q[7] ^= sk[7];
 }

-static inline void shift_rows(uint64_t *q) {
+static inline void
+shift_rows(uint64_t *q)
+{
    int i;

-    for (i = 0; i < 8; i ++) {
+    for (i = 0; i < 8; i++) {
        uint64_t x;

        x = q[i];
-        q[i] = (x & (uint64_t)0x000000000000FFFF)
-               | ((x & (uint64_t)0x00000000FFF00000) >> 4)
-               | ((x & (uint64_t)0x00000000000F0000) << 12)
-               | ((x & (uint64_t)0x0000FF0000000000) >> 8)
-               | ((x & (uint64_t)0x000000FF00000000) << 8)
-               | ((x & (uint64_t)0xF000000000000000) >> 12)
-               | ((x & (uint64_t)0x0FFF000000000000) << 4);
+        q[i] =
+            (x & (uint64_t)0x000000000000FFFF) | ((x & (uint64_t)0x00000000FFF00000) >> 4) |
+            ((x & (uint64_t)0x00000000000F0000) << 12) | ((x & (uint64_t)0x0000FF0000000000) >> 8) |
+            ((x & (uint64_t)0x000000FF00000000) << 8) | ((x & (uint64_t)0xF000000000000000) >> 12) |
+            ((x & (uint64_t)0x0FFF000000000000) << 4);
    }
 }

-static inline uint64_t rotr32(uint64_t x) {
+static inline uint64_t
+rotr32(uint64_t x)
+{
    return (x << 32) | (x >> 32);
 }

-static inline void mix_columns(uint64_t *q) {
+static inline void
+mix_columns(uint64_t *q)
+{
    uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
    uint64_t r0, r1, r2, r3, r4, r5, r6, r7;

@@ -535,14 +548,19 @@ static inline void mix_columns(uint64_t *q) {
    q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
 }

-
-static void inc4_be(uint32_t *x) {
+static void
+inc4_be(uint32_t *x)
+{
    uint32_t t = br_swap32(*x) + 4;
    *x = br_swap32(t);
 }

-
-static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+static void
+aes_ecb4x(unsigned char out[64],
+          const uint32_t ivw[16],
+          const uint64_t *sk_exp,
+          unsigned int nrounds)
+{
    uint32_t w[16];
    uint64_t q[8];
    unsigned int i;
@@ -553,7 +571,6 @@ static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint6
    }
    br_aes_ct64_ortho(q);

-
    add_round_key(q, sk_exp);
    for (i = 1; i < nrounds; i++) {
        br_aes_ct64_bitslice_Sbox(q);
@@ -566,14 +583,15 @@ static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint6
    add_round_key(q, sk_exp + 8 * nrounds);

    br_aes_ct64_ortho(q);
-    for (i = 0; i < 4; i ++) {
+    for (i = 0; i < 4; i++) {
        br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
    }
    br_range_enc32le(out, w, 16);
 }

-
-static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+static void
+aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds)
+{
    aes_ecb4x(out, ivw, sk_exp, nrounds);

    /* Increase counter for next 4 blocks */
@@ -583,8 +601,13 @@ static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *s
    inc4_be(ivw + 15);
 }

-
-static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, unsigned int nrounds) {
+static void
+aes_ecb(unsigned char *out,
+        const unsigned char *in,
+        size_t nblocks,
+        const uint64_t *rkeys,
+        unsigned int nrounds)
+{
    uint32_t blocks[16];
    unsigned char t[64];

@@ -603,18 +626,23 @@ static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks,
    }
 }

-
-static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, unsigned int nrounds) {
+static void
+aes_ctr(unsigned char *out,
+        size_t outlen,
+        const unsigned char *iv,
+        const uint64_t *rkeys,
+        unsigned int nrounds)
+{
    uint32_t ivw[16];
    size_t i;
    uint32_t cc = 0;

    br_range_dec32le(ivw, 3, iv);
-    memcpy(ivw +  4, ivw, 3 * sizeof(uint32_t));
-    memcpy(ivw +  8, ivw, 3 * sizeof(uint32_t));
+    memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t));
+    memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t));
    memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
-    ivw[ 3] = br_swap32(cc);
-    ivw[ 7] = br_swap32(cc + 1);
+    ivw[3] = br_swap32(cc);
+    ivw[7] = br_swap32(cc + 1);
    ivw[11] = br_swap32(cc + 2);
    ivw[15] = br_swap32(cc + 3);

@@ -632,97 +660,110 @@ static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv,
    }
 }

-void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key) {
+void
+aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key)
+{
    uint64_t skey[22];

-    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES128_STATESIZE);
-    if (r->sk_exp == NULL) {
-        exit(111);
-    }
-
    br_aes_ct64_keysched(skey, key, 16);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 10);
 }

-void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key) {
+void
+aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key)
+{
    aes128_ecb_keyexp(r, key);
 }

-
-void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key) {
+void
+aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key)
+{
    uint64_t skey[26];
-    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES192_STATESIZE);
-    if (r->sk_exp == NULL) {
-        exit(111);
-    }

    br_aes_ct64_keysched(skey, key, 24);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 12);
 }

-
-void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key) {
+void
+aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key)
+{
    aes192_ecb_keyexp(r, key);
 }

-
-void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key) {
+void
+aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key)
+{
    uint64_t skey[30];
-    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES256_STATESIZE);
-    if (r->sk_exp == NULL) {
-        exit(111);
-    }

    br_aes_ct64_keysched(skey, key, 32);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 14);
 }

-
-void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key) {
+void
+aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key)
+{
    aes256_ecb_keyexp(r, key);
 }

-
-void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx) {
+void
+aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx)
+{
    aes_ecb(out, in, nblocks, ctx->sk_exp, 10);
 }

-void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx) {
+void
+aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx)
+{
    aes_ctr(out, outlen, iv, ctx->sk_exp, 10);
 }

-void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx) {
+void
+aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx)
+{
    aes_ecb(out, in, nblocks, ctx->sk_exp, 12);
 }

-void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx) {
+void
+aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx)
+{
    aes_ctr(out, outlen, iv, ctx->sk_exp, 12);
 }

-void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx) {
+void
+aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx)
+{
    aes_ecb(out, in, nblocks, ctx->sk_exp, 14);
 }

-void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx) {
+void
+aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx)
+{
    aes_ctr(out, outlen, iv, ctx->sk_exp, 14);
 }

-void aes128_ctx_release(aes128ctx *r) {
-    free(r->sk_exp);
+void
+aes128_ctx_release(aes128ctx *r)
+{
 }

-void aes192_ctx_release(aes192ctx *r) {
-    free(r->sk_exp);
+void
+aes192_ctx_release(aes192ctx *r)
+{
 }

-void aes256_ctx_release(aes256ctx *r) {
-    free(r->sk_exp);
+void
+aes256_ctx_release(aes256ctx *r)
+{
 }

-int AES_128_CTR(unsigned char *output, size_t outputByteLen,
-                const unsigned char *input, size_t inputByteLen) {
+int
+AES_128_CTR(unsigned char *output,
+            size_t outputByteLen,
+            const unsigned char *input,
+            size_t inputByteLen)
+{
    aes128ctx ctx;
-    unsigned char iv[16] = { 0 };
+    const unsigned char iv[16] = { 0 };

    aes128_ctr_keyexp(&ctx, input);
    aes128_ctr(output, outputByteLen, iv, &ctx);
@@ -731,7 +772,9 @@ int AES_128_CTR(unsigned char *output, size_t outputByteLen,
    return (int)outputByteLen;
 }

-void AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output) {
+void
+AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output)
+{
    aes256ctx ctx;

    aes256_ecb_keyexp(&ctx, key);
--- a/src/common/ref/include/aes.h
+++ b/src/common/ref/include/aes.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AES_H
+#define AES_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output,
+                   size_t outputByteLen,
+                   const unsigned char *input,
+                   size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output,
+                      size_t outputByteLen,
+                      const unsigned char *input,
+                      size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+int AES_128_CTR(unsigned char *output,
+                size_t outputByteLen,
+                const unsigned char *input,
+                size_t inputByteLen);
+#endif
+
+#endif
--- a/src/common/ref/randombytes_ctrdrbg.c
+++ b/src/common/ref/randombytes_ctrdrbg.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: Apache-2.0 and Unknown
+//
+/*
+NIST-developed software is provided by NIST as a public service. You may use,
+copy, and distribute copies of the software in any medium, provided that you
+keep intact this entire notice. You may improve, modify, and create derivative
+works of the software or any portion of the software, and you may copy and
+distribute such modifications or works. Modified works should carry a notice
+stating that you changed the software and should note the date and nature of any
+such change. Please explicitly acknowledge the National Institute of Standards
+and Technology as the source of the software.
+
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
+ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
+WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
+NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
+ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
+ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
+INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
+USEFULNESS OF THE SOFTWARE.
+
+You are solely responsible for determining the appropriateness of using and
+distributing the software and you assume all risks associated with its use,
+including but not limited to the risks and costs of program errors, compliance
+with applicable laws, damage to or loss of data, programs or equipment, and the
+unavailability or interruption of operation. This software is not intended to be
+used in any situation where a failure could cause risk of injury or damage to
+property. The software developed by NIST employees is not subject to copyright
+protection within the United States.
+*/
+
+#include <rng.h>
+#include <string.h>
+
+#include <aes.h>
+
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define RNG_SUCCESS 0
+#define RNG_BAD_MAXLEN -1
+#define RNG_BAD_OUTBUF -2
+#define RNG_BAD_REQ_LEN -3
+
+static inline void AES256_ECB(const unsigned char *key,
+                              const unsigned char *ctr, unsigned char *buffer) {
+  AES_ECB_encrypt(ctr, key, buffer);
+}
+
+typedef struct {
+  unsigned char Key[32];
+  unsigned char V[16];
+  int reseed_counter;
+} AES256_CTR_DRBG_struct;
+
+void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
+                            unsigned char *Key, unsigned char *V);
+
+AES256_CTR_DRBG_struct DRBG_ctx;
+
+#ifndef CTRDRBG_TEST_BENCH
+static
+#endif
+    void
+    randombytes_init_nist(unsigned char *entropy_input,
+                          unsigned char *personalization_string,
+                          int security_strength) {
+  unsigned char seed_material[48];
+
+  (void)security_strength; // Unused parameter
+  memcpy(seed_material, entropy_input, 48);
+  if (personalization_string)
+    for (int i = 0; i < 48; i++) {
+      seed_material[i] ^= personalization_string[i];
+    }
+  memset(DRBG_ctx.Key, 0x00, 32);
+  memset(DRBG_ctx.V, 0x00, 16);
+  AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter = 1;
+}
+
+#ifndef CTRDRBG_TEST_BENCH
+static
+#endif
+    int
+    randombytes_nist(unsigned char *x, size_t xlen) {
+  unsigned char block[16];
+  size_t i = 0;
+
+  while (xlen > 0) {
+    // increment V
+    for (int j = 15; j >= 0; j--) {
+      if (DRBG_ctx.V[j] == 0xff) {
+        DRBG_ctx.V[j] = 0x00;
+      } else {
+        DRBG_ctx.V[j]++;
+        break;
+      }
+    }
+    AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
+    if (xlen > 15) {
+      memcpy(x + i, block, 16);
+      i += 16;
+      xlen -= 16;
+    } else {
+      memcpy(x + i, block, xlen);
+      i += xlen;
+      xlen = 0;
+    }
+  }
+  AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
+  DRBG_ctx.reseed_counter++;
+
+  return 0;
+}
+
+void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
+                            unsigned char *Key, unsigned char *V) {
+  unsigned char temp[48];
+
+  for (int i = 0; i < 3; i++) {
+    // increment V
+    for (int j = 15; j >= 0; j--) {
+      if (V[j] == 0xff) {
+        V[j] = 0x00;
+      } else {
+        V[j]++;
+        break;
+      }
+    }
+
+    AES256_ECB(Key, V, temp + 16 * i);
+  }
+  if (provided_data != NULL)
+    for (int i = 0; i < 48; i++) {
+      temp[i] ^= provided_data[i];
+    }
+  memcpy(Key, temp, 32);
+  memcpy(V, temp + 32, 16);
+}
+
+#ifdef RANDOMBYTES_C
+SQISIGN_API
+int randombytes(unsigned char *random_array, unsigned long long nbytes) {
+  int ret = randombytes_nist(random_array, nbytes);
+#ifdef ENABLE_CT_TESTING
+  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
+#endif
+  return ret;
+}
+
+SQISIGN_API
+void randombytes_init(unsigned char *entropy_input,
+                      unsigned char *personalization_string,
+                      int security_strength) {
+  randombytes_init_nist(entropy_input, personalization_string,
+                        security_strength);
+}
+#endif
--- a/src/ec/ref/CMakeLists.txt
+++ b/src/ec/ref/CMakeLists.txt
@@ -1,3 +1,3 @@
-set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)
+set(LVLX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lvlx)

 include(${SELECT_SQISIGN_VARIANT})
--- a/src/ec/ref/ecx/basis.c
+++ b/src/ec/ref/ecx/basis.c
@@ -1,508 +0,0 @@
-#include "isog.h"
-
-
-static void xTPL(ec_point_t* Q, const ec_point_t* P, const ec_point_t* A3)
-{
-    /* ----------------------------------------------------------------------------- *
-     * Differential point tripling given the montgomery coefficient A3 = (A+2C:A-2C)
-     * ----------------------------------------------------------------------------- */
-     
-    fp2_t t0, t1, t2, t3, t4;
-    fp2_sub(&t0, &P->x, &P->z);
-    fp2_sqr(&t2, &t0);
-    fp2_add(&t1, &P->x, &P->z);
-    fp2_sqr(&t3, &t1);
-    fp2_add(&t4, &t1, &t0);
-    fp2_sub(&t0, &t1, &t0);
-    fp2_sqr(&t1, &t4);
-    fp2_sub(&t1, &t1, &t3);
-    fp2_sub(&t1, &t1, &t2);
-    fp2_mul(&Q->x, &t3, &A3->x);
-    fp2_mul(&t3, &Q->x, &t3);
-    fp2_mul(&Q->z, &t2, &A3->z);
-    fp2_mul(&t2, &t2, &Q->z);
-    fp2_sub(&t3, &t2, &t3);
-    fp2_sub(&t2, &Q->x, &Q->z);
-    fp2_mul(&t1, &t2, &t1);
-    fp2_add(&t2, &t3, &t1);
-    fp2_sqr(&t2, &t2);
-    fp2_mul(&Q->x, &t2, &t4);
-    fp2_sub(&t1, &t3, &t1);
-    fp2_sqr(&t1, &t1);
-    fp2_mul(&Q->z, &t1, &t0);
-}
-
-int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P){
-
-    fp2_t t0, t1, t2;
-
-    // Check if xz*(C^2x^2+zACx+z^2C^2) is a square
-    fp2_mul(&t0, &curve->C, &P->x); 
-    fp2_mul(&t1, &t0, &P->z);       
-    fp2_mul(&t1, &t1, &curve->A);   
-    fp2_mul(&t2, &curve->C, &P->z); 
-    fp2_sqr(&t0, &t0);              
-    fp2_sqr(&t2, &t2);              
-    fp2_add(&t0, &t0, &t1);
-    fp2_add(&t0, &t0, &t2);
-    fp2_mul(&t0, &t0, &P->x);
-    fp2_mul(&t0, &t0, &P->z);
-    return fp2_is_square(&t0);
-}
-
-static void difference_point(ec_point_t* PQ, const ec_point_t* P, const ec_point_t* Q, const ec_curve_t* curve){
-    // Given P,Q in affine x-only, computes a deterministic choice for (P-Q)
-    // The points must be normalized to z=1 and the curve to C=1
-
-    fp2_t t0, t1, t2, t3;
-    
-    fp2_sub(&PQ->z, &P->x, &Q->x);  // P - Q
-    fp2_mul(&t2, &P->x, &Q->x);     // P*Q
-    fp_mont_setone(t1.re);
-    fp_set(t1.im, 0);
-    fp2_sub(&t3, &t2, &t1);         // P*Q-1
-    fp2_mul(&t0, &PQ->z, &t3);      // (P-Q)*(P*Q-1)
-    fp2_sqr(&PQ->z, &PQ->z);        // (P-Q)^2
-    fp2_sqr(&t0, &t0);              // (P-Q)^2*(P*Q-1)^2
-    fp2_add(&t1, &t2, &t1);         // P*Q+1
-    fp2_add(&t3, &P->x, &Q->x);     // P+Q
-    fp2_mul(&t1, &t1, &t3);         // (P+Q)*(P*Q+1)
-    fp2_mul(&t2, &t2, &curve->A);   // A*P*Q
-    fp2_add(&t2, &t2, &t2);         // 2*A*P*Q
-    fp2_add(&t1, &t1, &t2);         // (P+Q)*(P*Q+1) + 2*A*P*Q
-    fp2_sqr(&t2, &t1);              // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2
-    fp2_sub(&t0, &t2, &t0);         // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2 - (P-Q)^2*(P*Q-1)^2
-    fp2_sqrt(&t0);
-    fp2_add(&PQ->x, &t0, &t1);
-}
-
-void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve){
-    fp2_t x, t0, t1, t2;
-    ec_point_t P, Q, Q2, P2, A24;
-
-    // Curve coefficient in the form A24 = (A+2C:4C)
-    fp2_add(&A24.z, &curve->C, &curve->C);
-    fp2_add(&A24.x, &curve->A, &A24.z);
-    fp2_add(&A24.z, &A24.z, &A24.z);
-
-    fp_mont_setone(x.re);
-    fp_set(x.im, 0);
-
-    // Find P
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&P.x, &x);
-            fp_mont_setone(P.z.re);
-            fp_set(P.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear odd factors from the order
-        xMULv2(&P, &P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
-
-        // Check if point has order 2^f
-        copy_point(&P2, &P);
-        for(int i = 0; i < POWER_OF_2 - 1; i++)
-            xDBLv2(&P2, &P2, &A24);
-        if(ec_is_zero(&P2))
-            continue;
-        else
-            break;
-    }
-    
-    // Find Q
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&Q.x, &x);
-            fp_mont_setone(Q.z.re);
-            fp_set(Q.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear odd factors from the order
-        xMULv2(&Q, &Q, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
-
-        // Check if point has order 2^f
-        copy_point(&Q2, &Q);
-        for(int i = 0; i < POWER_OF_2 - 1; i++)
-            xDBLv2(&Q2, &Q2, &A24);
-        if(ec_is_zero(&Q2))
-            continue;
-
-        // Check if point is orthogonal to P
-        if(is_point_equal(&P2, &Q2))
-            continue;
-        else
-            break;
-    }
-
-    // Normalize points
-    ec_curve_t E;
-    fp2_mul(&t0, &P.z, &Q.z);
-    fp2_mul(&t1, &t0, &curve->C);
-    fp2_inv(&t1);
-    fp2_mul(&P.x, &P.x, &t1);
-    fp2_mul(&Q.x, &Q.x, &t1);
-    fp2_mul(&E.A, &curve->A, &t1);
-    fp2_mul(&P.x, &P.x, &Q.z);
-    fp2_mul(&P.x, &P.x, &curve->C);
-    fp2_mul(&Q.x, &Q.x, &P.z);
-    fp2_mul(&Q.x, &Q.x, &curve->C);
-    fp2_mul(&E.A, &E.A, &t0);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    fp2_copy(&Q.z, &P.z);
-    fp2_copy(&E.C, &P.z);
-
-    // Compute P-Q
-    difference_point(&PQ2->PmQ, &P, &Q, &E);
-    copy_point(&PQ2->P, &P);
-    copy_point(&PQ2->Q, &Q);
-}
-
-
-void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P){
-
-    fp2_t x, t0, t1, t2;
-    ec_point_t Q, Q2, P2, A24;
-
-    // Curve coefficient in the form A24 = (A+2C:4C)
-    fp2_add(&A24.z, &curve->C, &curve->C);
-    fp2_add(&A24.x, &curve->A, &A24.z);
-    fp2_add(&A24.z, &A24.z, &A24.z);
-
-    // Point of order 2 generated by P
-    copy_point(&P2, P);
-    for(int i = 0; i < POWER_OF_2 - 1; i++)
-        xDBLv2(&P2, &P2, &A24);
-
-    // Find Q
-    fp_mont_setone(x.re);
-    fp_set(x.im, 0);
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&Q.x, &x);
-            fp_mont_setone(Q.z.re);
-            fp_set(Q.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear odd factors from the order
-        xMULv2(&Q, &Q, p_cofactor_for_2f, (int)P_COFACTOR_FOR_2F_BITLENGTH, &A24);
-
-        // Check if point has order 2^f
-        copy_point(&Q2, &Q);
-        for(int i = 0; i < POWER_OF_2 - 1; i++)
-            xDBLv2(&Q2, &Q2, &A24);
-        if(ec_is_zero(&Q2))
-            continue;
-
-        // Check if point is orthogonal to P
-        if(is_point_equal(&P2, &Q2))
-            continue;
-        else
-            break;
-    }
-
-    // Normalize points
-    ec_curve_t E;
-    ec_point_t PP;
-    fp2_mul(&t0, &P->z, &Q.z);
-    fp2_mul(&t1, &t0, &curve->C);
-    fp2_inv(&t1);
-    fp2_mul(&PP.x, &P->x, &t1);
-    fp2_mul(&Q.x, &Q.x, &t1);
-    fp2_mul(&E.A, &curve->A, &t1);
-    fp2_mul(&PP.x, &PP.x, &Q.z);
-    fp2_mul(&PP.x, &PP.x, &curve->C);
-    fp2_mul(&Q.x, &Q.x, &P->z);
-    fp2_mul(&Q.x, &Q.x, &curve->C);
-    fp2_mul(&E.A, &E.A, &t0);
-    fp_mont_setone(PP.z.re);
-    fp_set(PP.z.im, 0);
-    fp2_copy(&Q.z, &PP.z);
-    fp2_copy(&E.C, &PP.z);
-
-    // Compute P-Q
-    difference_point(&PQ2->PmQ, &PP, &Q, &E);
-    copy_point(&PQ2->P, &PP);
-    copy_point(&PQ2->Q, &Q);
-}
-
-void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve){
-
-    fp2_t x, t0, t1, t2;
-    ec_point_t P, Q, Q3, P3, A24, A3;
-
-    // Curve coefficient in the form A24 = (A+2C:4C)
-    fp2_add(&A24.z, &curve->C, &curve->C);
-    fp2_add(&A24.x, &curve->A, &A24.z);
-    fp2_add(&A24.z, &A24.z, &A24.z);
-
-    // Curve coefficient in the form A3 = (A+2C:A-2C)
-    fp2_sub(&A3.z, &A24.x, &A24.z);
-    fp2_copy(&A3.x, &A24.x);
-
-    fp_mont_setone(x.re);
-    fp_set(x.im, 0);
-
-    // Find P
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&P.x, &x);
-            fp_mont_setone(P.z.re);
-            fp_set(P.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear non-3 factors from the order
-        xMULv2(&P, &P, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
-
-        // Check if point has order 3^g
-        copy_point(&P3, &P);
-        for(int i = 0; i < POWER_OF_3 - 1; i++)
-            xTPL(&P3, &P3, &A3);
-        if(ec_is_zero(&P3))
-            continue;
-        else
-            break;
-    }
-    
-    // Find Q
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&Q.x, &x);
-            fp_mont_setone(Q.z.re);
-            fp_set(Q.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear non-3 factors from the order
-        xMULv2(&Q, &Q, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
-
-        // Check if point has order 3^g
-        copy_point(&Q3, &Q);
-        for(int i = 0; i < POWER_OF_3 - 1; i++)
-            xTPL(&Q3, &Q3, &A3);
-        if(ec_is_zero(&Q3))
-            continue;
-
-        // Check if point is orthogonal to P
-        if(is_point_equal(&P3, &Q3))
-            continue;
-        xDBLv2(&P3, &P3, &A24);
-        if(is_point_equal(&P3, &Q3))
-            continue;
-        else
-            break;
-    }
-
-    // Normalize points
-    ec_curve_t E;
-    fp2_mul(&t0, &P.z, &Q.z);
-    fp2_mul(&t1, &t0, &curve->C);
-    fp2_inv(&t1);
-    fp2_mul(&P.x, &P.x, &t1);
-    fp2_mul(&Q.x, &Q.x, &t1);
-    fp2_mul(&E.A, &curve->A, &t1);
-    fp2_mul(&P.x, &P.x, &Q.z);
-    fp2_mul(&P.x, &P.x, &curve->C);
-    fp2_mul(&Q.x, &Q.x, &P.z);
-    fp2_mul(&Q.x, &Q.x, &curve->C);
-    fp2_mul(&E.A, &E.A, &t0);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    fp2_copy(&Q.z, &P.z);
-    fp2_copy(&E.C, &P.z);
-
-    // Compute P-Q
-    difference_point(&PQ3->PmQ, &P, &Q, &E);
-    copy_point(&PQ3->P, &P);
-    copy_point(&PQ3->Q, &Q);
-}
-
-void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve){
-
-    fp2_t x, t0, t1, t2;
-    ec_point_t P, Q, Q6, P6, R, T, A24, A3;
-
-    // Curve coefficient in the form A24 = (A+2C:4C)
-    fp2_add(&A24.z, &curve->C, &curve->C);
-    fp2_add(&A24.x, &curve->A, &A24.z);
-    fp2_add(&A24.z, &A24.z, &A24.z);
-
-    // Curve coefficient in the form A3 = (A+2C:A-2C)
-    fp2_sub(&A3.z, &A24.x, &A24.z);
-    fp2_copy(&A3.x, &A24.x);
-
-    fp_mont_setone(x.re);
-    fp_set(x.im, 0);
-
-    // Find P
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&P.x, &x);
-            fp_mont_setone(P.z.re);
-            fp_set(P.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear non-2 factors and non-3 factors from the order
-        xMULv2(&P, &P, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
-
-        // Check if point has order 2^f*3^g
-        copy_point(&P6, &P);
-        for(int i = 0; i < POWER_OF_2 - 1; i++)
-            xDBLv2(&P6, &P6, &A24);
-        for(int i = 0; i < POWER_OF_3 - 1; i++)
-            xTPL(&P6, &P6, &A3);
-        if(ec_is_zero(&P6))
-            continue;
-        xDBLv2(&T, &P6, &A24);
-        if (ec_is_zero(&T))
-            continue;
-        xTPL(&T, &P6, &A3);
-        if (ec_is_zero(&T))
-            continue;
-        break;
-    }
-
-    // Find Q
-    while(1){
-        fp_add(x.im, x.re, x.im);
-
-        // Check if point is rational
-        fp2_sqr(&t0, &curve->C);
-        fp2_mul(&t1, &t0, &x);
-        fp2_mul(&t2, &curve->A, &curve->C);
-        fp2_add(&t1, &t1, &t2);
-        fp2_mul(&t1, &t1, &x);
-        fp2_add(&t1, &t1, &t0);
-        fp2_mul(&t1, &t1, &x);
-        if(fp2_is_square(&t1)){
-            fp2_copy(&Q.x, &x);
-            fp_mont_setone(Q.z.re);
-            fp_set(Q.z.im, 0);
-        }
-        else
-            continue;
-
-        // Clear non-6 factors from the order
-        xMULv2(&Q, &Q, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
-
-        // Check first if point has order 2^f*3^g
-        copy_point(&Q6, &Q);
-        for(int i = 0; i < POWER_OF_2 - 1; i++)
-            xDBLv2(&Q6, &Q6, &A24);
-        for(int i = 0; i < POWER_OF_3 - 1; i++)
-            xTPL(&Q6, &Q6, &A3);
-        if(ec_is_zero(&Q6))
-            continue;
-        xDBLv2(&T, &Q6, &A24);
-        if (ec_is_zero(&T))
-            continue;
-        xTPL(&T, &Q6, &A3);
-        if (ec_is_zero(&T))
-            continue;
-
-        // Check if point P is independent from point Q
-        xTPL(&R, &P6, &A3);
-        xTPL(&T, &Q6, &A3);
-        if(is_point_equal(&R, &T))
-            continue;
-        xDBLv2(&R, &P6, &A24);
-        xDBLv2(&T, &Q6, &A24);
-        if(is_point_equal(&R, &T))
-            continue;
-        break;
-    }
-
-    // Normalize points
-    ec_curve_t E;
-    fp2_mul(&t0, &P.z, &Q.z);
-    fp2_mul(&t1, &t0, &curve->C);
-    fp2_inv(&t1);
-    fp2_mul(&P.x, &P.x, &t1);
-    fp2_mul(&Q.x, &Q.x, &t1);
-    fp2_mul(&E.A, &curve->A, &t1);
-    fp2_mul(&P.x, &P.x, &Q.z);
-    fp2_mul(&P.x, &P.x, &curve->C);
-    fp2_mul(&Q.x, &Q.x, &P.z);
-    fp2_mul(&Q.x, &Q.x, &curve->C);
-    fp2_mul(&E.A, &E.A, &t0);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    fp2_copy(&Q.z, &P.z);
-    fp2_copy(&E.C, &P.z);
-
-    // Compute P-Q
-    difference_point(&PQ6->PmQ, &P, &Q, &E);
-    copy_point(&PQ6->P, &P);
-    copy_point(&PQ6->Q, &Q);
-}
--- a/src/ec/ref/ecx/ec.c
+++ b/src/ec/ref/ecx/ec.c
--- a/src/ec/ref/ecx/fp2-test.c
+++ b/src/ec/ref/ecx/fp2-test.c
@@ -1,90 +0,0 @@
-#include <assert.h>
-#include <time.h>
-#include <stdio.h>
-#include "../generic/include/fp2_tmp.h"
-
-int main()
-{
-	fp2_t fp2_0, fp2_1;
-	// ------------
-	fp2_set0(fp2_0);
-	fp2_set1(fp2_1);
-	// ------------
-
-	int i;
-	fp2_t a, b, c, d;
-	fp_t e;
-
-	for (i = 0; i < 1024; i++)
-	{
-		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)1024);
-		fflush(stdout);
-		printf("\r\x1b[K");
-                
-		// Random elements of fp
-		fp2_random(a);
-		fp2_random(b);
-		fp2_copy(c, a);
-		c.re[0] += 1;
-		fp2_copy(d, b);
-		d.re[0] -= 1;
-
-		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
-		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
-
-		// Testing neg
-		fp2_set0(b);
-		fp2_copy(c, a);
-		fp2_neg(a, a);
-		fp2_sub(c, b, c);
-		assert(fp2_isequal(a,c) == 1);
-
-		fp2_set1(a);	// Now a == 1
-		fp2_set0(b);	// Now b == 0
-
-		assert(fp2_is_zero(a) == 0);
-		assert(fp2_is_zero(b) == 1);
-
-		// testing c - c
-		fp2_sub(d, c, c);
-		assert(fp2_is_zero(d) == 1);
-
-		// tetsing c * 0
-		fp2_mul(d, c, b);
-		assert(fp2_is_zero(d) == 1);
-
-		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
-		fp2_set1(a);
-		fp2_mul(d, c, a);
-		assert(fp2_isequal(d, c) == 1);
-
-		// fp_set(e, 1);	// Now e == 1
-		// fp2_pow(d, e, c);
-		// assert(fp2_isequal(d, c) == 1);
-		
-		// fp_set(e, 0);	// Now e == 0
-		// fp2_pow(d, e, c);
-		// assert(fp2_isone(d) == 1);
-
-		// fp2_set(a, 1);	// Now e == R mod p
-		// fp_random(e);
-		// fp2_pow(d, e, a);
-		// assert(fp2_isone(d) == 1);
-
-		// Testing 1/a by computing (1/a) x a
-		fp2_random(a);
-		fp2_copy(b, a);
-		fp2_inv(a);
-		fp2_mul(c, a, b);
-		assert(fp2_isone(c) == 1);
-
-		fp2_random(a);
-		fp2_sqr(b, a);
-		assert( fp2_issquare(b) );
-
-	};
-
-	printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i / (int)1024);
-	printf("-- All tests passed.\n");
-	return 0;
-}
--- a/src/ec/ref/ecx/isog_chains.c
+++ b/src/ec/ref/ecx/isog_chains.c
@@ -1,298 +0,0 @@
-#include "isog.h"
-#include <assert.h>
-
-static inline void AC_to_A24(ec_point_t *A24, ec_curve_t const *E)
-{
-    // A24 = (A+2C : 4C)
-    fp2_add(&A24->z, &E->C, &E->C);
-    fp2_add(&A24->x, &E->A, &A24->z);
-    fp2_add(&A24->z, &A24->z, &A24->z);
-}
-
-static inline void A24_to_AC(ec_curve_t *E, ec_point_t const *A24)
-{
-    // (A:C) = ((A+2C)*2-4C : 4C)
-    fp2_add(&E->A, &A24->x, &A24->x);
-    fp2_sub(&E->A, &E->A, &A24->z);
-    fp2_add(&E->A, &E->A, &E->A);
-    fp2_copy(&E->C, &A24->z);
-}
-
-void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
-    ec_point_t* points, unsigned short length){
-        ec_point_t Q4, Q, A24;
-        copy_point(&Q4, &phi->kernel);
-        AC_to_A24(&A24, &phi->curve);
-        for(int i = 0; i < phi->length - 2; i++)
-            xDBLv2(&Q4, &Q4, &A24);
-        xDBLv2(&Q, &Q4, &A24);
-        if(fp2_is_zero(&Q.x)){
-            xisog_4_singular(&A24, Q4, A24);
-            xeval_4_singular(points, points, length, Q4);
-            xeval_4_singular(&Q, &phi->kernel, 1, Q4);
-        }
-        else{
-            xisog_4(&A24, Q4);
-            xeval_4(points, points, length);
-            xeval_4(&Q, &phi->kernel, 1);
-        }
-        ec_eval_even_strategy(image, points, length, &A24, &Q, phi->length-2);
-    }
-
-void ec_eval_even_nonzero(ec_curve_t* image, const ec_isog_even_t* phi,
-    ec_point_t* points, unsigned short length){
-        ec_point_t Q4, A24;
-        copy_point(&Q4, &phi->kernel);
-        AC_to_A24(&A24, &phi->curve);
-        for(int i = 0; i < phi->length - 2; i++)
-            xDBLv2(&Q4, &Q4, &A24);
-        xisog_4(&A24, Q4);
-        xeval_4(points, points, length);
-        xeval_4(&Q4, &phi->kernel, 1);
-        ec_eval_even_strategy(image, points, length, &A24, &Q4, phi->length-2);
-    }
-
-static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
-    ec_point_t* A24, const ec_point_t *kernel, const int isog_len){
-    
-    assert(isog_len == POWER_OF_2-2);
-        
-    uint8_t log2_of_e, tmp;
-    fp2_t t0;
-    digit_t e_half = (isog_len)>>1;
-    for(tmp = e_half, log2_of_e = 0; tmp > 0; tmp>>=1, ++log2_of_e);
-    log2_of_e *= 2; // In order to ensure each splits is at most size log2_of_e
-
-    ec_point_t SPLITTING_POINTS[log2_of_e], K2;
-    copy_point(&SPLITTING_POINTS[0], kernel);
-
-    int strategy = 0,    // Current element of the strategy to be used
-    i, j;
-
-    int BLOCK = 0,       // Keeps track of point order
-    current = 0;         // Number of points being carried
-    int XDBLs[log2_of_e]; // Number of doubles performed
-
-    // If walk length is odd, we start with a 2-isogeny
-    if(isog_len & 1){
-        copy_point(&SPLITTING_POINTS[1], &SPLITTING_POINTS[0]);
-        for(i = 0; i < isog_len-1; i++)
-            xDBLv2(&SPLITTING_POINTS[1], &SPLITTING_POINTS[1], A24);
-        xisog_2(A24, SPLITTING_POINTS[1]);
-        xeval_2(SPLITTING_POINTS, SPLITTING_POINTS, 1);
-        xeval_2(points, points, points_len);
-    }
-    
-    // Chain of 4-isogenies
-    for(j = 0; j < (e_half - 1); j++)
-    {   
-        // Get the next point of order 4
-        while (BLOCK != (e_half -  1 - j) )
-        {
-            // A new split will be added
-            current += 1;
-            // We set the seed of the new split to be computed and saved
-            copy_point(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current - 1]);
-            for(i = 0; i < 2*STRATEGY4[strategy]; i++)
-                xDBLv2(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current], A24);
-            XDBLs[current] = STRATEGY4[strategy];  // The number of doublings performed is saved
-            BLOCK += STRATEGY4[strategy];          // BLOCK is increased by the number of doublings performed
-            strategy += 1;                  // Next, we move to the next element of the strategy
-        }
-
-        // Evaluate 4-isogeny
-        xisog_4(A24, SPLITTING_POINTS[current]);
-        xeval_4(SPLITTING_POINTS, SPLITTING_POINTS, current);
-        xeval_4(points, points, points_len);
-
-        BLOCK -= XDBLs[current];  
-        XDBLs[current] = 0;      
-        current -= 1;            
-    }
-
-    // Final 4-isogeny
-    xisog_4(A24, SPLITTING_POINTS[current]);
-    xeval_4(points, points, points_len);
-
-    // Output curve in the form (A:C)
-    A24_to_AC(image, A24);
-}
-
-void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
-        ec_point_t* points, unsigned short length){
-        
-    ec_point_t ker_plus, ker_minus, P, K, A24, B24;
-    int i,j,k;
-
-    AC_to_A24(&A24, &phi->curve);
-
-    // Isogenies with kernel in E[p+1]
-    copy_point(&ker_plus, &phi->ker_plus);
-    copy_point(&ker_minus, &phi->ker_minus);
-    for(i = 0; i < P_LEN; i++){
-        copy_point(&P, &ker_plus);
-        for(j = i+1; j < P_LEN; j++){
-            for(k = 0; k < phi->degree[j]; k++)
-                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
-        }
-        for(k = 0; k < phi->degree[i]; k++){
-            copy_point(&K, &P);
-            for(j = 0; j < phi->degree[i]-k-1; j++)
-                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
-            kps(i, K, A24);
-            xisog(&B24, i, A24);
-            xeval(&P, i, P, A24);
-            xeval(&ker_plus, i, ker_plus, A24);
-            xeval(&ker_minus, i, ker_minus, A24);
-            for(j = 0; j < length; j++)
-                xeval(&points[j], i, points[j], A24);
-            copy_point(&A24, &B24);
-            kps_clear(i);
-        }
-    }
-
-    // Isogenies with kernel in E[p-1]
-    for(i = P_LEN; i < P_LEN+M_LEN; i++){
-        copy_point(&P, &ker_minus);
-        for(j = i+1; j < P_LEN+M_LEN; j++){
-            for(k = 0; k < phi->degree[j]; k++)
-                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
-        }
-        for(k = 0; k < phi->degree[i]; k++){
-            copy_point(&K, &P);
-            for(j = 0; j < phi->degree[i]-k-1; j++)
-                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
-            kps(i, K, A24);
-            xisog(&B24, i, A24);
-            xeval(&P, i, P, A24);
-            xeval(&ker_minus, i, ker_minus, A24);
-            for(j = 0; j < length; j++)
-                xeval(&points[j], i, points[j], A24);
-            copy_point(&A24, &B24);
-            kps_clear(i);
-        }
-    }
-
-    A24_to_AC(image, &A24);
-}
-
-void ec_curve_normalize(ec_curve_t *new, ec_isom_t *isom, const ec_curve_t *old){
-    fp2_t t0, t1, t2, t3, t4, t5;
-    // Compute the other solutions:
-    // A'^2 = [ sqrt(A^2-4C^2)*(9C^2-A^2) +- (A^3-3AC^2) ] / [ 2C^2*sqrt(A^2-4C^2) ]
-    fp2_sqr(&t0, &old->C);      //C^2
-    fp2_add(&t1, &t0, &t0);     //2C^2
-    fp2_add(&t2, &t1, &t1);     //4C^2
-    fp2_sqr(&t3, &old->A);      //A^2
-    fp2_sub(&t2, &t3, &t2);     //A^2-4C^2
-    fp2_sqrt(&t2);              //sqrt(A^2-4C^2)
-    fp2_add(&t0, &t0, &t1);     //3C^2
-    fp2_mul(&t1, &t2, &t1);     //2C^2*sqrt(A^2-4C^2)
-    fp2_sub(&t5, &t3, &t0);     //A^2-3C^2
-    fp2_mul(&t5, &t5, &old->A);     //A^3-3AC^2
-    fp2_add(&t4, &t0, &t0);     //6C^2
-    fp2_add(&t0, &t4, &t0);     //9C^2
-    fp2_sub(&t0, &t0, &t3);     //9C^2-A^2
-    fp2_add(&t3, &t3, &t3);     //2A^2
-    fp2_mul(&t3, &t3, &t2);     //2A^2*sqrt(A^2-4C^2)
-    fp2_mul(&t2, &t2, &t0);     //sqrt(A^2-4C^2)*(9C^2-A^2)
-    fp2_add(&t0, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) + (A^3-3AC^2)
-    fp2_sub(&t2, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) - (A^3-3AC^2)
-    fp2_inv(&t1);               //1/2C^2*sqrt(A^2-4C^2)
-    fp2_mul(&t0, &t0, &t1);     // First solution
-    fp2_mul(&t2, &t2, &t1);     // Second solution
-    fp2_mul(&t1, &t3, &t1);     // Original solution
-
-    // Chose the lexicographically first solution
-    if(fp2_cmp(&t0, &t1)==1)
-        fp2_copy(&t0, &t1);
-    if(fp2_cmp(&t0, &t2)==1)
-        fp2_copy(&t0, &t2);
-
-    // Copy the solution
-    fp2_sqrt(&t0);
-    ec_curve_t E;
-    fp2_copy(&E.A, &t0);
-    fp_mont_setone(E.C.re);
-    fp_set(E.C.im, 0);
-    ec_isomorphism(isom, old, &E);
-    fp2_copy(&new->A, &E.A);
-    fp2_copy(&new->C, &E.C);
-}
-
-void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to){
-    fp2_t t0, t1, t2, t3, t4;
-    fp2_mul(&t0, &from->A, &to->C);
-    fp2_sqr(&t0, &t0);                  //fromA^2toC^2
-    fp2_mul(&t1, &to->A, &from->C);
-    fp2_sqr(&t1, &t1);                  //toA^2fromC^2
-    fp2_mul(&t2, &to->C, &from->C);
-    fp2_sqr(&t2, &t2);                  //toC^2fromC^2
-    fp2_add(&t3, &t2, &t2);
-    fp2_add(&t2, &t3, &t2);             //3toC^2fromC^2
-    fp2_sub(&t3, &t2, &t0);             //3toC^2fromC^2-fromA^2toC^2
-    fp2_sub(&t4, &t2, &t1);             //3toC^2fromC^2-toA^2fromC^2
-    fp2_inv(&t3);
-    fp2_mul(&t4, &t4, &t3);
-    fp2_sqrt(&t4);                      //lambda^2 constant for SW isomorphism
-    fp2_sqr(&t3, &t4);
-    fp2_mul(&t3, &t3, &t4);             //lambda^6
-
-    // Check sign of lambda^2, such that lambda^6 has the right sign
-    fp2_sqr(&t0, &from->C);
-    fp2_add(&t1, &t0, &t0);
-    fp2_add(&t1, &t1, &t1);
-    fp2_add(&t1, &t1, &t1);
-    fp2_add(&t0, &t0, &t1); // 9fromC^2
-    fp2_sqr(&t2, &from->A);
-    fp2_add(&t2, &t2, &t2); // 2fromA^2
-    fp2_sub(&t2, &t2, &t0);
-    fp2_mul(&t2, &t2, &from->A); // -9fromC^2fromA+2fromA^3
-    fp2_sqr(&t0, &to->C);
-    fp2_mul(&t0, &t0, &to->C);
-    fp2_mul(&t2, &t2, &t0);     //toC^3* [-9fromC^2fromA+2fromA^3]
-    fp2_mul(&t3, &t3, &t2);             //lambda^6*(-9fromA+2fromA^3)*toC^3
-    fp2_sqr(&t0, &to->C);
-    fp2_add(&t1, &t0, &t0);
-    fp2_add(&t1, &t1, &t1);
-    fp2_add(&t1, &t1, &t1);
-    fp2_add(&t0, &t0, &t1); // 9toC^2
-    fp2_sqr(&t2, &to->A);
-    fp2_add(&t2, &t2, &t2); // 2toA^2
-    fp2_sub(&t2, &t2, &t0);
-    fp2_mul(&t2, &t2, &to->A); // -9toC^2toA+2toA^3
-    fp2_sqr(&t0, &from->C);
-    fp2_mul(&t0, &t0, &from->C);
-    fp2_mul(&t2, &t2, &t0);     //fromC^3* [-9toC^2toA+2toA^3]
-    if(!fp2_is_equal(&t2, &t3))
-        fp2_neg(&t4, &t4);
-
-    // Mont -> SW -> SW -> Mont
-    fp_mont_setone(t0.re);
-    fp_set(t0.im, 0);
-    fp2_add(&isom->D, &t0, &t0);
-    fp2_add(&isom->D, &isom->D, &t0);
-    fp2_mul(&isom->D, &isom->D, &from->C);
-    fp2_mul(&isom->D, &isom->D, &to->C);
-    fp2_mul(&isom->Nx, &isom->D, &t4);
-    fp2_mul(&t4, &t4, &from->A);
-    fp2_mul(&t4, &t4, &to->C);
-    fp2_mul(&t0, &to->A, &from->C);
-    fp2_sub(&isom->Nz, &t0, &t4);
-}
-
-void ec_iso_inv(ec_isom_t* isom){
-    fp2_t tmp;
-    fp2_copy(&tmp, &isom->D);
-    fp2_copy(&isom->D, &isom->Nx);
-    fp2_copy(&isom->Nx, &tmp);
-    fp2_neg(&isom->Nz, &isom->Nz);
-}
-
-void ec_iso_eval(ec_point_t *P, ec_isom_t* isom){
-    fp2_t tmp;
-    fp2_mul(&P->x, &P->x, &isom->Nx);
-    fp2_mul(&tmp, &P->z, &isom->Nz);
-    fp2_sub(&P->x, &P->x, &tmp);
-    fp2_mul(&P->z, &P->z, &isom->D);
-}
--- a/src/ec/ref/ecx/kps.c
+++ b/src/ec/ref/ecx/kps.c
@@ -1,228 +0,0 @@
-#include "isog.h"
-#include "curve_extras.h"
-#include <assert.h>
-
-int sI, sJ, sK;	// Sizes of each current I, J, and K	
-
-fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
-			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
-
-ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
-fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
-    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
-    A0;			// constant multiple of the reciprocal R0
-
-poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
-     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
-     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
-     
-fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
-
-int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
-    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
-
-fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
-
-// -----------------------------------------------------------
-// -----------------------------------------------------------
-// Traditional Kernel Point computation (KPs)
-
-// Kernel computation required in tye degree-4 isogeny evaluation
-void kps_4(ec_point_t const P)
-{
-	fp2_sub(&K[1].x, &P.x, &P.z);
-	fp2_add(&K[2].x, &P.x, &P.z);
-	fp2_sqr(&K[0].x, &P.z);
-	fp2_add(&K[0].z, &K[0].x, &K[0].x);
-	fp2_add(&K[0].x, &K[0].z, &K[0].z);
-}
-
-void eds2mont(ec_point_t* P)
-{
-	fp2_t t;
-	fp2_add(&t, &(P->z), &(P->x));
-	fp2_sub(&(P->z), &(P->z), &(P->x));
-	fp2_copy(&(P->x), &t);
-}
-
-
-// Differential doubling in Twisted Edwards model
-void ydbl(ec_point_t* Q, ec_point_t* const P, ec_point_t const* A)
-{
-	fp2_t t_0, t_1, X, Z;
-
-	fp2_sqr(&t_0, &(P->x));
-	fp2_sqr(&t_1, &(P->z));
-	fp2_mul(&Z, &(A->z), &t_0);
-	fp2_mul(&X, &Z, &t_1);
-	fp2_sub(&t_1, &t_1, &t_0);
-	fp2_mul(&t_0, &(A->x), &t_1);
-	fp2_add(&Z, &Z, &t_0);
-	fp2_mul(&Z, &Z, &t_1);
-
-	fp2_sub(&(Q->x), &X, &Z);
-	fp2_add(&(Q->z), &X, &Z);
-}
-
-// Differential addition in Twisted Edwards model
-void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ)
-{
-	fp2_t a, b, c, d, X, Z;
-
-	fp2_mul(&a, &(P->z), &(Q->x));
-	fp2_mul(&b, &(P->x), &(Q->z));
-	fp2_add(&c, &a, &b);
-	fp2_sub(&d, &a, &b);
-	fp2_sqr(&c, &c);
-	fp2_sqr(&d, &d);
-
-	fp2_add(&a, &(PQ->z), &(PQ->x));
-	fp2_sub(&b, &(PQ->z), &(PQ->x));
-	fp2_mul(&X, &b, &c);
-	fp2_mul(&Z, &a, &d);
-
-	fp2_sub(&(R->x), &X, &Z);
-	fp2_add(&(R->z), &X, &Z);
-}
-
-// tvelu formulae
-void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A)
-{
-	int j;
-	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;
-
-	// Mapping the input point x(P), which belongs to a 
-	// Montogmery curve model, into its Twisted Edwards 
-	// representation y(P)
-	fp2_sub(&K[0].x, &P.x, &P.z);
-	fp2_add(&K[0].z, &P.x, &P.z);
-	ydbl(&K[1], &K[0], &A);				// y([2]P)
-
-	for (j = 2; j < d; j++)
-		yadd(&K[j], &K[j - 1], &K[0], &K[j - 2]);	// y([j+1]P)
-}
-
-// -----------------------------------------------------------
-// -----------------------------------------------------------
-// Kernel Point computation (KPs) used in velu SQRT
-void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A)
-{
-	// =================================================================================
-	assert(TORSION_ODD_PRIMES[i] > gap);	// Ensuring velusqrt is used for l_i > gap
-	// The optimal bounds must corresponds to sI, sJ, and sK
-
-	sI = sizeI[i];	// Size of I
-	sJ = sizeJ[i];	// Size of J
-	sK = sizeK[i];	// Size of K
-	assert(sI >= sJ);	// Ensuring #I >= #J
-	assert(sK >= 0);	// Recall, it must be that #K >= 0
-	assert(sJ > 1);		// ensuring sI >= sJ > 1
-	// =================================================================================
-	
-	// Now, we can proceed by the general case
-
-	int j;
-
-	// --------------------------------------------------
-	// Computing [j]P for each j in {1, 3, ..., 2*sJ - 1}
-	ec_point_t P2, P4;
-	copy_point(&J[0], &P);				//    x(P)
-	// Next computations are required for allowing the use of the function get_A()
-	fp2_mul(&XZJ4[0], &J[0].x, &J[0].z);					//   Xj*Zj
-	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  2Xj*Zj
-	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  4Xj*Zj
-	fp2_neg(&XZJ4[0], &XZJ4[0]);					// -4Xj*Zj
-	xDBLv2(&P2, &P, &A);					// x([2]P)
-	xADD(&J[1], &P2, &J[0], &J[0]);			// x([3]P)
-	// Next computations are required for allowing the use of the function get_A()
-	fp2_mul(&XZJ4[1], &J[1].x, &J[1].z);					//   Xj*Zj
-	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  2Xj*Zj
-	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  4Xj*Zj
-	fp2_neg(&XZJ4[1], &XZJ4[1]);					// -4Xj*Zj
-	for (j = 2; j < sJ; j++)
-	{
-		xADD(&J[j], &J[j - 1], &P2, &J[j - 2]);	// x([2*j + 1]P)
-		// Next computations are required for allowing the use of the function get_A()
-		fp2_mul(&XZJ4[j], &J[j].x, &J[j].z);					//   Xj*Zj
-		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  2Xj*Zj
-		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  4Xj*Zj
-		fp2_neg(&XZJ4[j], &XZJ4[j]);					// -4Xj*Zj
-	};
-
-	// ----------------------------------------------------------
-	// Computing [i]P for i in { (2*sJ) * (2i + 1) : 0 <= i < sI}
-	// and the linear factors of h_I(W)
-	ec_point_t Q, Q2, tmp1, tmp2;
-	int bhalf_floor= sJ >> 1;
-	int bhalf_ceil = sJ - bhalf_floor;
-	xDBLv2(&P4, &P2, &A);								// x([4]P)
-	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
-	xADD(&Q, &J[bhalf_ceil], &J[bhalf_floor - 1], &P2);	// Q := [2b]P
-	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
-
-	// .............................................
-	xDBLv2(&Q2, &Q, &A);					// x([2]Q)
-	xADD(&tmp1, &Q2, &Q, &Q);	// x([3]Q)
-	fp2_neg(&I[0][0], &Q.x);
-	fp2_copy(&I[0][1], &Q.z);
-	fp2_neg(&I[1][0], &tmp1.x);
-	fp2_copy(&I[1][1], &tmp1.z);
-	copy_point(&tmp2, &Q);
-	
-	for (j = 2; j < sI; j++){
-		xADD(&tmp2, &tmp1, &Q2, &tmp2);	// x([2*j + 1]Q)
-		fp2_neg(&I[j][0], &tmp2.x);
-		fp2_copy(&I[j][1], &tmp2.z);
-		swap_points(&tmp1, &tmp2, -(uint64_t)1);
-	}
-
-
-	// ----------------------------------------------------------------
-	// Computing [k]P for k in { 4*sJ*sI + 1, ..., l - 6, l - 4, l - 2}
-	// In order to avoid BRANCHES we make allways copy in K[0] and K[1]
-	// by assuming that these entries are only used when sK >= 1 and 
-	// sK >= 2, respectively.
-
-	//if (sK >= 1)
-	copy_point(&K[0], &P2);				//       x([l - 2]P) = x([2]P)
-	//if (sK >= 2)
-	copy_point(&K[1], &P4);				//       x([l - 4]P) = x([4]P)
-	
-	for (j = 2; j < sK; j++)
-		xADD(&K[j], &K[j - 1], &P2, &K[j - 2]);	// x([l - 2*(j+1)]P) = x([2 * (j+1)]P)
-
-	// ----------------------------------------------------------------
-	//                   ~~~~~~~~               ~~~~~~~~
-	//                    |    |                 |    |
-	// Computing h_I(W) = |    | (W - x([i]P)) = |    | (Zi * W - Xi) / Zi where x([i]P) = Xi/Zi
-	//                    i in I                 i in I
-	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
-
-	product_tree_LENFeq2(ptree_hI, deg_ptree_hI, 0, I, sI);				// Product tree of hI
-	if (!scaled)
-	{
-		// (unscaled) remainder tree approach
-		reciprocal_tree(rtree_hI, rtree_A, 2*sJ + 1, ptree_hI, deg_ptree_hI, 0, sI);	// Reciprocal tree of hI
-	}
-	else
-	{
-		// scaled remainder tree approach
-		fp2_t f_rev[sI_max + 1];
-		for (j = 0; j < (sI + 1); j++)
-			fp2_copy(&f_rev[j], &ptree_hI[0][sI - j]);
-
-		if (sI > (2*sJ - sI + 1))
-			reciprocal(R0, &A0, f_rev, sI + 1, sI);
-		else
-			reciprocal(R0, &A0, f_rev, sI + 1, 2*sJ - sI + 1);
-	};
-}
-
-void kps_clear(int i){
-		if (TORSION_ODD_PRIMES[i] > gap)
-		{
-			if (!scaled)
-				clear_tree(rtree_hI, 0, sizeI[i]);
-			clear_tree(ptree_hI, 0, sizeI[i]);
-		}
-}
--- a/src/ec/ref/ecx/poly-mul.c
+++ b/src/ec/ref/ecx/poly-mul.c
--- a/src/ec/ref/ecx/poly-redc.c
+++ b/src/ec/ref/ecx/poly-redc.c
@@ -1,349 +0,0 @@
-#define _POLY_MUL_REDC_H_
-#include "poly.h"
-#include <assert.h>
-
-void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n){
-  
-  // Writes a polynomial to h and a field element to c such that f*h = c mod x^n
-  // REQUIRES h to have space for n terms
-  // NOT responsible for terms in h beyond h[n-1]
-
-  int i;
-
-  // Case when f needs to be padded with zeroes
-  if(n > lenf)
-  {
-    fp2_t fpad[n];
-    for(i = 0; i < lenf; i++)
-      fp2_copy(&fpad[i], &f[i]);
-    for(i = lenf; i < n; i++)
-      fp2_set(&fpad[i], 0);
-    reciprocal(h, c, fpad, n, n);
-    return;
-  }
-
-  // Trivial case
-  if(n == 0)
-  {
-    fp2_set(&*c, 0);
-    return;
-  }
-
-  // Case n = 1
-  if(n == 1)
-  {
-    fp2_copy(&*c, &f[0]);
-    fp_mont_setone(h[0].re);fp_set(h[0].im,0);
-    return;
-  }
-
-  // Case n = 2
-  if(n == 2)
-  {
-    fp2_sqr(&*c, &f[0]);
-    fp2_copy(&h[0], &f[0]);
-    fp2_neg(&h[1], &f[1]);
-    return;
-  }
-
-  // Case n = 3
-  if(n == 3)
-  {
-    fp2_t t0, t1;
-
-    fp2_sqr(&t0, &f[1]);
-    fp2_mul(&t1, &f[0], &f[2]);
-    fp2_sub(&t1, &t1, &t0);
-    fp2_mul(&t1, &t1, &f[0]);
-
-    reciprocal(h, c, f, 2, 2);
-    fp2_mul(&h[0], &h[0], &*c);
-    fp2_mul(&h[1], &h[1], &*c);
-    fp2_neg(&h[2], &t1);
-    fp2_sqr(&*c, &*c);
-    return;
-  }
-
-  // Case n = 4
-  if(n == 4)
-  {
-    fp2_t t0, t1, t2, t3, g[2];
-
-    reciprocal(g, &t3, f, 2, 2);
-    fp2_sqr(&t0, &f[1]);
-    fp2_mul(&t1, &g[0], &f[2]);
-    fp2_mul(&t2, &g[0], &f[3]);
-    fp2_mul(&h[1], &g[1], &f[2]);
-    fp2_sub(&t0, &t1, &t0);
-    fp2_add(&t1, &t2, &h[1]);
-    fp2_mul(&t2, &t0, &g[0]);
-    fp2_mul(&h[1], &t0, &g[1]);
-    fp2_mul(&h[3], &t1, &g[0]);
-    fp2_add(&h[3], &h[1], &h[3]);
-    
-    fp2_mul(&h[0], &g[0], &t3);
-    fp2_mul(&h[1], &g[1], &t3);
-    fp2_neg(&h[2], &t2);
-    fp2_neg(&h[3], &h[3]);
-    fp2_sqr(&*c, &t3);
-    return;
-  }
-
-
-  // General case
-  // Compute the reciprocal g mod x^m for m = ceil(n/2)
-  // Then f*g-c is multiple of x^m so we only care about terms from m to n-1
-  const int m = n - (n>>1);
-  fp2_t g[m], t[m], t0;
-
-  reciprocal(g, &t0, f, lenf, m);
-  poly_mul_middle(t, g, m, f, n);
-  poly_mul_low(t, n-m, g, m, &(t[2*m-n]), n-m);
-  for(i = 0; i < m; i++)
-    fp2_mul(&h[i], &g[i], &t0);
-  for(i = m; i < n; i++)
-    fp2_neg(&h[i], &t[i-m]);
-  fp2_sqr(&*c, &t0);
-  return;
-}
-
-
-void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,//
-	       const poly f_rev_inv, const fp2_t c)
-{
-  // Computes h(x) =  a * g(x) mod f(x) for some scalar a, writting lenf-1 terms to h.
-  // REQUIRES an inverse f_rev_inv such that f_rev*f_rev_inv = c mod x^(leng-lenf+1),
-  // where f_rev is the polynomial with the coefficients of f listed in reverse order.
-  // The scalar a is equal to c, except for special cases:
-  //    - If leng<lenf (no reduction needed) then a = 1
-  //    - If lenf = leng = 2, then a = f[1] 
-  //    - If lenf = leng = 3, then a = f[2] 
-  //    - If lenf=2, leng=3 then a = 2*f[1]^2
-  //
-  // REQUIRES h to have space for lenf-1 terms
-  // NOT responsible for terms in h beyond h[lenf-2]
-
-  int i;
-  
-  // Case without reduction
-  if(leng < lenf)
-  {
-    for(i = 0; i < leng; i++)
-      fp2_copy(&h[i], &g[i]);
-    for(i = leng; i < lenf-1; i++)
-      fp2_set(&h[i], 0);
-    return;
-  }
-
-  // Small cases for f linear
-  if(lenf == 2)
-  {
-    if(leng == 2)
-    {
-      fp2_t t0;
-      fp2_mul(&t0, &g[0], &f[1]);
-      fp2_mul(&h[0], &g[1], &f[0]);
-      fp2_sub(&h[0], &t0, &h[0]);
-      return;
-    }
-    
-    if(leng == 3)
-    {
-      fp2_t f0f1, f02, f12;
-      fp2_sqr(&f02, &f[0]);
-      fp2_sqr(&f12, &f[1]);
-      fp2_sub(&f0f1, &f[0], &f[1]);
-      fp2_sqr(&f0f1, &f0f1);
-      fp2_sub(&f0f1, &f0f1, &f02);
-      fp2_sub(&f0f1, &f0f1, &f12);
-      fp2_add(&f02, &f02, &f02);
-      fp2_add(&f12, &f12, &f12);
-      fp2_mul(&f02, &f02, &g[2]);
-      fp2_mul(&f12, &f12, &g[0]);
-      fp2_mul(&f0f1, &f0f1, &g[1]);
-      fp2_add(&h[0], &f02, &f12);
-      fp2_add(&h[0], &h[0], &f0f1);
-      return;
-    }
-  }
-
-  // Small case for f cuadratic
-  if(lenf == 3 && leng == 3)
-  {
-    fp2_t f2g1, f2g0, f1g2;
-    fp2_mul(&f2g1, &g[1], &f[2]);
-    fp2_mul(&f2g0, &g[0], &f[2]);
-    fp2_mul(&f1g2, &g[2], &f[1]);
-    fp2_mul(&h[0], &g[2], &f[0]);
-    fp2_sub(&h[0], &f2g0, &h[0]);
-    fp2_sub(&h[1], &f2g1, &f1g2);
-    return;
-  }
-
-  // General case
-  fp2_t g_reversed[leng], Q[leng - lenf + 1], Q_reversed[leng - lenf + 1];
-  
-  for(i = 0; i < leng; i++)
-    fp2_copy(&g_reversed[i], &g[leng-1-i]);
-
-  poly_mul_low(Q, leng-lenf+1, f_rev_inv, leng-lenf+1, g_reversed, leng-lenf+1);
-
-  for(i = 0; i < leng - lenf + 1; i++)
-    fp2_copy(&Q_reversed[i], &Q[leng - lenf - i]);
-
-  poly_mul_low(g_reversed, lenf-1, Q_reversed, leng-lenf+1, f, lenf);
-
-  for(i = 0; i < lenf-1; i++)
-  {
-    fp2_mul(&h[i], &g[i], &c);
-    fp2_sub(&h[i], &h[i], &g_reversed[i]);
-  }
-  return;
-}
-
-
-void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[],//
-		     const int root, const int n)
-{
-  // Given a product tree H with degrees tree DEG rooted at root and generated 
-  // by n polynomials, writes the reverse-reciprocal polynomials to R and field elements 
-  // to A such that Rev(H[i])*R[i] = A[i] mod x^(N) for all nodes but the leaves.
-  // The mod is N = deg(parent)-deg(self) for inner nodes, or N = leng - deg(root) for the root.
-  //
-  // REQUIRES that leng >= DEG[0] and that R,A have enough space for the tree (see product_tree)
-
-  if(n == 0)
-    return;
-
-  const int parent = (root-1) >> 1;
-  const int brother = root - 1 + 2*(root & 1);
-  int lenr;
-
-  if(root > 0)
-    lenr = DEG[parent] - DEG[root];
-  else
-    lenr = leng - DEG[root];
-  
-  R[root] = malloc(sizeof(fp2_t)*lenr);
-  
-  // ----------------------------------
-  // base cases determined by poly_redc
-  if(n == 1)
-    return;
-
-
-  // case for computing  g mod f when len(f), len(g) = 3
-  if (DEG[root] == 2 && lenr == 1)
-  {
-    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
-    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
-    return;
-  }
-  
-  // ----------------------------------
-
-  int i;
-  
-  // When the parent's inverse was calculated to a smaller modulus, need to invert from scratch
-  if(root == 0 || leng < lenr)
-  {
-    for(i = 0; i < lenr && i < DEG[root]+1; i++)
-      fp2_copy(&R[root][i], &H[root][DEG[root]-i]);
-    for(i = DEG[root]+1; i < lenr; i++){
-      fp2_set(&R[root][i], 0);
-    }
-    reciprocal(R[root], &(A[root]), R[root], lenr, lenr);
-  }
-  else
-  {
-  // When parent's inverse was to a greater/equal modulus, this inverse can be obtained from it
-    for(i = 0; i < lenr; i++)
-      fp2_copy(&R[root][i], &H[brother][DEG[brother]-i]);
-    poly_mul_low(R[root], lenr, R[parent], leng, R[root], lenr);
-    fp2_copy(&A[root], &A[parent]);
-  }
-
-  // Now move on to the children
-  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
-  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
-  return;
-}
-
-
-void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[],//
-		const poly H[], const int DEG[], const int root, const int n)
-{
-  // Given the product tree H and reciprocal tree R,A generated by f_0, ... , f_{n-1},
-  // with corresponding degrees tree DEG[] and rooted at root,  writes the constant term 
-  // of c_i*g mod f_i to REM[i]. The constants c_i are unspecified, but are a function
-  // only of leng and f_0,...,f_{n-1} so they cancel out when taking the ratios of
-  // remainders of different g's of the same length.
-  //
-  // REQUIRES REM to have space for n terms
-
-  if(n == 0)
-    return;
-  
-  fp2_t g_mod[DEG[root]];
-  poly_redc(g_mod, g, leng, H[root], DEG[root]+1, R[root], A[root]);
-
-  if(n == 1)
-  {
-    fp2_copy(&REM[0], &g_mod[0]);
-    return;
-  }
-  
-  multieval_unscaled(REM, g_mod, DEG[root], R, A, H, DEG, 2*root+1, n-(n>>1));
-  multieval_unscaled(&(REM[n-(n>>1)]), g_mod, DEG[root], R, A, H, DEG, 2*root+2, n>>1);
-  return;
-}
-
-
-void multieval_scaled(fp2_t REM[], const poly G, const poly H[], //
-			   const int DEG[], const int root, const int n)
-{
-  // Given the product tree H generated by LINEAR f_0,...,f_{n-1} rooted at root and with
-  // corresponding degrees tree DEG, writes the constant term of c_i * g mod f_i(x) to REM[i]
-  // The constants c_i are unspecified but are only a function of leng and f_0,...,f_{n-1},
-  // so they cancel out when taking the ratios of remainders of different g's of the same length.
-  //
-  // REQUIRES REM to have space for n terms and n > 1
-  // Also REQUIRES G = rev((rev(g mod F)) * F_rev_inv mod x^deg(F)-1) where F = H[root]
-  // and F_rev_inv is its reverse's reciprocal mod x^deg(F)
-
-  if(root == 0)
-  {
-    if(n == 1)
-    {
-      fp2_copy(&REM[0], &G[DEG[root]-1]);
-      return;
-    }
-    else
-    {
-      multieval_scaled(REM, G, H, DEG, 2*root+1, n-(n>>1));
-      multieval_scaled(&(REM[n-(n>>1)]), G, H, DEG, 2*root+2, n>>1);
-      return;
-    }
-  }
-    
-  const int parent = (root-1) >> 1;
-  const int brother = root - 1 + 2*(root & 1);
-  const int uncle = parent - 1 + 2*(parent & 1);
-  fp2_t fg[DEG[brother]+1];
-
-  if(root > 2)
-    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[uncle]+1);
-  else
-    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[0]);
-    
-  
-  if(n == 1)
-  {
-    fp2_copy(&REM[0], &fg[DEG[brother]]);
-    return;
-  }
-
-  multieval_scaled(REM, fg, H, DEG, 2*root+1, n-(n>>1));
-  multieval_scaled(&(REM[n-(n>>1)]), fg, H, DEG, 2*root+2, n>>1);
-  return;
-}
--- a/src/ec/ref/ecx/tedwards.c
+++ b/src/ec/ref/ecx/tedwards.c
@@ -1,231 +0,0 @@
-#include <tedwards.h>
-#include <assert.h>
-
-// a*x^2+y^2=1+d*x^2*y^2
-// a = A.x/A.z + 2, d = A.x/A.z - 2
-
-void ted_init(ted_point_t* P)
-{ // Initialize point as identity element (X:Y:Z:T) <- (0:1:1:0)
-    fp_t one = {0};
-
-    memset((digit_t*)P, 0, NWORDS_FIELD*RADIX*8/8);
-    one[0] = 1;
-    fp_tomont(P->x.re, one);
-}
-
-void copy_ted_point(ted_point_t* P, ted_point_t const* Q)
-{
-    fp2_copy(&(P->x), &(Q->x));
-    fp2_copy(&(P->y), &(Q->y));
-    fp2_copy(&(P->z), &(Q->z));
-    fp2_copy(&(P->t), &(Q->t));
-}
-
-void ted_dbl(ted_point_t *Q, ted_point_t const *P, ec_curve_t const* E) 
-{
-    // A = X1^2
-    // B = Y1^2
-    // C = 2*Z1^2
-    // D = a*A
-    // K = (X1+Y1)^2-A-B
-    // G = D+B
-    // F = G-C
-    // H = D-B
-    // X3 = K*F
-    // Y3 = G*H
-    // T3 = K*H
-    // Z3 = F*G
-
-    // TODO: neutral element
-    fp2_t A, B, C, D, K, G, F, H;
-
-    fp2_sqr(&A, &P->x);
-    fp2_sqr(&B, &P->y);
-    fp2_sqr(&C, &P->z);
-    fp2_add(&C, &C, &C);
-    fp2_mul(&D, &A, &E->A);
-    fp2_add(&K, &P->x, &P->y);
-    fp2_sqr(&K, &K);
-    fp2_sub(&K, &K, &A);
-    fp2_sub(&K, &K, &B);
-    fp2_add(&G, &D, &B);
-    fp2_sub(&F, &G, &C);
-    fp2_sub(&H, &D, &B);
-    fp2_mul(&Q->x, &K, &F);
-    fp2_mul(&Q->y, &G, &H);
-    fp2_mul(&Q->t, &K, &H);
-    fp2_mul(&Q->z, &F, &G);
-}
-
-void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E)
-{
-    // A = X1*X2
-    // B = Y1*Y2
-    // C = Z1*T2
-    // D = T1*Z2
-    // K = D+C
-    // F = (X1-Y1)*(X2+Y2)+B-A
-    // G = B+a*A
-    // H = D-C
-    // X3 = K*F
-    // Y3 = G*H
-    // T3 = K*H
-    // Z3 = F*G
-
-    // TODO: neutral element
-
-    ted_point_t res;
-
-    if (is_ted_equal(P, Q)) {
-      ted_dbl(S, P, E);
-      return;
-    }
-    //assert(!is_ted_equal(P, Q));
-    
-    ted_neg(&res, P);
-    if (is_ted_equal(&res, Q)) {
-       ted_init(S);
-       return;
-    }
-    // assert(!ted_equal(&res,Q));
-    fp2_t A, B, C, D, K, F, G, H, tmp;
-
-    fp2_mul(&A, &P->x, &Q->x);
-    fp2_mul(&B, &P->y, &Q->y);
-    fp2_mul(&C, &P->z, &Q->t);
-    fp2_mul(&D, &P->t, &Q->z);
-    fp2_add(&K, &D, &C);
-    fp2_add(&F, &Q->x, &Q->y);
-    fp2_sub(&tmp, &P->x, &P->y);
-    fp2_mul(&F, &F, &tmp);
-    fp2_add(&F, &F, &B);
-    fp2_sub(&F, &F, &A);
-    fp2_mul(&G, &A, &E->A);
-    fp2_add(&G, &G, &B);
-    fp2_sub(&H, &D, &C);
-    fp2_mul(&res.x, &K, &F);
-    fp2_mul(&res.y, &G, &H);
-    fp2_mul(&res.t, &K, &H);
-    fp2_mul(&res.z, &F, &G);
-
-    if (fp2_is_zero(&res.x) && fp2_is_zero(&res.y) && fp2_is_zero(&res.z)) {
-        ted_dbl(S, P, E);
-    } else {
-        copy_ted_point(S, &res);
-    }
-}
-
-void ted_neg(ted_point_t* Q, ted_point_t const* P)
-{
-    fp2_neg(&Q->x, &P->x);
-    fp2_copy(&Q->y, &P->y);
-    fp2_copy(&Q->z, &P->z);
-    fp2_neg(&Q->t, &P->t);
-}
-
-static bool xLIFT(fp2_t* y, const ec_point_t* P, const ec_curve_t* curve)
-{ // Returns false if it is on the curve, true if it is on the twist
-    fp2_t z2, tmp1, tmp2, y2;
-
-    if (fp2_is_zero(&P->z)) return false;
-
-    // (X^2 + Z^2) C
-    fp2_sqr(&tmp1, &P->x);
-    fp2_sqr(&z2, &P->z);
-    fp2_add(&tmp1, &tmp1, &z2);
-    fp2_mul(&tmp1, &tmp1, &curve->C);
-
-    // X^2C + AXZ + Z^2C
-    fp2_mul(&tmp2, &P->x, &P->z);
-    fp2_mul(&tmp2, &tmp2, &curve->A);
-    fp2_add(&tmp1, &tmp1, &tmp2);
-
-    // X^3C + AX^2Z + XZ^2C = Z^3(Cx^3 + Ax^2 + Cx) = Z^3 C (B*y^2) = Z C (B*Y^2) // x = X/Z
-    fp2_mul(&tmp1, &tmp1, &P->x);
-    // (ZC)^(-1)
-    fp2_mul(&tmp2, &curve->C, &P->z);
-
-    assert(!fp2_is_zero(&tmp2));
-    
-    fp2_inv(&tmp2);    
-    fp2_mul(&y2, &tmp1, &tmp2);    // (B*Y^2)
-    fp2_copy(y, &y2);
-
-    if (fp2_is_square(&y2)) {  // on the curve
-        fp2_sqrt(y);
-        return false;
-    } else { // on the twist
-        fp2_t tmp = fp2_non_residue();
-        fp2_mul(y, y, &tmp);
-        fp2_sqrt(y);
-        return true;
-    }
-}
-
-//void mont_to_ted(ec_point_t* E, ec_point_t const* A, bool twist)
-void mont_to_ted(ec_curve_t* ted_curve, ec_curve_t const* curve)
-{
-    fp2_t tmp, two;
-
-    // A : y^2 = x^3 + (a/c)x^2 + x
-    fp2_copy(&tmp, &curve->C);         
-    fp2_inv(&tmp);                    // 1/c
-    fp2_mul(&tmp, &tmp, &curve->A);   // a/c
-    fp2_set(&two, 2);
-    fp2_tomont(&two, &two);
-    fp2_add(&ted_curve->A, &tmp, &two);       // a/c + 2
-    fp2_sub(&ted_curve->C, &tmp, &two);       // a/c - 2
-    //if (twist) {
-        // B = Fp2_inv(fp2_non_residue)
-    //    tmp = fp2_non_residue();
-    //    fp2_mul2(&E->x,&tmp);
-    //    fp2_mul2(&E->z,&tmp);
-    //}
-}
-
-void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* curve)
-{
-    if (fp2_is_zero(&P->z)) {
-        fp2_set(&Q->x, 0);
-        fp2_set(&Q->y, 1);
-        fp2_set(&Q->z, 1);
-        fp2_set(&Q->t, 0);
-        fp_tomont(Q->y.re, Q->y.re);
-        fp_tomont(Q->z.re, Q->z.re);
-    } else {
-        fp2_t tmp, y;
-
-        xLIFT(&y, P, curve);
-        fp2_add(&tmp, &P->x, &P->z);
-        fp2_mul(&Q->x, &P->x, &tmp);
-        fp2_sub(&Q->y, &P->x, &P->z);
-        fp2_mul(&Q->y, &Q->y, &y);
-        fp2_mul(&Q->z, &tmp, &y);
-        fp2_copy(&Q->t, &Q->z);
-        fp2_inv(&Q->t);
-        fp2_mul(&Q->t, &Q->t, &Q->x);
-        fp2_mul(&Q->t, &Q->t, &Q->y);
-    }
-}
-
-void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P)
-{
-    fp2_add(&Q->x, &P->z, &P->y);
-    fp2_sub(&Q->z, &P->z, &P->y);
-}
-
-bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2)
-{
-    fp2_t x1z2, y1z2;
-    fp2_t y2z1, x2z1;
-    fp2_t t1y2, t2y1;
-
-    fp2_mul(&x1z2, &P1->x, &P2->z);
-    fp2_mul(&y1z2, &P1->y, &P2->z);
-    fp2_mul(&y2z1, &P2->y, &P1->z);
-    fp2_mul(&x2z1, &P2->x, &P1->z);
-    fp2_mul(&t1y2, &P1->t, &P2->y);
-    fp2_mul(&t2y1, &P2->t, &P1->y);
-
-    return fp2_is_equal(&x1z2, &x2z1) && fp2_is_equal(&y1z2, &y2z1) && fp2_is_equal(&t1y2, &t2y1);
-}
--- a/src/ec/ref/ecx/test/ec-test.c
+++ b/src/ec/ref/ecx/test/ec-test.c
@@ -1,18 +0,0 @@
-#include "ec-tests.h"
-
-int main(int argc, char* argv[])
-{
-    if (argc < 3) {
-        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
-        exit(1);
-    }
-    if (!strcmp(argv[1], "test")) {
-        TEST_LOOPS = atoi(argv[2]);
-        return !(ec_test() & dlog_test());
-    } else if (!strcmp(argv[1], "bench")) {
-        BENCH_LOOPS = atoi(argv[2]);
-        return !(ec_run() & dlog_run());
-    } else {
-        exit(1);
-    }
-}
--- a/src/ec/ref/ecx/test/fp2-test.c
+++ b/src/ec/ref/ecx/test/fp2-test.c
@@ -1,142 +0,0 @@
-#include <assert.h>
-#include <time.h>
-#include <stdio.h>
-#include <fp2.h>
-#include <inttypes.h>
-
-static int BENCH_LOOPS = 1000;       // Number of iterations per bench
-static int TEST_LOOPS  = 512;       // Number of iterations per test
-
-bool fp2_isequal(fp2_t a, fp2_t b){
-    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
-}
-
-bool fp2_isone(fp2_t a){
-    fp_t one;
-    bool res = 1;
-    fp_mont_setone(one);
-    for(int i = 0; i < NWORDS_FIELD; i++){
-        res = res && (a.re[i] == one[i]);
-        res = res && (a.im[i] == 0);
-    }
-    return res;
-}
-
-void fp2_print(char *name, fp2_t const a){
-    fp2_t b;
-    fp2_set(&b, 1);
-    fp2_mul(&b, &b, &a);
-    printf("%s = 0x", name);
-    for(int i = NWORDS_FIELD - 1; i >=0; i--)
-        printf("%016" PRIx64, b.re[i]);
-    printf(" + i*0x");
-    for(int i = NWORDS_FIELD - 1; i >=0; i--)
-        printf("%016" PRIx64, b.im[i]);
-    printf("\n");
-}
-
-// VERY NOT SECURE (testing only)
-void fp2_random(fp2_t *a){
-    for(int i = 0; i < NWORDS_FIELD; i++){
-        a->re[i] = rand();
-        a->im[i] = rand();
-    }
-    // Normalize
-    fp2_t one;
-    fp_mont_setone(one.re);fp_set(one.im,0);
-    fp2_mul(&*a, &*a, &one);
-    // Update seed
-    srand((unsigned) a->re[0]);
-}
-
-int main(int argc, char* argv[])
-{
-	if (argc > 1) {
-		TEST_LOOPS = atoi(argv[1]);
-	}
-
-	fp2_t fp2_0, fp2_1;
-	// ------------
-	fp2_set(&fp2_0, 0);
-	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
-	// ------------
-
-	int i;
-	fp2_t a, b, c, d;
-	fp_t e;
-
-	for (i = 0; i < TEST_LOOPS; i++)
-	{
-		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)TEST_LOOPS);
-		fflush(stdout);
-		printf("\r\x1b[K");
-                
-		// Random elements of fp
-		fp2_random(&a);
-		fp2_random(&b);
-		fp2_copy(&c, &a);
-		c.re[0] += 1;
-		fp2_copy(&d, &b);
-		d.re[0] -= 1;
-
-		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
-		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
-
-		// Testing neg
-		fp2_set(&b, 0);
-		fp2_copy(&c, &a);
-		fp2_neg(&a, &a);
-		fp2_sub(&c, &b, &c);
-		assert(fp2_isequal(a,c) == 1);
-
-		fp_mont_setone(a.re);fp_set(a.im,0);	// Now a == 1
-		fp2_set(&b, 0);	// Now b == 0
-
-		assert(fp2_is_zero(&a) == 0);
-		assert(fp2_is_zero(&b) == 1);
-
-		// testing c - c
-		fp2_sub(&d, &c, &c);
-		assert(fp2_is_zero(&d) == 1);
-
-		// tetsing c * 0
-		fp2_mul(&d, &c, &b);
-		assert(fp2_is_zero(&d) == 1);
-
-		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
-		fp_mont_setone(a.re);fp_set(a.im,0);
-		fp2_mul(&d, &c, &a);
-		assert(fp2_isequal(d, c) == 1);
-
-		// fp_set(e, 1);	// Now e == 1
-		// fp2_pow(d, e, c);
-		// assert(fp2_isequal(d, c) == 1);
-		
-		// fp_set(e, 0);	// Now e == 0
-		// fp2_pow(d, e, c);
-		// assert(fp2_isone(d) == 1);
-
-		// fp2_set(a, 1);	// Now e == R mod p
-		// fp_random(e);
-		// fp2_pow(d, e, a);
-		// assert(fp2_isone(d) == 1);
-
-		// Testing 1/a by computing (1/a) x a
-		fp2_random(&a);
-		fp2_copy(&b, &a);
-		fp2_inv(&a);
-		fp2_mul(&c, &a, &b);
-		assert(fp2_isone(c) == 1);
-
-		fp2_random(&a);
-		fp2_sqr(&b, &a);
-		assert( fp2_is_square(&b) );
-
-	};
-
-	if(TEST_LOOPS){
-		printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i /TEST_LOOPS);
-	}
-	printf("-- All tests passed.\n");
-	return 0;
-}
--- a/src/ec/ref/ecx/test/isog-test.c
+++ b/src/ec/ref/ecx/test/isog-test.c
--- a/src/ec/ref/ecx/test/mont-test.c
+++ b/src/ec/ref/ecx/test/mont-test.c
@@ -1,386 +0,0 @@
-#include <time.h>
-#include <assert.h>
-#include <stdio.h>
-
-#include "ec.h"
-#include "isog.h"
-#include "test-basis.h"
-#include <bench.h> 
-
-static int BENCH_LOOPS = 1000;       // Number of iterations per bench
-static int TEST_LOOPS  = 128;       // Number of iterations per test
-
-// void random_scalar(fp_t k, const uint8_t j)
-// {
-//         // To implement a better random function (We must use some of the SHAKE family functions)
-//         do
-//         {
-//                 randombytes((void *)k, keyspace_bytes[j]);
-//         } while (fp_issmaller((uint64_t *)k, keyspace_size[j]));
-// }
-
-// VERY NOT SECURE (testing only)
-void fp2_random(fp2_t *a){
-    for(int i = 0; i < NWORDS_FIELD; i++){
-        a->re[i] = rand();
-        a->im[i] = rand();
-    }
-    // Normalize
-    fp2_t one;
-    fp_mont_setone(one.re);fp_set(one.im,0);
-    fp2_mul(&*a, &*a, &one);
-    // Update seed
-    srand((unsigned) a->re[0]);
-}
-
-// Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
-void coeff(fp2_t *B, ec_point_t const A)
-{
-	fp2_t t;
-	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
-	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
-
-	fp2_copy(&*B, &A.z);
-	fp2_inv(&*B);		// 1 / (C24)
-	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
-	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
-}
-
-// Determines if point is fp2-rational (if not, then it must be a zero trace point)
-uint8_t isrational(ec_point_t const T, fp2_t const a)
-{
-	fp2_t XT, tmp, aux, YT_squared;
-
-	fp2_copy(&XT, &T.z);
-	fp2_inv(&XT);
-
-	fp2_mul(&XT, &XT, &T.x);
-
-	fp2_sqr(&tmp, &XT);
-	fp2_mul(&aux, &tmp, &XT);
-	fp2_mul(&tmp, &tmp, &a);
-	fp2_add(&YT_squared, &tmp, &aux);
-	fp2_add(&YT_squared, &YT_squared, &XT);
-
-	return fp2_is_square(&YT_squared);
-}
-
-// ladder3pt computes x(P + [m]Q)
-void ladder3pt(ec_point_t* R, fp_t const m, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A)
-{
-	ec_point_t X0, X1, X2;
-	copy_point(&X0, Q);
-	copy_point(&X1, P);
-	copy_point(&X2, PQ);
-
-	int i,j;
-	uint64_t t;
-	for (i = 0; i < NWORDS_FIELD; i++)
-	{
-		t = 1;
-		for (j = 0 ; j < 64; j++)
-		{
-			swap_points(&X1, &X2, -((t & m[i]) == 0));
-			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
-			swap_points(&X1, &X2, -((t & m[i]) == 0));
-			t <<= 1;
-		};
-	};
-	copy_point(R, &X1);
-}
-
-// For computing [(p + 1) / l_i]P, i:=0, ..., (N - 1)
-void cofactor_multiples(ec_point_t P[], ec_point_t const* A, size_t lower, size_t upper)
-{
-	assert(lower < upper);
-	if (upper - lower == 1)
-		return ;
-
-	int i;
-	size_t mid = lower + (upper - lower + 1) / 2;
-	copy_point(&(P[mid]), &(P[lower]));
-	for (i = lower; i < (int)mid; i++)
-		xMULv2(&(P[mid]), &(P[mid]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
-	for (i = (int)mid; i < (int)upper; i++)
-		xMULv2(&(P[lower]), &(P[lower]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
-
-	cofactor_multiples(P, A, lower, mid);
-	cofactor_multiples(P, A, mid, upper);
-}
-
-// The projective x-coordinate point (X : Z) at infinity is such that Z == 0
-static inline int isinfinity(ec_point_t const P)
-{
-	return fp2_is_zero(&P.z);
-}
-
-int main(int argc, char* argv[])
-{
-	if (argc > 1) {
-		TEST_LOOPS = atoi(argv[1]);
-	}
-
-	fp2_t fp2_0, fp2_1;
-	fp2_set(&fp2_0, 0);
-	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
-
-	int i, j;
-
-	ec_point_t A;
-	fp2_set(&A.x, 0);
-	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
-
-	fp2_add(&A.z, &A.z, &A.z);	// 2C
-	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
-	fp2_add(&A.z, &A.z, &A.z);	// 4C
-
-	// Just to ensure the projective curve coeffientes are different from zero
-	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
-
-	fp2_t a;
-	coeff(&a, A);
-
-	ec_point_t PA, QA, PQA, PB, QB, PQB;
-
-	// Writing the public projective x-coordinate points into Montogmery domain
-	fp2_tomont(&(PA.x), &(xPA));
-	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
-	fp2_tomont(&(QA.x), &(xQA));
-	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
-	fp2_tomont(&(PQA.x), &(xPQA));
-	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
-
-	assert( isrational(PA, a) );
-	assert( isrational(QA, a) );
-	assert( isrational(PQA, a) );
-
-	// ======================================================================================================
-	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
-	for (j = 0; j < P_LEN; j++)
-	{
-		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
-		{
-			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-	
-			assert( isrational(PA, a) );
-			assert( isrational(QA, a) );
-			assert( isrational(PQA, a) );
-		};
-	};
-	assert( !isinfinity(PA) );
-	assert( !isinfinity(QA) );
-	assert( !isinfinity(PQA) );
-	
-	ec_point_t P[P_LEN + M_LEN], Q[P_LEN + M_LEN], PQ[P_LEN + M_LEN];
-	copy_point(&(P[0]), &PA);
-	cofactor_multiples(P, &A, 0, P_LEN);
-	copy_point(&(Q[0]), &QA);
-	cofactor_multiples(Q, &A, 0, P_LEN);
-	copy_point(&(PQ[0]), &PQA);
-	cofactor_multiples(PQ, &A, 0, P_LEN);
-	for (j = 0; j < P_LEN; j++)
-	{
-		// x(PA)
-		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
-		assert( isrational(P[j], a) );
-		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(P[j]) );		// It must be now the point at infinity
-		// x(QA)
-		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
-		assert( isrational(Q[j], a) );
-		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
-		// x(PQA)
-		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
-		assert( isrational(PQ[j], a) );
-		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
-	};
-	// Writing the public projective x-coordinate points into Montogmery domain
-	fp2_tomont(&(PB.x), &(xPB));
-	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
-	fp2_tomont(&(QB.x), &(xQB));
-	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
-	fp2_tomont(&(PQB.x), &(xPQB));
-	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
-
-	assert( !isrational(PB, a) );
-	assert( !isrational(QB, a) );
-	assert( !isrational(PQB, a) );
-	// ======================================================================================================
-	// Recall, PB, QB, and PQB are expeted to be M-order points, but we require to ensure they are of order M
-	for (j = P_LEN; j < (P_LEN + M_LEN); j++)
-	{
-		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
-		{
-			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-	
-			assert( !isrational(PB, a) );
-			assert( !isrational(QB, a) );
-			assert( !isrational(PQB, a) );
-		};
-	};
-	assert( !isinfinity(PB) );
-	assert( !isinfinity(QB) );
-	assert( !isinfinity(PQB) );
-
-	copy_point(&(P[P_LEN]), &PB);
-	cofactor_multiples(P, &A, P_LEN, P_LEN + M_LEN);
-	copy_point(&(Q[P_LEN]), &QB);
-	cofactor_multiples(Q, &A, P_LEN, P_LEN + M_LEN);
-	copy_point(&(PQ[P_LEN]), &PQB);
-	cofactor_multiples(PQ, &A, P_LEN, P_LEN + M_LEN);
-	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
-	{
-		// x(PB)
-		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
-		assert( !isrational(P[j], a) );
-		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(P[j]) );		// It must be now the point at infinity
-		// x(QB)
-		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
-		assert( !isrational(Q[j], a) );
-		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
-		// x(PQB)
-		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
-		assert( !isrational(PQ[j], a) );
-		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
-	};
-
-	fp2_t m;
-
-	// Writing the public projective x-coordinate points into Montogmery domain
-	fp2_tomont(&(PA.x), &(xPA));
-	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
-	fp2_tomont(&(QA.x), &(xQA));
-	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
-	fp2_tomont(&(PQA.x), &(xPQA));
-	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
-
-	assert( isrational(PA, a) );
-	assert( isrational(QA, a) );
-	assert( isrational(PQA, a) );
-	
-	fp2_tomont(&(PB.x), &(xPB));
-	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
-	fp2_tomont(&(QB.x), &(xQB));
-	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
-	fp2_tomont(&(PQB.x), &(xPQB));
-	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
-
-	assert( !isrational(PB, a) );
-	assert( !isrational(QB, a) );
-	assert( !isrational(PQB, a) );
-
-	ec_point_t R[P_LEN + M_LEN];
-	int k;
-	for (j = 0; j < TEST_LOOPS; j++)
-	{
-		printf("[%3d%%] Testing EC differential arithmetic", 100 * j / TEST_LOOPS);
-		fflush(stdout);
-		printf("\r\x1b[K");
-		fp2_random(&m);
-		ladder3pt(&(R[0]), m.re, &PA, &QA, &PQA, &A);
-		assert( isrational(R[0], a) );
-		for (k = 0; k < P_LEN; k++)
-		{
-			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
-			{
-				xMULv2(&R[0], &R[0], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
-				assert( isrational(R[0], a) );
-			};
-		};
-		cofactor_multiples(R, &A, 0, P_LEN);
-		for (i = 0; i < P_LEN; i++)
-		{
-			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
-			assert( isrational(R[i], a) );
-			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
-			assert( isinfinity(R[i]) );		// It must be now the point at infinity
-		};
-
-		fp2_random(&m);
-		ladder3pt(&(R[P_LEN]), m.re, &PB, &QB, &PQB, &A);
-		assert( !isrational(R[P_LEN], a) );
-		for (k = P_LEN; k < (P_LEN+M_LEN); k++)
-		{
-			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
-			{
-				xMULv2(&R[P_LEN], &R[P_LEN], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
-				assert( !isrational(R[P_LEN], a) );
-			};
-		};
-		cofactor_multiples(R, &A, P_LEN, P_LEN + M_LEN);
-		for (i = P_LEN; i < (P_LEN+M_LEN); i++)
-		{
-			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
-			assert( !isrational(R[i], a) );
-			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
-			assert( isinfinity(R[i]) );		// It must be now the point at infinity
-		};
-	};
-
-	if(TEST_LOOPS)
-		printf("[%3d%%] Tested EC differential arithmetic:\tNo errors!\n", 100 * j / TEST_LOOPS);
-	printf("-- All tests passed.\n");
-
-	// BENCHMARK xDBLv2
-    unsigned long long cycles, cycles1, cycles2;
-    cycles = 0;
-	ec_point_t PP[TEST_LOOPS], EE[TEST_LOOPS];
-	for(int i = 0; i < TEST_LOOPS; i++){
-		fp2_random(&PP[i].x);
-		fp2_random(&PP[i].z);
-		fp2_random(&EE[i].x);
-		fp2_random(&EE[i].z);
-	}
-    cycles1 = cpucycles(); 
-	for(int i = 0; i < TEST_LOOPS; i++){
-		xDBLv2(&PP[i], &PP[i], &EE[i]);
-	}
-    cycles2 = cpucycles();
-    cycles = cycles+(cycles2-cycles1);
-	
-	printf("xDBLv2 bench: %7lld cycles\n", cycles/TEST_LOOPS);
-
-	// BENCHMARK xIsog4
-    cycles = 0;
-	ec_point_t KK0[TEST_LOOPS], KK1[TEST_LOOPS], KK2[TEST_LOOPS];
-	for(int i = 0; i < TEST_LOOPS; i++){
-		fp2_random(&KK0[i].x);
-		fp2_random(&KK0[i].z);
-		fp2_random(&KK1[i].x);
-		fp2_random(&KK1[i].z);
-		fp2_random(&KK2[i].x);
-		fp2_random(&KK2[i].z);
-	}
-    cycles1 = cpucycles(); 
-	for(int i = 0; i < TEST_LOOPS; i++){
-	fp2_t t0, t1;
-	fp2_add(&t0, &PP[i].x, &PP[i].z);
-	fp2_sub(&t1, &PP[i].x, &PP[i].z);
-	fp2_mul(&(EE[i].x), &t0, &KK1[i].x);
-	fp2_mul(&(EE[i].z), &t1, &KK2[i].x);
-	fp2_mul(&t0, &t0, &t1);
-	fp2_mul(&t0, &t0, &KK0[i].x); 
-	fp2_add(&t1, &(EE[i].x), &(EE[i].z));
-	fp2_sub(&(EE[i].z), &(EE[i].x), &(EE[i].z));
-	fp2_sqr(&t1, &t1);
-	fp2_sqr(&(EE[i].z), &(EE[i].z));
-	fp2_add(&(EE[i].x), &t0, &t1);
-	fp2_sub(&t0, &(EE[i].z), &t0);
-	fp2_mul(&(EE[i].x), &(EE[i].x), &t1);
-	fp2_mul(&(EE[i].z), &(EE[i].z), &t0);
-	}
-    cycles2 = cpucycles();
-    cycles = cycles+(cycles2-cycles1);
-	printf("xeval_4 bench: %7lld cycles\n", cycles/TEST_LOOPS);
-
-	return 0;
-}
--- a/src/ec/ref/ecx/test/poly-mul-test.c
+++ b/src/ec/ref/ecx/test/poly-mul-test.c
@@ -1,445 +0,0 @@
-#include <poly.h>
-#include <assert.h>
-#include <stdio.h>
-
-bool fp2_isequal(fp2_t a, fp2_t b){
-    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
-}
-
-// VERY NOT SECURE (testing only)
-void fp2_random(fp2_t *a){
-    for(int i = 0; i < NWORDS_FIELD; i++){
-        a->re[i] = rand();
-        a->im[i] = rand();
-    }
-    // Normalize
-    fp2_t one;
-    fp_mont_setone(one.re);fp_set(one.im,0);
-    fp2_mul(&*a, &*a, &one);
-    // Update seed
-    srand((unsigned) a->re[0]);
-}
-
-void slow_mul(poly h, poly f, int lenf, poly g, int leng){
-  // Computes h = f*g by school method
-
-  fp2_t a, b;
-  int nf, ng, e;
-  int lenh = lenf + leng - 1;
-  
-  if(lenh <= 0){
-    return;
-  }
-  
-  fp2_t fg[lenh];
-  
-  if (leng > lenf){
-    slow_mul(h, g, leng, f, lenf);
-    return;
-  }
-  
-  for(e = 0; e < lenh; e++){
-
-    if (lenf - 1 < e){
-      nf = lenf - 1;
-    }
-    else{
-      nf = e;
-    }
-
-    ng = e - nf;
-    fp2_set(&a, 0);
-    while( (ng < leng) & (nf >= 0) ){
-      fp2_mul(&b, &f[nf], &g[ng]);
-      fp2_add(&a, &a, &b);
-      nf--;
-      ng++;
-    }
-    fp2_copy(&fg[e], &a);
-  }
-  for(e = 0; e < lenh; e++){
-    fp2_copy(&h[e], &fg[e]);
-  }
-  return;
-}
-
-
-
-int main(){
-  fp2_t fp2_0, fp2_1;
-  #define nmax 16
-  int nf, ng, n, e;
-        fp2_set(&fp2_0, 0);
-        fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0); 
-  
-  //TEST MULTIPLICATION BY 0
-  
-  for(nf = 2; nf < nmax; nf++){
-    fp2_t f[nf], h[nf-1];
-
-    printf("[%3d%%] Testing multiplication by 0", 100 * nf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    for(e = 0; e < nf; e++){
-      fp2_random(&f[e]);
-    }
-    poly_mul(h, f, nf, f, 0);
-    for(e = 0; e < nf-1; e++){
-      assert(fp2_is_zero(&h[e])==1);
-    }
-    poly_mul(h, f, 0, f, nf);
-    for(e = 0; e < nf-1; e++){
-      assert(fp2_is_zero(&h[e])==1);
-    }
-  }
-  printf("[%3d%%] Tested multiplication by 0:\t\tNo errors!\n", 100 * nf / nmax);
-
-  
-  
-  //TEST FOR f, g, h DISJOINT MEMORY SPACES
-  
-  for(nf = 1; nf < nmax; nf++){
-    
-    printf("[%3d%%] Testing multiplication", 100 * nf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    for(ng = 1; ng < nmax; ng++){
-      
-      fp2_t f[nf];   //Random length nf poly
-      for(e = 0; e < nf; e++){
-	fp2_random(&f[e]);
-      }
-      
-      fp2_t g[ng];  // Random length ng poly
-      for(e = 0; e < ng; e++){
-	fp2_random(&g[e]);
-      }
-      
-      fp2_t h[nf+ng-1];// Compute product
-      poly_mul(h, f, nf, g, ng);
-
-      fp2_t fg[nf+ng-1]; // Compute the product by school method
-      slow_mul(fg, f, nf, g, ng);
-      
-      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
-	assert(fp2_isequal(h[e], fg[e])==1);
-      }
-    }
-  }
-  printf("[%3d%%] Tested multiplication:\t\t\tNo errors!\n", 100 * nf / nmax);
-
-  
-
-  // TEST FOR f, g CONTIGIOUS AND RESULT SAVED OVER THEM
-    
-  for(nf = 1; nf < nmax; nf++){
-          
-    printf("[%3d%%] Testing multiplication in place", 100 * nf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    for(ng = 1; ng < nmax; ng++){
-      
-      fp2_t h[nf+ng];
-      
-      //Random length nf poly
-      for(e = 0; e < nf; e++){
-	fp2_random(&h[e]);
-      }
-      
-      // Random length ng poly
-      for(e = 0; e < ng; e++){
-	fp2_random(&h[e+nf]);
-      }
-
-      // Compute the product
-      fp2_t fg[nf+ng-1];
-      slow_mul(fg, h, nf, &(h[nf]), ng); // School method
-      poly_mul(h, h, nf, &(h[nf]), ng); // Karatsuba method
-
-
-      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
-	assert(fp2_isequal(h[e], fg[e])==1);
-      }
-    }
-  }
-    printf("[%3d%%] Tested multiplication in place:\t\tNo errors!\n", 100 * nf / nmax);
-
-    
-    
-  //TEST FOR MULTIPLICATION MOD X^N BY 0
-    
-  for(nf = 2; nf < nmax; nf++){
-    fp2_t f[nf];
-    
-    printf("[%3d%%] Testing mul mod x^n by 0", 100 * nf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    for(e = 0; e < nf; e++){
-      fp2_random(&f[e]);
-    }
-    
-    for(n = 1; n < nmax; n++){
-      fp2_t h[n];
-      poly_mul_low(h, n, f, nf, f, 0);
-      for(e = 0; e < n; e++){
-	assert(fp2_is_zero(&h[e])==1);
-      }
-      poly_mul_low(h, n, f, 0, f, nf);
-      for(e = 0; e < n; e++){
-	assert(fp2_is_zero(&h[e])==1);
-      }
-    }
-  }
-  printf("[%3d%%] Tested mul mod x^n by 0:\t\t\tNo errors!\n", 100 * nf / nmax);
-
-  
-  
-  //TEST FOR MULTIPLICATION MOD X^N
-    
-    for(nf = 1; nf < nmax; nf++){
-    
-      printf("[%3d%%] Testing mul mod x^n", 100 * nf / nmax);
-      fflush(stdout);
-      printf("\r\x1b[K");
-      
-      for(ng = 1; ng < nmax; ng++){
-
-	fp2_t f[nf], g[ng], fg[nf+ng-1];
-	poly h;
-
-	//Get random polynomials
-	for(e = 0; e < nf; e++){
-	  fp2_random(&f[e]);
-	}
-	for(e = 0; e < ng; e++){
-	  fp2_random(&g[e]);
-	}
-	
-	//Save regular result to fg
-	slow_mul(fg, f, nf, g, ng);
-
-	//Compute result mod x^n
-	for(n = 1; n < 2*nmax; n++){
-	  h = malloc(sizeof(fp2_t)*n);
-	  poly_mul_low(h, n, f, nf, g, ng);
-
-	  //Compare with expected
-	  e = 0;
-	  while(e < nf+ng-1 && e < n){
-	    assert(fp2_isequal(h[e], fg[e]) == 1);
-	    e++;
-	  }
-	  while(e < n){
-	    assert(fp2_is_zero(&h[e]) == 1);
-	    e++;
-	  }
-	  free(h);
-	}
-      }
-    }
-    printf("[%3d%%] Tested mul mod x^n:\t\t\tNo errors!\n", 100 * nf / nmax);
-
-  
-     
-  //TEST FOR POLY_MUL_MIDDLE
-    
-    for(nf = 1; nf < 2*nmax; nf+=1){
-      fp2_t f[nf];
-      
-      printf("[%3d%%] Testing poly_mul_middle", 100 * nf / (2*nmax));
-      fflush(stdout);
-      printf("\r\x1b[K");
-      
-      for(ng = (nf+1)>>1; ng < (nf+1)-((nf+1)>>1); ng++){
-	// This runs from floor((nf+1)/2) to ceil((nf+1)/2)
-	fp2_t g[ng];
-	for(e = 0; e < nf; e++){
-	  fp2_random(&f[e]);
-	}
-	for(e = 0; e < ng; e++){
-	  fp2_random(&g[e]);
-	}
-	
-	fp2_t h[nf+ng-1];
-	slow_mul(h, g, ng, f, nf);
-	poly_mul_middle(g, g, ng, f, nf);
-      
-	for(e = 0; e < ng; e++){
-	  assert(fp2_isequal(h[e+nf-ng], g[e])==1);
-	}
-      }
-    }
-    printf("[%3d%%] Tested poly_mul_middle:\t\t\tNo errors!\n", 100 * nf / (2*nmax));
-
-  
-  // TEST FOR SELF RECIPROCAL MULTIPLICATION
-    for(nf = 1; nf < nmax; nf++){
-
-      printf("[%3d%%] Testing self reciprocal mul", 100 * nf / nmax);
-      fflush(stdout);
-      printf("\r\x1b[K");
-
-      for(ng = 1; ng < nmax; ng++){
-      
-	fp2_t f[nf], g[ng], h[nf+ng-1], fg[nf+ng-1];
-
-	// Get random palyndromes
-	for(e = 0; e < (nf>>1); e++){
-	  fp2_random(&f[e]);
-	  fp2_copy(&f[nf-1-e], &f[e]);
-	}
-	if(nf & 1){
-	  fp2_random(&f[nf>>1]);
-	}
-
-	for(e = 0; e < (ng>>1); e++){
-	  fp2_random(&g[e]);
-	  fp2_copy(&g[ng-1-e], &g[e]);
-	}
-	if(ng & 1){
-	  fp2_random(&g[ng>>1]);
-	} 
-
-	// Compute products
-	poly_mul_selfreciprocal(h, g, ng, f, nf);
-	slow_mul(fg, g, ng, f, nf);
-
-	// Compare
-	for(e = 0; e < nf+ng-1; e++){
-	  assert(fp2_isequal(fg[e], h[e])==1);
-	}
-      }
-    }		 
-    printf("[%3d%%] Tested self reciprocal mul:\t\tNo errors!\n", 100 * nf / nmax);
-
-  // TEST FOR PRODUCT TREES
-    int tree_size, iteration, i;
-    int  len, *DEG, LENF;
-    poly *H, *F, h;
-    
-    for(tree_size = 1; tree_size < nmax; tree_size++){
-
-      printf("[%3d%%] Testing product tree:\t\t\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
-      fflush(stdout);
-      printf("\r\x1b[K");
-
-      i = 0;
-      while((1<<i) < tree_size){
-	i++;
-      }
-      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
-      H = malloc(sizeof(poly)*((1<<(i+2))-1));
-      F = malloc(sizeof(poly)*tree_size);
-      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
-
-      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
-
-	// Generate random list of polynomials
-	LENF = (rand() % nmax)+1;
-	for(i = 0; i < tree_size; i++){
-	  F[i] = malloc(sizeof(fp2_t)*LENF);
-	  for(e = 0; e < LENF; e++){
-	    fp2_random(&F[i][e]);
-	  }
-	}
-	product_tree(H, DEG, 0, F, LENF, tree_size);
-	
-	// Build product of all polynomials manually
-	len = LENF;
-	
-	//for(e = 0; e < LENF[0]; e++){
-	for(e = 0; e < LENF; e++){
-	  fp2_copy(&h[e], &F[0][e]);
-	}
-	for(i = 1; i < tree_size; i++){
-	  poly_mul(h, h, len, F[i], LENF);
-	  len += LENF-1;
-	}
-
-	// Compare to root
-	assert (len == DEG[0]+1);
-	for(e = 0; e < len; e++){
-	  assert(fp2_isequal(H[0][e], h[e])==1);
-	}
-      clear_tree(H, 0, tree_size);
-      for(i = 0; i < tree_size; i++){
-	free(F[i]);
-      }
-
-      }
-      free(DEG);
-      free(H);
-      free(F); 
-      free(h);
-    }
-    printf("[%3d%%] Tested product tree:\t\t\tNo errors!\n", 100 * tree_size / nmax);
-    
-  // TEST FOR SELF RECIPROCAL PRODUCT TREES
-    
-    for(tree_size = 1; tree_size < nmax; tree_size++){
-
-      printf("[%3d%%] Testing selfreciprocal product tree:\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
-      fflush(stdout);
-      printf("\r\x1b[K");
-
-      i = 0;
-      while((1<<i) < tree_size){
-	i++;
-      }
-      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
-      H = malloc(sizeof(poly)*((1<<(i+2))-1));
-      F = malloc(sizeof(poly)*tree_size);
-      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
-
-      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
-
-	// Generate random list of polynomials
-	LENF = (rand() % nmax)+1;;
-	for(i = 0; i < tree_size; i++){
-	  F[i] = malloc(sizeof(fp2_t)*LENF);
-	  for(e = 0; e < (LENF>>1); e++){
-	    fp2_random(&F[i][e]);
-	    fp2_copy(&F[i][LENF-1-e], &F[i][e]);
-	  }
-	  if(LENF & 1){
-	  	fp2_random(&F[i][(LENF>>1)]);
-	  }
-	}
-	product_tree_selfreciprocal(H, DEG, 0, F, LENF, tree_size);
-	
-	// Build product of all polynomials manually
-	len = LENF;
-	for(e = 0; e < LENF; e++){
-	  fp2_copy(&h[e], &F[0][e]);
-	}
-	for(i = 1; i < tree_size; i++){
-	  poly_mul(h, h, len, F[i], LENF);
-	  len += LENF-1;
-	}
-
-	// Compare to root
-	assert (len == DEG[0]+1);
-	for(e = 0; e < len; e++){
-	  assert(fp2_isequal(H[0][e], h[e])==1);
-	}
-      clear_tree(H, 0, tree_size);
-      for(i = 0; i < tree_size; i++){
-	free(F[i]);
-      }
-
-      }
-      free(DEG);
-      free(H);
-      free(F); 
-      free(h);
-    }
-    printf("[%3d%%] Tested selfreciprocal product tree:\tNo errors!\n", 100 * tree_size / nmax);
-    
-    printf("-- All tests passed.\n");
-    return 0;
-}
-  
--- a/src/ec/ref/ecx/test/poly-redc-test.c
+++ b/src/ec/ref/ecx/test/poly-redc-test.c
@@ -1,461 +0,0 @@
-#include "poly.h"
-#include <assert.h>
-#include <stdio.h>
-#define nmax 32
-
-bool fp2_isequal(fp2_t a, fp2_t b){
-    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
-}
-
-// VERY NOT SECURE (testing only)
-void fp2_random(fp2_t *a){
-    for(int i = 0; i < NWORDS_FIELD; i++){
-        a->re[i] = rand();
-        a->im[i] = rand();
-    }
-    // Normalize
-    fp2_t one;
-    fp_mont_setone(one.re);fp_set(one.im,0);
-    fp2_mul(&*a, &*a, &one);
-    // Update seed
-    srand((unsigned) a->re[0]);
-}
-
-int main(){
-  fp2_t fp2_0, fp2_1;
-  fp2_set(&fp2_0, 0);
-  fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
-
-  int lenf, leng, n, e, iteration, array_size, tree_size, i, root, brother, *DEG, LENF;
-  poly f, g, h, f_rev, f_rev_inv, *F, *H, *R, g1, g2, REM1, REM2, G1, G2, G1_rev, G2_rev, R0;
-  fp2_t c, *A, *C, ratio, A0;
-  
-  f_rev_inv = 0;
-  
-// TEST FOR RECIPROCAL
-  for(lenf = 1; lenf < nmax; lenf++)
-  {  
-    printf("[%3d%%] Testing reciprocals", 100 * lenf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-
-    // Get random poly
-    f = malloc(sizeof(fp2_t)*lenf);
-    for(e = 0; e < lenf; e++)
-      fp2_random(&f[e]);
-
-    for(n = 1; n < nmax; n++)
-    {
-      // Get the reciprocal and multiply them
-      h = malloc(sizeof(fp2_t)*n);
-      memset(h, 0, sizeof(fp2_t)*n);
-      reciprocal(h, &c, f, lenf, n);
-      poly_mul_low(h, n, f, lenf, h, n);
-
-      // Compare with expected
-      assert(fp2_isequal(h[0],c));
-      for(e = 1;  e < n; e++)
-	assert(fp2_is_zero(&h[e]));
-      free(h);
-    }
-    free(f); 
-  }
-  printf("[%3d%%] Tested reciprocals:\t\tNo errors!\n", 100 * lenf / nmax);
-  
-  
-
-  // TEST FOR REDUCTION
-  for(lenf = 2; lenf < nmax; lenf++)
-  {
-    printf("[%3d%%] Testing polynomial reduction", 100 * lenf / nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-
-    // Get random poly for the mod
-    f = malloc(sizeof(fp2_t)*lenf);
-    f_rev = malloc(sizeof(fp2_t)*lenf);
-    for(e = 0; e < lenf; e++)
-    {
-      fp2_random(&f[e]);
-      fp2_copy(&f_rev[lenf-1-e], &f[e]);
-    }
-
-    for(leng = 1; leng < nmax; leng++)
-    {
-      // Get random poly to reduce
-      g = malloc(sizeof(fp2_t)*leng);
-      for(e = 0; e < leng; e++){
-	fp2_random(&g[e]);
-      }
-
-      // Get reverse-inverse mod x^(leng-lenf+1)
-      if(leng >= lenf)
-      {
-	f_rev_inv = malloc(sizeof(fp2_t)*(leng-lenf+1));
-	reciprocal(f_rev_inv, &c, f_rev, lenf, leng-lenf+1);
-      }
-      else{
-	fp_mont_setone(c.re);fp_set(c.im,0);
-      }
-	
-      // Compute the reduction
-      h = malloc(sizeof(fp2_t)*(lenf-1));
-      poly_redc(h, g, leng, f, lenf, f_rev_inv, c);
-
-      // Reduce manually
-      int leng_red = leng;
-      fp2_t scale, f_e;
-      while(leng_red >= lenf)
-      {
-	fp2_copy(&scale, &f[lenf-1]);
-	fp2_inv(&scale);
-	fp2_mul(&scale, &scale, &g[leng_red-1]);
-	for(e = 0; e < lenf; e++)
-	  {
-	    fp2_mul(&f_e, &f[e], &scale);
-	    fp2_sub(&g[e+leng_red-lenf], &g[e+leng_red-lenf], &f_e);
-	  }
-	leng_red--;
-      }
-
-      // Rescale manual result
-      if( leng < lenf){
-	      fp_mont_setone(scale.re);fp_set(scale.im,0);
-      }
-      else
-	if(lenf == 2 && leng == 3)
-	{
-	  fp2_sqr(&scale, &f[1]);
-	  fp2_add(&scale, &scale, &scale);
-	}
-	else
-	  fp2_copy(&scale, &c);
-      for(e = 0; e < leng_red; e++)
-	fp2_mul(&g[e], &g[e], &scale);
-     
-
-      // Comapre results
-      for(e = leng_red-1; e >= 0; e--)
-	      assert(fp2_isequal(h[e], g[e]));
-      for(e = leng_red; e < lenf-1; e++)
-	      assert(fp2_is_zero(&h[e]));
-      
-      free(g);
-      free(h);
-      if(leng >= lenf)
-	free(f_rev_inv);
-    }
-    free(f);
-    free(f_rev);
-  }
-  printf("[%3d%%] Tested polynomial reduction:\tNo errors!\n", 100 * lenf / nmax);
-
-  
-
-// TEST FOR RECIPROCAL TREES
-  
-  for(tree_size = 3; tree_size < nmax; tree_size++)
-  {
-    printf("[%3d%%] Testing reciprocal tree:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    // Compute size of arrays
-    i = 0;
-    while((1<<i) < tree_size){
-      i++;
-    }
-    array_size = (1<<(i+2))-1;
-    
-    DEG = malloc(sizeof(int)*array_size);
-    H = malloc(sizeof(poly)*array_size);
-    R = malloc(sizeof(poly)*array_size);
-    F = malloc(sizeof(poly)*tree_size);
-    A = malloc(sizeof(fp2_t)*array_size);
-    
-    // Get random polys
-    LENF = 2;
-    for(i = 0; i < tree_size; i++)
-    {
-      F[i] = malloc(sizeof(fp2_t)*LENF);
-      for(e = 0; e < LENF; e++){
-	      fp2_random(&F[i][e]);
-      }
-    }
-    
-    // Get product tree then reciprocal tree
-    product_tree(H, DEG, 0, F, LENF, tree_size);
-    leng = DEG[0]+1+(rand() % nmax);
-    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
-    
-    // Check the root
-    root = 0;
-    lenf = leng-DEG[root];
-    f = malloc(sizeof(fp2_t)*lenf);
-    for(e = 0; e < DEG[root]+1 && e < lenf; e++){
-      fp2_copy(&f[e], &H[root][DEG[root]-e]);
-    }
-    for(e = DEG[root]+1; e < lenf; e++){
-      fp2_set(&f[e], 0);
-    }
-    poly_mul_low(f, lenf, f, lenf, R[root], lenf);
-    assert(fp2_isequal(f[0], A[root]));
-    for(e = 1; e < lenf; e++){
-      assert(fp2_is_zero(&f[e]));
-    }
-    free(f);
-    
-    // Perform random walks
-    for(iteration = 0; iteration < nmax - tree_size; iteration++)
-    {
-      root = 0;
-      n = tree_size;
-      while(n > 1)
-      {
-	if(rand() & 1)
-	{
-	  root = 2*root+1;
-	  n = n - (n>>1);
-	}
-	else
-	{
-	  root = 2*root+2;
-	  n = n>>1;
-	}
-	brother = root - 1 + 2*(root & 1);
-	
-	// Check current node
-	if(DEG[root] > 2)
-	{
-	  lenf = DEG[brother];
-	  f = malloc(sizeof(fp2_t)*lenf);
-	  for(e = 0; e < DEG[root]+1 && e < lenf; e++){
-	    fp2_copy(&f[e], &H[root][DEG[root]-e]);
-    }
-	  for(e = DEG[root]+1; e < lenf; e++){
-	    fp2_set(&f[e], 0);
-    }
-	  poly_mul_low(f, lenf, f, lenf, R[root], lenf);
-	  assert(fp2_isequal(f[0], A[root]));
-	  for(e = 1; e < lenf; e++){
-	    assert(fp2_is_zero(&f[e]));
-    }
-	  free(f);
-	}
-      }
-    }
-    // Clean up
-    for(i = 0; i < tree_size; i++)
-      free(F[i]);
-    clear_tree(H, 0, tree_size);
-    clear_tree(R, 0, tree_size);
-    free(F);
-    free(H);
-    free(R);
-    free(A);
-    free(DEG);
-  }
-  printf("[%3d%%] Tested reciprocal tree:\t\tNo errors!\n", 100 * tree_size / nmax);
-  
-  
-
-  // TEST FOR REMAINDERS
-  for(tree_size = 2; tree_size < nmax; tree_size++)
-  {
-    printf("[%3d%%] Testing batched remainders:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    // Compute size of arrays
-    i = 0;
-    while((1<<i) < tree_size)
-      i++;
-    array_size = (1<<(i+2))-1;
-    
-    DEG = malloc(sizeof(int)*array_size);
-    H = malloc(sizeof(poly)*array_size);
-    R = malloc(sizeof(poly)*array_size);
-    F = malloc(sizeof(poly)*tree_size);
-    A = malloc(sizeof(fp2_t)*array_size);
-    REM1 = malloc(sizeof(fp2_t)*array_size);
-    REM2 = malloc(sizeof(fp2_t)*array_size);
-    C = malloc(sizeof(fp2_t)*tree_size);
-    
-    // Get random polys
-    LENF = 2;
-    for(i = 0; i < tree_size; i++)
-    {
-      F[i] = malloc(sizeof(fp2_t)*LENF);
-      for(e = 0; e < LENF; e++)
-	fp2_random(&F[i][e]);
-    }
-    
-    // Get product tree, reciprocal tree, and remainders
-    product_tree(H, DEG, 0, F, LENF, tree_size);
-    leng = DEG[0]+1+(rand() % nmax);
-    g1 = malloc(sizeof(fp2_t)*leng);
-    g2 = malloc(sizeof(fp2_t)*leng);
-    for(e = 0; e < leng; e++)
-    {
-      fp2_random(&g1[e]);
-      fp2_random(&g2[e]);
-    }
-    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
-    multieval_unscaled(REM1, g1, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
-    multieval_unscaled(REM2, g2, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
-    
-    for(i = 0; i < tree_size; i++)
-    {
-      // Get ratio of the remainder
-      fp2_inv(&REM1[i]);
-      fp2_mul(&ratio, &REM1[i], &REM2[i]);
-      
-      // Compute remainders manually
-      f_rev = malloc(sizeof(fp2_t)*LENF);
-      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
-      h = malloc(sizeof(fp2_t)*(LENF-1));
-      for(e = 0; e < LENF; e++)
-	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
-      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
-      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
-      fp2_copy(&REM1[i], &h[0]);
-      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
-      fp2_copy(&REM2[i], &h[0]);
-      free(f_rev);
-      free(f_rev_inv);
-      free(h);
-
-      // Compare results
-      fp2_inv(&REM1[i]);
-      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
-      assert(fp2_isequal(REM1[i], ratio));
-    }
-		 
-    // Clean up
-    for(i = 0; i < tree_size; i++)
-      free(F[i]);
-    free(g1);
-    free(g2);
-    clear_tree(H, 0, tree_size);
-    clear_tree(R, 0, tree_size);
-    free(F);
-    free(H);
-    free(R);
-    free(A);
-    free(DEG);
-    free(REM1);
-    free(REM2);
-    free(C);
-  } 
-  printf("[%3d%%] Tested batched remainders:\tNo errors!\n", 100 * tree_size / nmax);
-  
-
-
-// TEST FOR SCALED REMAINDER TREE
-  for(tree_size = 1; tree_size < nmax; tree_size++)
-  {
-    printf("[%3d%%] Testing scaled remainder tree:\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
-    fflush(stdout);
-    printf("\r\x1b[K");
-    
-    // Compute size of arrays
-    i = 0;
-    while((1<<i) < tree_size)
-      i++;
-    array_size = (1<<(i+2))-1;
-    
-    DEG = malloc(sizeof(int)*array_size);
-    H = malloc(sizeof(poly)*array_size);
-    F = malloc(sizeof(poly)*tree_size);
-    REM1 = malloc(sizeof(fp2_t)*array_size);
-    REM2 = malloc(sizeof(fp2_t)*array_size);
-    
-    // Get random polys
-    LENF = 2;
-    for(i = 0; i < tree_size; i++)
-    {
-      F[i] = malloc(sizeof(fp2_t)*LENF);
-      for(e = 0; e < LENF; e++)
-	fp2_random(&F[i][e]);
-    }
-    
-    // Get random polys to reduce
-    product_tree(H, DEG, 0, F, LENF, tree_size);
-    leng = DEG[0]+1+(rand() % nmax);
-    g1 = malloc(sizeof(fp2_t)*leng);
-    g2 = malloc(sizeof(fp2_t)*leng);
-    for(e = 0; e < leng; e++)
-    {
-      fp2_random(&g1[e]);
-      fp2_random(&g2[e]);
-    }
-
-    // Get the required initial nodes
-    G1 = malloc(sizeof(fp2_t)*DEG[0]);
-    G2 = malloc(sizeof(fp2_t)*DEG[0]);
-    G1_rev = malloc(sizeof(fp2_t)*DEG[0]);
-    G2_rev = malloc(sizeof(fp2_t)*DEG[0]);
-    R0 = malloc(sizeof(fp2_t)*(leng));
-    f_rev = malloc(sizeof(fp2_t)*(DEG[0]+1));
-    for(e = 0; e < DEG[0]+1; e++)
-      fp2_copy(&f_rev[e], &H[0][DEG[0]-e]);
-    if( DEG[0] > leng-DEG[0])
-      reciprocal(R0, &A0, f_rev, DEG[0]+1, DEG[0]);
-    else
-      reciprocal(R0, &A0, f_rev, DEG[0]+1, leng-DEG[0]);
-    poly_redc(G1, g1, leng, H[0], DEG[0]+1, R0, A0);
-    poly_redc(G2, g2, leng, H[0], DEG[0]+1, R0, A0);
-    for(e = 0; e < DEG[0]; e++)
-    {
-      fp2_copy(&G1_rev[e], &G1[DEG[0]-1-e]);
-      fp2_copy(&G2_rev[e], &G2[DEG[0]-1-e]);
-    }
-    poly_mul_middle(G1_rev, G1_rev, DEG[0], R0, DEG[0]);
-    poly_mul_middle(G2_rev, G2_rev, DEG[0], R0, DEG[0]);
-    for(e = 0; e < DEG[0]; e++)
-    {
-      fp2_copy(&G1[e], &G1_rev[DEG[0]-1-e]);
-      fp2_copy(&G2[e], &G2_rev[DEG[0]-1-e]);
-    }
-    free(G1_rev);free(G2_rev);free(R0);free(f_rev);
-
-    // Compute the scaled remainder trees
-    multieval_scaled(REM1, G1, H, DEG, 0, tree_size);
-    multieval_scaled(REM2, G2, H, DEG, 0, tree_size);
-    
-    for(i = 0; i < tree_size; i++)
-    {
-      // Get ratio of the remainder
-      fp2_inv(&REM1[i]);
-      fp2_mul(&ratio, &REM1[i], &REM2[i]);
-
-      // Compute remainders manually
-      f_rev = malloc(sizeof(fp2_t)*LENF);
-      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
-      h = malloc(sizeof(fp2_t)*(LENF-1));
-      for(e = 0; e < LENF; e++)
-	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
-      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
-      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
-      fp2_copy(&REM1[i], &h[0]);
-      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
-      fp2_copy(&REM2[i], &h[0]);
-      free(f_rev);free(f_rev_inv);free(h);
-
-      // Compare results
-      fp2_inv(&REM1[i]);
-      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
-      assert(fp2_isequal(REM1[i], ratio));
-    }
-		 
-    // Clean up
-    for(i = 0; i < tree_size; i++)
-      free(F[i]);
-    free(F);free(g1);free(g2);free(G1);free(G2);
-    clear_tree(H, 0, tree_size);free(H);free(DEG);
-    free(REM1);free(REM2);
-  } 
-  printf("[%3d%%] Tested scaled remainder tree:\tNo errors!\n", 100 * tree_size / nmax);
-  
-  printf("-- All tests passed.\n");
-}
--- a/src/ec/ref/ecx/test/test_extras.c
+++ b/src/ec/ref/ecx/test/test_extras.c
@@ -1,75 +0,0 @@
-#include "test_extras.h"
-#include <bench.h>
-
-// Global constants
-extern const digit_t p[NWORDS_FIELD];
-extern const digit_t R2[NWORDS_FIELD];
-
-
-#if 0
-int64_t cpucycles(void)
-{ // Access system counter for benchmarking
-    unsigned int hi, lo;
-
-    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
-    return ((int64_t)lo) | (((int64_t)hi) << 32);
-}
-#endif
-
-
-int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
-{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
-  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
-    int i;
-
-    for (i = nwords-1; i >= 0; i--)
-    {
-        if (a[i] > b[i]) return 1;
-        else if (a[i] < b[i]) return -1;
-    }
-
-    return 0; 
-}
-
-
-void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
-{ // Subtraction without borrow, out = a-b where a>b
-  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
-    unsigned int i;
-    digit_t res, carry, borrow = 0;
-  
-    for (i = 0; i < nwords; i++)
-    {
-        res = a[i] - b[i];
-        carry = (a[i] < b[i]);
-        out[i] = res - borrow;
-        borrow = carry || (res < borrow);
-    } 
-}
-
-
-void fprandom_test(digit_t* a)
-{ // Generating a pseudo-random field element in [0, p-1] 
-  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
-    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
-    unsigned char* string = NULL;
-
-    string = (unsigned char*)a;
-    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
-        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
-    }
-    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
-
-    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
-        sub_test(a, a, (digit_t*)p, nwords);
-    }
-}
-
-
-void fp2random_test(fp2_t* a)
-{ // Generating a pseudo-random element in GF(p^2) 
-  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
-
-    fprandom_test(a->re);
-    fprandom_test(a->im);
-}
--- a/src/ec/ref/ecx/test/test_extras.h
+++ b/src/ec/ref/ecx/test/test_extras.h
@@ -1,29 +0,0 @@
-
-#ifndef TEST_EXTRAS_H
-#define TEST_EXTRAS_H
-
-#include <time.h>
-#include <stdlib.h>
-#include <fp.h>
-#include <fp2.h>
-#include <curve_extras.h>
-
-#define PASSED    0
-#define FAILED    1
-    
-// Access system counter for benchmarking
-//int64_t cpucycles(void);
-
-// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
-int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
-
-// Multiprecision subtraction for testing, assumes a > b
-void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords);
-
-// Generating a pseudo-random field element in [0, p-1] 
-void fprandom_test(digit_t* a);
-
-// Generating a pseudo-random element in GF(p^2)
-void fp2random_test(fp2_t* a);
-
-#endif
--- a/src/ec/ref/ecx/test/velu-test.c
+++ b/src/ec/ref/ecx/test/velu-test.c
@@ -1,298 +0,0 @@
-#include<time.h>
-#include <stdio.h>
-#include <assert.h>
-#include <inttypes.h>
-
-#include "isog.h"
-#include "sdacs.h"
-#include "ec.h"
-#include "test-basis.h"
-
-void random_scalar(fp_t k, const uint8_t j)
-{
-    for(int i = 0; i < NWORDS_FIELD; i++)
-        k[i] = rand();
-}
-
-// Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
-void coeff(fp2_t *B, ec_point_t const A)
-{
-	fp2_t t;
-	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
-	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
-
-	fp2_copy(&*B, &A.z);
-	fp2_inv(&*B);		// 1 / (C24)
-	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
-	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
-}
-
-// Determines if point is fp2-rational (if not, then it must be a zero trace point)
-uint8_t isrational(ec_point_t const T, fp2_t const a)
-{
-	fp2_t XT, tmp, aux, YT_squared;
-
-	fp2_copy(&XT, &T.z);
-	fp2_inv(&XT);
-
-	fp2_mul(&XT, &XT, &T.x);
-
-	fp2_sqr(&tmp, &XT);
-	fp2_mul(&aux, &tmp, &XT);
-	fp2_mul(&tmp, &tmp, &a);
-	fp2_add(&YT_squared, &tmp, &aux);
-	fp2_add(&YT_squared, &YT_squared, &XT);
-
-	return fp2_is_square(&YT_squared);
-}
-
-// ladder3pt computes x(P + [m]Q)
-void ladder3pt(ec_point_t *R, fp_t const m, ec_point_t const *P, ec_point_t const *Q, ec_point_t const *PQ, ec_point_t const *A)
-{
-	ec_point_t X0, X1, X2;
-	copy_point(&X0, Q);
-	copy_point(&X1, P);
-	copy_point(&X2, PQ);
-
-	int i,j;
-	uint64_t t;
-	for (i = 0; i < NWORDS_FIELD; i++)
-	{
-		t = 1;
-		for (j = 0 ; j < 64; j++)
-		{
-			swap_points(&X1, &X2, -((t & m[i]) == 0));
-			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
-			swap_points(&X1, &X2, -((t & m[i]) == 0));
-			t <<= 1;
-		};
-	};
-	copy_point(R, &X1);
-}
-
-// The projective x-coordinate point (X : Z) at infinity is such that Z == 0
-static inline int isinfinity(ec_point_t const P)
-{
-	return fp2_is_zero(&P.z);
-}
-
-int main()
-{
-	
-	fp2_t fp2_0, fp2_1;
-	fp2_set(&fp2_0, 0);
-	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
-
-	int i, j;
-
-	ec_point_t A, B, T;
-	fp2_set(&A.x, 0);
-	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
-	
-	// fp2_add(&A.x, &A.z, &A.x);	// 1
-	// fp2_add(&A.x, &A.x, &A.x);	// 2
-	// fp2_add(&A.x, &A.z, &A.x);	// 3
-	// fp2_add(&A.x, &A.x, &A.x);	// 6
-
-	fp2_add(&A.z, &A.z, &A.z);	// 2C
-	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
-	fp2_add(&A.z, &A.z, &A.z);	// 4C
-
-	// Just to ensure the projective curve coeffientes are different from zero
-	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
-
-	fp2_t a;
-	coeff(&a, A);
-
-	ec_point_t PA, QA, PQA, PB, QB, PQB, RA, RB;
-
-	// Writing the public projective x-coordinate points into Montogmery domain
-	fp2_tomont(&(PA.x), &(xPA));
-	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
-	fp2_tomont(&(QA.x), &(xQA));
-	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
-	fp2_tomont(&(PQA.x), &(xPQA));
-	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
-
-	assert( isrational(PA, a) );
-	assert( isrational(QA, a) );
-	assert( isrational(PQA, a) );
-
-	fp2_tomont(&(PB.x), &(xPB));
-	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
-	fp2_tomont(&(QB.x), &(xQB));
-	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
-	fp2_tomont(&(PQB.x), &(xPQB));
-	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
-
-	assert( !isrational(PB, a) );
-	assert( !isrational(QB, a) );
-	assert( !isrational(PQB, a) );
-	// ======================================================================================================
-	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
-	for (j = 0; j < P_LEN; j++)
-	{
-		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
-		{
-			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-			assert( isrational(PA, a) );
-			assert( isrational(QA, a) );
-			assert( isrational(PQA, a) );
-		};
-	};
-
-	assert( !isinfinity(PA) );
-	assert( !isinfinity(QA) );
-	assert( !isinfinity(PQA) );
-
-	// --------------------------------------------------------------
-	fp_t m;
-	random_scalar(m, 0);
-	ladder3pt(&RA, m, &PA, &QA, &PQA, &A);
-	for (i = 0; i < P_LEN; i++)
-	{
-		printf("// Processing the %d-th prime:\t", i + 1);
-		printf("%2d%%", 100 * i / (int)P_LEN);
-		fflush(stdout);
-		printf("\r\x1b[K");
-
-		copy_point(&T, &RA);
-		for (j = (i+1); j < P_LEN; j++)
-			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-		assert( !isinfinity(T) );
-
-		kps(i, T, A);
-		if (TORSION_ODD_PRIMES[i] > gap)
-			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
-		else
-			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
-
-		xisog(&B, i, A);
-
-		xeval(&PB, i, PB, A);
-		coeff(&a, B);
-		assert( !isinfinity(PB) );
-		assert( !isrational(PB, a) );
-
-		xeval(&RA, i, RA, A);
-		assert( (!isinfinity(RA) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
-		assert( (isrational(RA, a) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
-		
-		copy_point(&A, &B);
-		// Verifying the order of the image point of  PA has been reduced 
-		copy_point(&T, &RA);
-		for (j = (i+1); j < P_LEN; j++)
-			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-		assert( isinfinity(T) );
-		kps_clear(i);
-	};
-
-	fp2_set(&A.x, 0);
-	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
-	
-	// fp2_add(&A.x, &A.z, &A.x);	// 1
-	// fp2_add(&A.x, &A.x, &A.x);	// 2
-	// fp2_add(&A.x, &A.z, &A.x);	// 3
-	// fp2_add(&A.x, &A.x, &A.x);	// 6
-
-	fp2_add(&A.z, &A.z, &A.z);	// 2C
-	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
-	fp2_add(&A.z, &A.z, &A.z);	// 4C
-
-	// Just to ensure the projective curve coeffientes are different from zero
-	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
-
-	coeff(&a, A);
-	// Writing the public projective x-coordinate points into Montogmery domain
-	fp2_tomont(&(PA.x), &(xPA));
-	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
-	fp2_tomont(&(QA.x), &(xQA));
-	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
-	fp2_tomont(&(PQA.x), &(xPQA));
-	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
-
-	assert( isrational(PA, a) );
-	assert( isrational(QA, a) );
-	assert( isrational(PQA, a) );
-
-	fp2_tomont(&(PB.x), &(xPB));
-	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
-	fp2_tomont(&(QB.x), &(xQB));
-	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
-	fp2_tomont(&(PQB.x), &(xPQB));
-	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
-
-	assert( !isrational(PB, a) );
-	assert( !isrational(QB, a) );
-	assert( !isrational(PQB, a) );
-
-	// ======================================================================================================
-	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
-	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
-	{
-		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
-		{
-			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-			assert( !isrational(PB, a) );
-			assert( !isrational(QB, a) );
-			assert( !isrational(PQB, a) );
-		};
-	};
-
-	assert( !isinfinity(PB) );
-	assert( !isinfinity(QB) );
-	assert( !isinfinity(PQB) );
-
-	random_scalar(m, 1);
-	ladder3pt(&RB, m, &PB, &QB, &PQB, &A);
-	for (i = P_LEN; i < (P_LEN+M_LEN); i++)
-	{
-		printf("// Processing the %d-th prime:\t", i + 1);
-		printf("%2d%%", 100 * i / (int)(P_LEN+M_LEN));
-		fflush(stdout);
-		printf("\r\x1b[K");
-
-		copy_point(&T, &RB);
-		for (j = (i+1); j < (P_LEN+M_LEN); j++)
-			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-		assert( !isinfinity(T) );
-
-		kps(i, T, A);
-		if (TORSION_ODD_PRIMES[i] > gap)
-			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
-		else
-			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
-
-		xisog(&B, i, A);
-
-		xeval(&PA, i, PA, A);
-		coeff(&a, B);
-		assert( !isinfinity(PA) );
-		assert( isrational(PA, a) );
-
-		xeval(&RB, i, RB, A);
-		assert( (!isinfinity(RB) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
-		assert( (!isrational(RB, a) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
-	
-		copy_point(&A, &B);
-		// Verifying the order of the image point of  PB has been reduced 
-		copy_point(&T, &RB);
-		for (j = (i+1); j < (P_LEN+M_LEN); j++)
-			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
-
-		assert( isinfinity(T) );
-		kps_clear(i);
-	};
-
-	printf("-- All tests passed!\n");
-	return 0;
-}
--- a/src/ec/ref/ecx/xeval.c
+++ b/src/ec/ref/ecx/xeval.c
@@ -1,299 +0,0 @@
-#include "isog.h"
-#include "ec.h"
-#include <assert.h>
-
-// -----------------------------------------------------------------------------------------
-// -----------------------------------------------------------------------------------------
-// Traditional isogeny evaluation (xEVAL)
-
-// CrissCross procedure as described in Hisil and Costello paper
-void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta)
-{
-	fp2_t t_1, t_2;
-
-	fp2_mul(&t_1, &alpha, &delta);
-    	fp2_mul(&t_2, &beta, &gamma);
-	fp2_add(&*r0, &t_1, &t_2);
-	fp2_sub(&*r1, &t_1, &t_2);
-}
-
-// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
-void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ)
-{
-	fp2_t t0, t1, t2;
-	for(int j = 0; j < lenQ; j++){
-		fp2_add(&t0, &Q[j].x, &Q[j].z);
-		fp2_sub(&t1, &Q[j].x, &Q[j].z);
-		fp2_mul(&t2, &K[0].x, &t1);
-		fp2_mul(&t1, &K[0].z, &t0);
-		fp2_add(&t0, &t2, &t1);
-		fp2_sub(&t1, &t2, &t1);
-		fp2_mul(&R[j].x, &Q[j].x, &t0);
-		fp2_mul(&R[j].z, &Q[j].z, &t1);
-	}
-}
-
-// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
-void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ)
-{
-	fp2_t t0, t1;
-
-	for(int i = 0; i < lenQ; i++){
-		fp2_add(&t0, &Q[i].x, &Q[i].z);
-		fp2_sub(&t1, &Q[i].x, &Q[i].z);
-		fp2_mul(&(R[i].x), &t0, &K[1].x);
-		fp2_mul(&(R[i].z), &t1, &K[2].x);
-		fp2_mul(&t0, &t0, &t1);
-		fp2_mul(&t0, &t0, &K[0].x); 
-		fp2_add(&t1, &(R[i].x), &(R[i].z));
-		fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
-		fp2_sqr(&t1, &t1);
-		fp2_sqr(&(R[i].z), &(R[i].z));
-		fp2_add(&(R[i].x), &t0, &t1);
-		fp2_sub(&t0, &t0, &(R[i].z));
-		fp2_mul(&(R[i].x), &(R[i].x), &t1);
-		fp2_mul(&(R[i].z), &(R[i].z), &t0);
-	}
-}
-
-// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P = (0, 0)
-// Must call after xisog_4_singular
-void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P)
-{
-	fp2_t t0, t1, t2;
-	for(int i = 0; i < lenQ; i++){
-		fp2_add(&t0, &Q[i].x, &Q[i].z);
-		fp2_sub(&t2, &Q[i].x, &Q[i].z);
-		fp2_sqr(&t0, &t0);
-		fp2_sqr(&t2, &t2);
-		fp2_sub(&R[i].z, &t0, &t2);
-		if(fp2_is_equal(&P.x, &P.z)){
-			// Branch for P = (+1,_)
-			fp2_copy(&t1, &t2);
-		}
-		else{
-			// Branch for P = (-1,_)
-			fp2_copy(&t1, &t0);
-			fp2_copy(&t0, &t2);
-		}
-		fp2_mul(&R[i].x, &R[i].z, &K[0].x);
-		fp2_mul(&R[i].z, &R[i].z, &K[1].x);
-		fp2_mul(&R[i].z, &R[i].z, &t1);
-		fp2_mul(&t1, &t1, &K[0].z);
-		fp2_add(&R[i].x, &R[i].x, &t1);
-		fp2_mul(&R[i].x, &R[i].x, &t0);
-	}
-}
-
-// Isogeny evaluation on Montgomery curves
-// Recall: K has been computed in Twisted Edwards model and none extra additions are required.
-void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P)
-{
-	int j;
-	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
-
-	fp2_t R0, R1, S0, S1, T0, T1;
-	fp2_add(&S0, &P.x, &P.z);
-	fp2_sub(&S1, &P.x, &P.z);
-
-	CrissCross(&R0, &R1, K[0].z, K[0].x, S0, S1);
-	for (j = 1; j < d; j++)
-	{
-		CrissCross(&T0, &T1, K[j].z, K[j].x, S0, S1);
-		fp2_mul(&R0, &T0, &R0);
-		fp2_mul(&R1, &T1, &R1);
-	};
-
-	fp2_sqr(&R0, &R0);
-	fp2_sqr(&R1, &R1);
-
-	fp2_mul(&(Q->x), &P.x, &R0);
-	fp2_mul(&(Q->z), &P.z, &R1);
-}
-
-// -----------------------------------------------------------------------------------------
-// -----------------------------------------------------------------------------------------
-// Isogeny evaluation (xEVAL) used in velu SQRT
-
-void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
-{
-	// =================================================================================
-	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
-	sI = sizeI[i];          // size of I
-	sJ = sizeJ[i];          // size of J
-	sK = sizeK[i];          // size of K
-
-	assert(sI >= sJ);       // Ensuring #I >= #J
-	assert(sK >= 0);        // Recall, it must be that #K >= 0
-	assert(sJ > 1);         // ensuring sI >= sJ > 1
-	// =================================================================================
-
-	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
-	fp2_t Ap;
-	fp2_add(&Ap, &A.x, &A.x); // 2A' + 4C
-	fp2_sub(&Ap, &Ap, &A.z);   // 2A'
-	fp2_add(&Ap, &Ap, &Ap);     // 4A'
-
-	//  --------------------------------------------------------------------------------------------------
-	//                   ~~~~~~~~
-	//                    |    | 
-	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
-	//                    j in J 
-	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
-	// In particular, for a degree-l isogeny construction, we need alpha = X/Z and alpha = Z/X (i.e., 1/alpha)
-
-	//fp2_t EJ_0[sJ][3]; // EJ_0[j][2] factors of one polynomial to be used in a resultant 
-
-	fp2_t XZ_add, XZj_add,
-	   XZ_sub, XZj_sub,
-	   AXZ2,
-	   CXZ2,
-	   CX2Z2,
-	   t1, t2;
-
-	fp2_add(&XZ_add, &P.x, &P.z);	// X + Z
-	fp2_sub(&XZ_sub, &P.x, &P.z);	// X - Z
-
-	fp2_mul(&AXZ2, &P.x, &P.z);	// X * Z
-	fp2_sqr(&t1, &P.x);		// X ^ 2
-	fp2_sqr(&t2, &P.z);		// Z ^ 2
-
-	fp2_add(&CX2Z2, &t1, &t2);		//      X^2 + Z^2
-	fp2_mul(&CX2Z2, &CX2Z2, &A.z);	// C * (X^2 + Z^2)
-
-	fp2_add(&AXZ2, &AXZ2, &AXZ2);	//       2 * (X * Z)
-	fp2_mul(&CXZ2, &AXZ2, &A.z);	// C  * [2 * (X * Z)]
-	fp2_mul(&AXZ2, &AXZ2, &Ap);		// A' * [2 * (X * Z)]
-
-	int j;
-	for (j = 0; j < sJ; j++)
-	{
-		fp2_add(&XZj_add, &J[j].x, &J[j].z);		// Xj + Zj
-		fp2_sub(&XZj_sub, &J[j].x, &J[j].z);		// Xj - Zj
-
-		fp2_mul(&t1, &XZ_sub, &XZj_add);			// (X - Z) * (Xj + Zj)
-		fp2_mul(&t2, &XZ_add, &XZj_sub);			// (X + Z) * (Xj - Zj)
-
-		// ...................................
-		// Computing the quadratic coefficient
-		fp2_sub(&EJ_0[j][2], &t1, &t2);			//       2 * [(X*Zj) - (Z*Xj)]
-		fp2_sqr(&EJ_0[j][2], &EJ_0[j][2]);			//     ( 2 * [(X*Zj) - (Z*Xj)] )^2
-		fp2_mul(&EJ_0[j][2], &A.z, &EJ_0[j][2]);		// C * ( 2 * [(X*Zj) - (Z*Xj)] )^2
-
-		// ..................................
-		// Computing the constant coefficient
-		fp2_add(&EJ_0[j][0], &t1, &t2);			//       2 * [(X*Xj) - (Z*Zj)]
-		fp2_sqr(&EJ_0[j][0], &EJ_0[j][0]);			//     ( 2 * [(X*Xj) - (Z*Zj)] )^2
-		fp2_mul(&EJ_0[j][0], &A.z, &EJ_0[j][0]);		// C * ( 2 * [(X*Xj) - (Z*Zj)] )^2
-
-		// ................................
-		// Computing the linear coefficient
-	
-		// C * [ (-2*Xj*Zj)*(alpha^2 + 1) + (-2*alpha)*(Xj^2 + Zj^2)] + [A' * (-2*Xj*Zj) * (2*X*Z)] where alpha = X/Z
-		fp2_add(&t1, &J[j].x, &J[j].z);			//      (Xj + Zj)
-		fp2_sqr(&t1, &t1);					//      (Xj + Zj)^2
-		fp2_add(&t1, &t1, &t1);				//  2 * (Xj + Zj)^2
-		fp2_add(&t1, &t1, &XZJ4[j]);			//  2 * (Xj + Zj)^2 - (4*Xj*Zj) := 2 * (Xj^2 + Zj^2)
-		fp2_mul(&t1, &t1, &CXZ2);				// [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
-
-		fp2_mul(&t2, &CX2Z2, &XZJ4[j]);			// [C * (X^2 + Z^2)] * (-4 * Xj * Zj)
-		fp2_sub(&t1, &t2, &t1);				// [C * (X^2 + Z^2)] * (-4 * Xj * Zj) - [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
-
-		fp2_mul(&t2, &AXZ2, &XZJ4[j]);			// (2 * [A' * (X * Z)]) * (-4 * Xj * Zj)
-		fp2_add(&EJ_0[j][1], &t1, &t2);			// This is our desired equation but multiplied by 2
-		fp2_add(&EJ_0[j][1], &EJ_0[j][1], &EJ_0[j][1]);	// This is our desired equation but multiplied by 4
-	};
-
-        // ---------------------------------------------------------------------
-        // The faster way for multiplying is using a divide-and-conquer approach
-
-	// product tree of EJ_0 (we only require the root)
-	product_tree_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
-	assert( deg_ptree_EJ[0] == (2*sJ) );
-	if (!scaled)
-	{
-		// unscaled remainder tree approach
-		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
-	}
-	else
-	{
-		// scaled remainder tree approach
-		fp2_t G[sI_max], G_rev[sI_max];
-		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
-
-		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
-
-		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
-        };
-
-	// Finally, we must multiply the leaves of the outpur of remainders
-	fp2_t r0;
-	product(&r0, (const fp2_t*)leaves, sI);
-	// EJ_1 is just reverting the ordering in the coefficients of EJ_0
-	for (j = 0; j < sJ; j++){
-		fp2_copy(&t1, &ptree_EJ[0][j]);
-		fp2_copy(&ptree_EJ[0][j], &ptree_EJ[0][2*sJ - j]);
-		fp2_copy(&ptree_EJ[0][2*sJ - j], &t1);
-	}
-
-	if (!scaled)
-	{
-		// unscaled remainder tree approach
-		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
-	}
-	else
-	{
-		// scaled remainder tree approach
-		fp2_t G[sI_max], G_rev[sI_max];
-		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
-
-		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
-
-		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
-        };
-	clear_tree(ptree_EJ, 0, sJ);
-	// Finally, we must multiply the leaves of the outpur of remainders
-	fp2_t r1;
-	product(&r1, (const fp2_t*)leaves, sI);
-
-	// -------------------------------
-	// Sometimes the public value sK is equal to zero,
-	// Thus for avoing runtime error we add one when sK =0
-	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
-	for (j = 0; j < sK; j++)
-	{
-		fp2_add(&XZj_add, &K[j].x, &K[j].z);	// Xk + Zk
-		fp2_sub(&XZj_sub, &K[j].x, &K[j].z);	// Xk - Zk
-		fp2_mul(&t1, &XZ_sub, &XZj_add);		// (X - Z) * (Xk + Zk)
-		fp2_mul(&t2, &XZ_add, &XZj_sub);		// (X + Z) * (Xk - Zk)
-
-		// Case alpha = X/Z
-		fp2_sub(&hK_0[j], &t1, &t2);		// 2 * [(X*Zk) - (Z*Xk)]
-
-		// Case 1/alpha = Z/X
-		fp2_add(&hK_1[j], &t1, &t2);		// 2 * [(X*Xk) - (Z*Zk)]
-	};
-
-	// hk_0 <- use product to mulitiply all the elements in hK_0
-	product(&hk_0, (const fp2_t*)hK_0, sK);
-	// hk_1 <- use product to mulitiply all the elements in hK_1
-	product(&hk_1, (const fp2_t*)hK_1, sK);
-
-	// ---------------------------------------------------------------------------------
-	// Now, unifying all the computations
-	fp2_mul(&t1, &hk_1, &r1);				// output of algorithm 2 with 1/alpha = Z/X and without the demoninator
-	fp2_sqr(&t1, &t1);
-	fp2_mul(&(Q->x), &t1, &P.x);
-
-	fp2_mul(&t2, &hk_0, &r0);				// output of algorithm 2 with alpha = X/Z and without the demoninator
-	fp2_sqr(&t2, &t2);
-	fp2_mul(&(Q->z), &t2, &P.z);
-}
--- a/src/ec/ref/ecx/xisog.c
+++ b/src/ec/ref/ecx/xisog.c
@@ -1,295 +0,0 @@
-#include "isog.h"
-#include "ec.h"
-#include <assert.h>
-
-// -------------------------------------------------------------------------
-// -------------------------------------------------------------------------
-
-// Degree-2 isogeny with kernel generated by P != (0 ,0)
-// Outputs the curve coefficient in the form A24=(A+2C:4C)
-void xisog_2(ec_point_t* B, ec_point_t const P)
-{
-        fp2_sqr(&B->x, &P.x);
-        fp2_sqr(&B->z, &P.z);
-        fp2_sub(&B->x, &B->z, &B->x);
-        fp2_add(&K[0].x, &P.x, &P.z);
-        fp2_sub(&K[0].z, &P.x, &P.z);
-}
-
-// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
-// Outputs the curve coefficient in the form A24=(A+2C:4C)
-void xisog_4(ec_point_t* B, ec_point_t const P)
-{
-	fp2_sqr(&K[0].x, &P.x);
-	fp2_sqr(&K[0].z, &P.z);
-	fp2_add(&K[1].x, &K[0].z, &K[0].x);
-	fp2_sub(&K[1].z, &K[0].z, &K[0].x);
-	fp2_mul(&B->x, &K[1].x, &K[1].z);
-	fp2_sqr(&B->z, &K[0].z);
-
-	// Constants for xeval_4
-	fp2_add(&K[2].x, &P.x, &P.z);
-	fp2_sub(&K[1].x, &P.x, &P.z);
-	fp2_add(&K[0].x, &K[0].z, &K[0].z);
-	fp2_add(&K[0].x, &K[0].x, &K[0].x);
-}
-
-// Degree-4 isogeny with kernel generated by P such that [2]P = (0 ,0)
-void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24)
-{
-	fp2_copy(&K[0].z, &A24.z);
-	if(fp2_is_equal(&P.x, &P.z)){
-		// Case for P=(1,_)
-		fp2_copy(&K[0].x, &A24.x);
-		fp2_sub(&K[1].x, &A24.x, &A24.z);
-		fp2_neg(&B24->z, &K[1].x);
-	}
-	else{
-		// Case for P=(-1,_)
-		fp2_copy(&K[1].x, &A24.x);
-		fp2_sub(&K[0].x, &A24.x, &A24.z);
-		fp2_neg(&B24->z, &K[0].x);
-		fp2_copy(&B24->z, &K[1].x);
-	}
-	fp2_copy(&B24->x, &K[0].z);
-}
-
-// xISOG procedure, which is a hybrid between Montgomery and Twisted Edwards
-// This tradition fomulae corresponds with the Twisted Edwards formulae but 
-// mapping the output into Montgomery form
-void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A)
-{
-	int j;
-	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
-
-	fp2_t By, Bz, constant_d_edwards, tmp_a, tmp_d;
-
-	fp2_copy(&By, &K[0].x);
-	fp2_copy(&Bz, &K[0].z);
-
-	for (j = 1; j < d; j++)
-	{
-		fp2_mul(&By, &By, &K[j].x);
-		fp2_mul(&Bz, &Bz, &K[j].z);
-	};
-
-	// Mapping Montgomery curve coefficients into Twisted Edwards form
-	fp2_sub(&constant_d_edwards, &A.x, &A.z);
-	fp2_copy(&tmp_a, &A.x);
-	fp2_copy(&tmp_d, &constant_d_edwards);
-
-	// left-to-right method for computing a^l and d^l
-	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
-	{
-		fp2_sqr(&tmp_a, &tmp_a);
-		fp2_sqr(&tmp_d, &tmp_d);
-		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
-		{
-			fp2_mul(&tmp_a, &tmp_a, &A.x);
-			fp2_mul(&tmp_d, &tmp_d, &constant_d_edwards);
-		};
-	};
-
-	// raising to 8-th power
-	for (j = 0; j < 3; j++)
-	{
-		fp2_sqr(&By, &By);
-		fp2_sqr(&Bz, &Bz);
-	};
-
-	// Mapping Twisted Edwards curve coefficients into Montgomery form
-	fp2_mul(&(B->x), &tmp_a, &Bz);
-	fp2_mul(&(B->z), &tmp_d, &By);
-	fp2_sub(&(B->z), &(B->x), &(B->z));
-}
-
-// -------------------------------------------------------------------------
-// -------------------------------------------------------------------------
-//  Isogeny construction (xISOG) used in velu SQRT
-
-void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A)
-{
-	// =================================================================================
-	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
-	sI = sizeI[i];          // size of I
-	sJ = sizeJ[i];          // size of J
-	sK = sizeK[i];          // size of K
-
-	assert(sI >= sJ);       // Ensuring #I >= #J
-	assert(sK >= 0);         // Recall, L is a prime and therefore it must be that #K > 0
-	assert(sJ > 1);         // ensuring sI >= sJ > 1
-	// =================================================================================
-	
-	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
-	fp2_t Ap;
-	fp2_add(&Ap, &A.x, &A.x);	// 2A' + 4C
-	fp2_sub(&Ap, &Ap, &A.z);	// 2A'
-	fp2_add(&Ap, &Ap, &Ap);	// 4A'
-
-	fp2_t ADD_SQUARED[sJ_max],	// (Xj + Zj)^2
-	   SUB_SQUARED[sJ_max];	// (Xj - Zj)^2
-
-	int j;
-	// Next loop precompute some variables to be used in the reaminder of xisog
-	for (j = 0; j < sJ; j++)
-	{
-		fp2_sub(&SUB_SQUARED[j], &J[j].x, &J[j].z);		// (Xj - Zj)
-		fp2_sqr(&SUB_SQUARED[j], &SUB_SQUARED[j]);		// (Xj - Zj)^2
-		fp2_sub(&ADD_SQUARED[j], &SUB_SQUARED[j], &XZJ4[j]);	// (Xj + Zj)^2
-	};
-
-	//  --------------------------------------------------------------------------------------------------
-	//                   ~~~~~~~~
-	//                    |    | 
-	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
-	//                    j in J 
-	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
-	// In particular, for a degree-l isogeny construction, we need alpha = 1 and alpha = -1
-
-	//fp2_t EJ_0[sJ][3],	// quadratic factors of one polynomial to be used in a resultant 
-	//   EJ_1[sJ][3];	// quadratic factors of one polynomial to be used in a resultant
-
-	// Next loop computes all the quadratic factors of EJ_0 and EJ_1
-	fp2_t t1;
-	for (j = 0; j < sJ; j++)
-	{
-		// Each SUB_SQUARED[j] and ADD_SQUARED[j] should be multiplied by C
-		fp2_mul(&EJ_1[j][0], &ADD_SQUARED[j], &A.z);
-		fp2_mul(&EJ_0[j][0], &SUB_SQUARED[j], &A.z);
-		// We require the double of tadd and tsub
-		fp2_add(&EJ_0[j][1], &EJ_1[j][0], &EJ_1[j][0]);
-		fp2_add(&EJ_1[j][1], &EJ_0[j][0], &EJ_0[j][0]);
-
-		fp2_mul(&t1, &XZJ4[j], &Ap);			// A' *(-4*Xj*Zj)
-
-		// Case alpha = 1
-		fp2_sub(&EJ_0[j][1], &t1, &EJ_0[j][1]);
-		fp2_copy(&EJ_0[j][2], &EJ_0[j][0]);		// E_[0,j} is a palindrome
-		
-		// Case alpha = -1
-		fp2_sub(&EJ_1[j][1], &EJ_1[j][1], &t1);
-		fp2_copy(&EJ_1[j][2], &EJ_1[j][0]);		// E_{1,j} is a palindrome
-	};
-
-	// ---------------------------------------------------------------------
-	// The faster way for multiplying is using a divide-and-conquer approach
-	
-	// selfreciprocal product tree of EJ_0 (we only require the root)
-	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
-	assert( deg_ptree_EJ[0] == (2*sJ) );
-	if (!scaled)
-	{
-		// (unscaled) remainder tree approach
-		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
-	}
-	else
-	{
-		// scaled remainder tree approach
-		fp2_t G[sI_max], G_rev[sI_max];
-		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
-
-		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
-
-		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
-	};
-	clear_tree(ptree_EJ, 0, sJ);
-	// Finally, we must multiply the leaves of the outpur of remainders
-	fp2_t r0;
-	product(&r0, (const fp2_t*)leaves, sI);
-
-	// selfreciprocal product tree of EJ_1 (we only require the root)
-	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_1, sJ);
-	assert( deg_ptree_EJ[0] == (2*sJ) );
-	if (!scaled)
-	{
-		// (unscaled) remainder tree approach
-		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
-	}
-	else
-	{
-		// scaled remainder tree approach
-		fp2_t G[sI_max], G_rev[sI_max];
-		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
-
-		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
-		for (j = 0; j < sI; j++)
-			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
-
-		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
-	};
-	clear_tree(ptree_EJ, 0, sJ);
-	// Finally, we must multiply the leaves of the outpur of remainders
-	fp2_t r1;
-	product(&r1, (const fp2_t*)leaves, sI);
-
-	// -------------------------------
-	// Sometimes the public value sK is equal to zero,
-	// Thus for avoing runtime error we add one when sK =0
-	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
-	for (j = 0; j < sK; j++)
-	{
-		fp2_sub(&hK_0[j], &K[j].z, &K[j].x);
-		fp2_add(&hK_1[j], &K[j].z, &K[j].x);
-	};
-
-	// hk_0 <- use product to mulitiply all the elements in hK_0
-	product(&hk_0, (const fp2_t*)hK_0, sK);
-	// hk_1 <- use product to mulitiply all the elements in hK_1
-	product(&hk_1, (const fp2_t*)hK_1, sK);
-	
-	// --------------------------------------------------------------
-	// Now, we have all the ingredients for computing the image curve
-	fp2_t A24, A24m,
-	   t24, t24m;	// <---- JORGE creo que podemos omitir estas variables, se usan cuando ya no se requiren los valores de la entrada A (podemos cambiar estos t's por B[0] y B[1]
-
-	fp2_copy(&A24, &A.x);			// A' + 2C
-	fp2_sub(&A24m, &A.x, &A.z);		// A' - 2C
-	fp2_copy(&Ap, &A24m);
-
-	// left-to-right method for computing (A' + 2C)^l and (A' - 2C)^l
-	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
-	{
-		fp2_sqr(&A24, &A24);
-		fp2_sqr(&A24m, &A24m);
-		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
-		{
-			fp2_mul(&A24, &A24, &A.x);
-			fp2_mul(&A24m, &A24m, &Ap);
-		};
-	};
-
-	fp2_mul(&t24m, &hk_1, &r1);			// output of algorithm 2 with alpha =-1 and without the demoninator
-	fp2_sqr(&t24m, &t24m);			// raised at 2
-	fp2_sqr(&t24m, &t24m);			// raised at 4
-	fp2_sqr(&t24m, &t24m);			// raised at 8
-
-	fp2_mul(&t24, &hk_0, &r0);			// output of algorithm 2 with alpha = 1 and without the demoninator 
-	fp2_sqr(&t24, &t24);			// raised at 2
-	fp2_sqr(&t24, &t24);			// raised at 4
-	fp2_sqr(&t24, &t24);			// raised at 8
-
-	fp2_mul(&A24, &A24, &t24m);
-	fp2_mul(&A24m, &A24m, &t24);
-
-	// Now, we have d = (A24m / A24) where the image Montgomery cuve coefficient is
-	//      B'   2*(1 + d)   2*(A24 + A24m)
-	// B = ---- = --------- = --------------
-	//      C      (1 - d)     (A24 - A24m)
-	// However, we required B' + 2C = 4*A24 and 4C = 4 * (A24 - A24m)
-
-	fp2_sub(&t24m, &A24, &A24m);		//   (A24 - A24m)
-	fp2_add(&t24m, &t24m, &t24m);		// 2*(A24 - A24m)
-	fp2_add(&t24m, &t24m, &t24m);		// 4*(A24 - A24m)
-
-	fp2_add(&t24, &A24, &A24);			// 2 * A24
-	fp2_add(&t24, &t24, &t24);			// 4 * A24
-
-	fp2_copy(&(B->x), &t24);
-	fp2_copy(&(B->z), &t24m);
-}
--- a/src/ec/ref/include/biextension.h
+++ b/src/ec/ref/include/biextension.h
@@ -0,0 +1,82 @@
+#ifndef _BIEXT_H_
+#define _BIEXT_H_
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+typedef struct pairing_params
+{
+    uint32_t e;     // Points have order 2^e
+    ec_point_t P;   // x(P)
+    ec_point_t Q;   // x(Q)
+    ec_point_t PQ;  // x(P-Q) = (PQX/PQZ : 1)
+    fp2_t ixP;      // PZ/PX
+    fp2_t ixQ;      // QZ/QX
+    ec_point_t A24; // ((A+2)/4 : 1)
+} pairing_params_t;
+
+// For two bases <P, Q> and <R, S> store:
+// x(P - R), x(P - S), x(R - Q), x(S - Q)
+typedef struct pairing_dlog_diff_points
+{
+    ec_point_t PmR; // x(P - R)
+    ec_point_t PmS; // x(P - S)
+    ec_point_t RmQ; // x(R - Q)
+    ec_point_t SmQ; // x(S - Q)
+} pairing_dlog_diff_points_t;
+
+typedef struct pairing_dlog_params
+{
+    uint32_t e;                      // Points have order 2^e
+    ec_basis_t PQ;                   // x(P), x(Q), x(P-Q)
+    ec_basis_t RS;                   // x(R), x(S), x(R-S)
+    pairing_dlog_diff_points_t diff; // x(P - R), x(P - S), x(R - Q), x(S - Q)
+    fp2_t ixP;                       // PZ/PX
+    fp2_t ixQ;                       // QZ/QX
+    fp2_t ixR;                       // RZ/RX
+    fp2_t ixS;                       // SZ/SX
+    ec_point_t A24;                  // ((A+2)/4 : 1)
+} pairing_dlog_params_t;
+
+// Computes e = e_{2^e}(P, Q) using biextension ladder
+void weil(fp2_t *r, uint32_t e, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ, ec_curve_t *E);
+
+// Computes (reduced) z = t_{2^e}(P, Q) using biextension ladder
+void reduced_tate(fp2_t *r, uint32_t e, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ, ec_curve_t *E);
+
+// Given two bases <P, Q> and <R, S> computes scalars
+// such that R = [r1]P + [r2]Q, S = [s1]P + [s2]Q
+void ec_dlog_2_weil(digit_t *r1,
+                    digit_t *r2,
+                    digit_t *s1,
+                    digit_t *s2,
+                    ec_basis_t *PQ,
+                    const ec_basis_t *RS,
+                    ec_curve_t *curve,
+                    int e);
+
+// Given two bases <P, Q> and <R, S>
+// where <P, Q> is a basis for E[2^f]
+// the full 2-torsion, and <R,S> a basis
+// for smaller torsion E[2^e]
+// computes scalars r1, r2, s1, s2
+// such that R = [r1]P + [r2]Q, S = [s1]P + [s2]Q
+void ec_dlog_2_tate(digit_t *r1,
+                    digit_t *r2,
+                    digit_t *s1,
+                    digit_t *s2,
+                    const ec_basis_t *PQ,
+                    const ec_basis_t *RS,
+                    ec_curve_t *curve,
+                    int e);
+
+void ec_dlog_2_tate_to_full(digit_t *r1,
+                            digit_t *r2,
+                            digit_t *s1,
+                            digit_t *s2,
+                            ec_basis_t *PQ,
+                            ec_basis_t *RS,
+                            ec_curve_t *curve,
+                            int e);
+
+#endif
--- a/src/ec/ref/include/curve_extras.h
+++ b/src/ec/ref/include/curve_extras.h
@@ -1,28 +0,0 @@
-#ifndef CURVE_EXTRAS_H
-#define CURVE_EXTRAS_H
-
-#include "ec.h"
-#include "torsion_constants.h"
-
-typedef struct jac_point_t {
-    fp2_t x;
-    fp2_t y;
-    fp2_t z;
-} jac_point_t;
-
-bool ec_is_zero(ec_point_t const* P);
-void copy_point(ec_point_t* P, ec_point_t const* Q);
-void swap_points(ec_point_t* P, ec_point_t* Q, const digit_t option);
-void ec_init(ec_point_t* P);
-void xDBLv2(ec_point_t* Q, ec_point_t const* P, ec_point_t const* A24);
-void xDBLADD(ec_point_t* R, ec_point_t* S, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A24);
-void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
-void xDBL(ec_point_t* Q, ec_point_t const* P, ec_point_t const* AC);
-void xMUL(ec_point_t* Q, ec_point_t const* P, digit_t const* k, ec_curve_t const* curve);
-void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
-
-#define is_point_equal ec_is_equal
-#define xADD ec_add
-
-#endif
-
--- a/src/ec/ref/include/ec.h
+++ b/src/ec/ref/include/ec.h
--- a/src/ec/ref/include/isog.h
+++ b/src/ec/ref/include/isog.h
@@ -1,84 +1,28 @@
 #ifndef _ISOG_H_
 #define _ISOG_H_
+#include <sqisign_namespace.h>
+#include <ec.h>

-#include "curve_extras.h"
-#include "poly.h"
-
-extern int sI, sJ, sK;	// Sizes of each current I, J, and K	
-
-extern fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
-			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
-
-extern ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
-extern fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
-    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
-    A0;			// constant multiple of the reciprocal R0
-
-extern poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
-     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
-     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
-     
-extern fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
-
-extern int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
-    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
-
-extern fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
-
-
-void eds2mont(ec_point_t* P);						// mapping from Twisted edwards into Montogmery
-void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ);	// differential addition on Twisted edwards model
-void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta);
-
-void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A);	// tvelu formulae
-void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
-
-void xisog_4(ec_point_t* B, ec_point_t const P);			// degree-4 isogeny construction
-void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24);
-void xisog_2(ec_point_t* B, ec_point_t const P);			// degree-2 isogeny construction
-void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A);	// tvelu formulae
-void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A);	// svelu formulae
-
-void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ);					// degree-4 isogeny evaluation
-void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P);
-void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ);	// degree-2 isogeny evaluation
-void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P);			// tvelu formulae
-void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
-
-// Strategy-based 4-isogeny chain
-static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
-    ec_point_t* A24, const ec_point_t *kernel, const int isog_len);
-
-void kps_clear(int i);	// Clear memory assigned by KPS
-
-
-// hybrid velu formulae
-static inline void kps(uint64_t const i, ec_point_t const P, ec_point_t const A)	
+/* KPS structure for isogenies of degree 2 or 4 */
+typedef struct
 {
-	// Next branch only depends on a fixed public bound (named gap)
-	if (TORSION_ODD_PRIMES[i] <= gap)
-		kps_t(i, P, A);
-	else
-		kps_s(i, P, A);
-}
-
-static inline void xisog(ec_point_t* B, uint64_t const i, ec_point_t const A)
+    ec_point_t K;
+} ec_kps2_t;
+typedef struct
 {
-	// Next branch only depends on a fixed public bound (named gap)
-	if (TORSION_ODD_PRIMES[i] <= gap)
-		xisog_t(B, i, A);
-	else
-		xisog_s(B, i, A);
-}
+    ec_point_t K[3];
+} ec_kps4_t;

-static inline void xeval(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
-{
-	// Next branch only depends on a fixed public bound (named gap)
-	if (TORSION_ODD_PRIMES[i] <= gap)
-		xeval_t(Q, i, P);
-	else
-		xeval_s(Q, i, P, A);
-}
+void xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P); // degree-2 isogeny construction
+void xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24);

+void xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P); // degree-4 isogeny construction
+void xisog_4_singular(ec_kps4_t *kps, ec_point_t *B24, const ec_point_t P, ec_point_t A24);
+
+void xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps);
+void xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps);
+
+void xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps);
+void xeval_4_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_point_t P, const ec_kps4_t *kps);

 #endif
--- a/src/ec/ref/include/poly.h
+++ b/src/ec/ref/include/poly.h
@@ -1,28 +0,0 @@
-#ifndef _POLY_H_
-#define _POLY_H_
-
-#include <fp2.h>
-
-typedef fp2_t *poly; // Polynomials are arrays of coeffs over Fq, lowest degree first
-
-void poly_mul(poly h, const poly f, const int lenf, const poly g, const int leng);
-void poly_mul_low(poly h, const int n, const poly f, const int lenf, const poly g, const int leng);
-void poly_mul_middle(poly h, const poly g, const int leng, const poly f, const int lenf);
-void poly_mul_selfreciprocal(poly h, const poly g, const int leng, const poly f, const int lenf);
-
-void product_tree(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
-void product_tree_LENFeq2(poly H[], int DEG[], const int root, const fp2_t F[][2], const int n);
-void product_tree_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
-void product_tree_selfreciprocal(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
-void product_tree_selfreciprocal_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
-void clear_tree(poly H[], const int root, const int n);
-
-void product(fp2_t *c, const fp2_t F[], const int n);
-
-void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n);
-void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,const poly f_inv, const fp2_t c);
-void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[], const int root, const int n);
-void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[], const poly H[], const int DEG[], const int root, const int n);
-void multieval_scaled(fp2_t REM[], const poly G, const poly H[], const int DEG[], const int root, const int n);
-
-#endif /* _POLY_H */
--- a/src/ec/ref/include/sdacs.h
+++ b/src/ec/ref/include/sdacs.h
@@ -1,50 +0,0 @@
-#ifndef _SDACS_H_
-#define _SDACS_H_
-
-static char SDAC_P_0[] = "0";
-static char SDAC_P_1[] = "10";
-static char SDAC_P_2[] = "100";
-static char SDAC_P_3[] = "0100";
-static char SDAC_P_4[] = "10000";
-static char SDAC_P_5[] = "110000";
-static char SDAC_P_6[] = "100000";
-static char SDAC_P_7[] = "1100010001";
-static char SDAC_P_8[] = "1001010000";
-static char SDAC_P_9[] = "0101001000";
-static char SDAC_P_10[] = "110110010000";
-static char SDAC_P_11[] = "10000000000";
-static char SDAC_P_12[] = "1010100001001000";
-
-static char SDAC_M_0[] = "";
-static char SDAC_M_1[] = "000";
-static char SDAC_M_2[] = "1010";
-static char SDAC_M_3[] = "100010";
-static char SDAC_M_4[] = "0010000";
-static char SDAC_M_5[] = "110000000";
-static char SDAC_M_6[] = "1010101010";
-static char SDAC_M_7[] = "1010001000";
-static char SDAC_M_8[] = "1001000000";
-static char SDAC_M_9[] = "0100001000";
-static char SDAC_M_10[] ="101101010000"; 
-static char SDAC_M_11[] = "100100010010";
-static char SDAC_M_12[] = "010100011000";
-static char SDAC_M_13[] = "101010000001";
-static char SDAC_M_14[] = "010100001000";
-static char SDAC_M_15[] = "1101010010000";
-static char SDAC_M_16[] = "1001010001010";
-static char SDAC_M_17[] = "101001000000101";
-
-static char *SDACs[31] = {
-	SDAC_P_0, SDAC_P_1, SDAC_P_2, SDAC_P_3, SDAC_P_4, 
-	SDAC_P_5, SDAC_P_6, SDAC_P_7, SDAC_P_8, SDAC_P_9, 
-	SDAC_P_10, SDAC_P_11, SDAC_P_12, 
-	SDAC_M_0, SDAC_M_1, SDAC_M_2, SDAC_M_3, SDAC_M_4, 
-	SDAC_M_5, SDAC_M_6, SDAC_M_7, SDAC_M_8, SDAC_M_9, 
-	SDAC_M_10, SDAC_M_11, SDAC_M_12, SDAC_M_13, SDAC_M_14, 
-	SDAC_M_15, SDAC_M_16, SDAC_M_17
-	};
-
-static int LENGTHS[] =	{
-1, 2, 3, 4, 5, 6, 6, 10, 10, 10, 12, 11, 16, 0, 3, 4, 6, 7, 9, 10, 10, 10, 10, 12, 12, 12, 12, 12, 13, 13, 15
-	};
-#endif
--- a/src/ec/ref/include/tedwards.h
+++ b/src/ec/ref/include/tedwards.h
@@ -1,28 +0,0 @@
-#ifndef TEDWARDS_H
-#define TEDWARDS_H
-
-#include <fp2.h>
-#include "ec.h"
-
-// a*x^2+y^2=1+d*x^2*y^2
-
-typedef struct ted_point_t {
-    fp2_t x;
-    fp2_t y;
-    fp2_t z;
-    fp2_t t; // t = x*y/z
-} ted_point_t;
-
-void ted_init(ted_point_t* P);
-bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2);
-void copy_ted_point(ted_point_t* P, ted_point_t const* Q);
-
-void ted_neg(ted_point_t* Q, ted_point_t const* P);
-void ted_dbl(ted_point_t* Q, ted_point_t const* P, ec_curve_t const* E);
-void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E);
-
-void mont_to_ted(ec_curve_t* E, ec_curve_t const* A);
-void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* A);
-void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P);
-
-#endif
--- a/src/ec/ref/lvl1/CMakeLists.txt
+++ b/src/ec/ref/lvl1/CMakeLists.txt
@@ -1,17 +1 @@
-set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
-    ${ECX_DIR}/poly-mul.c 
-    ${ECX_DIR}/poly-redc.c 
-    ${ECX_DIR}/ec.c 
-    ${ECX_DIR}/tedwards.c 
-    ${ECX_DIR}/kps.c 
-    ${ECX_DIR}/xisog.c 
-    ${ECX_DIR}/xeval.c 
-    ${ECX_DIR}/isog_chains.c 
-    ${ECX_DIR}/basis.c
-)
-
-add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
-target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
-target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
-
-add_subdirectory(test)
+include(../lvlx.cmake)
--- a/src/ec/ref/lvl1/test/CMakeLists.txt
+++ b/src/ec/ref/lvl1/test/CMakeLists.txt
@@ -1,36 +1 @@
-add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
-	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
-	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
-		
-add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
-	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-		
-add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
-	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-	
-add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
-	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
-	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
-	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
-	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-
-add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
-add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+include(../../lvlx_test.cmake)
--- a/src/ec/ref/lvl1/test/ec-tests.h
+++ b/src/ec/ref/lvl1/test/ec-tests.h
@@ -1,400 +0,0 @@
-#ifndef EC_TESTS_H
-#define EC_TESTS_H
-
-#include "test_extras.h"
-#include <stdio.h>
-#include <string.h>
-#include <bench.h>       //////// NOTE: enable later
-#include "test-basis.h"
-#include "ec_params.h"
-
-// Global constants
-extern const digit_t p[NWORDS_FIELD];
-
-// Benchmark and test parameters  
-static int BENCH_LOOPS = 1000;       // Number of iterations per bench
-static int TEST_LOOPS  = 512;       // Number of iterations per test
-
-
-bool ec_test()
-{ // Tests for ecc arithmetic
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_point_t AC = {0};
-    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Testing ecc functions: \n\n"); 
-
-    // Point doubling
-    passed = 1;
-    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
-    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
-    P.z.re[0] = 0x01;
-
-    AC.z.re[0] = 0x01;
-    fp2_tomont(&AC.z, &AC.z);
-        
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    xDBL(&S, &R, &AC);
-    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
-    fp2_copy(&SS.z, &S.z);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
-    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
-    
-    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
-    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
-    Q.z.re[0] = 0x01;
-
-    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
-    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
-    PQ.z.re[0] = 0x01;
-
-    fp2_tomont(&S.x, &Q.x);
-    fp2_tomont(&S.z, &Q.z);
-    fp2_tomont(&PQ.x, &PQ.x);
-    fp2_tomont(&PQ.z, &PQ.z);
-    xADD(&S, &SS, &S, &PQ);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
-    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 126;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
-    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
-    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &Q.x);
-    fp2_tomont(&R.z, &Q.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    l[0] = 0x34AB78B6C6B2D8C0;
-    l[1] = 0xDE6B2D8CD00F38D1;
-    l[2] = 0xA35F4A7897E17CE2;
-    l[3] = 0x20ACF4A789614D13;
-    fp2_inv(&SS.z);
-    fp2_mul(&SS.x, &SS.x, &SS.z);
-    fp2_copy(&SS.z, &R.z);
-    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
-    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-out0:
-    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
-    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
- 
-    return OK;
-}
-
-bool dlog_test()
-{ // Tests for dlog
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Testing dlog functions: \n\n");
-
-    // dlog2 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(f1, TWOpFm1);
-    fp_copy(f2, TWOpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[0] = 0xFFFFFFFFFFFFFFFF;
-    k[1] = 0x00000000000007FF;
-    l[0] = 0xFFFFFFFFFFFFFFFE;
-    l[1] = 0x00000000000007FF;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
-           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, f2, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, f2, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
-    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    // dlog3 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(tpFdiv2, THREEpFdiv2);
-    fp_copy(tpF, THREEpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[1] = 0;
-    l[1] = 0;
-    k[0] = 0x02153E468B91C6D1;
-    l[0] = 0x02153E468B91C6D0;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
-           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, tpF, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, tpF, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
-    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    return OK;
-}
-
-bool ec_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P, Q, R, PQ, AC;
-    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
-        
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Benchmarking ecc arithmetic: \n\n"); 
-
-    // Point doubling
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles(); 
-        xDBL(&Q, &P, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point addition
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xADD(&R, &Q, &P, &PQ);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-bool dlog_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Benchmarking dlog2: \n\n");
-
-    // dlog2 computation
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // dlog3 computation
-
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-#endif
--- a/src/ec/ref/lvl1/test/test-basis.h
+++ b/src/ec/ref/lvl1/test/test-basis.h
@@ -1,24 +0,0 @@
-#ifndef TEST_BASIS_H
-#define TEST_BASIS_H
-
-#include "fp2.h"
-// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
-const fp2_t xPA = {{0x7505815fb30f099e,0x89e78dbb4294c8df,0x7db9b4b1f7716d7b,0x13fcd4c87af65308},{0x93533c1017088fd4,0x6df9e398a1bb4cb1,0xc928f082be2e2b4c,0x17aa7e2906bef0af}};
-const fp2_t xQA = {{0xe96336b75eb5a505,0x5640cecad0ad7b5a,0x1394f0771bc58ac1,0x18d92124656d68d9},{0xa54e8e24605754f0,0xe52de9790bbe4bb9,0x3bf9b7833f62e255,0x277a07644ec4f0e2}};
-const fp2_t xPQA = {{0xc8fcceb408e3444c,0x9f8ca4d2c05c3287,0x259e496f17c0f529,0x0eb18a51c2a3dd1a},{0x1014dbe2534b8310,0x6b035ee3c371ea12,0x8354ecb4c111db6d,0x178259b78fe08093}};
-
-const fp2_t xPB = {{0xbd0a2f0c9a5378ca,0x74af17405042203d,0x0ccdcb4b7f0b8c15,0x314c70951a92d8bf},{0xe889e6bc5f9842af,0xefb0edbb5e266ab3,0x7bfb9d05f1ba6962,0x0a5f3f4fe6f16514}};
-const fp2_t xQB = {{0x137e215438caaf3b,0xc4403ee1b69f1382,0x2b5783edcefa7246,0x3015572698262f66},{0x8e88e4293f84536e,0x8d6dbc277f85ff77,0xb3f17b53b01da916,0x08dd3f4976c5dad1}};
-const fp2_t xPQB = {{0xf0c2701a7050d9b9,0xc8fdb069c0234d3a,0x9ec25780f2b101a8,0x221a0565053e8ff4},{0xd8513bf6a05910ae,0x47ff2422258dfb3a,0xb98ccceae31ac407,0x21bcc8e659aaa1b3}};
-
-// 2^f-torsion basis for A=0
-const fp2_t xP2 = {{0xfc93bac7df77fd30,0xa8d37e10783215bd,0x4bd2ece4f148039b,0x2bd5b83f5f8c09fb},{0x444112970b59f12f,0x557b8b9beb55c276,0x633f97cd9464df6c,0x00a1b21b593a2dfd}};
-const fp2_t xQ2 = {{0x6b4289960273222c,0xa290d8eb8e343a04,0x0c0a333f80a0ed68,0x31a58910e276aff0},{0xb7ca615ad7473865,0xeb6f72f20794f050,0x2941c3fe3203b94f,0x32ad5cbe915e467b}};
-const fp2_t xPQ2 = {{0xac9f90005e47b095,0x47eafdafd5168836,0xb88aac8334acdad0,0x1a5cf52a20f665b4},{0x4baa70fb1f5fa99c,0xffb7ddb12c87f1a3,0xdd3a229d370a8484,0x1e992ad0a14baf03}};
-
-// 3^g-torsion basis for A=0
-const fp2_t xP3 = {{0x8cf496c2722f340d,0x3e329c5a507ad39c,0xa0c7caa3e4537e25,0x1371d43cf97de48e},{0xa4b94c97b8149e7d,0xd290853fa14704c7,0x158b854173c1b289,0x04c6dcda7872c23f}};
-const fp2_t xQ3 = {{0x0f6380fd4c963950,0x101a22a245c4f563,0x601d3e30b21a5f43,0x0becd5f73b067949},{0xd364123c6806057e,0x8ff24fca9e060260,0x3b52df5bfb817901,0x30950462489b838f}};
-const fp2_t xPQ3 = {{0xe04cab7169e64a82,0x56df573ea9295c19,0x06cbb6af8e341990,0x0f1046ca03017ca1},{0x2dac3457c35be728,0x2f59af21113f25f9,0xa0dc4f54eec2715d,0x102ecf9a7ff2f2ff}};
-
-#endif
--- a/src/ec/ref/lvl3/CMakeLists.txt
+++ b/src/ec/ref/lvl3/CMakeLists.txt
@@ -1,17 +1 @@
-set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
-    ${ECX_DIR}/poly-mul.c 
-    ${ECX_DIR}/poly-redc.c 
-    ${ECX_DIR}/ec.c 
-    ${ECX_DIR}/tedwards.c 
-    ${ECX_DIR}/kps.c 
-    ${ECX_DIR}/xisog.c 
-    ${ECX_DIR}/xeval.c 
-    ${ECX_DIR}/isog_chains.c 
-    ${ECX_DIR}/basis.c
-)
-
-add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
-target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
-target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
-
-add_subdirectory(test)
+include(../lvlx.cmake)
--- a/src/ec/ref/lvl3/test/CMakeLists.txt
+++ b/src/ec/ref/lvl3/test/CMakeLists.txt
@@ -1,36 +1 @@
-add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
-	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
-	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
-		
-add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
-	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-		
-add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
-	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-	
-add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
-	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
-	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
-	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
-	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-
-add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
-add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+include(../../lvlx_test.cmake)
--- a/src/ec/ref/lvl3/test/ec-tests.h
+++ b/src/ec/ref/lvl3/test/ec-tests.h
@@ -1,400 +0,0 @@
-#ifndef EC_TESTS_H
-#define EC_TESTS_H
-
-#include "test_extras.h"
-#include <stdio.h>
-#include <string.h>
-#include <bench.h>       //////// NOTE: enable later
-#include "test-basis.h"
-#include "ec_params.h"
-
-// Global constants
-extern const digit_t p[NWORDS_FIELD];
-
-// Benchmark and test parameters  
-static int BENCH_LOOPS = 1000;       // Number of iterations per bench
-static int TEST_LOOPS  = 512;       // Number of iterations per test
-
-
-bool ec_test()
-{ // Tests for ecc arithmetic
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_point_t AC = {0};
-    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
-/*
-    // Point doubling
-    passed = 1;
-    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
-    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
-    P.z.re[0] = 0x01;
-
-    AC.z.re[0] = 0x01;
-    fp2_tomont(&AC.z, &AC.z);
-        
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    xDBL(&S, &R, &AC);
-    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
-    fp2_copy(&SS.z, &S.z);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
-    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
-    
-    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
-    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
-    Q.z.re[0] = 0x01;
-
-    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
-    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
-    PQ.z.re[0] = 0x01;
-
-    fp2_tomont(&S.x, &Q.x);
-    fp2_tomont(&S.z, &Q.z);
-    fp2_tomont(&PQ.x, &PQ.x);
-    fp2_tomont(&PQ.z, &PQ.z);
-    xADD(&S, &SS, &S, &PQ);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
-    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 126;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
-    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
-    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &Q.x);
-    fp2_tomont(&R.z, &Q.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    l[0] = 0x34AB78B6C6B2D8C0;
-    l[1] = 0xDE6B2D8CD00F38D1;
-    l[2] = 0xA35F4A7897E17CE2;
-    l[3] = 0x20ACF4A789614D13;
-    fp2_inv(&SS.z);
-    fp2_mul(&SS.x, &SS.x, &SS.z);
-    fp2_copy(&SS.z, &R.z);
-    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
-    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-out0:
-    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
-    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
- */
-    return OK;
-}
-
-bool dlog_test()
-{ // Tests for dlog
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Testing dlog functions: \n\n");
-
-    // dlog2 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(f1, TWOpFm1);
-    fp_copy(f2, TWOpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[0] = 0xFFFFFFFFFFFFFFFF;
-    k[1] = 0x00000000000007FF;
-    l[0] = 0xFFFFFFFFFFFFFFFE;
-    l[1] = 0x00000000000007FF;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
-           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, f2, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, f2, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
-    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    // dlog3 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(tpFdiv2, THREEpFdiv2);
-    fp_copy(tpF, THREEpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[1] = 0;
-    l[1] = 0;
-    k[0] = 0x02153E468B91C6D1;
-    l[0] = 0x02153E468B91C6D0;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
-           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, tpF, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, tpF, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
-    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    return OK;
-}
-
-bool ec_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P, Q, R, PQ, AC;
-    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
-        
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Benchmarking ecc arithmetic: \n\n"); 
-
-    // Point doubling
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles(); 
-        xDBL(&Q, &P, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point addition
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xADD(&R, &Q, &P, &PQ);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-bool dlog_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Benchmarking dlog2: \n\n");
-
-    // dlog2 computation
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // dlog3 computation
-
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-#endif
--- a/src/ec/ref/lvl3/test/test-basis.h
+++ b/src/ec/ref/lvl3/test/test-basis.h
@@ -1,24 +0,0 @@
-#ifndef TEST_BASIS_H
-#define TEST_BASIS_H
-
-#include "fp2.h"
-// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
-const fp2_t xPA = {{0x35b53c72e7494775,0x5791b499bc29710d,0x2060f3aca68fa4ff,0x81150c19a14f523a,0x08af6c81a906d44a,0x00cca2a93efb536e},{0x14eaac356375af76,0x5655011e771be3b4,0x6273ccee274d7754,0x440d6b5b4496c183,0xa3d7f80e9f9111ba,0x0302e153bee01a18}};
-const fp2_t xQA = {{0x80c0767d1b7b5fd8,0x24e9039d430ca3b5,0x26485254625dc85a,0x612eaebc345b64d1,0x59669fbd946a4409,0x004c3a8564e16101},{0x0e1eac4e38449c54,0x752c042b4c6675cb,0x88ec0e75c8e9ea0e,0xbf7c4cdbfc4483f0,0xd594cb5474bbc264,0x02f5e2345a9b4654}};
-const fp2_t xPQA = {{0x1f5accaff9a7da90,0x91884964774d4cb2,0x0e938e13dd088e63,0x453c9af09879a724,0xb2bd09ec3740312b,0x0007a5837e23aaa1},{0x8e1ac4b319787bd4,0x7cb9fba402f67bfe,0x370b2951f9ec29cf,0x7a020172566f9d17,0x063e31753d703130,0x01551136265bade6}};
-
-const fp2_t xPB = {{0xb702a70a8ae132ad,0x56d8804c83a8e696,0x5ac3e12f4df1792e,0x0a89da435664746e,0xd8758765206844bd,0x01a92f6e9e0e9296},{0x8aaab711b76b0959,0x210e6695ca5e5fdd,0x593be0d75909ca12,0xfbc074d8ebdeb927,0xb61fcc328d3756bc,0x0198a5942855c8bf}};
-const fp2_t xQB = {{0x2b6b82b950b61fda,0x0ef2dd717daed334,0x99dee4db0b268ac9,0x3534eb384e1fcaf0,0xbaf112845a4f2d81,0x037f1492d8d815a1},{0x97e80590f9a0556b,0x7d9b4b87a22a7792,0xda4534fe75595b4b,0xbe1092a2733c03e1,0xbf5b1bd147b0d630,0x0125721476e5267f}};
-const fp2_t xPQB = {{0xb7d459a56d4aebec,0x6ac7f10ba20e1e71,0x9a95a8928507f7ef,0xc4c5aff6b97f3dfe,0x644beb3e86806b77,0x022319eb6eaf072a},{0x8ad0f6b18934790e,0xdad82b7b38e166bf,0xcb08f5a3ab53d9a9,0xd2ff39b401ba8aba,0xbff9b5e40ed9e5ce,0x03c1773791f554c0}};
-
-// 2^f-torsion basis for A=0
-const fp2_t xP2 = {{0x7a26fdb0e5844206,0x0752b2ba140f7dfd,0x1728013f8f5fe257,0xd05f129975ed6bba,0xe736dbce707ad5a8,0x01f861715896d0be},{0xdac046927a0c5352,0x5a42474ac156ff18,0xe887982ff4c5a9ea,0x3875be6432251f1c,0xdfae47315af877ee,0x005627f085582ecc}};
-const fp2_t xQ2 = {{0xc4f03ab3db57331b,0xf04261fc3b713778,0xa99b82430c7e40d1,0x5fe52b1324c2a091,0xfcaa2a7049d0f657,0x021f2caa09302141},{0x4a92a1d5ff9f6730,0x6dcd5f600f33783e,0xdb8b4e2e5149b45e,0x993458635c01d0c0,0x5f9bc7d3bb307f91,0x01fcc7eae4712b6a}};
-const fp2_t xPQ2 = {{0x7f4ee9c86c4341a2,0x0c867f482063bdfc,0xe46fb7b0fbd479c7,0xddaa716e091be9ad,0x29239eadddf5dc59,0x0231c09c660f0a89},{0xde64fa344dd64237,0xa89aaaed3dd84555,0xbb70924d8fb73f27,0x0869ec018b3366dc,0x47a0356ce742bcbc,0x00547dbda6dc094d}};
-
-// 3^g-torsion basis for A==0
-const fp2_t xP3 = {{0x7c878d0ceaa821f0,0xf94db4cab7186625,0x7cff6d5fb0ca7867,0x4e3f5bd19cbca9d6,0x05ec8273d0042548,0x0233a79cf87040b3},{0x060e9f3dcab8192c,0xa94e86d063a46398,0x0e5cc403bfb60867,0x3ea1277f98087283,0xaff1fd95bb094917,0x025041b12719d3b8}};
-const fp2_t xQ3 = {{0xb25aaa192bd351b7,0xc5db1962aed7e543,0x1f722ab174319947,0xd1c9bb4a0a5d8aa3,0x351415ec64f88921,0x0288ae044d62c930},{0xb41ede1724f8e06a,0xfb10ce5a83c66629,0x9846173e31a9d448,0x35c94966192f08db,0x72f7252946af3f9c,0x02ea05c971e7b34c}};
-const fp2_t xPQ3 = {{0x674703cc3134d90b,0x507e338e496b8f75,0x0c8cb1f138346e4c,0x54cb7ad5ba580da7,0x65750f0bcd0a9857,0x038b435f51669e87},{0xdcdc0116c67589a0,0x45ce94f4d345c827,0x0f2cbfb3c53b73ea,0x03e7951bc98efbb8,0x3335ad0991864858,0x01e151a64210f74f}};
-
-#endif
--- a/src/ec/ref/lvl5/CMakeLists.txt
+++ b/src/ec/ref/lvl5/CMakeLists.txt
@@ -1,17 +1 @@
-set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
-    ${ECX_DIR}/poly-mul.c 
-    ${ECX_DIR}/poly-redc.c 
-    ${ECX_DIR}/ec.c 
-    ${ECX_DIR}/tedwards.c 
-    ${ECX_DIR}/kps.c 
-    ${ECX_DIR}/xisog.c 
-    ${ECX_DIR}/xeval.c 
-    ${ECX_DIR}/isog_chains.c 
-    ${ECX_DIR}/basis.c
-)
-
-add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
-target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
-target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
-
-add_subdirectory(test)
+include(../lvlx.cmake)
--- a/src/ec/ref/lvl5/test/CMakeLists.txt
+++ b/src/ec/ref/lvl5/test/CMakeLists.txt
@@ -1,36 +1 @@
-add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
-	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
-	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
-		
-add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
-	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-		
-add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
-	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
-	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-	
-add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
-	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
-	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
-	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
-	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
-	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
-
-
-add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
-add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
-add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
+include(../../lvlx_test.cmake)
--- a/src/ec/ref/lvl5/test/ec-tests.h
+++ b/src/ec/ref/lvl5/test/ec-tests.h
@@ -1,400 +0,0 @@
-#ifndef EC_TESTS_H
-#define EC_TESTS_H
-
-#include "test_extras.h"
-#include <stdio.h>
-#include <string.h>
-#include <bench.h>       //////// NOTE: enable later
-#include "test-basis.h"
-#include "ec_params.h"
-
-// Global constants
-extern const digit_t p[NWORDS_FIELD];
-
-// Benchmark and test parameters  
-static int BENCH_LOOPS = 1000;       // Number of iterations per bench
-static int TEST_LOOPS  = 512;       // Number of iterations per test
-
-
-bool ec_test()
-{ // Tests for ecc arithmetic
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_point_t AC = {0};
-    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
-/*
-    // Point doubling
-    passed = 1;
-    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
-    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
-    P.z.re[0] = 0x01;
-
-    AC.z.re[0] = 0x01;
-    fp2_tomont(&AC.z, &AC.z);
-        
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    xDBL(&S, &R, &AC);
-    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
-    fp2_copy(&SS.z, &S.z);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
-    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
-    
-    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
-    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
-    Q.z.re[0] = 0x01;
-
-    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
-    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
-    PQ.z.re[0] = 0x01;
-
-    fp2_tomont(&S.x, &Q.x);
-    fp2_tomont(&S.z, &Q.z);
-    fp2_tomont(&PQ.x, &PQ.x);
-    fp2_tomont(&PQ.z, &PQ.z);
-    xADD(&S, &SS, &S, &PQ);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
-    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 126;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
-    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &P.x);
-    fp2_tomont(&R.z, &P.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    xMUL(&S, &R, k, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
-    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-    fp2_tomont(&R.x, &Q.x);
-    fp2_tomont(&R.z, &Q.z);
-    k[0] = 0xE77AD6B6C6B2D8CD;
-    k[1] = 0xDE43A0B600F38D12;
-    k[2] = 0xA35F4A7897E17CE2;
-    k[3] = 0x10ACB62E614D1237;
-    l[0] = 0x34AB78B6C6B2D8C0;
-    l[1] = 0xDE6B2D8CD00F38D1;
-    l[2] = 0xA35F4A7897E17CE2;
-    l[3] = 0x20ACF4A789614D13;
-    fp2_inv(&SS.z);
-    fp2_mul(&SS.x, &SS.x, &SS.z);
-    fp2_copy(&SS.z, &R.z);
-    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
-    fp2_inv(&S.z);
-    fp2_mul(&S.x, &S.x, &S.z);
-    fp2_frommont(&S.x, &S.x);
-
-    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
-    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
-
-    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
-    
-out0:
-    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
-    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
- */
-    return OK;
-}
-
-bool dlog_test()
-{ // Tests for dlog
-    bool OK = true;
-    int passed;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Testing dlog functions: \n\n");
-
-    // dlog2 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(f1, TWOpFm1);
-    fp_copy(f2, TWOpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[0] = 0xFFFFFFFFFFFFFFFF;
-    k[1] = 0x00000000000007FF;
-    l[0] = 0xFFFFFFFFFFFFFFFE;
-    l[1] = 0x00000000000007FF;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
-           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, f2, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, f2, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
-    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    // dlog3 testing
-    passed = 1;
-    
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp_copy(tpFdiv2, THREEpFdiv2);
-    fp_copy(tpF, THREEpF);
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-    k[1] = 0;
-    l[1] = 0;
-    k[0] = 0x02153E468B91C6D1;
-    l[0] = 0x02153E468B91C6D0;
-
-    for (int n = 0; n < TEST_LOOPS; n++)
-    {
-        k[0] -= 1;
-        l[0] -= 2;
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-
-        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
-        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
-        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
-           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
-            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
-                sub_test(kt, tpF, kt, NWORDS_ORDER);
-            }
-            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
-                sub_test(lt, tpF, lt, NWORDS_ORDER);
-            }
-        }
-        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
-    }
-
-    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
-    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
-    printf("\n");
-
-    return OK;
-}
-
-bool ec_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P, Q, R, PQ, AC;
-    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
-        
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
-    printf("Benchmarking ecc arithmetic: \n\n"); 
-
-    // Point doubling
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles(); 
-        xDBL(&Q, &P, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point addition
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xADD(&R, &Q, &P, &PQ);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // Point multiplication
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        cycles1 = cpucycles();
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-bool dlog_run()
-{
-    bool OK = true;
-    int n;
-    unsigned long long cycles, cycles1, cycles2;
-    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
-    ec_curve_t AC = {0};
-    ec_basis_t PQ2;
-    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
-
-    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
-    printf("Benchmarking dlog2: \n\n");
-
-    // dlog2 computation
-    
-    fp2_tomont(&P.x, &xP2);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ2);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ2);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    AC.C.re[0] = 0x01;
-    fp2_tomont(&AC.C, &AC.C);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    // dlog3 computation
-
-    fp2_tomont(&P.x, &xP3);
-    fp_mont_setone(P.z.re);
-    fp_set(P.z.im, 0);
-    
-    fp2_tomont(&Q.x, &xQ3);
-    fp_mont_setone(Q.z.re);
-    fp_set(Q.z.im, 0);
-    
-    fp2_tomont(&PQ.x, &xPQ3);
-    fp_mont_setone(PQ.z.re);
-    fp_set(PQ.z.im, 0);
-
-    copy_point(&PQ2.P, &P);
-    copy_point(&PQ2.Q, &Q);
-    copy_point(&PQ2.PmQ, &PQ);
-
-    cycles = 0;
-    for (n = 0; n < BENCH_LOOPS; n++)
-    {
-        fprandom_test(k); fprandom_test(l);
-        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
-        cycles1 = cpucycles();
-        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
-        cycles2 = cpucycles();
-        cycles = cycles + (cycles2 - cycles1);
-    }
-    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
-    printf("\n");
-
-    return OK;
-}
-
-#endif
--- a/src/ec/ref/lvl5/test/test-basis.h
+++ b/src/ec/ref/lvl5/test/test-basis.h
@@ -1,24 +0,0 @@
-#ifndef TEST_BASIS_H
-#define TEST_BASIS_H
-
-#include "fp2.h"
-// Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
-const fp2_t xPA = {{0x3c780e636a5869dc,0xb8a1d106332efe8e,0x7dd946e490e6578e,0x71d1fadbea881f88,0xb94912baba3999f0,0x85343be0a74ca9e1,0x22ae01775a9f7fa4,0x001032ffab70a66e},{0x15908a4b85221a67,0x342f82e6a1db4e1d,0x3d7c806a0d47b041,0x693830fad798c598,0xcfa244134a61827a,0x7f723d6f5d9628cf,0x10da657833d4d027,0x000c48499df01216}};
-const fp2_t xQA = {{0x79a766df9c10c642,0x7677cb85097be8be,0x2a21c7f9b84b9deb,0xb263e837f57210ce,0x551d6636b7c7e061,0x78d332581bee10b2,0xce30a9926772e06c,0x00150b5009b1d6ed},{0xbb2f097dae470eb9,0x53940c6df1eb93a9,0x7786a4bab87320c1,0x89d32acc1c91db18,0x733ef7f139fb7f9b,0x7bc336ee25a3901b,0xf7dfe8f5559eeeb1,0x00210555ab63e7f3}};
-const fp2_t xPQA = {{0x315ead6fadc8b0d6,0x7da37e8b7e94de95,0xcc6a9e206f513651,0x84fa9fab584acf3d,0x293b25689ac50519,0xe3222bd1c8154964,0x8ad7f39d04a8274f,0x000898edca69c223},{0x3e6c3e1864851e7e,0x01807c724f75ad5e,0xe9cd50eff4e66fb7,0x6c7c19a88fed9707,0x3ab57d0499386a40,0x6b5fd53c6efdc0b5,0x092fe030da27bc43,0x00076f2f409c5f8e}};
-
-const fp2_t xPB = {{0x229e388475511856,0x2f6b17e9ec9258c0,0x0cb28c568697f9f4,0xca039e28512c9f9b,0xd52d823761b0daa2,0xa09c3800e22c5e3b,0x2971022668c3b76a,0x0006e91c4415afd1},{0xbd5059b7406e1dcd,0x9da456ed8c11f1a3,0x1fb30e9cf66f928e,0x867c348b2f488d26,0x9d4b03d8aa4229bc,0x1c01ca1088d145a8,0xc9d6a201d77644a1,0x000a0d45131bf5b0}};
-const fp2_t xQB = {{0x712f0e5d0e3b4dfa,0x52260082dda1a07e,0x5a7513dcfd273829,0xc686f0976cbb5dcf,0xf5fc3df004cc7efc,0x615d0c2da4f2fb9f,0x796efbb3f65aede8,0x00028176c42e1d9f},{0xb8779b5a7bd2436b,0x4067b7e09d0ca56c,0xfdbaee6ff27ebe38,0x69310e98174025de,0x71960a10fa15706e,0x08ffb4b3f6efafbf,0xb7116ca162211ea3,0x00253c0f60765f1f}};
-const fp2_t xPQB = {{0x0e90506c89b46e0c,0x24ec65d5deb4e5b9,0x8477f7e141db8725,0xf76957ec1940dbd3,0xc2857af32534e715,0x06820654c6bae5f4,0x5ac928ef3c90c1f8,0x0024f724366faeed},{0xf6d7d2fdb06b91c4,0xe603cf05ce3f7555,0x8a0876277637415c,0xa1ef891f00155f8f,0x159db3ac93d39d57,0x5a05683aeaa453ff,0x180c38da2402f6fc,0x000b69d01dcb9107}};
-
-// 2^f-torsion basis for A=0
-const fp2_t xP2 = {{0x5d453ee3e6de9bf6,0xb5e51a5e88d8bbf3,0xc91ce6ef41eda957,0x4e0ba74e86fd3385,0xeff87c1def35e01f,0xedcd6c20496988a5,0x91a2c14abdb955fe,0x000be92a3f4de175},{0xa8a13d8e0022a825,0xb26bb70885d42bef,0x2533c31e799596b4,0xc41d58b247fb5ac9,0x8d45fa188fd5cb65,0x1b0593f6e4af948d,0x0ede22e4fcbe17ca,0x0014f54c5d5e1308}};
-const fp2_t xQ2 = {{0x90414b2365f868cd,0x68af18688f73fe25,0x46ca4c4b4ca19114,0xadae5e2564f79c98,0xfe3e09af9d00eb08,0x6856810a298a57bf,0x170d41ba9327205d,0x001d588b6744b4ea},{0xfb94e978bcf29be5,0x136700c07b264bd6,0x62a3c89d8466b8f9,0x9f990ca7d3084bd8,0xaab6fb1040e242d0,0x9e9325c5a5c20740,0xa9a6ee97f376e198,0x0003c8eee3581511}};
-const fp2_t xPQ2 = {{0x873d426c501eafe6,0xdeb1e87769484669,0x57c38f42bd1fef4d,0x53ca12d14b2ded18,0xb72ef4a808fc9d70,0x59d9a54b1844cca1,0x6ca7ccb15b6a9e49,0x00132a12929654f7},{0xffc6b824b6603270,0xb4152cbd3b607298,0xbe97764acdcb16ce,0x5205b1ec222c3be9,0x0cf5ac18d1eb4984,0xf5233664fd72c328,0x492e775887a3367c,0x001ce6bdfc847b45}};
-
-// 3^g-torsion basis for A=0
-const fp2_t xP3 = {{0x807a6abcb56d1915,0x3ab8ff7df809ea8f,0x2bd4f1eba48b23ac,0xeb32542370dde5ff,0xe6c50551eaaf2329,0x545dceaf98f07f09,0x90bfb0e10f3e5b48,0x000cc0084da1b367},{0xbd6f9c82cd4acc13,0x9b39d0711267d8a2,0x0ff31ab9fd38bb36,0xccc169cd75c1a58b,0xd943ad3571e304b4,0xfc3cda0859595d00,0xabda66362732b019,0x00070c5abcf1f329}};
-const fp2_t xQ3 = {{0x2b46bbfa6e57a9db,0xa7a5881479d3aaff,0x5c8106d57698b7cb,0xde0ccd3c436cd1ad,0xed351e8fbc28fd8f,0xe18a9a18e4f5bf03,0x9a98961a81073911,0x001ed93f47abe8f2},{0x5dc96ddee6e9a9eb,0x5e8905d15b918006,0xe89cecdc3f9b48f1,0x9d1a98543001e35e,0x0795c7b134dadeba,0x8050c48376f36d87,0xe9f364f7c6fbee1f,0x00061cb05b384f81}};
-const fp2_t xPQ3 = {{0xd44970f662987227,0x4c8eda7256920e8d,0x857f42e972e25a0e,0xc66a5b62daa3644d,0x6ab4ded74a464c38,0x4157cc1048b85a3a,0x9916ab1ee4e2305a,0x000c6943137ffba1},{0x0c5118f818e5279d,0xacb0c4a011613c7a,0xb87b4a9cb16a7565,0xc997ccbe0159f318,0x6fc50720bce6f45f,0xbd1916a5ca7789d7,0x3f48f437fdeccc64,0x000674d925340bc4}};
-
-#endif
--- a/src/ec/ref/lvlx.cmake
+++ b/src/ec/ref/lvlx.cmake
@@ -0,0 +1,17 @@
+set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
+    ${LVLX_DIR}/ec.c
+    ${LVLX_DIR}/ec_jac.c
+    ${LVLX_DIR}/xisog.c
+    ${LVLX_DIR}/xeval.c
+    ${LVLX_DIR}/isog_chains.c
+    ${LVLX_DIR}/basis.c
+    ${LVLX_DIR}/biextension.c
+)
+
+add_library(${LIB_EC_${SVARIANT_UPPER}} STATIC ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
+target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_MP} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_EC})
+target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+target_link_libraries(${LIB_EC_${SVARIANT_UPPER}} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_MP} ${LIB_GF_${SVARIANT_UPPER}})
+target_compile_definitions(${LIB_EC_${SVARIANT_UPPER}} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
+
+add_subdirectory(test)
--- a/src/ec/ref/lvlx/basis.c
+++ b/src/ec/ref/lvlx/basis.c
@@ -0,0 +1,416 @@
+#include "ec.h"
+#include "fp2.h"
+#include "e0_basis.h"
+#include <assert.h>
+
+uint32_t
+ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve)
+{ // Recover y-coordinate of a point on the Montgomery curve y^2 = x^3 + Ax^2 + x
+    fp2_t t0;
+
+    fp2_sqr(&t0, Px);
+    fp2_mul(y, &t0, &curve->A); // Ax^2
+    fp2_add(y, y, Px);          // Ax^2 + x
+    fp2_mul(&t0, &t0, Px);
+    fp2_add(y, y, &t0); // x^3 + Ax^2 + x
+    // This is required, because we do not yet know that our curves are
+    // supersingular so our points live on the twist with B = 1.
+    return fp2_sqrt_verify(y);
+}
+
+static void
+difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve)
+{
+    // Given P,Q in projective x-only, computes a deterministic choice for (P-Q)
+    // Based on Proposition 3 of https://eprint.iacr.org/2017/518.pdf
+
+    fp2_t Bxx, Bxz, Bzz, t0, t1;
+
+    fp2_mul(&t0, &P->x, &Q->x);
+    fp2_mul(&t1, &P->z, &Q->z);
+    fp2_sub(&Bxx, &t0, &t1);
+    fp2_sqr(&Bxx, &Bxx);
+    fp2_mul(&Bxx, &Bxx, &curve->C); // C*(P.x*Q.x-P.z*Q.z)^2
+    fp2_add(&Bxz, &t0, &t1);
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    fp2_add(&Bzz, &t0, &t1);
+    fp2_mul(&Bxz, &Bxz, &Bzz); // (P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_sub(&Bzz, &t0, &t1);
+    fp2_sqr(&Bzz, &Bzz);
+    fp2_mul(&Bzz, &Bzz, &curve->C); // C*(P.x*Q.z-P.z*Q.x)^2
+    fp2_mul(&Bxz, &Bxz, &curve->C); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&t0, &t0, &curve->A);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&Bxz, &Bxz, &t0); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x) + 2*A*P.x*Q.z*P.z*Q.x
+
+    // To ensure that the denominator is a fourth power in Fp, we normalize by
+    // C*C_bar^2*(P.z)_bar^2*(Q.z)_bar^2
+    fp_copy(&t0.re, &curve->C.re);
+    fp_neg(&t0.im, &curve->C.im);
+    fp2_sqr(&t0, &t0);
+    fp2_mul(&t0, &t0, &curve->C);
+    fp_copy(&t1.re, &P->z.re);
+    fp_neg(&t1.im, &P->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp_copy(&t1.re, &Q->z.re);
+    fp_neg(&t1.im, &Q->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&Bxx, &Bxx, &t0);
+    fp2_mul(&Bxz, &Bxz, &t0);
+    fp2_mul(&Bzz, &Bzz, &t0);
+
+    // Solving quadratic equation
+    fp2_sqr(&t0, &Bxz);
+    fp2_mul(&t1, &Bxx, &Bzz);
+    fp2_sub(&t0, &t0, &t1);
+    // No need to check if t0 is square, as per the entangled basis algorithm.
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &Bxz, &t0);
+    fp2_copy(&PQ->z, &Bzz);
+}
+
+// Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and the point
+// P = (X/Z : 1). For generic implementation see lift_basis()
+uint32_t
+lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    assert(fp2_is_one(&B->P.z));
+    assert(fp2_is_one(&E->C));
+
+    fp2_copy(&P->x, &B->P.x);
+    fp2_copy(&Q->x, &B->Q.x);
+    fp2_copy(&Q->z, &B->Q.z);
+    fp2_set_one(&P->z);
+    uint32_t ret = ec_recover_y(&P->y, &P->x, E);
+
+    // Algorithm of Okeya-Sakurai to recover y.Q in the montgomery model
+    fp2_t v1, v2, v3, v4;
+    fp2_mul(&v1, &P->x, &Q->z);
+    fp2_add(&v2, &Q->x, &v1);
+    fp2_sub(&v3, &Q->x, &v1);
+    fp2_sqr(&v3, &v3);
+    fp2_mul(&v3, &v3, &B->PmQ.x);
+    fp2_add(&v1, &E->A, &E->A);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_add(&v2, &v2, &v1);
+    fp2_mul(&v4, &P->x, &Q->x);
+    fp2_add(&v4, &v4, &Q->z);
+    fp2_mul(&v2, &v2, &v4);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_sub(&v2, &v2, &v1);
+    fp2_mul(&v2, &v2, &B->PmQ.z);
+    fp2_sub(&Q->y, &v3, &v2);
+    fp2_add(&v1, &P->y, &P->y);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_mul(&v1, &v1, &B->PmQ.z);
+    fp2_mul(&Q->x, &Q->x, &v1);
+    fp2_mul(&Q->z, &Q->z, &v1);
+
+    // Transforming to a jacobian coordinate
+    fp2_sqr(&v1, &Q->z);
+    fp2_mul(&Q->y, &Q->y, &v1);
+    fp2_mul(&Q->x, &Q->x, &Q->z);
+    return ret;
+}
+
+uint32_t
+lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    // Normalise the curve E such that (A : C) is (A/C : 1)
+    // and the point x(P) = (X/Z : 1).
+    fp2_t inverses[2];
+    fp2_copy(&inverses[0], &B->P.z);
+    fp2_copy(&inverses[1], &E->C);
+
+    fp2_batched_inv(inverses, 2);
+    fp2_set_one(&B->P.z);
+    fp2_set_one(&E->C);
+
+    fp2_mul(&B->P.x, &B->P.x, &inverses[0]);
+    fp2_mul(&E->A, &E->A, &inverses[1]);
+
+    // Lift the basis to Jacobian points P, Q
+    return lift_basis_normalized(P, Q, B, E);
+}
+
+// Given an x-coordinate, determines if this is a valid
+// point on the curve. Assumes C=1.
+static uint32_t
+is_on_curve(const fp2_t *x, const ec_curve_t *curve)
+{
+    assert(fp2_is_one(&curve->C));
+    fp2_t t0;
+
+    fp2_add(&t0, x, &curve->A); // x + (A/C)
+    fp2_mul(&t0, &t0, x);       // x^2 + (A/C)*x
+    fp2_add_one(&t0, &t0);      // x^2 + (A/C)*x + 1
+    fp2_mul(&t0, &t0, x);       // x^3 + (A/C)*x^2 + x
+
+    return fp2_is_square(&t0);
+}
+
+// Helper function which given a point of order k*2^n with n maximal
+// and k odd, computes a point of order 2^f
+static inline void
+clear_cofactor_for_maximal_even_order(ec_point_t *P, ec_curve_t *curve, int f)
+{
+    // clear out the odd cofactor to get a point of order 2^n
+    ec_mul(P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, P, curve);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_A24(P, P, &curve->A24, curve->is_A24_computed_and_normalized);
+    }
+}
+
+// Helper function which finds an NQR -1 / (1 + i*b) for entangled basis generation
+static uint8_t
+find_nqr_factor(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    // factor = -1/(1 + i*b) for b in Fp will be NQR whenever 1 + b^2 is NQR
+    // in Fp, so we find one of these and then invert (1 + i*b). We store b
+    // as a u8 hint to save time in verification.
+
+    // We return the hint as a u8, but use (uint16_t)n to give 2^16 - 1
+    // to make failure cryptographically negligible, with a fallback when
+    // n > 128 is required.
+    uint8_t hint;
+    uint32_t found = 0;
+    uint16_t n = start;
+
+    bool qr_b = 1;
+    fp_t b, tmp;
+    fp2_t z, t0, t1;
+
+    do {
+        while (qr_b) {
+            // find b with 1 + b^2 a non-quadratic residue
+            fp_set_small(&tmp, (uint32_t)n * n + 1);
+            qr_b = fp_is_square(&tmp);
+            n++; // keeps track of b = n - 1
+        }
+
+        // for Px := -A/(1 + i*b) to be on the curve
+        // is equivalent to A^2*(z-1) - z^2 NQR for z = 1 + i*b
+        // thus prevents unnecessary inversion pre-check
+
+        // t0 = z - 1 = i*b
+        // t1 = z = 1 + i*b
+        fp_set_small(&b, (uint32_t)n - 1);
+        fp2_set_zero(&t0);
+        fp2_set_one(&z);
+        fp_copy(&z.im, &b);
+        fp_copy(&t0.im, &b);
+
+        // A^2*(z-1) - z^2
+        fp2_sqr(&t1, &curve->A);
+        fp2_mul(&t0, &t0, &t1); // A^2 * (z - 1)
+        fp2_sqr(&t1, &z);
+        fp2_sub(&t0, &t0, &t1); // A^2 * (z - 1) - z^2
+        found = !fp2_is_square(&t0);
+
+        qr_b = 1;
+    } while (!found);
+
+    // set Px to -A/(1 + i*b)
+    fp2_copy(x, &z);
+    fp2_inv(x);
+    fp2_mul(x, x, &curve->A);
+    fp2_neg(x, x);
+
+    /*
+     * With very low probability n will not fit in 7 bits.
+     * We set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    hint = n <= 128 ? n - 1 : 0;
+
+    return hint;
+}
+
+// Helper function which finds a point x(P) = n * A
+static uint8_t
+find_nA_x_coord(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    assert(!fp2_is_square(&curve->A)); // Only to be called when A is a NQR
+
+    // when A is NQR we allow x(P) to be a multiple n*A of A
+    uint8_t n = start;
+    if (n == 1) {
+        fp2_copy(x, &curve->A);
+    } else {
+        fp2_mul_small(x, &curve->A, n);
+    }
+
+    while (!is_on_curve(x, curve)) {
+        fp2_add(x, x, &curve->A);
+        n++;
+    }
+
+    /*
+     * With very low probability (1/2^128), n will not fit in 7 bits.
+     * In this case, we set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    uint8_t hint = n < 128 ? n : 0;
+    return hint;
+}
+
+// The entangled basis generation does not allow A = 0
+// so we simply return the one we have already precomputed
+static void
+ec_basis_E0_2f(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    assert(fp2_is_zero(&curve->A));
+    ec_point_t P, Q;
+
+    // Set P, Q to precomputed (X : 1) values
+    fp2_copy(&P.x, &BASIS_E0_PX);
+    fp2_copy(&Q.x, &BASIS_E0_QX);
+    fp2_set_one(&P.z);
+    fp2_set_one(&Q.z);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_E0(&P, &P);
+        xDBL_E0(&Q, &Q);
+    }
+
+    // Set P, Q in the basis and compute x(P - Q)
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->Q, &Q);
+    difference_point(&PQ2->PmQ, &P, &Q, curve);
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// and stores hints as an array for faster recomputation at a later point
+uint8_t
+ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 0;
+    }
+
+    uint8_t hint;
+    bool hint_A = fp2_is_square(&curve->A);
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_A) {
+        // when A is NQR we allow x(P) to be a multiple n*A of A
+        hint = find_nA_x_coord(&P.x, curve, 1);
+    } else {
+        // when A is QR we instead have to find (1 + b^2) a NQR
+        // such that x(P) = -A / (1 + i*b)
+        hint = find_nqr_factor(&P.x, curve, 1);
+    }
+
+    fp2_set_one(&P.z);
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+    // Finally, we compress hint_A and hint into a single bytes.
+    // We choose to set the LSB of hint to hint_A
+    assert(hint < 128); // We expect hint to be 7-bits in size
+    return (hint << 1) | hint_A;
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// given the hints as an array for faster basis computation
+int
+ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 1;
+    }
+
+    // The LSB of hint encodes whether A is a QR
+    // The remaining 7-bits are used to find a valid x(P)
+    bool hint_A = hint & 1;
+    uint8_t hint_P = hint >> 1;
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_P) {
+        // When hint_P = 0 it means we did not find a point in 128 attempts
+        // this is very rare and we almost never expect to need this fallback
+        // In either case, we can start with b = 128 to skip testing the known
+        // values which will not work
+        if (!hint_A) {
+            find_nA_x_coord(&P.x, curve, 128);
+        } else {
+            find_nqr_factor(&P.x, curve, 128);
+        }
+    } else {
+        // Otherwise we use the hint to directly find x(P) based on hint_A
+        if (!hint_A) {
+            // when A is NQR, we have found n such that x(P) = n*A
+            fp2_mul_small(&P.x, &curve->A, hint_P);
+        } else {
+            // when A is QR we have found b such that (1 + b^2) is a NQR in
+            // Fp, so we must compute x(P) = -A / (1 + i*b)
+            fp_set_one(&P.x.re);
+            fp_set_small(&P.x.im, hint_P);
+            fp2_inv(&P.x);
+            fp2_mul(&P.x, &P.x, &curve->A);
+            fp2_neg(&P.x, &P.x);
+        }
+    }
+    fp2_set_one(&P.z);
+
+#ifndef NDEBUG
+    int passed = 1;
+    passed = is_on_curve(&P.x, curve);
+    passed &= !fp2_is_square(&P.x);
+
+    if (!passed)
+        return 0;
+#endif
+
+    // set xQ to -xP - A
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+#ifndef NDEBUG
+    passed &= test_basis_order_twof(PQ2, curve, f);
+
+    if (!passed)
+        return 0;
+#endif
+
+    return 1;
+}
--- a/src/ec/ref/lvlx/biextension.c
+++ b/src/ec/ref/lvlx/biextension.c
@@ -0,0 +1,770 @@
+#include <biextension.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <mp.h>
+
+/*
+ * We implement the biextension arithmetic by using the cubical torsor
+ * representation. For now only implement the 2^e-ladder.
+ *
+ * Warning: cubicalADD is off by a factor x4 with respect to the correct
+ * cubical arithmetic. This does not affect the Weil pairing or the Tate
+ * pairing over F_{p^2} (due to the final exponentiation), but would give
+ * the wrong result if we compute the Tate pairing over F_p.
+ */
+
+// this would be exactly like xADD if PQ was 'antinormalised' as (1,z)
+// Cost: 3M + 2S + 3a + 3s
+// Note: if needed, cubicalDBL is simply xDBL_A24 normalized and
+// costs 3M + 2S + 2a + 2s
+
+static void
+cubicalADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const fp2_t *ixPQ)
+{
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&t2, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &t2);
+    fp2_add(&t2, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&R->z, &t3);
+    fp2_sqr(&t2, &t2);
+    fp2_mul(&R->x, ixPQ, &t2);
+}
+
+// Given cubical reps of P, Q and x(P - Q) = (1 : ixPQ)
+// compute P + Q, [2]Q
+// Cost: 6M + 4S + 4a + 4s
+static void
+cubicalDBLADD(ec_point_t *PpQ,
+              ec_point_t *QQ,
+              const ec_point_t *P,
+              const ec_point_t *Q,
+              const fp2_t *ixPQ,
+              const ec_point_t *A24)
+{
+    // A24 = (A+2C/4C: 1)
+    assert(fp2_is_one(&A24->z));
+
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&PpQ->x, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_sqr(&t2, &PpQ->x);
+    fp2_sqr(&QQ->z, &t3);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &PpQ->x);
+    fp2_add(&PpQ->x, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&PpQ->z, &t3);
+    fp2_sqr(&PpQ->x, &PpQ->x);
+    fp2_mul(&PpQ->x, ixPQ, &PpQ->x);
+    fp2_sub(&t3, &t2, &QQ->z);
+    fp2_mul(&QQ->x, &t2, &QQ->z);
+    fp2_mul(&t0, &t3, &A24->x);
+    fp2_add(&t0, &t0, &QQ->z);
+    fp2_mul(&QQ->z, &t0, &t3);
+}
+
+// iterative biextension doubling
+static void
+biext_ladder_2e(uint32_t e,
+                ec_point_t *PnQ,
+                ec_point_t *nQ,
+                const ec_point_t *PQ,
+                const ec_point_t *Q,
+                const fp2_t *ixP,
+                const ec_point_t *A24)
+{
+    copy_point(PnQ, PQ);
+    copy_point(nQ, Q);
+    for (uint32_t i = 0; i < e; i++) {
+        cubicalDBLADD(PnQ, nQ, PnQ, nQ, ixP, A24);
+    }
+}
+
+// Compute the monodromy ratio X/Z above as a (X:Z) point to avoid a division
+// We implicitly use (1,0) as a cubical point above 0_E
+static void
+point_ratio(ec_point_t *R, const ec_point_t *PnQ, const ec_point_t *nQ, const ec_point_t *P)
+{
+    // Sanity tests
+    assert(ec_is_zero(nQ));
+    assert(ec_is_equal(PnQ, P));
+
+    fp2_mul(&R->x, &nQ->x, &P->x);
+    fp2_copy(&R->z, &PnQ->x);
+}
+
+// Compute the cubical translation of P by a point of 2-torsion T
+static void
+translate(ec_point_t *P, const ec_point_t *T)
+{
+    // When we translate, the following three things can happen:
+    // T = (A : 0) then the translation of P should be P
+    // T = (0 : B) then the translation of P = (X : Z) should be (Z : X)
+    // Otherwise T = (A : B) and P translates to (AX - BZ : BX - AZ)
+    // We compute this in constant time by computing the generic case
+    // and then using constant time swaps.
+    fp2_t PX_new, PZ_new;
+
+    {
+        fp2_t t0, t1;
+
+        // PX_new = AX - BZ
+        fp2_mul(&t0, &T->x, &P->x);
+        fp2_mul(&t1, &T->z, &P->z);
+        fp2_sub(&PX_new, &t0, &t1);
+
+        // PZ_new = BX - AZ
+        fp2_mul(&t0, &T->z, &P->x);
+        fp2_mul(&t1, &T->x, &P->z);
+        fp2_sub(&PZ_new, &t0, &t1);
+    }
+
+    // When we have A zero we should return (Z : X)
+    uint32_t TA_is_zero = fp2_is_zero(&T->x);
+    fp2_select(&PX_new, &PX_new, &P->z, TA_is_zero);
+    fp2_select(&PZ_new, &PZ_new, &P->x, TA_is_zero);
+
+    // When we have B zero we should return (X : Z)
+    uint32_t TB_is_zero = fp2_is_zero(&T->z);
+    fp2_select(&PX_new, &PX_new, &P->x, TB_is_zero);
+    fp2_select(&PZ_new, &PZ_new, &P->z, TB_is_zero);
+
+    // Set the point to the desired result
+    fp2_copy(&P->x, &PX_new);
+    fp2_copy(&P->z, &PZ_new);
+}
+
+// Compute the biextension monodromy g_P,Q^{2^g} (in level 1) via the
+// cubical arithmetic of P+2^e Q.
+// The suffix _i means that we are given 1/x(P) as parameter. Warning: to
+// get meaningful result when using the monodromy to compute pairings, we
+// need P, Q, PQ, A24 to be normalised (this is not strictly necessary, but
+// care need to be taken when they are not normalised. Only handle the
+// normalised case for now)
+static void
+monodromy_i(ec_point_t *R, const pairing_params_t *pairing_data, bool swap_PQ)
+{
+    fp2_t ixP;
+    ec_point_t P, Q, PnQ, nQ;
+
+    // When we compute the Weil pairing we need both P + [2^e]Q and
+    // Q + [2^e]P which we can do easily with biext_ladder_2e() below
+    // we use a bool to decide wether to use Q, ixP or P, ixQ in the
+    // ladder and P or Q in translation.
+    if (!swap_PQ) {
+        copy_point(&P, &pairing_data->P);
+        copy_point(&Q, &pairing_data->Q);
+        fp2_copy(&ixP, &pairing_data->ixP);
+    } else {
+        copy_point(&P, &pairing_data->Q);
+        copy_point(&Q, &pairing_data->P);
+        fp2_copy(&ixP, &pairing_data->ixQ);
+    }
+
+    // Compute the biextension ladder P + [2^e]Q
+    biext_ladder_2e(pairing_data->e - 1, &PnQ, &nQ, &pairing_data->PQ, &Q, &ixP, &pairing_data->A24);
+    translate(&PnQ, &nQ);
+    translate(&nQ, &nQ);
+    point_ratio(R, &PnQ, &nQ, &P);
+}
+
+// Normalize the points and also store 1/x(P), 1/x(Q)
+static void
+cubical_normalization(pairing_params_t *pairing_data, const ec_point_t *P, const ec_point_t *Q)
+{
+    fp2_t t[4];
+    fp2_copy(&t[0], &P->x);
+    fp2_copy(&t[1], &P->z);
+    fp2_copy(&t[2], &Q->x);
+    fp2_copy(&t[3], &Q->z);
+    fp2_batched_inv(t, 4);
+
+    // Store PZ / PX and QZ / QX
+    fp2_mul(&pairing_data->ixP, &P->z, &t[0]);
+    fp2_mul(&pairing_data->ixQ, &Q->z, &t[2]);
+
+    // Store x(P), x(Q) normalised to (X/Z : 1)
+    fp2_mul(&pairing_data->P.x, &P->x, &t[1]);
+    fp2_mul(&pairing_data->Q.x, &Q->x, &t[3]);
+    fp2_set_one(&pairing_data->P.z);
+    fp2_set_one(&pairing_data->Q.z);
+}
+
+// Weil pairing, PQ should be P+Q in (X:Z) coordinates
+// We assume the points are normalised correctly
+static void
+weil_n(fp2_t *r, const pairing_params_t *pairing_data)
+{
+    ec_point_t R0, R1;
+    monodromy_i(&R0, pairing_data, true);
+    monodromy_i(&R1, pairing_data, false);
+
+    fp2_mul(r, &R0.x, &R1.z);
+    fp2_inv(r);
+    fp2_mul(r, r, &R0.z);
+    fp2_mul(r, r, &R1.x);
+}
+
+// Weil pairing, PQ should be P+Q in (X:Z) coordinates
+// Normalise the points and call the code above
+// The code will crash (division by 0) if either P or Q is (0:1)
+void
+weil(fp2_t *r, uint32_t e, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ, ec_curve_t *E)
+{
+    pairing_params_t pairing_data;
+    // Construct the structure for the Weil pairing
+    // Set (PX/PZ : 1), (QX : QZ : 1), PZ/PX and QZ/QX
+    pairing_data.e = e;
+    cubical_normalization(&pairing_data, P, Q);
+    copy_point(&pairing_data.PQ, PQ);
+
+    // Ensure the input curve has A24 normalised and store
+    // in a struct
+    ec_curve_normalize_A24(E);
+    copy_point(&pairing_data.A24, &E->A24);
+
+    // Compute the Weil pairing e_(2^n)(P, Q)
+    weil_n(r, &pairing_data);
+}
+
+// two helper functions for reducing the tate pairing
+// clear_cofac clears (p + 1) // 2^f for an Fp2 value
+void
+clear_cofac(fp2_t *r, const fp2_t *a)
+{
+    digit_t exp = *p_cofactor_for_2f;
+    exp >>= 1;
+
+    fp2_t x;
+    fp2_copy(&x, a);
+    fp2_copy(r, a);
+
+    // removes cofac
+    while (exp > 0) {
+        fp2_sqr(r, r);
+        if (exp & 1) {
+            fp2_mul(r, r, &x);
+        }
+        exp >>= 1;
+    }
+}
+
+// applies frobenius a + ib --> a - ib to an fp2 element
+void
+fp2_frob(fp2_t *out, const fp2_t *in)
+{
+    fp_copy(&(out->re), &(in->re));
+    fp_neg(&(out->im), &(in->im));
+}
+
+// reduced Tate pairing, normalizes the points, assumes PQ is P+Q in (X:Z)
+// coordinates. Computes 1/x(P) and 1/x(Q) for efficient cubical ladder
+void
+reduced_tate(fp2_t *r, uint32_t e, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ, ec_curve_t *E)
+{
+    uint32_t e_full = TORSION_EVEN_POWER;
+    uint32_t e_diff = e_full - e;
+    ec_point_t R;
+    pairing_params_t pairing_data;
+
+    // Construct the structure for the Weil pairing
+    // Set (PX/PZ : 1), (QX : QZ : 1), PZ/PX and QZ/QX
+    pairing_data.e = e;
+    cubical_normalization(&pairing_data, P, Q);
+    copy_point(&pairing_data.PQ, PQ);
+
+    // Ensure the input curve has A24 normalised and store
+    // in a struct
+    ec_curve_normalize_A24(E);
+    copy_point(&pairing_data.A24, &E->A24);
+
+    monodromy_i(&R, &pairing_data, true);
+
+    // we get unreduced tate as R.X, R.Z
+    // reduced tate is -(R.Z/R.X)^((p^2 - 1) div 2^f)
+    //  we reuse R.X and R.Z to split reduction step ^(p-1) into frobenius and ^-1
+    fp2_t frob, tmp;
+    fp2_copy(&tmp, &R.x);
+    fp2_frob(&frob, &R.x);
+    fp2_mul(&R.x, &R.z, &frob);
+    fp2_frob(&frob, &R.z);
+    fp2_mul(&R.z, &tmp, &frob);
+    fp2_inv(&R.x);
+    fp2_mul(r, &R.x, &R.z);
+
+    clear_cofac(r, r);
+    // clear remaining 2^e_diff
+    for (uint32_t j = 0; j < e_diff; j++) {
+        fp2_sqr(r, r);
+    }
+}
+
+// Functions to compute discrete logs by computing the Weil pairing of points
+// followed by computing the dlog in Fp^2
+// (If we work with full order points, it would be faster to use the Tate
+// pairings rather than the Weil pairings; this is not implemented yet)
+
+// recursive dlog function
+static bool
+fp2_dlog_2e_rec(digit_t *a, long len, fp2_t *pows_f, fp2_t *pows_g, long stacklen)
+{
+    if (len == 0) {
+        // *a = 0;
+        for (int i = 0; i < NWORDS_ORDER; i++) {
+            a[i] = 0;
+        }
+        return true;
+    } else if (len == 1) {
+        if (fp2_is_one(&pows_f[stacklen - 1])) {
+            // a = 0;
+            for (int i = 0; i < NWORDS_ORDER; i++) {
+                a[i] = 0;
+            }
+            for (int i = 0; i < stacklen - 1; ++i) {
+                fp2_sqr(&pows_g[i], &pows_g[i]); // new_g = g^2
+            }
+            return true;
+        } else if (fp2_is_equal(&pows_f[stacklen - 1], &pows_g[stacklen - 1])) {
+            // a = 1;
+            a[0] = 1;
+            for (int i = 1; i < NWORDS_ORDER; i++) {
+                a[i] = 0;
+            }
+            for (int i = 0; i < stacklen - 1; ++i) {
+                fp2_mul(&pows_f[i], &pows_f[i], &pows_g[i]); // new_f = f*g
+                fp2_sqr(&pows_g[i], &pows_g[i]);             // new_g = g^2
+            }
+            return true;
+        } else {
+            return false;
+        }
+    } else {
+        long right = (double)len * 0.5;
+        long left = len - right;
+        pows_f[stacklen] = pows_f[stacklen - 1];
+        pows_g[stacklen] = pows_g[stacklen - 1];
+        for (int i = 0; i < left; i++) {
+            fp2_sqr(&pows_f[stacklen], &pows_f[stacklen]);
+            fp2_sqr(&pows_g[stacklen], &pows_g[stacklen]);
+        }
+        // uint32_t dlp1 = 0, dlp2 = 0;
+        digit_t dlp1[NWORDS_ORDER], dlp2[NWORDS_ORDER];
+        bool ok;
+        ok = fp2_dlog_2e_rec(dlp1, right, pows_f, pows_g, stacklen + 1);
+        if (!ok)
+            return false;
+        ok = fp2_dlog_2e_rec(dlp2, left, pows_f, pows_g, stacklen);
+        if (!ok)
+            return false;
+        // a = dlp1 + 2^right * dlp2
+        multiple_mp_shiftl(dlp2, right, NWORDS_ORDER);
+        mp_add(a, dlp2, dlp1, NWORDS_ORDER);
+
+        return true;
+    }
+}
+
+// compute DLP: compute scal such that f = g^scal with f, 1/g as input
+static bool
+fp2_dlog_2e(digit_t *scal, const fp2_t *f, const fp2_t *g_inverse, int e)
+{
+    long log, len = e;
+    for (log = 0; len > 1; len >>= 1)
+        log++;
+    log += 1;
+
+    fp2_t pows_f[log], pows_g[log];
+    pows_f[0] = *f;
+    pows_g[0] = *g_inverse;
+
+    for (int i = 0; i < NWORDS_ORDER; i++) {
+        scal[i] = 0;
+    }
+
+    bool ok = fp2_dlog_2e_rec(scal, e, pows_f, pows_g, 1);
+    assert(ok);
+
+    return ok;
+}
+
+// Normalize the bases (P, Q), (R, S) and store their inverse
+// and additionally normalise the curve to (A/C : 1)
+static void
+cubical_normalization_dlog(pairing_dlog_params_t *pairing_dlog_data, ec_curve_t *curve)
+{
+    fp2_t t[11];
+    ec_basis_t *PQ = &pairing_dlog_data->PQ;
+    ec_basis_t *RS = &pairing_dlog_data->RS;
+    fp2_copy(&t[0], &PQ->P.x);
+    fp2_copy(&t[1], &PQ->P.z);
+    fp2_copy(&t[2], &PQ->Q.x);
+    fp2_copy(&t[3], &PQ->Q.z);
+    fp2_copy(&t[4], &PQ->PmQ.x);
+    fp2_copy(&t[5], &PQ->PmQ.z);
+    fp2_copy(&t[6], &RS->P.x);
+    fp2_copy(&t[7], &RS->P.z);
+    fp2_copy(&t[8], &RS->Q.x);
+    fp2_copy(&t[9], &RS->Q.z);
+    fp2_copy(&t[10], &curve->C);
+
+    fp2_batched_inv(t, 11);
+
+    fp2_mul(&pairing_dlog_data->ixP, &PQ->P.z, &t[0]);
+    fp2_mul(&PQ->P.x, &PQ->P.x, &t[1]);
+    fp2_set_one(&PQ->P.z);
+
+    fp2_mul(&pairing_dlog_data->ixQ, &PQ->Q.z, &t[2]);
+    fp2_mul(&PQ->Q.x, &PQ->Q.x, &t[3]);
+    fp2_set_one(&PQ->Q.z);
+
+    fp2_mul(&PQ->PmQ.x, &PQ->PmQ.x, &t[5]);
+    fp2_set_one(&PQ->PmQ.z);
+
+    fp2_mul(&pairing_dlog_data->ixR, &RS->P.z, &t[6]);
+    fp2_mul(&RS->P.x, &RS->P.x, &t[7]);
+    fp2_set_one(&RS->P.z);
+
+    fp2_mul(&pairing_dlog_data->ixS, &RS->Q.z, &t[8]);
+    fp2_mul(&RS->Q.x, &RS->Q.x, &t[9]);
+    fp2_set_one(&RS->Q.z);
+
+    fp2_mul(&curve->A, &curve->A, &t[10]);
+    fp2_set_one(&curve->C);
+}
+
+// Given two bases <P, Q> and basis = <R, S> compute
+// x(P - R), x(P - S), x(R - Q), x(S - Q)
+static void
+compute_difference_points(pairing_dlog_params_t *pairing_dlog_data, ec_curve_t *curve)
+{
+    jac_point_t xyP, xyQ, xyR, xyS, temp;
+
+    // lifting the two basis points, assumes that x(P) and x(R)
+    // and the curve itself are normalised to (X : 1)
+    lift_basis_normalized(&xyP, &xyQ, &pairing_dlog_data->PQ, curve);
+    lift_basis_normalized(&xyR, &xyS, &pairing_dlog_data->RS, curve);
+
+    // computation of the differences
+    // x(P - R)
+    jac_neg(&temp, &xyR);
+    ADD(&temp, &temp, &xyP, curve);
+    jac_to_xz(&pairing_dlog_data->diff.PmR, &temp);
+
+    // x(P - S)
+    jac_neg(&temp, &xyS);
+    ADD(&temp, &temp, &xyP, curve);
+    jac_to_xz(&pairing_dlog_data->diff.PmS, &temp);
+
+    // x(R - Q)
+    jac_neg(&temp, &xyQ);
+    ADD(&temp, &temp, &xyR, curve);
+    jac_to_xz(&pairing_dlog_data->diff.RmQ, &temp);
+
+    // x(S - Q)
+    jac_neg(&temp, &xyQ);
+    ADD(&temp, &temp, &xyS, curve);
+    jac_to_xz(&pairing_dlog_data->diff.SmQ, &temp);
+}
+
+// Inline all the Weil pairing computations needed for ec_dlog_2_weil
+static void
+weil_dlog(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, pairing_dlog_params_t *pairing_dlog_data)
+{
+
+    ec_point_t nP, nQ, nR, nS, nPQ, PnQ, nPR, PnR, nPS, PnS, nRQ, RnQ, nSQ, SnQ;
+
+    copy_point(&nP, &pairing_dlog_data->PQ.P);
+    copy_point(&nQ, &pairing_dlog_data->PQ.Q);
+    copy_point(&nR, &pairing_dlog_data->RS.P);
+    copy_point(&nS, &pairing_dlog_data->RS.Q);
+    copy_point(&nPQ, &pairing_dlog_data->PQ.PmQ);
+    copy_point(&PnQ, &pairing_dlog_data->PQ.PmQ);
+    copy_point(&nPR, &pairing_dlog_data->diff.PmR);
+    copy_point(&nPS, &pairing_dlog_data->diff.PmS);
+    copy_point(&PnR, &pairing_dlog_data->diff.PmR);
+    copy_point(&PnS, &pairing_dlog_data->diff.PmS);
+    copy_point(&nRQ, &pairing_dlog_data->diff.RmQ);
+    copy_point(&nSQ, &pairing_dlog_data->diff.SmQ);
+    copy_point(&RnQ, &pairing_dlog_data->diff.RmQ);
+    copy_point(&SnQ, &pairing_dlog_data->diff.SmQ);
+
+    for (uint32_t i = 0; i < pairing_dlog_data->e - 1; i++) {
+        cubicalADD(&nPQ, &nPQ, &nP, &pairing_dlog_data->ixQ);
+        cubicalADD(&nPR, &nPR, &nP, &pairing_dlog_data->ixR);
+        cubicalDBLADD(&nPS, &nP, &nPS, &nP, &pairing_dlog_data->ixS, &pairing_dlog_data->A24);
+
+        cubicalADD(&PnQ, &PnQ, &nQ, &pairing_dlog_data->ixP);
+        cubicalADD(&RnQ, &RnQ, &nQ, &pairing_dlog_data->ixR);
+        cubicalDBLADD(&SnQ, &nQ, &SnQ, &nQ, &pairing_dlog_data->ixS, &pairing_dlog_data->A24);
+
+        cubicalADD(&PnR, &PnR, &nR, &pairing_dlog_data->ixP);
+        cubicalDBLADD(&nRQ, &nR, &nRQ, &nR, &pairing_dlog_data->ixQ, &pairing_dlog_data->A24);
+
+        cubicalADD(&PnS, &PnS, &nS, &pairing_dlog_data->ixP);
+        cubicalDBLADD(&nSQ, &nS, &nSQ, &nS, &pairing_dlog_data->ixQ, &pairing_dlog_data->A24);
+    }
+
+    // weil(&w0,e,&PQ->P,&PQ->Q,&PQ->PmQ,&A24);
+    translate(&nPQ, &nP);
+    translate(&nPR, &nP);
+    translate(&nPS, &nP);
+    translate(&PnQ, &nQ);
+    translate(&RnQ, &nQ);
+    translate(&SnQ, &nQ);
+    translate(&PnR, &nR);
+    translate(&nRQ, &nR);
+    translate(&PnS, &nS);
+    translate(&nSQ, &nS);
+
+    translate(&nP, &nP);
+    translate(&nQ, &nQ);
+    translate(&nR, &nR);
+    translate(&nS, &nS);
+
+    // computation of the reference weil pairing
+    ec_point_t T0, T1;
+    fp2_t w1[5], w2[5];
+
+    // e(P, Q) = w0
+    point_ratio(&T0, &nPQ, &nP, &pairing_dlog_data->PQ.Q);
+    point_ratio(&T1, &PnQ, &nQ, &pairing_dlog_data->PQ.P);
+    // For the first element we need it's inverse for
+    // fp2_dlog_2e so we swap w1 and w2 here to save inversions
+    fp2_mul(&w2[0], &T0.x, &T1.z);
+    fp2_mul(&w1[0], &T1.x, &T0.z);
+
+    // e(P,R) = w0^r2
+    point_ratio(&T0, &nPR, &nP, &pairing_dlog_data->RS.P);
+    point_ratio(&T1, &PnR, &nR, &pairing_dlog_data->PQ.P);
+    fp2_mul(&w1[1], &T0.x, &T1.z);
+    fp2_mul(&w2[1], &T1.x, &T0.z);
+
+    // e(R,Q) = w0^r1
+    point_ratio(&T0, &nRQ, &nR, &pairing_dlog_data->PQ.Q);
+    point_ratio(&T1, &RnQ, &nQ, &pairing_dlog_data->RS.P);
+    fp2_mul(&w1[2], &T0.x, &T1.z);
+    fp2_mul(&w2[2], &T1.x, &T0.z);
+
+    // e(P,S) = w0^s2
+    point_ratio(&T0, &nPS, &nP, &pairing_dlog_data->RS.Q);
+    point_ratio(&T1, &PnS, &nS, &pairing_dlog_data->PQ.P);
+    fp2_mul(&w1[3], &T0.x, &T1.z);
+    fp2_mul(&w2[3], &T1.x, &T0.z);
+
+    // e(S,Q) = w0^s1
+    point_ratio(&T0, &nSQ, &nS, &pairing_dlog_data->PQ.Q);
+    point_ratio(&T1, &SnQ, &nQ, &pairing_dlog_data->RS.Q);
+    fp2_mul(&w1[4], &T0.x, &T1.z);
+    fp2_mul(&w2[4], &T1.x, &T0.z);
+
+    fp2_batched_inv(w1, 5);
+    for (int i = 0; i < 5; i++) {
+        fp2_mul(&w1[i], &w1[i], &w2[i]);
+    }
+
+    fp2_dlog_2e(r2, &w1[1], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(r1, &w1[2], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(s2, &w1[3], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(s1, &w1[4], &w1[0], pairing_dlog_data->e);
+}
+
+void
+ec_dlog_2_weil(digit_t *r1,
+               digit_t *r2,
+               digit_t *s1,
+               digit_t *s2,
+               ec_basis_t *PQ,
+               const ec_basis_t *RS,
+               ec_curve_t *curve,
+               int e)
+{
+    assert(test_point_order_twof(&PQ->Q, curve, e));
+
+    // precomputing the correct curve data
+    ec_curve_normalize_A24(curve);
+
+    pairing_dlog_params_t pairing_dlog_data;
+    pairing_dlog_data.e = e;
+    pairing_dlog_data.PQ = *PQ;
+    pairing_dlog_data.RS = *RS;
+    pairing_dlog_data.A24 = curve->A24;
+
+    cubical_normalization_dlog(&pairing_dlog_data, curve);
+    compute_difference_points(&pairing_dlog_data, curve);
+
+    weil_dlog(r1, r2, s1, s2, &pairing_dlog_data);
+
+#ifndef NDEBUG
+    ec_point_t test;
+    ec_biscalar_mul(&test, r1, r2, e, PQ, curve);
+    // R = [r1]P + [r2]Q
+    assert(ec_is_equal(&test, &RS->P));
+    ec_biscalar_mul(&test, s1, s2, e, PQ, curve);
+    // S = [s1]P + [s2]Q
+    assert(ec_is_equal(&test, &RS->Q));
+#endif
+}
+
+// Inline all the Tate pairing computations needed for ec_dlog_2_weil
+// including reduction, assumes a bases PQ of full E[2^e_full] torsion
+// and a bases RS of smaller E[2^e] torsion
+static void
+tate_dlog_partial(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, pairing_dlog_params_t *pairing_dlog_data)
+{
+
+    uint32_t e_full = TORSION_EVEN_POWER;
+    uint32_t e_diff = e_full - pairing_dlog_data->e;
+
+    ec_point_t nP, nQ, nR, nS, nPQ, PnR, PnS, nRQ, nSQ;
+
+    copy_point(&nP, &pairing_dlog_data->PQ.P);
+    copy_point(&nQ, &pairing_dlog_data->PQ.Q);
+    copy_point(&nR, &pairing_dlog_data->RS.P);
+    copy_point(&nS, &pairing_dlog_data->RS.Q);
+    copy_point(&nPQ, &pairing_dlog_data->PQ.PmQ);
+    copy_point(&PnR, &pairing_dlog_data->diff.PmR);
+    copy_point(&PnS, &pairing_dlog_data->diff.PmS);
+    copy_point(&nRQ, &pairing_dlog_data->diff.RmQ);
+    copy_point(&nSQ, &pairing_dlog_data->diff.SmQ);
+
+    for (uint32_t i = 0; i < e_full - 1; i++) {
+        cubicalDBLADD(&nPQ, &nP, &nPQ, &nP, &pairing_dlog_data->ixQ, &pairing_dlog_data->A24);
+    }
+
+    for (uint32_t i = 0; i < pairing_dlog_data->e - 1; i++) {
+        cubicalADD(&PnR, &PnR, &nR, &pairing_dlog_data->ixP);
+        cubicalDBLADD(&nRQ, &nR, &nRQ, &nR, &pairing_dlog_data->ixQ, &pairing_dlog_data->A24);
+
+        cubicalADD(&PnS, &PnS, &nS, &pairing_dlog_data->ixP);
+        cubicalDBLADD(&nSQ, &nS, &nSQ, &nS, &pairing_dlog_data->ixQ, &pairing_dlog_data->A24);
+    }
+
+    translate(&nPQ, &nP);
+    translate(&PnR, &nR);
+    translate(&nRQ, &nR);
+    translate(&PnS, &nS);
+    translate(&nSQ, &nS);
+
+    translate(&nP, &nP);
+    translate(&nQ, &nQ);
+    translate(&nR, &nR);
+    translate(&nS, &nS);
+
+    // computation of the reference Tate pairing
+    ec_point_t T0;
+    fp2_t w1[5], w2[5];
+
+    // t(P, Q)^(2^e_diff) = w0
+    point_ratio(&T0, &nPQ, &nP, &pairing_dlog_data->PQ.Q);
+    fp2_copy(&w1[0], &T0.x);
+    fp2_copy(&w2[0], &T0.z);
+
+    // t(R,P) = w0^r2
+    point_ratio(&T0, &PnR, &nR, &pairing_dlog_data->PQ.P);
+    fp2_copy(&w1[1], &T0.x);
+    fp2_copy(&w2[1], &T0.z);
+
+    // t(R,Q) = w0^r1
+    point_ratio(&T0, &nRQ, &nR, &pairing_dlog_data->PQ.Q);
+    fp2_copy(&w2[2], &T0.x);
+    fp2_copy(&w1[2], &T0.z);
+
+    // t(S,P) = w0^s2
+    point_ratio(&T0, &PnS, &nS, &pairing_dlog_data->PQ.P);
+    fp2_copy(&w1[3], &T0.x);
+    fp2_copy(&w2[3], &T0.z);
+
+    // t(S,Q) = w0^s1
+    point_ratio(&T0, &nSQ, &nS, &pairing_dlog_data->PQ.Q);
+    fp2_copy(&w2[4], &T0.x);
+    fp2_copy(&w1[4], &T0.z);
+
+    // batched reduction using projective representation
+    for (int i = 0; i < 5; i++) {
+        fp2_t frob, tmp;
+        fp2_copy(&tmp, &w1[i]);
+        // inline frobenius for ^p
+        // multiply by inverse to get ^(p-1)
+        fp2_frob(&frob, &w1[i]);
+        fp2_mul(&w1[i], &w2[i], &frob);
+
+        // repeat for denom
+        fp2_frob(&frob, &w2[i]);
+        fp2_mul(&w2[i], &tmp, &frob);
+    }
+
+    // batched normalization
+    fp2_batched_inv(w2, 5);
+    for (int i = 0; i < 5; i++) {
+        fp2_mul(&w1[i], &w1[i], &w2[i]);
+    }
+
+    for (int i = 0; i < 5; i++) {
+        clear_cofac(&w1[i], &w1[i]);
+
+        // removes 2^e_diff
+        for (uint32_t j = 0; j < e_diff; j++) {
+            fp2_sqr(&w1[i], &w1[i]);
+        }
+    }
+
+    fp2_dlog_2e(r2, &w1[1], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(r1, &w1[2], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(s2, &w1[3], &w1[0], pairing_dlog_data->e);
+    fp2_dlog_2e(s1, &w1[4], &w1[0], pairing_dlog_data->e);
+}
+
+void
+ec_dlog_2_tate(digit_t *r1,
+               digit_t *r2,
+               digit_t *s1,
+               digit_t *s2,
+               const ec_basis_t *PQ,
+               const ec_basis_t *RS,
+               ec_curve_t *curve,
+               int e)
+{
+    // assume PQ is a full torsion basis
+    // returns a, b, c, d such that R = [a]P + [b]Q, S = [c]P + [d]Q
+
+#ifndef NDEBUG
+    int e_full = TORSION_EVEN_POWER;
+    int e_diff = e_full - e;
+#endif
+    assert(test_basis_order_twof(PQ, curve, e_full));
+
+    // precomputing the correct curve data
+    ec_curve_normalize_A24(curve);
+
+    pairing_dlog_params_t pairing_dlog_data;
+    pairing_dlog_data.e = e;
+    pairing_dlog_data.PQ = *PQ;
+    pairing_dlog_data.RS = *RS;
+    pairing_dlog_data.A24 = curve->A24;
+
+    cubical_normalization_dlog(&pairing_dlog_data, curve);
+    compute_difference_points(&pairing_dlog_data, curve);
+    tate_dlog_partial(r1, r2, s1, s2, &pairing_dlog_data);
+
+#ifndef NDEBUG
+    ec_point_t test;
+    ec_biscalar_mul(&test, r1, r2, e, PQ, curve);
+    ec_dbl_iter(&test, e_diff, &test, curve);
+    // R = [r1]P + [r2]Q
+    assert(ec_is_equal(&test, &RS->P));
+
+    ec_biscalar_mul(&test, s1, s2, e, PQ, curve);
+    ec_dbl_iter(&test, e_diff, &test, curve);
+    // S = [s1]P + [s2]Q
+    assert(ec_is_equal(&test, &RS->Q));
+#endif
+}
--- a/src/ec/ref/lvlx/ec.c
+++ b/src/ec/ref/lvlx/ec.c
@@ -0,0 +1,665 @@
+#include <assert.h>
+#include <stdio.h>
+#include <mp.h>
+#include <ec.h>
+
+void
+ec_point_init(ec_point_t *P)
+{ // Initialize point as identity element (1:0)
+    fp2_set_one(&(P->x));
+    fp2_set_zero(&(P->z));
+}
+
+void
+ec_curve_init(ec_curve_t *E)
+{ // Initialize the curve struct
+    // Initialize the constants
+    fp2_set_zero(&(E->A));
+    fp2_set_one(&(E->C));
+
+    // Initialize the point (A+2 : 4C)
+    ec_point_init(&(E->A24));
+
+    // Set the bool to be false by default
+    E->is_A24_computed_and_normalized = false;
+}
+
+void
+select_point(ec_point_t *Q, const ec_point_t *P1, const ec_point_t *P2, const digit_t option)
+{ // Select points in constant time
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+cswap_points(ec_point_t *P, ec_point_t *Q, const digit_t option)
+{ // Swap points in constant time
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
+    fp2_cswap(&(P->x), &(Q->x), option);
+    fp2_cswap(&(P->z), &(Q->z), option);
+}
+
+void
+ec_normalize_point(ec_point_t *P)
+{
+    fp2_inv(&P->z);
+    fp2_mul(&P->x, &P->x, &P->z);
+    fp2_set_one(&(P->z));
+}
+
+void
+ec_normalize_curve(ec_curve_t *E)
+{
+    fp2_inv(&E->C);
+    fp2_mul(&E->A, &E->A, &E->C);
+    fp2_set_one(&E->C);
+}
+
+void
+ec_curve_normalize_A24(ec_curve_t *E)
+{
+    if (!E->is_A24_computed_and_normalized) {
+        AC_to_A24(&E->A24, E);
+        ec_normalize_point(&E->A24);
+        E->is_A24_computed_and_normalized = true;
+    }
+    assert(fp2_is_one(&E->A24.z));
+}
+
+void
+ec_normalize_curve_and_A24(ec_curve_t *E)
+{ // Neither the curve or A24 are guaranteed to be normalized.
+  // First we normalize (A/C : 1) and conditionally compute
+    if (!fp2_is_one(&E->C)) {
+        ec_normalize_curve(E);
+    }
+
+    if (!E->is_A24_computed_and_normalized) {
+        // Now compute A24 = ((A + 2) / 4 : 1)
+        fp2_add_one(&E->A24.x, &E->A);     // re(A24.x) = re(A) + 1
+        fp2_add_one(&E->A24.x, &E->A24.x); // re(A24.x) = re(A) + 2
+        fp_copy(&E->A24.x.im, &E->A.im);   // im(A24.x) = im(A)
+
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 2
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 4
+        fp2_set_one(&E->A24.z);
+
+        E->is_A24_computed_and_normalized = true;
+    }
+}
+
+uint32_t
+ec_is_zero(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_has_zero_coordinate(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->x) | fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_is_equal(const ec_point_t *P, const ec_point_t *Q)
+{ // Evaluate if two points in Montgomery coordinates (X:Z) are equal
+  // Returns 0xFFFFFFFF (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1;
+
+    // Check if P, Q are the points at infinity
+    uint32_t l_zero = ec_is_zero(P);
+    uint32_t r_zero = ec_is_zero(Q);
+
+    // Check if PX * QZ = QX * PZ
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    uint32_t lr_equal = fp2_is_equal(&t0, &t1);
+
+    // Points are equal if
+    // - Both are zero, or
+    // - neither are zero AND PX * QZ = QX * PZ
+    return (l_zero & r_zero) | (~l_zero & ~r_zero * lr_equal);
+}
+
+uint32_t
+ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    if (ec_is_zero(P))
+        return 0;
+
+    uint32_t x_is_zero, tmp_is_zero;
+    fp2_t t0, t1, t2;
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t0, &t1);
+    fp2_mul(&t2, &t2, &E->A);
+    fp2_mul(&t1, &t1, &E->C);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t1, &t2); // 4 (CX^2+CZ^2+AXZ)
+
+    x_is_zero = fp2_is_zero(&P->x);
+    tmp_is_zero = fp2_is_zero(&t0);
+
+    // two torsion if x or x^2 + Ax + 1 is zero
+    return x_is_zero | tmp_is_zero;
+}
+
+uint32_t
+ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    ec_point_t test;
+    xDBL_A24(&test, P, &E->A24, E->is_A24_computed_and_normalized);
+    return ec_is_two_torsion(&test, E);
+}
+
+uint32_t
+ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E)
+{ // Check if basis points (P, Q) form a full 2^t-basis
+    ec_point_t P2, Q2;
+    xDBL_A24(&P2, &B->P, &E->A24, E->is_A24_computed_and_normalized);
+    xDBL_A24(&Q2, &B->Q, &E->A24, E->is_A24_computed_and_normalized);
+    return (ec_is_two_torsion(&P2, E) & ec_is_two_torsion(&Q2, E) & ~ec_is_equal(&P2, &Q2));
+}
+
+int
+ec_curve_verify_A(const fp2_t *A)
+{ // Verify the Montgomery coefficient A is valid (A^2-4 \ne 0)
+  // Return 1 if curve is valid, 0 otherwise
+    fp2_t t;
+    fp2_set_one(&t);
+    fp_add(&t.re, &t.re, &t.re); // t=2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    fp_neg(&t.re, &t.re); // t=-2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    return 1;
+}
+
+int
+ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A)
+{ // Initialize the curve from the A coefficient and check it is valid
+  // Return 1 if curve is valid, 0 otherwise
+    ec_curve_init(E);
+    fp2_copy(&E->A, A); // Set A
+    return ec_curve_verify_A(A);
+}
+
+void
+ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve)
+{ // j-invariant computation for Montgommery coefficient A2=(A+2C:4C)
+    fp2_t t0, t1;
+
+    fp2_sqr(&t1, &curve->C);
+    fp2_sqr(j_inv, &curve->A);
+    fp2_add(&t0, &t1, &t1);
+    fp2_sub(&t0, j_inv, &t0);
+    fp2_sub(&t0, &t0, &t1);
+    fp2_sub(j_inv, &t0, &t1);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(j_inv, j_inv, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_sqr(&t1, &t0);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(j_inv);
+    fp2_mul(j_inv, &t0, j_inv);
+}
+
+void
+xDBL_E0(ec_point_t *Q, const ec_point_t *P)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z) on the curve E0 with (A:C) = (0:1).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C) = (0:1). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&Q->z, &t1, &t2);
+    fp2_mul(&Q->z, &Q->z, &t2);
+}
+
+void
+xDBL(ec_point_t *Q, const ec_point_t *P, const ec_point_t *AC)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z). Computation of coefficient values A+2C and 4C
+  // on-the-fly. 
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t3, &AC->z, &AC->z);
+    fp2_mul(&t1, &t1, &t3);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&t0, &t3, &AC->x);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and
+  //        the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    if (!A24_normalized)
+        fp2_mul(&t1, &t1, &A24->z);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_mul(&t0, &t2, &A24->x);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ)
+{ // Differential addition of Montgomery points in projective coordinates (X:Z).
+  // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, and difference
+  //        PQ=P-Q=(XPQ:ZPQ).
+  // Output: projective Montgomery point R <- P+Q = (XR:ZR) such that x(P+Q)=XR/ZR.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&t2, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &t2);
+    fp2_add(&t2, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&t2, &t2);
+    fp2_sqr(&t3, &t3);
+    fp2_mul(&t2, &PQ->z, &t2);
+    fp2_mul(&R->z, &PQ->x, &t3);
+    fp2_copy(&R->x, &t2);
+}
+
+void
+xDBLADD(ec_point_t *R,
+        ec_point_t *S,
+        const ec_point_t *P,
+        const ec_point_t *Q,
+        const ec_point_t *PQ,
+        const ec_point_t *A24,
+        const bool A24_normalized)
+{ // Simultaneous doubling and differential addition.
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, the difference
+  //         PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points R <- 2*P = (XR:ZR) such that x(2P)=XR/ZR, and S <- P+Q = (XS:ZS) such that =
+  //         x(Q+P)=XS/ZS.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&R->x, &t0);
+    fp2_sub(&t2, &Q->x, &Q->z);
+    fp2_add(&S->x, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_sqr(&R->z, &t1);
+    fp2_mul(&t1, &t1, &S->x);
+    fp2_sub(&t2, &R->x, &R->z);
+    if (!A24_normalized)
+        fp2_mul(&R->z, &R->z, &A24->z);
+    fp2_mul(&R->x, &R->x, &R->z);
+    fp2_mul(&S->x, &A24->x, &t2);
+    fp2_sub(&S->z, &t0, &t1);
+    fp2_add(&R->z, &R->z, &S->x);
+    fp2_add(&S->x, &t0, &t1);
+    fp2_mul(&R->z, &R->z, &t2);
+    fp2_sqr(&S->z, &S->z);
+    fp2_sqr(&S->x, &S->x);
+    fp2_mul(&S->z, &S->z, &PQ->x);
+    fp2_mul(&S->x, &S->x, &PQ->z);
+}
+
+void
+xMUL(ec_point_t *Q, const ec_point_t *P, const digit_t *k, const int kbits, const ec_curve_t *curve)
+{ // The Montgomery ladder
+  // Input: projective Montgomery point P=(XP:ZP) such that xP=XP/ZP, a scalar k of bitlength kbits, and
+  //        the Montgomery curve constants (A:C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points Q <- k*P = (XQ:ZQ) such that x(k*P)=XQ/ZQ.
+    ec_point_t R0, R1, A24;
+    digit_t mask;
+    unsigned int bit, prevbit = 0, swap;
+
+    if (!curve->is_A24_computed_and_normalized) {
+        // Computation of A24=(A+2C:4C)
+        fp2_add(&A24.x, &curve->C, &curve->C);
+        fp2_add(&A24.z, &A24.x, &A24.x);
+        fp2_add(&A24.x, &A24.x, &curve->A);
+    } else {
+        fp2_copy(&A24.x, &curve->A24.x);
+        fp2_copy(&A24.z, &curve->A24.z);
+        // Assert A24 has been normalised
+        assert(fp2_is_one(&A24.z));
+    }
+
+    // R0 <- (1:0), R1 <- P
+    ec_point_init(&R0);
+    fp2_copy(&R1.x, &P->x);
+    fp2_copy(&R1.z, &P->z);
+
+    // Main loop
+    for (int i = kbits - 1; i >= 0; i--) {
+        bit = (k[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
+        swap = bit ^ prevbit;
+        prevbit = bit;
+        mask = 0 - (digit_t)swap;
+
+        cswap_points(&R0, &R1, mask);
+        xDBLADD(&R0, &R1, &R0, &R1, P, &A24, true);
+    }
+    swap = 0 ^ prevbit;
+    mask = 0 - (digit_t)swap;
+    cswap_points(&R0, &R1, mask);
+
+    fp2_copy(&Q->x, &R0.x);
+    fp2_copy(&Q->z, &R0.z);
+}
+
+int
+xDBLMUL(ec_point_t *S,
+        const ec_point_t *P,
+        const digit_t *k,
+        const ec_point_t *Q,
+        const digit_t *l,
+        const ec_point_t *PQ,
+        const int kbits,
+        const ec_curve_t *curve)
+{ // The Montgomery biladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, scalars k and l of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants (A:C).
+  // Output: projective Montgomery point S <- k*P + l*Q = (XS:ZS) such that x(k*P + l*Q)=XS/ZS.
+
+    int i, A_is_zero;
+    digit_t evens, mevens, bitk0, bitl0, maskk, maskl, temp, bs1_ip1, bs2_ip1, bs1_i, bs2_i, h;
+    digit_t sigma[2] = { 0 }, pre_sigma = 0;
+    digit_t k_t[NWORDS_ORDER], l_t[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }, r[2 * BITS] = { 0 };
+    ec_point_t DIFF1a, DIFF1b, DIFF2a, DIFF2b, R[3] = { 0 }, T[3];
+
+    // differential additions formulas are invalid in this case
+    if (ec_has_zero_coordinate(P) | ec_has_zero_coordinate(Q) | ec_has_zero_coordinate(PQ))
+        return 0;
+
+    // Derive sigma according to parity
+    bitk0 = (k[0] & 1);
+    bitl0 = (l[0] & 1);
+    maskk = 0 - bitk0; // Parity masks: 0 if even, otherwise 1...1
+    maskl = 0 - bitl0;
+    sigma[0] = (bitk0 ^ 1);
+    sigma[1] = (bitl0 ^ 1);
+    evens = sigma[0] + sigma[1]; // Count number of even scalars
+    mevens = 0 - (evens & 1);    // Mask mevens <- 0 if # even of scalars = 0 or 2, otherwise mevens = 1...1
+
+    // If k and l are both even or both odd, pick sigma = (0,1)
+    sigma[0] = (sigma[0] & mevens);
+    sigma[1] = (sigma[1] & mevens) | (1 & ~mevens);
+
+    // Convert even scalars to odd
+    one[0] = 1;
+    mp_sub(k_t, k, one, NWORDS_ORDER);
+    mp_sub(l_t, l, one, NWORDS_ORDER);
+    select_ct(k_t, k_t, k, maskk, NWORDS_ORDER);
+    select_ct(l_t, l_t, l, maskl, NWORDS_ORDER);
+
+    // Scalar recoding
+    for (i = 0; i < kbits; i++) {
+        // If sigma[0] = 1 swap k_t and l_t
+        maskk = 0 - (sigma[0] ^ pre_sigma);
+        swap_ct(k_t, l_t, maskk, NWORDS_ORDER);
+
+        if (i == kbits - 1) {
+            bs1_ip1 = 0;
+            bs2_ip1 = 0;
+        } else {
+            bs1_ip1 = mp_shiftr(k_t, 1, NWORDS_ORDER);
+            bs2_ip1 = mp_shiftr(l_t, 1, NWORDS_ORDER);
+        }
+        bs1_i = k_t[0] & 1;
+        bs2_i = l_t[0] & 1;
+
+        r[2 * i] = bs1_i ^ bs1_ip1;
+        r[2 * i + 1] = bs2_i ^ bs2_ip1;
+
+        // Revert sigma if second bit, r_(2i+1), is 1
+        pre_sigma = sigma[0];
+        maskk = 0 - r[2 * i + 1];
+        select_ct(&temp, &sigma[0], &sigma[1], maskk, 1);
+        select_ct(&sigma[1], &sigma[1], &sigma[0], maskk, 1);
+        sigma[0] = temp;
+    }
+
+    // Point initialization
+    ec_point_init(&R[0]);
+    maskk = 0 - sigma[0];
+    select_point(&R[1], P, Q, maskk);
+    select_point(&R[2], Q, P, maskk);
+
+    fp2_copy(&DIFF1a.x, &R[1].x);
+    fp2_copy(&DIFF1a.z, &R[1].z);
+    fp2_copy(&DIFF1b.x, &R[2].x);
+    fp2_copy(&DIFF1b.z, &R[2].z);
+
+    // Initialize DIFF2a <- P+Q, DIFF2b <- P-Q
+    xADD(&R[2], &R[1], &R[2], PQ);
+    if (ec_has_zero_coordinate(&R[2]))
+        return 0; // non valid formulas
+
+    fp2_copy(&DIFF2a.x, &R[2].x);
+    fp2_copy(&DIFF2a.z, &R[2].z);
+    fp2_copy(&DIFF2b.x, &PQ->x);
+    fp2_copy(&DIFF2b.z, &PQ->z);
+
+    A_is_zero = fp2_is_zero(&curve->A);
+
+    // Main loop
+    for (i = kbits - 1; i >= 0; i--) {
+        h = r[2 * i] + r[2 * i + 1]; // in {0, 1, 2}
+        maskk = 0 - (h & 1);
+        select_point(&T[0], &R[0], &R[1], maskk);
+        maskk = 0 - (h >> 1);
+        select_point(&T[0], &T[0], &R[2], maskk);
+        if (A_is_zero) {
+            xDBL_E0(&T[0], &T[0]);
+        } else {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(&T[0], &T[0], &curve->A24, true);
+        }
+
+        maskk = 0 - r[2 * i + 1]; // in {0, 1}
+        select_point(&T[1], &R[0], &R[1], maskk);
+        select_point(&T[2], &R[1], &R[2], maskk);
+
+        cswap_points(&DIFF1a, &DIFF1b, maskk);
+        xADD(&T[1], &T[1], &T[2], &DIFF1a);
+        xADD(&T[2], &R[0], &R[2], &DIFF2a);
+
+        // If hw (mod 2) = 1 then swap DIFF2a and DIFF2b
+        maskk = 0 - (h & 1);
+        cswap_points(&DIFF2a, &DIFF2b, maskk);
+
+        // R <- T
+        copy_point(&R[0], &T[0]);
+        copy_point(&R[1], &T[1]);
+        copy_point(&R[2], &T[2]);
+    }
+
+    // Output R[evens]
+    select_point(S, &R[0], &R[1], mevens);
+
+    maskk = 0 - (bitk0 & bitl0);
+    select_point(S, S, &R[2], maskk);
+    return 1;
+}
+
+int
+ec_ladder3pt(ec_point_t *R,
+             const digit_t *m,
+             const ec_point_t *P,
+             const ec_point_t *Q,
+             const ec_point_t *PQ,
+             const ec_curve_t *E)
+{ // The 3-point Montgomery ladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, a scalar k of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C/4C:1).
+  // Output: projective Montgomery point R <- P + m*Q = (XR:ZR) such that x(P + m*Q)=XR/ZR.
+    assert(E->is_A24_computed_and_normalized);
+    if (!fp2_is_one(&E->A24.z)) {
+        return 0;
+    }
+    // Formulas are not valid in that case
+    if (ec_has_zero_coordinate(PQ)) {
+        return 0;
+    }
+
+    ec_point_t X0, X1, X2;
+    copy_point(&X0, Q);
+    copy_point(&X1, P);
+    copy_point(&X2, PQ);
+
+    int i, j;
+    digit_t t;
+    for (i = 0; i < NWORDS_ORDER; i++) {
+        t = 1;
+        for (j = 0; j < RADIX; j++) {
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            xDBLADD(&X0, &X1, &X0, &X1, &X2, &E->A24, true);
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            t <<= 1;
+        };
+    };
+    copy_point(R, &X1);
+    return 1;
+}
+
+// WRAPPERS to export
+
+void
+ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve)
+{
+    // If A24 = ((A+2)/4 : 1) we save multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+    } else {
+        // Otherwise we compute A24 on the fly for doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+    }
+}
+
+void
+ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve)
+{
+    if (n == 0) {
+        copy_point(res, P);
+        return;
+    }
+
+    // When the chain is long enough, we should normalise A24
+    if (n > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is normalized we can save some multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+        for (int i = 0; i < n - 1; i++) {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(res, res, &curve->A24, true);
+        }
+    } else {
+        // Otherwise we do normal doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+        for (int i = 0; i < n - 1; i++) {
+            xDBL(res, res, (const ec_point_t *)curve);
+        }
+    }
+}
+
+void
+ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve)
+{
+    ec_dbl_iter(&res->P, n, &B->P, curve);
+    ec_dbl_iter(&res->Q, n, &B->Q, curve);
+    ec_dbl_iter(&res->PmQ, n, &B->PmQ, curve);
+}
+
+void
+ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve)
+{
+    // For large scalars it's worth normalising anyway
+    if (kbits > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is computed and normalized we save some Fp2 multiplications
+    xMUL(res, P, scalar, kbits, curve);
+}
+
+int
+ec_biscalar_mul(ec_point_t *res,
+                const digit_t *scalarP,
+                const digit_t *scalarQ,
+                const int kbits,
+                const ec_basis_t *PQ,
+                const ec_curve_t *curve)
+{
+    if (fp2_is_zero(&PQ->PmQ.z))
+        return 0;
+
+    /* Differential additions behave badly when PmQ = (0:1), so we need to
+     * treat this case specifically. Since we assume P, Q are a basis, this
+     * can happen only if kbits==1 */
+    if (kbits == 1) {
+        // Sanity check: our basis should be given by 2-torsion points
+        if (!ec_is_two_torsion(&PQ->P, curve) || !ec_is_two_torsion(&PQ->Q, curve) ||
+            !ec_is_two_torsion(&PQ->PmQ, curve))
+            return 0;
+        digit_t bP, bQ;
+        bP = (scalarP[0] & 1);
+        bQ = (scalarQ[0] & 1);
+        if (bP == 0 && bQ == 0)
+            ec_point_init(res); //(1: 0)
+        else if (bP == 1 && bQ == 0)
+            copy_point(res, &PQ->P);
+        else if (bP == 0 && bQ == 1)
+            copy_point(res, &PQ->Q);
+        else if (bP == 1 && bQ == 1)
+            copy_point(res, &PQ->PmQ);
+        else // should never happen
+            assert(0);
+        return 1;
+    } else {
+        ec_curve_t E;
+        copy_curve(&E, curve);
+
+        if (!fp2_is_zero(&curve->A)) { // If A is not zero normalize
+            ec_curve_normalize_A24(&E);
+        }
+        return xDBLMUL(res, &PQ->P, scalarP, &PQ->Q, scalarQ, &PQ->PmQ, kbits, (const ec_curve_t *)&E);
+    }
+}
--- a/src/ec/ref/lvlx/ec_jac.c
+++ b/src/ec/ref/lvlx/ec_jac.c
@@ -0,0 +1,335 @@
+#include <assert.h>
+#include <ec.h>
+
+void
+jac_init(jac_point_t *P)
+{ // Initialize Montgomery in Jacobian coordinates as identity element (0:1:0)
+    fp2_set_zero(&P->x);
+    fp2_set_one(&P->y);
+    fp2_set_zero(&P->z);
+}
+
+uint32_t
+jac_is_equal(const jac_point_t *P, const jac_point_t *Q)
+{ // Evaluate if two points in Jacobian coordinates (X:Y:Z) are equal
+  // Returns 1 (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1, t2, t3;
+
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t2, &P->x, &t0); // x1*z2^2
+    fp2_sqr(&t1, &P->z);
+    fp2_mul(&t3, &Q->x, &t1); // x2*z1^2
+    fp2_sub(&t2, &t2, &t3);
+
+    fp2_mul(&t0, &t0, &Q->z);
+    fp2_mul(&t0, &P->y, &t0); // y1*z2^3
+    fp2_mul(&t1, &t1, &P->z);
+    fp2_mul(&t1, &Q->y, &t1); // y2*z1^3
+    fp2_sub(&t0, &t0, &t1);
+
+    return fp2_is_zero(&t0) & fp2_is_zero(&t2);
+}
+
+void
+jac_to_xz(ec_point_t *P, const jac_point_t *xyP)
+{
+    fp2_copy(&P->x, &xyP->x);
+    fp2_copy(&P->z, &xyP->z);
+    fp2_sqr(&P->z, &P->z);
+
+    // If xyP = (0:1:0), we currently have P=(0 : 0) but we want to set P=(1:0)
+    uint32_t c1, c2;
+    fp2_t one;
+    fp2_set_one(&one);
+
+    c1 = fp2_is_zero(&P->x);
+    c2 = fp2_is_zero(&P->z);
+    fp2_select(&P->x, &P->x, &one, c1 & c2);
+}
+
+void
+jac_to_ws(jac_point_t *Q, fp2_t *t, fp2_t *ao3, const jac_point_t *P, const ec_curve_t *curve)
+{
+    // Cost of 3M + 2S when A != 0.
+    fp_t one;
+    fp2_t a;
+    /* a = 1 - A^2/3, U = X + (A*Z^2)/3, V = Y, W = Z, T = a*Z^4*/
+    fp_set_one(&one);
+    if (!fp2_is_zero(&(curve->A))) {
+        fp_div3(&(ao3->re), &(curve->A.re));
+        fp_div3(&(ao3->im), &(curve->A.im));
+        fp2_sqr(t, &P->z);
+        fp2_mul(&Q->x, ao3, t);
+        fp2_add(&Q->x, &Q->x, &P->x);
+        fp2_sqr(t, t);
+        fp2_mul(&a, ao3, &(curve->A));
+        fp_sub(&(a.re), &one, &(a.re));
+        fp_neg(&(a.im), &(a.im));
+        fp2_mul(t, t, &a);
+    } else {
+        fp2_copy(&Q->x, &P->x);
+        fp2_sqr(t, &P->z);
+        fp2_sqr(t, t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve)
+{
+    // Cost of 1M + 1S when A != 0.
+    fp2_t t;
+    /* X = U - (A*W^2)/3, Y = V, Z = W. */
+    if (!fp2_is_zero(&(curve->A))) {
+        fp2_sqr(&t, &P->z);
+        fp2_mul(&t, &t, ao3);
+        fp2_sub(&Q->x, &P->x, &t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+copy_jac_point(jac_point_t *P, const jac_point_t *Q)
+{
+    fp2_copy(&(P->x), &(Q->x));
+    fp2_copy(&(P->y), &(Q->y));
+    fp2_copy(&(P->z), &(Q->z));
+}
+
+void
+jac_neg(jac_point_t *Q, const jac_point_t *P)
+{
+    fp2_copy(&Q->x, &P->x);
+    fp2_neg(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC)
+{ // Cost of 6M + 6S.
+  // Doubling on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding to
+  // (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    fp2_t t0, t1, t2, t3;
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_sqr(&t0, &P->x); // t0 = x1^2
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1); // t0 = 3x1^2
+    fp2_sqr(&t1, &P->z);    // t1 = z1^2
+    fp2_mul(&t2, &P->x, &AC->A);
+    fp2_add(&t2, &t2, &t2); // t2 = 2Ax1
+    fp2_add(&t2, &t1, &t2); // t2 = 2Ax1+z1^2
+    fp2_mul(&t2, &t1, &t2); // t2 = z1^2(2Ax1+z1^2)
+    fp2_add(&t2, &t0, &t2); // t2 = alpha = 3x1^2 + z1^2(2Ax1+z1^2)
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z); // z2 = 2y1z1
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t0, &t0, &AC->A); // t0 = 4Ay1^2z1^2
+    fp2_sqr(&t1, &P->y);
+    fp2_add(&t1, &t1, &t1);     // t1 = 2y1^2
+    fp2_add(&t3, &P->x, &P->x); // t3 = 2x1
+    fp2_mul(&t3, &t1, &t3);     // t3 = 4x1y1^2
+    fp2_sqr(&Q->x, &t2);        // x2 = alpha^2
+    fp2_sub(&Q->x, &Q->x, &t0); // x2 = alpha^2 - 4Ay1^2z1^2
+    fp2_sub(&Q->x, &Q->x, &t3);
+    fp2_sub(&Q->x, &Q->x, &t3); // x2 = alpha^2 - 4Ay1^2z1^2 - 8x1y1^2
+    fp2_sub(&Q->y, &t3, &Q->x); // y2 = 4x1y1^2 - x2
+    fp2_mul(&Q->y, &Q->y, &t2); // y2 = alpha(4x1y1^2 - x2)
+    fp2_sqr(&t1, &t1);          // t1 = 4y1^4
+    fp2_sub(&Q->y, &Q->y, &t1);
+    fp2_sub(&Q->y, &Q->y, &t1); // y2 = alpha(4x1y1^2 - x2) - 8y1^4
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t)
+{ // Cost of 3M + 5S.
+  // Doubling on a Weierstrass curve, representation in modified Jacobian coordinates
+  // (X:Y:Z:T=a*Z^4) corresponding to (X/Z^2,Y/Z^3), where a is the curve coefficient.
+  // Formula from https://hyperelliptic.org/EFD/g1p/auto-shortw-modified.html
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_t xx, c, cc, r, s, m;
+    // XX = X^2
+    fp2_sqr(&xx, &P->x);
+    // A = 2*Y^2
+    fp2_sqr(&c, &P->y);
+    fp2_add(&c, &c, &c);
+    // AA = A^2
+    fp2_sqr(&cc, &c);
+    // R = 2*AA
+    fp2_add(&r, &cc, &cc);
+    // S = (X+A)^2-XX-AA
+    fp2_add(&s, &P->x, &c);
+    fp2_sqr(&s, &s);
+    fp2_sub(&s, &s, &xx);
+    fp2_sub(&s, &s, &cc);
+    // M = 3*XX+T1
+    fp2_add(&m, &xx, &xx);
+    fp2_add(&m, &m, &xx);
+    fp2_add(&m, &m, t);
+    // X3 = M^2-2*S
+    fp2_sqr(&Q->x, &m);
+    fp2_sub(&Q->x, &Q->x, &s);
+    fp2_sub(&Q->x, &Q->x, &s);
+    // Z3 = 2*Y*Z
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z);
+    // Y3 = M*(S-X3)-R
+    fp2_sub(&Q->y, &s, &Q->x);
+    fp2_mul(&Q->y, &Q->y, &m);
+    fp2_sub(&Q->y, &Q->y, &r);
+    // T3 = 2*R*T1
+    fp2_mul(u, t, &r);
+    fp2_add(u, u, u);
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+select_jac_point(jac_point_t *Q, const jac_point_t *P1, const jac_point_t *P2, const digit_t option)
+{ // Select points
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->y), &(P1->y), &(P2->y), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Addition on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding
+    // to (x,y) = (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    //
+    // Complete routine, to handle all edge cases:
+    //   if ZP == 0:            # P == inf
+    //       return Q
+    //   if ZQ == 0:            # Q == inf
+    //       return P
+    //   dy <- YQ*ZP**3 - YP*ZQ**3
+    //   dx <- XQ*ZP**2 - XP*ZQ**2
+    //   if dx == 0:             # x1 == x2
+    //       if dy == 0:         # ... and y1 == y2: doubling case
+    //           dy <- ZP*ZQ * (3*XP^2 + ZP^2 * (2*A*XP + ZP^2))
+    //           dx <- 2*YP*ZP
+    //       else:              # ... but y1 != y2, thus P = -Q
+    //           return inf
+    //   XR <- dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2)
+    //   YR <- dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3
+    //   ZR <- dx * ZP * ZQ
+
+    // Constant time processing:
+    // - The case for P == 0 or Q == 0 is handled at the end with conditional select
+    // - dy and dx are computed for both the normal and doubling cases, we switch when
+    //   dx == dy == 0 for the normal case.
+    // - If we have that P = -Q then dx = 0 and so ZR will be zero, giving us the point
+    //   at infinity for "free".
+    //
+    // These current formula are expensive and I'm probably missing some tricks...
+    // Thought I'd get the ball rolling.
+    // Cost 17M + 6S + 13a
+    fp2_t t0, t1, t2, t3, u1, u2, v1, dx, dy;
+
+    /* If P is zero or Q is zero we will conditionally swap before returning. */
+    uint32_t ctl1 = fp2_is_zero(&P->z);
+    uint32_t ctl2 = fp2_is_zero(&Q->z);
+
+    /* Precompute some values */
+    fp2_sqr(&t0, &P->z); // t0 = z1^2
+    fp2_sqr(&t1, &Q->z); // t1 = z2^2
+
+    /* Compute dy and dx for ordinary case */
+    fp2_mul(&v1, &t1, &Q->z); // v1 = z2^3
+    fp2_mul(&t2, &t0, &P->z); // t2 = z1^3
+    fp2_mul(&v1, &v1, &P->y); // v1 = y1z2^3
+    fp2_mul(&t2, &t2, &Q->y); // t2 = y2z1^3
+    fp2_sub(&dy, &t2, &v1);   // dy = y2z1^3 - y1z2^3
+    fp2_mul(&u2, &t0, &Q->x); // u2 = x2z1^2
+    fp2_mul(&u1, &t1, &P->x); // u1 = x1z2^2
+    fp2_sub(&dx, &u2, &u1);   // dx = x2z1^2 - x1z2^2
+
+    /* Compute dy and dx for doubling case */
+    fp2_add(&t1, &P->y, &P->y);   // dx_dbl = t1 = 2y1
+    fp2_add(&t2, &AC->A, &AC->A); // t2 = 2A
+    fp2_mul(&t2, &t2, &P->x);     // t2 = 2Ax1
+    fp2_add(&t2, &t2, &t0);       // t2 = 2Ax1 + z1^2
+    fp2_mul(&t2, &t2, &t0);       // t2 = z1^2 * (2Ax1 + z1^2)
+    fp2_sqr(&t0, &P->x);          // t0 = x1^2
+    fp2_add(&t2, &t2, &t0);       // t2 = x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 2*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 3*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_mul(&t2, &t2, &Q->z);     // dy_dbl = t2 = z2 * (3*x1^2 + z1^2 * (2Ax1 + z1^2))
+
+    /* If dx is zero and dy is zero swap with double variables */
+    uint32_t ctl = fp2_is_zero(&dx) & fp2_is_zero(&dy);
+    fp2_select(&dx, &dx, &t1, ctl);
+    fp2_select(&dy, &dy, &t2, ctl);
+
+    /* Some more precomputations */
+    fp2_mul(&t0, &P->z, &Q->z); // t0 = z1z2
+    fp2_sqr(&t1, &t0);          // t1 = z1z2^2
+    fp2_sqr(&t2, &dx);          // t2 = dx^2
+    fp2_sqr(&t3, &dy);          // t3 = dy^2
+
+    /* Compute x3 = dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2) */
+    fp2_mul(&R->x, &AC->A, &t1); // x3 = A*(z1z2)^2
+    fp2_add(&R->x, &R->x, &u1);  // x3 = A*(z1z2)^2 + u1
+    fp2_add(&R->x, &R->x, &u2);  // x3 = A*(z1z2)^2 + u1 + u2
+    fp2_mul(&R->x, &R->x, &t2);  // x3 = dx^2 * (A*(z1z2)^2 + u1 + u2)
+    fp2_sub(&R->x, &t3, &R->x);  // x3 = dy^2 - dx^2 * (A*(z1z2)^2 + u1 + u2)
+
+    /* Compute y3 = dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3*/
+    fp2_mul(&R->y, &u1, &t2);     // y3 = u1 * dx^2
+    fp2_sub(&R->y, &R->y, &R->x); // y3 = u1 * dx^2 - x3
+    fp2_mul(&R->y, &R->y, &dy);   // y3 = dy * (u1 * dx^2 - x3)
+    fp2_mul(&t3, &t2, &dx);       // t3 = dx^3
+    fp2_mul(&t3, &t3, &v1);       // t3 = v1 * dx^3
+    fp2_sub(&R->y, &R->y, &t3);   // y3 = dy * (u1 * dx^2 - x3) - v1 * dx^3
+
+    /* Compute z3 = dx * z1 * z2 */
+    fp2_mul(&R->z, &dx, &t0);
+
+    /* Finally, we need to set R = P is Q.Z = 0 and R = Q if P.Z = 0 */
+    select_jac_point(R, R, Q, ctl1);
+    select_jac_point(R, R, P, ctl2);
+}
+
+void
+jac_to_xz_add_components(add_components_t *add_comp, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Take P and Q in E distinct, two jac_point_t, return three components u,v and w in Fp2 such
+    // that the xz coordinates of P+Q are (u-v:w) and of P-Q are (u+v:w)
+
+    fp2_t t0, t1, t2, t3, t4, t5, t6;
+
+    fp2_sqr(&t0, &P->z);             // t0 = z1^2
+    fp2_sqr(&t1, &Q->z);             // t1 = z2^2
+    fp2_mul(&t2, &P->x, &t1);        // t2 = x1z2^2
+    fp2_mul(&t3, &t0, &Q->x);        // t3 = z1^2x2
+    fp2_mul(&t4, &P->y, &Q->z);      // t4 = y1z2
+    fp2_mul(&t4, &t4, &t1);          // t4 = y1z2^3
+    fp2_mul(&t5, &P->z, &Q->y);      // t5 = z1y2
+    fp2_mul(&t5, &t5, &t0);          // t5 = z1^3y2
+    fp2_mul(&t0, &t0, &t1);          // t0 = (z1z2)^2
+    fp2_mul(&t6, &t4, &t5);          // t6 = (z1z_2)^3y1y2
+    fp2_add(&add_comp->v, &t6, &t6); // v  = 2(z1z_2)^3y1y2
+    fp2_sqr(&t4, &t4);               // t4 = y1^2z2^6
+    fp2_sqr(&t5, &t5);               // t5 = z1^6y_2^2
+    fp2_add(&t4, &t4, &t5);          // t4 = z1^6y_2^2 + y1^2z2^6
+    fp2_add(&t5, &t2, &t3);          // t5 = x1z2^2 +z_1^2x2
+    fp2_add(&t6, &t3, &t3);          // t6 = 2z_1^2x2
+    fp2_sub(&t6, &t5, &t6);          // t6 = lambda = x1z2^2 - z_1^2x2
+    fp2_sqr(&t6, &t6);               // t6 = lambda^2 = (x1z2^2 - z_1^2x2)^2
+    fp2_mul(&t1, &AC->A, &t0);       // t1 = A*(z1z2)^2
+    fp2_add(&t1, &t5, &t1);          // t1 = gamma =A*(z1z2)^2 + x1z2^2 +z_1^2x2
+    fp2_mul(&t1, &t1, &t6);          // t1 = gamma*lambda^2
+    fp2_sub(&add_comp->u, &t4, &t1); // u  = z1^6y_2^2 + y1^2z2^6 - gamma*lambda^2
+    fp2_mul(&add_comp->w, &t6, &t0); // w  = (z1z2)^2(lambda)^2
+}
--- a/src/ec/ref/lvlx/isog_chains.c
+++ b/src/ec/ref/lvlx/isog_chains.c
@@ -0,0 +1,241 @@
+#include "isog.h"
+#include <assert.h>
+
+// since we use degree 4 isogeny steps, we need to handle the odd case with care
+static uint32_t
+ec_eval_even_strategy(ec_curve_t *curve,
+                      ec_point_t *points,
+                      unsigned len_points,
+                      const ec_point_t *kernel,
+                      const int isog_len)
+{
+    ec_curve_normalize_A24(curve);
+    ec_point_t A24;
+    copy_point(&A24, &curve->A24);
+
+    int space = 1;
+    for (int i = 1; i < isog_len; i *= 2)
+        ++space;
+
+    // Stack of remaining kernel points and their associated orders
+    ec_point_t splits[space];
+    uint16_t todo[space];
+    splits[0] = *kernel;
+    todo[0] = isog_len;
+
+    int current = 0; // Pointer to current top of stack
+
+    // Chain of 4-isogenies
+    for (int j = 0; j < isog_len / 2; ++j) {
+        assert(current >= 0);
+        assert(todo[current] >= 1);
+        // Get the next point of order 4
+        while (todo[current] != 2) {
+            assert(todo[current] >= 3);
+            // A new split will be added
+            ++current;
+            assert(current < space);
+            // We set the seed of the new split to be computed and saved
+            copy_point(&splits[current], &splits[current - 1]);
+            // if we copied from the very first element, then we perform one additional doubling
+            unsigned num_dbls = todo[current - 1] / 4 * 2 + todo[current - 1] % 2;
+            todo[current] = todo[current - 1] - num_dbls;
+            while (num_dbls--)
+                xDBL_A24(&splits[current], &splits[current], &A24, false);
+        }
+
+        if (j == 0) {
+            assert(fp2_is_one(&A24.z));
+            if (!ec_is_four_torsion(&splits[current], curve))
+                return -1;
+
+            ec_point_t T;
+            xDBL_A24(&T, &splits[current], &A24, false);
+            if (fp2_is_zero(&T.x))
+                return -1; // special isogenies not allowed
+        } else {
+            assert(todo[current] == 2);
+#ifndef NDEBUG
+            if (fp2_is_zero(&splits[current].z))
+                debug_print("splitting point z coordinate is unexpectedly zero");
+
+            ec_point_t test;
+            xDBL_A24(&test, &splits[current], &A24, false);
+            if (fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly zero before doubling");
+            xDBL_A24(&test, &test, &A24, false);
+            if (!fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+        }
+
+        // Evaluate 4-isogeny
+        ec_kps4_t kps4;
+        xisog_4(&kps4, &A24, splits[current]);
+        xeval_4(splits, splits, current, &kps4);
+        for (int i = 0; i < current; ++i)
+            todo[i] -= 2;
+        xeval_4(points, points, len_points, &kps4);
+
+        --current;
+    }
+    assert(isog_len % 2 ? !current : current == -1);
+
+    // Final 2-isogeny
+    if (isog_len % 2) {
+#ifndef NDEBUG
+        if (fp2_is_zero(&splits[0].z))
+            debug_print("splitting point z coordinate is unexpectedly zero");
+        ec_point_t test;
+        copy_point(&test, &splits[0]);
+        xDBL_A24(&test, &test, &A24, false);
+        if (!fp2_is_zero(&test.z))
+            debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+
+        // We need to check the order of this point in case there were no 4-isogenies
+        if (isog_len == 1 && !ec_is_two_torsion(&splits[0], curve))
+            return -1;
+        if (fp2_is_zero(&splits[0].x)) {
+            // special isogenies not allowed
+            // this case can only happen if isog_len == 1; otherwise the
+            // previous 4-isogenies we computed ensure that $T=(0:1)$ is put
+            // as the kernel of the dual isogeny
+            return -1;
+        }
+
+        ec_kps2_t kps2;
+        xisog_2(&kps2, &A24, splits[0]);
+        xeval_2(points, points, len_points, &kps2);
+    }
+
+    // Output curve in the form (A:C)
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+
+    return 0;
+}
+
+uint32_t
+ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points)
+{
+    copy_curve(image, &phi->curve);
+    return ec_eval_even_strategy(image, points, len_points, &phi->kernel, phi->length);
+}
+
+// naive implementation
+uint32_t
+ec_eval_small_chain(ec_curve_t *curve,
+                    const ec_point_t *kernel,
+                    int len,
+                    ec_point_t *points,
+                    unsigned len_points,
+                    bool special) // do we allow special isogenies?
+{
+
+    ec_point_t A24;
+    AC_to_A24(&A24, curve);
+
+    ec_kps2_t kps;
+    ec_point_t small_K, big_K;
+    copy_point(&big_K, kernel);
+
+    for (int i = 0; i < len; i++) {
+        copy_point(&small_K, &big_K);
+        // small_K = big_K;
+        for (int j = 0; j < len - i - 1; j++) {
+            xDBL_A24(&small_K, &small_K, &A24, false);
+        }
+        // Check the order of the point before the first isogeny step
+        if (i == 0 && !ec_is_two_torsion(&small_K, curve))
+            return (uint32_t)-1;
+        // Perform isogeny step
+        if (fp2_is_zero(&small_K.x)) {
+            if (special) {
+                ec_point_t B24;
+                xisog_2_singular(&kps, &B24, A24);
+                xeval_2_singular(&big_K, &big_K, 1, &kps);
+                xeval_2_singular(points, points, len_points, &kps);
+                copy_point(&A24, &B24);
+            } else {
+                return (uint32_t)-1;
+            }
+        } else {
+            xisog_2(&kps, &A24, small_K);
+            xeval_2(&big_K, &big_K, 1, &kps);
+            xeval_2(points, points, len_points, &kps);
+        }
+    }
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+    return 0;
+}
+
+uint32_t
+ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to)
+{
+    fp2_t t0, t1, t2, t3, t4;
+
+    fp2_mul(&t0, &from->A, &from->C);
+    fp2_mul(&t1, &to->A, &to->C);
+
+    fp2_mul(&t2, &t1, &to->C); // toA*toC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*toA*toC^2
+    fp2_sqr(&t3, &to->A);
+    fp2_mul(&t3, &t3, &to->A); // toA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->Nx, &t3, &t2); // 2*toA^3-9*toA*toC^2
+    fp2_mul(&t2, &t0, &from->A);  // fromA^2*fromC
+    fp2_sqr(&t3, &from->C);
+    fp2_mul(&t3, &t3, &from->C); // fromC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);             // 3*fromC^3
+    fp2_sub(&t3, &t3, &t2);             // 3*fromC^3-fromA^2*fromC
+    fp2_mul(&isom->Nx, &isom->Nx, &t3); // lambda_x = (2*toA^3-9*toA*toC^2)*(3*fromC^3-fromA^2*fromC)
+
+    fp2_mul(&t2, &t0, &from->C); // fromA*fromC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*fromA*fromC^2
+    fp2_sqr(&t3, &from->A);
+    fp2_mul(&t3, &t3, &from->A); // fromA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->D, &t3, &t2); // 2*fromA^3-9*fromA*fromC^2
+    fp2_mul(&t2, &t1, &to->A);   // toA^2*toC
+    fp2_sqr(&t3, &to->C);
+    fp2_mul(&t3, &t3, &to->C); // toC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);           // 3*toC^3
+    fp2_sub(&t3, &t3, &t2);           // 3*toC^3-toA^2*toC
+    fp2_mul(&isom->D, &isom->D, &t3); // lambda_z = (2*fromA^3-9*fromA*fromC^2)*(3*toC^3-toA^2*toC)
+
+    // Mont -> SW -> SW -> Mont
+    fp2_mul(&t0, &to->C, &from->A);
+    fp2_mul(&t0, &t0, &isom->Nx); // lambda_x*toC*fromA
+    fp2_mul(&t1, &from->C, &to->A);
+    fp2_mul(&t1, &t1, &isom->D);  // lambda_z*fromC*toA
+    fp2_sub(&isom->Nz, &t0, &t1); // lambda_x*toC*fromA - lambda_z*fromC*toA
+    fp2_mul(&t0, &from->C, &to->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1);             // 3*fromC*toC
+    fp2_mul(&isom->D, &isom->D, &t0);   // 3*lambda_z*fromC*toC
+    fp2_mul(&isom->Nx, &isom->Nx, &t0); // 3*lambda_x*fromC*toC
+
+    return (fp2_is_zero(&isom->Nx) | fp2_is_zero(&isom->D));
+}
+
+void
+ec_iso_eval(ec_point_t *P, ec_isom_t *isom)
+{
+    fp2_t tmp;
+    fp2_mul(&P->x, &P->x, &isom->Nx);
+    fp2_mul(&tmp, &P->z, &isom->Nz);
+    fp2_add(&P->x, &P->x, &tmp);
+    fp2_mul(&P->z, &P->z, &isom->D);
+}
--- a/src/ec/ref/lvlx/test/basis-gen-bench.c
+++ b/src/ec/ref/lvlx/test/basis-gen-bench.c
@@ -0,0 +1,143 @@
+#include <bench.h>
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <ec.h>
+
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+/******************************
+Util functions
+******************************/
+
+int
+cmp_u64(const void *v1, const void *v2)
+{
+    uint64_t x1 = *(const uint64_t *)v1;
+    uint64_t x2 = *(const uint64_t *)v2;
+    if (x1 < x2) {
+        return -1;
+    } else if (x1 == x2) {
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+void
+bench_basis_generation(unsigned int n, int iterations)
+{
+    int i, j;
+    uint64_t cycles1, cycles2;
+    uint64_t cycle_runs[20];
+
+    ec_basis_t basis;
+    ec_curve_t curve;
+    ec_curve_init(&curve);
+
+    // Set a supersingular elliptic curve
+    // E : y^2 = x^3 + 6*x^2 + x
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    // Full even torsion generation without hints
+    for (i = 0; i < 20; i++) {
+        cycles1 = cpucycles();
+        for (j = 0; j < iterations; j++) {
+            (void)ec_curve_to_basis_2f_to_hint(&basis, &curve, n);
+        }
+        cycles2 = cpucycles();
+        cycle_runs[i] = cycles2 - cycles1;
+    }
+    qsort(cycle_runs + 10, 10, sizeof cycle_runs[0], cmp_u64);
+    printf("  2^%d torsion generation takes .................................... %" PRIu64 " cycles\n",
+           n,
+           cycle_runs[4] / (iterations));
+}
+
+void
+bench_basis_generation_from_hint(unsigned int n, int iterations)
+{
+    int i, j;
+    uint64_t cycles1, cycles2;
+    uint64_t cycle_runs[20];
+
+    ec_basis_t basis;
+    ec_curve_t curve;
+    ec_curve_init(&curve);
+
+    // Set a supersingular elliptic curve
+    // E : y^2 = x^3 + 6*x^2 + x
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    uint8_t hint = ec_curve_to_basis_2f_to_hint(&basis, &curve, n);
+
+    // Full even torsion generation without hints
+    for (i = 0; i < 20; i++) {
+        cycles1 = cpucycles();
+        for (j = 0; j < iterations; j++) {
+            ec_curve_to_basis_2f_from_hint(&basis, &curve, n, hint);
+        }
+        cycles2 = cpucycles();
+        cycle_runs[i] = cycles2 - cycles1;
+    }
+    qsort(cycle_runs + 10, 10, sizeof cycle_runs[0], cmp_u64);
+    printf("  2^%d torsion generation takes .................................... %" PRIu64 " cycles\n",
+           n,
+           cycle_runs[4] / (iterations));
+}
+
+void
+bench_basis(int iterations)
+{
+    printf("\n-------------------------------------------------------------------------------------"
+           "-------------------\n\n");
+    printf("Benchmarking E[2^n] basis generation for " STRINGIFY(SQISIGN_VARIANT) ": \n\n");
+    bench_basis_generation(TORSION_EVEN_POWER, iterations);
+    bench_basis_generation(128, iterations);
+
+    printf("\nBenchmarking E[2^n] basis generation with hint for " STRINGIFY(SQISIGN_VARIANT) ": \n\n");
+    bench_basis_generation_from_hint(TORSION_EVEN_POWER, iterations);
+    bench_basis_generation_from_hint(128, iterations);
+}
+
+int
+main(int argc, char *argv[])
+{
+    int iterations = 100 * SQISIGN_TEST_REPS;
+    int help = 0;
+
+#ifndef NDEBUG
+    fprintf(stderr,
+            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
+            "This will severely impact performance measurements.\x1b[0m\n");
+#endif
+
+    for (int i = 1; i < argc; i++) {
+        if (!help && strcmp(argv[i], "--help") == 0) {
+            help = 1;
+            continue;
+        }
+
+        if (sscanf(argv[i], "--iterations=%d", &iterations) == 1) {
+            continue;
+        }
+    }
+
+    if (help || iterations <= 0) {
+        printf("Usage: %s [--iterations=<iterations>]\n", argv[0]);
+        printf("Where <iterations> is the number of iterations used for benchmarking; if not "
+               "present, uses the default: %d)\n",
+               iterations);
+        return 1;
+    }
+
+    cpucycles_init();
+
+    bench_basis(iterations);
+    return 0;
+}
--- a/src/ec/ref/lvlx/test/basis-gen-test.c
+++ b/src/ec/ref/lvlx/test/basis-gen-test.c
@@ -0,0 +1,195 @@
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <ec.h>
+
+/******************************
+Test functions
+******************************/
+
+int
+inner_test_generated_basis(ec_basis_t *basis, ec_curve_t *curve, unsigned int n)
+{
+    unsigned int i;
+    int PASSED = 1;
+
+    ec_point_t P, Q;
+    copy_point(&P, &basis->P);
+    copy_point(&Q, &basis->Q);
+
+    // Double points to get point of order 2
+    for (i = 0; i < n - 1; i++) {
+        xDBL_A24(&P, &P, &curve->A24, curve->is_A24_computed_and_normalized);
+        xDBL_A24(&Q, &Q, &curve->A24, curve->is_A24_computed_and_normalized);
+    }
+    if (ec_is_zero(&P)) {
+        printf("Point P generated does not have full order\n");
+        PASSED = 0;
+    }
+    if (ec_is_zero(&Q)) {
+        printf("Point Q generated does not have full order\n");
+        PASSED = 0;
+    }
+    if (ec_is_equal(&P, &Q)) {
+        printf("Points P, Q are linearly dependent\n");
+        PASSED = 0;
+    }
+
+    if (!fp2_is_zero(&Q.x)) {
+        printf("Points Q is not above the Montgomery point\n");
+        PASSED = 0;
+    }
+
+    // This should give the identity
+    xDBL_A24(&P, &P, &curve->A24, curve->is_A24_computed_and_normalized);
+    xDBL_A24(&Q, &Q, &curve->A24, curve->is_A24_computed_and_normalized);
+    if (!ec_is_zero(&P)) {
+        printf("Point P generated does not have order exactly 2^n\n");
+        PASSED = 0;
+    }
+    if (!ec_is_zero(&Q)) {
+        printf("Point Q generated does not have order exactly 2^n\n");
+        PASSED = 0;
+    }
+
+    if (PASSED == 0) {
+        printf("Test failed with n = %u\n", n);
+    }
+
+    return PASSED;
+}
+
+int
+inner_test_hint_basis(ec_basis_t *basis, ec_basis_t *basis_hint)
+{
+    int PASSED = 1;
+
+    if (!ec_is_equal(&basis->P, &basis_hint->P)) {
+        printf("The points P do not match using the hint\n");
+        PASSED = 0;
+    }
+
+    if (!ec_is_equal(&basis->Q, &basis_hint->Q)) {
+        printf("The points Q do not match using the hint\n");
+        PASSED = 0;
+    }
+
+    if (!ec_is_equal(&basis->PmQ, &basis_hint->PmQ)) {
+        printf("The points PmQ do not match using the hint\n");
+        PASSED = 0;
+    }
+
+    if (PASSED == 0) {
+        printf("Test failed\n");
+    }
+
+    return PASSED;
+}
+
+/******************************
+Test wrapper functions
+******************************/
+
+int
+test_basis_generation_E0(unsigned int n)
+{
+    ec_basis_t basis;
+    ec_curve_t curve;
+
+    ec_curve_init(&curve);
+
+    // Set a supersingular elliptic curve
+    // E : y^2 = x^3 + 6*x^2 + x
+    fp2_set_small(&(curve.A), 0);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    // Generate a basis
+    (void)ec_curve_to_basis_2f_to_hint(&basis, &curve, n);
+
+    // Test result
+    return inner_test_generated_basis(&basis, &curve, n);
+}
+
+int
+test_basis_generation(unsigned int n)
+{
+    ec_basis_t basis;
+    ec_curve_t curve;
+
+    ec_curve_init(&curve);
+
+    // Set a supersingular elliptic curve
+    // E : y^2 = x^3 + 6*x^2 + x
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    // Generate a basis
+    (void)ec_curve_to_basis_2f_to_hint(&basis, &curve, n);
+
+    // Test result
+    return inner_test_generated_basis(&basis, &curve, n);
+}
+
+int
+test_basis_generation_with_hints(unsigned int n)
+{
+    int check_1, check_2;
+    ec_basis_t basis, basis_hint;
+    ec_curve_t curve;
+    ec_curve_init(&curve);
+
+    // Set a supersingular elliptic curve
+    // E : y^2 = x^3 + 6*x^2 + x
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    // Generate a basis with hints
+    uint8_t hint = ec_curve_to_basis_2f_to_hint(&basis, &curve, n);
+
+    // Ensure the basis from the hint is good
+    check_1 = inner_test_generated_basis(&basis, &curve, n);
+
+    // Generate a basis using hints
+    ec_curve_to_basis_2f_from_hint(&basis_hint, &curve, n, hint);
+
+    // These two bases should be the same
+    check_2 = inner_test_hint_basis(&basis, &basis_hint);
+
+    return check_1 && check_2;
+}
+
+int
+test_basis(void)
+{
+    int passed;
+
+    // Test full order
+    passed = test_basis_generation(TORSION_EVEN_POWER);
+    passed &= test_basis_generation_with_hints(TORSION_EVEN_POWER);
+
+    // Test partial order
+    passed &= test_basis_generation(128);
+    passed &= test_basis_generation_with_hints(128);
+
+    // Special case when we have A = 0
+    passed &= test_basis_generation_E0(TORSION_EVEN_POWER);
+    passed &= test_basis_generation_E0(128);
+
+    return passed;
+}
+
+int
+main(void)
+{
+    bool ok;
+    ok = test_basis();
+    if (!ok) {
+        printf("Tests failed!\n");
+    } else {
+        printf("All basis generation tests passed.\n");
+    }
+    return !ok;
+}
--- a/src/ec/ref/lvlx/test/biextension-bench.c
+++ b/src/ec/ref/lvlx/test/biextension-bench.c
@@ -0,0 +1,113 @@
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <tools.h>
+#include <mp.h>
+#include "biextension.h"
+#include <rng.h>
+#include "bench.h"
+
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+void
+biextension_bench(uint64_t bench)
+{
+    uint64_t t0, t1;
+    uint32_t e = TORSION_EVEN_POWER;
+
+    fp2_t r1;
+    ec_curve_t curve;
+    ec_point_t tmp;
+
+    digit_t scal_r1[NWORDS_ORDER];
+    digit_t scal_r2[NWORDS_ORDER];
+    digit_t scal_s1[NWORDS_ORDER];
+    digit_t scal_s2[NWORDS_ORDER];
+
+    ec_basis_t BPQ, BRS;
+
+    // Get constants form curve E6 : y^2 = x^3 + 6*x^2 + x
+    ec_curve_init(&curve);
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    // Compute 2^e torsion on curve and copy to a second basis
+    (void)ec_curve_to_basis_2f_to_hint(&BPQ, &curve, e);
+    copy_basis(&BRS, &BPQ);
+
+    // Benchmark doubling on the curve
+    printf("\n\nBenchmarking doublings\n");
+    t0 = cpucycles();
+    for (uint64_t i = 0; i < bench; ++i) {
+        ec_dbl_iter(&tmp, e, &BPQ.P, &curve);
+    }
+    t1 = cpucycles();
+    printf("\x1b[34mAvg doubling: %'" PRIu64 " cycles\x1b[0m\n", (t1 - t0) / bench);
+
+    printf("\n\nBenchmarking (Weil) pairings\n");
+    t0 = cpucycles();
+    for (uint64_t i = 0; i < bench; ++i) {
+        weil(&r1, e, &BPQ.P, &BPQ.Q, &BPQ.PmQ, &curve);
+    }
+    t1 = cpucycles();
+    printf("\x1b[34mAvg pairing: %'" PRIu64 " cycles\x1b[0m\n", (t1 - t0) / bench);
+
+    printf("\n\nBenchmarking (Weil) dlogs\n");
+    t0 = cpucycles();
+    for (uint64_t i = 0; i < bench; ++i) {
+        ec_dlog_2_weil(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e);
+    }
+    t1 = cpucycles();
+    printf("\x1b[34mAvg pairing dlog: %'" PRIu64 " cycles\x1b[0m\n", (t1 - t0) / bench);
+
+    printf("\n\nBenchmarking (Tate) dlogs\n");
+    t0 = cpucycles();
+    for (uint64_t i = 0; i < bench; ++i) {
+        ec_dlog_2_tate(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e);
+    }
+    t1 = cpucycles();
+    printf("\x1b[34mAvg Tate dlog: %'" PRIu64 " cycles\x1b[0m\n", (t1 - t0) / bench);
+}
+
+int
+main(int argc, char *argv[])
+{
+    int iterations = 1000 * SQISIGN_TEST_REPS;
+    int help = 0;
+
+#ifndef NDEBUG
+    fprintf(stderr,
+            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
+            "This will severely impact performance measurements.\x1b[0m\n");
+#endif
+
+    for (int i = 1; i < argc; i++) {
+        if (!help && strcmp(argv[i], "--help") == 0) {
+            help = 1;
+            continue;
+        }
+
+        if (sscanf(argv[i], "--iterations=%d", &iterations) == 1) {
+            continue;
+        }
+    }
+
+    if (help || iterations <= 0) {
+        printf("Usage: %s [--iterations=<iterations>]\n", argv[0]);
+        printf("Where <iterations> is the number of iterations used for benchmarking; if not "
+               "present, uses the default: %d)\n",
+               iterations);
+        return 1;
+    }
+
+    cpucycles_init();
+
+    printf("Running biextension benchmarks for " STRINGIFY(SQISIGN_VARIANT) ":\n\n");
+
+    biextension_bench(iterations);
+
+    return 0;
+}
--- a/src/ec/ref/lvlx/test/biextension-test.c
+++ b/src/ec/ref/lvlx/test/biextension-test.c
@@ -0,0 +1,259 @@
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <tools.h>
+#include <mp.h>
+#include "biextension.h"
+#include <rng.h>
+#include <bench_test_arguments.h>
+
+void
+fp2_exp_2e(fp2_t *r, uint32_t e, const fp2_t *x)
+{
+    fp2_copy(r, x);
+    for (uint32_t i = 0; i < e; i++) {
+        fp2_sqr(r, r);
+    }
+}
+
+void
+biextension_test()
+{
+    clock_t t;
+    ec_curve_t curve;
+    ec_basis_t even_torsion;
+    uint32_t e = TORSION_EVEN_POWER;
+    fp2_t one, r1, rr1, rrr1, r2, r3, tp;
+    ec_point_t P, Q, PmQ, A24;
+    ec_point_t tmp, tmp2, PQ, PP, QQ, PPQ, PQQ, PPP, QQQ, PPPQ, PQQQ;
+
+    // Get constants form curve E6 : y^2 = x^3 + 6*x^2 + x
+    ec_curve_init(&curve);
+    fp2_set_small(&(curve.A), 6);
+    fp2_set_one(&(curve.C));
+    ec_curve_normalize_A24(&curve);
+    copy_point(&A24, &curve.A24);
+
+    // Compute 2^e torsion on curve
+    (void)ec_curve_to_basis_2f_to_hint(&even_torsion, &curve, e);
+    copy_point(&P, &even_torsion.P);
+    copy_point(&Q, &even_torsion.Q);
+    copy_point(&PmQ, &even_torsion.PmQ);
+
+    printf("Testing order of points\n");
+    t = tic();
+    ec_dbl_iter(&tmp, e, &P, &curve);
+    TOC_clock(t, "Doublings");
+    assert(ec_is_zero(&tmp));
+    ec_dbl_iter(&tmp, e, &Q, &curve);
+    assert(ec_is_zero(&tmp));
+    ec_dbl_iter(&tmp, e, &PmQ, &curve);
+    assert(ec_is_zero(&tmp));
+
+    printf("Computing Weil pairing\n");
+    xADD(&PQ, &P, &Q, &PmQ);
+    t = tic();
+
+    weil(&r1, e, &P, &Q, &PQ, &curve);
+    TOC_clock(t, "Weil pairing");
+
+    printf("Computing Tate pairing\n");
+    t = tic();
+
+    reduced_tate(&tp, e, &P, &Q, &PQ, &curve);
+    TOC_clock(t, "Tate pairing");
+
+    printf("Testing order of Weil pairing\n");
+    fp2_set_one(&one);
+    fp2_exp_2e(&r2, e - 1, &r1);
+    assert(!fp2_is_equal(&r2, &one));
+    fp2_exp_2e(&r2, e, &r1);
+    assert(fp2_is_equal(&r2, &one));
+
+    printf("Testing order of Tate pairing\n");
+    fp2_set_one(&one);
+    fp2_exp_2e(&r2, e - 1, &tp);
+    assert(!fp2_is_equal(&r2, &one));
+    fp2_exp_2e(&r2, e, &tp);
+    assert(fp2_is_equal(&r2, &one));
+
+    printf("Bilinearity tests\n");
+    weil(&r2, e, &P, &Q, &PmQ, &curve);
+    fp2_inv(&r2);
+    assert(fp2_is_equal(&r1, &r2));
+
+    xDBL_A24(&PP, &P, &A24, false);
+    xDBL_A24(&QQ, &Q, &A24, false);
+    xADD(&PPQ, &PQ, &P, &Q);
+    xADD(&PQQ, &PQ, &Q, &P);
+
+    weil(&r2, e, &PP, &Q, &PPQ, &curve);
+    weil(&r3, e, &P, &QQ, &PQQ, &curve);
+    assert(fp2_is_equal(&r2, &r3));
+    fp2_sqr(&rr1, &r1);
+    assert(fp2_is_equal(&rr1, &r2));
+
+    xADD(&PPP, &PP, &P, &P);
+    xADD(&QQQ, &QQ, &Q, &Q);
+    xADD(&PPPQ, &PPQ, &P, &PQ);
+    xADD(&PQQQ, &PQQ, &Q, &PQ);
+    weil(&r2, e, &PPP, &Q, &PPPQ, &curve);
+    weil(&r3, e, &P, &QQQ, &PQQQ, &curve);
+    assert(fp2_is_equal(&r2, &r3));
+    fp2_mul(&rrr1, &rr1, &r1);
+    assert(fp2_is_equal(&rrr1, &r2));
+
+    printf("dlog tests\n");
+    ec_basis_t BPQ, BRS;
+    digit_t scal_r1[NWORDS_ORDER] = { 0 };
+    digit_t scal_r2[NWORDS_ORDER] = { 0 };
+    digit_t scal_s1[NWORDS_ORDER] = { 0 };
+    digit_t scal_s2[NWORDS_ORDER] = { 0 };
+    digit_t scal_d1[NWORDS_ORDER] = { 0 };
+    digit_t scal_d2[NWORDS_ORDER] = { 0 };
+
+    // original even torsion
+    BPQ = even_torsion;
+    BRS = even_torsion;
+
+    // alternative torsion, just mix the points up a little...
+    // not filling top word so the addition below can overflow into it
+    // so the scalars are "random enough" but we still keep the difference
+    // scal_d1 and scal_d2 required to compute the right multiple of RmS
+    randombytes((unsigned char *)scal_d1, (NWORDS_ORDER - 1) * sizeof(digit_t));
+    randombytes((unsigned char *)scal_d2, (NWORDS_ORDER - 1) * sizeof(digit_t));
+    randombytes((unsigned char *)scal_s1, (NWORDS_ORDER - 1) * sizeof(digit_t));
+    randombytes((unsigned char *)scal_s2, (NWORDS_ORDER - 1) * sizeof(digit_t));
+
+    // Ensure that r1*s2 - r2*s1 is odd such that the matrix
+    // [[r1, r2], [s1, s2]] is invertible
+    scal_s1[0] = (scal_s1[0] & ((digit_t)(-1) - 1)) + 1; // s1 needs to be odd
+    scal_d1[0] = (scal_d1[0] & ((digit_t)(-1) - 1));     // d1 needs to be even to make r1 odd
+    scal_s2[0] = (scal_s2[0] & ((digit_t)(-1) - 1)) + 1; // s2 needs to be odd
+    scal_d2[0] = (scal_d2[0] & ((digit_t)(-1) - 1)) + 1; // d2 needs to be odd to make r2 even
+
+    // Compute r1 and r2 from the difference di = ri - si
+    mp_add(scal_r1, scal_d1, scal_s1, NWORDS_ORDER);
+    mp_add(scal_r2, scal_d2, scal_s2, NWORDS_ORDER);
+
+    ec_biscalar_mul(&BRS.P, scal_r1, scal_r2, e, &BPQ, &curve);
+    ec_biscalar_mul(&BRS.Q, scal_s1, scal_s2, e, &BPQ, &curve);
+    ec_biscalar_mul(&BRS.PmQ, scal_d1, scal_d2, e, &BPQ, &curve);
+
+    printf("mixed\n");
+
+    // Now solve the discrete log
+    ec_dlog_2_weil(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e);
+
+    // assert everything matches
+    // R = [r1]P + [r2]Q
+    ec_biscalar_mul(&tmp, scal_r1, scal_r2, e, &BPQ, &curve);
+    assert(ec_is_equal(&tmp, &BRS.P));
+
+    // S = [s1]P + [s2]Q
+    ec_biscalar_mul(&tmp, scal_s1, scal_s2, e, &BPQ, &curve);
+    assert(ec_is_equal(&tmp, &BRS.Q));
+
+    printf("weil solved\n");
+
+    // now repeat using the tate pairing
+    ec_dlog_2_tate(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e);
+
+    // assert everything matches
+    // R = [r1]P + [r2]Q
+    ec_biscalar_mul(&tmp, scal_r1, scal_r2, e, &BPQ, &curve);
+    assert(ec_is_equal(&tmp, &BRS.P));
+
+    // S = [s1]P + [s2]Q
+    ec_biscalar_mul(&tmp, scal_s1, scal_s2, e, &BPQ, &curve);
+    assert(ec_is_equal(&tmp, &BRS.Q));
+
+    printf("tate solved\n");
+
+    // now we try with bases for partial torsion E[2^e] with e < e_full
+    int e_full = TORSION_EVEN_POWER;
+    int e_partial = 126;
+
+    ec_dbl_iter(&BRS.P, e_full - e_partial, &BRS.P, &curve);
+    ec_dbl_iter(&BRS.Q, e_full - e_partial, &BRS.Q, &curve);
+    ec_dbl_iter(&BRS.PmQ, e_full - e_partial, &BRS.PmQ, &curve);
+
+    ec_dlog_2_tate(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e_partial);
+
+    ec_biscalar_mul(&tmp, scal_r1, scal_r2, e, &BPQ, &curve);
+    ec_dbl_iter(&tmp, e_full - e_partial, &tmp, &curve);
+    assert(ec_is_equal(&tmp, &BRS.P));
+
+    // S = [s1]P + [s2]Q
+    // then S = [2^e_diff] S
+    ec_biscalar_mul(&tmp, scal_s1, scal_s2, e, &BPQ, &curve);
+    ec_dbl_iter(&tmp, e_full - e_partial, &tmp, &curve);
+    assert(ec_is_equal(&tmp, &BRS.Q));
+
+    printf("tate from full basis solved\n");
+
+    ec_dlog_2_tate(scal_r1, scal_r2, scal_s1, scal_s2, &BPQ, &BRS, &curve, e_partial);
+    mp_invert_matrix(scal_r1, scal_r2, scal_s1, scal_s2, e_partial, NWORDS_ORDER);
+
+    // assert everything matches
+    ec_biscalar_mul(&tmp, scal_r1, scal_r2, e, &BRS, &curve);
+    ec_dbl_iter(&tmp2, e_full - e_partial, &BPQ.P, &curve);
+    assert(ec_is_equal(&tmp, &tmp2));
+
+    ec_biscalar_mul(&tmp, scal_s1, scal_s2, e, &BRS, &curve);
+    ec_dbl_iter(&tmp2, e_full - e_partial, &BPQ.Q, &curve);
+    assert(ec_is_equal(&tmp, &tmp2));
+
+    printf("tate to full basis solved\n");
+}
+
+int
+main(int argc, char *argv[])
+{
+    uint32_t seed[12] = { 0 };
+    int help = 0;
+    int seed_set = 0;
+
+    for (int i = 1; i < argc; i++) {
+        if (!help && strcmp(argv[i], "--help") == 0) {
+            help = 1;
+            continue;
+        }
+
+        if (!seed_set && !parse_seed(argv[i], seed)) {
+            seed_set = 1;
+            continue;
+        }
+    }
+
+    if (help) {
+        printf("Usage: %s [--seed=<seed>]\n", argv[0]);
+        printf("Where <seed> is the random seed to be used; if not present, a random seed is "
+               "generated\n");
+        return 1;
+    }
+
+    if (!seed_set) {
+        randombytes_select((unsigned char *)seed, sizeof(seed));
+    }
+
+    print_seed(seed);
+
+#if defined(TARGET_BIG_ENDIAN)
+    for (int i = 0; i < 12; i++) {
+        seed[i] = BSWAP32(seed[i]);
+    }
+#endif
+
+    randombytes_init((unsigned char *)seed, NULL, 256);
+
+    printf("Running biextension unit tests\n");
+
+    biextension_test();
+
+    // Failures will be caught by asserts in biextension_test
+    printf("\nAll tests passed!\n");
+
+    return 0;
+}
--- a/src/ec/ref/lvlx/test/curve-arith-bench.c
+++ b/src/ec/ref/lvlx/test/curve-arith-bench.c
@@ -0,0 +1,163 @@
+#include <bench.h>
+#include <bench_test_arguments.h>
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "test_extras.h"
+#include <ec.h>
+#include <isog.h>
+#include <rng.h>
+
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+uint64_t
+bench_xDBL(unsigned int Nbench)
+{
+    uint64_t cycles0, cycles1;
+    unsigned int i;
+    ec_point_t P[Nbench], A24[Nbench];
+    for (i = 0; i < Nbench; i++) {
+        fp2_random_test(&(P[i].x));
+        fp2_random_test(&(P[i].z));
+        fp2_random_test(&(A24[i].x));
+        fp2_random_test(&(A24[i].z));
+    }
+    cycles0 = cpucycles();
+    for (i = 0; i < Nbench; i++) {
+        xDBL(&P[i], &P[i], &A24[i]);
+    }
+    cycles1 = cpucycles();
+    return cycles1 - cycles0;
+}
+
+uint64_t
+bench_xEVAL4(unsigned int Nbench)
+{
+    uint64_t cycles0, cycles1;
+    unsigned int i;
+    ec_point_t P[Nbench];
+    ec_kps4_t KPS[Nbench];
+    for (i = 0; i < Nbench; i++) {
+        fp2_random_test(&(P[i].x));
+        fp2_random_test(&(P[i].z));
+        for (int j = 0; j < 3; j++) {
+            fp2_random_test(&(KPS[i].K[j].x));
+            fp2_random_test(&(KPS[i].K[j].z));
+        }
+    }
+    cycles0 = cpucycles();
+    for (i = 0; i < Nbench; i++) {
+        xeval_4(&P[i], &P[i], 1, &KPS[i]);
+    }
+    cycles1 = cpucycles();
+    return cycles1 - cycles0;
+}
+
+uint64_t
+bench_isog_strategy(unsigned int Nbench)
+{
+    uint64_t cycles0, cycles1;
+    unsigned int i;
+    ec_curve_t E0;
+    ec_isog_even_t phi[Nbench];
+    ec_basis_t basis2;
+    ec_curve_init(&E0);
+    fp2_set_small(&(E0.A), 6);
+    fp2_set_one(&(E0.C));
+    (void)ec_curve_to_basis_2f_to_hint(&basis2, &E0, TORSION_EVEN_POWER);
+    for (i = 0; i < Nbench; i++) {
+        copy_curve(&phi[i].curve, &E0);
+        phi[i].length = TORSION_EVEN_POWER;
+        if (i == 0) {
+            xADD(&phi[i].kernel, &basis2.P, &basis2.Q, &basis2.PmQ);
+        }
+        if (i == 1) {
+            xADD(&phi[i].kernel, &phi[i - 1].kernel, &basis2.Q, &basis2.P);
+        }
+        if (i > 1) {
+            xADD(&phi[i].kernel, &phi[i - 1].kernel, &basis2.Q, &phi[i - 2].kernel);
+        }
+    }
+    cycles0 = cpucycles();
+    for (i = 2; i < Nbench; i++) {
+        if (ec_eval_even(&phi[i].curve, &phi[i], NULL, 0)) {
+            printf("Failed isogeny strategy\n");
+            return 0;
+        }
+    }
+    cycles1 = cpucycles();
+    return cycles1 - cycles0;
+}
+
+int
+main(int argc, char *argv[])
+{
+    uint32_t seed[12] = { 0 };
+    int iterations = 100 * SQISIGN_TEST_REPS;
+    int help = 0;
+    int seed_set = 0;
+
+#ifndef NDEBUG
+    fprintf(stderr,
+            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
+            "This will severely impact performance measurements.\x1b[0m\n");
+#endif
+
+    for (int i = 1; i < argc; i++) {
+        if (!help && strcmp(argv[i], "--help") == 0) {
+            help = 1;
+            continue;
+        }
+
+        if (!seed_set && !parse_seed(argv[i], seed)) {
+            seed_set = 1;
+            continue;
+        }
+
+        if (sscanf(argv[i], "--iterations=%d", &iterations) == 1) {
+            continue;
+        }
+    }
+
+    if (help || iterations <= 0) {
+        printf("Usage: %s [--iterations=<iterations>] [--seed=<seed>]\n", argv[0]);
+        printf("Where <iterations> is the number of iterations used for benchmarking; if not "
+               "present, uses the default: %d)\n",
+               iterations);
+        printf("Where <seed> is the random seed to be used; if not present, a random seed is "
+               "generated\n");
+        return 1;
+    }
+
+    if (!seed_set) {
+        randombytes_select((unsigned char *)seed, sizeof(seed));
+    }
+
+    print_seed(seed);
+
+#if defined(TARGET_BIG_ENDIAN)
+    for (int i = 0; i < 12; i++) {
+        seed[i] = BSWAP32(seed[i]);
+    }
+#endif
+
+    randombytes_init((unsigned char *)seed, NULL, 256);
+    cpucycles_init();
+
+    printf("Benchmarking elliptic curve arithmetic for " STRINGIFY(SQISIGN_VARIANT) ":\n\n");
+
+    uint64_t cycles;
+
+    cycles = bench_xDBL(10 * iterations);
+    printf("Bench xDBL_A24:\t%" PRIu64 " cycles\n", cycles / (10 * iterations));
+
+    cycles = bench_xEVAL4(iterations);
+    printf("Bench xEVAL4:\t%" PRIu64 " cycles\n", cycles / iterations);
+
+    cycles = bench_isog_strategy(iterations);
+    printf("Bench isog strategy:\t%" PRIu64 " cycles\n", cycles / iterations);
+
+    return 0;
+}
--- a/src/ec/ref/lvlx/test/curve-arith-test.c
+++ b/src/ec/ref/lvlx/test/curve-arith-test.c
@@ -0,0 +1,404 @@
+#include <assert.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "test_extras.h"
+#include <ec.h>
+#include <isog.h>
+#include <rng.h>
+#include <bench_test_arguments.h>
+
+/******************************
+Test functions
+******************************/
+
+int
+test_xDBL_xADD(const ec_curve_t *curve, unsigned int Ntest)
+{
+    unsigned int i;
+
+    ec_point_t P, Q, PQ, R1, R2;
+
+    for (i = 0; i < Ntest; i++) {
+        ec_random_test(&P, curve);
+        ec_random_test(&Q, curve);
+        projective_difference_point(&PQ, &P, &Q, curve);
+
+        // 2(P + Q) = 2P + 2Q
+        xADD(&R1, &P, &Q, &PQ);
+        ec_dbl(&R1, &R1, curve);
+        ec_dbl(&P, &P, curve);
+        ec_dbl(&Q, &Q, curve);
+        ec_dbl(&PQ, &PQ, curve);
+        xADD(&R2, &P, &Q, &PQ);
+        if (!ec_is_equal(&R1, &R2)) {
+            printf("Failed 2(P + Q) = 2P + 2Q\n");
+            return 1;
+        }
+
+        // (P+Q) + (P-Q) = 2P
+        xADD(&R1, &P, &Q, &PQ);
+        ec_dbl(&Q, &Q, curve);
+        xADD(&R1, &R1, &PQ, &Q);
+        ec_dbl(&P, &P, curve);
+        ec_dbl(&PQ, &PQ, curve);
+        if (!ec_is_equal(&R1, &P)) {
+            printf("Failed (P+Q) + (P-Q) = 2P\n");
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+int
+test_xDBLADD(const ec_curve_t *curve, unsigned int Ntest)
+{
+    unsigned int i;
+
+    ec_point_t P, Q, PQ, R1, R2;
+
+    ec_point_t A24;
+    AC_to_A24(&A24, curve);
+
+    for (i = 0; i < Ntest; i++) {
+        ec_random_test(&P, curve);
+        ec_random_test(&Q, curve);
+        projective_difference_point(&PQ, &P, &Q, curve);
+
+        xDBLADD(&R1, &R2, &P, &Q, &PQ, &A24, false);
+        xADD(&PQ, &P, &Q, &PQ);
+        if (!ec_is_equal(&R2, &PQ)) {
+            printf("Failed addition in xDBLADD\n");
+            return 1;
+        }
+        ec_dbl(&P, &P, curve);
+        if (!ec_is_equal(&R1, &P)) {
+            printf("Failed doubling in xDBLADD\n");
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int
+test_xDBL_variants(ec_curve_t *curve, unsigned int Ntest)
+{
+    unsigned int i;
+    ec_curve_t E;
+    ec_point_t P, R1, R2, R3, R4;
+    ec_point_t A24, A24norm;
+    fp2_t z;
+
+    AC_to_A24(&A24, curve);
+    copy_point(&A24norm, &A24);
+    ec_normalize_point(&A24norm);
+
+    // Randomize projective representation
+    copy_curve(&E, curve);
+    fp2_random_test(&z);
+    fp2_mul(&(E.A24.x), &(A24.x), &z);
+    fp2_mul(&(E.A24.z), &(A24.z), &z);
+    E.is_A24_computed_and_normalized = false;
+
+    for (i = 0; i < Ntest; i++) {
+        ec_random_test(&P, curve);
+        xDBL(&R1, &P, (const ec_point_t *)curve);
+        xDBL_A24(&R2, &P, &(E.A24), false);
+        xDBL_A24(&R3, &P, &A24norm, true);
+        xDBL_E0(&R4, &P);
+        if (!ec_is_equal(&R1, &R2)) {
+            printf("xDBL and xDBL_A24 dont match\n");
+            return 1;
+        }
+        if (!ec_is_equal(&R1, &R3)) {
+            printf("xDBL and xDBL_A24 normalized dont match\n");
+            return 1;
+        }
+        if (!ec_is_equal(&R1, &R4)) {
+            printf("xDBL and xDBL_E0 dont match\n");
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int
+test_zero_identities(ec_curve_t *curve, unsigned int Ntest)
+{
+    unsigned int i;
+
+    ec_point_t P, Q, R, ec_zero;
+
+    fp2_set_one(&(P.x));
+    fp2_set_zero(&(P.z));
+
+    fp2_set_one(&(ec_zero.x));
+    fp2_set_zero(&(ec_zero.z));
+
+    assert(ec_is_zero(&P));
+
+    for (i = 0; i < Ntest; i++) {
+        ec_random_test(&P, curve);
+
+        xADD(&R, &ec_zero, &ec_zero, &ec_zero);
+        if (!ec_is_zero(&R)) {
+            printf("Failed 0 + 0 = 0\n");
+            return 1;
+        }
+
+        ec_dbl(&R, &P, curve);
+        xADD(&R, &P, &P, &R);
+        if (!ec_is_zero(&R)) {
+            printf("Failed P - P = 0\n");
+            return 1;
+        }
+
+        ec_dbl(&R, &ec_zero, curve);
+        if (!ec_is_zero(&R)) {
+            printf("Failed 2*0 = 0\n");
+            return 1;
+        }
+
+        xADD(&R, &P, &ec_zero, &P);
+        if (!ec_is_equal(&R, &P)) {
+            printf("Failed P + 0 = P\n");
+            return 1;
+        }
+        xADD(&R, &ec_zero, &P, &P);
+        if (!ec_is_equal(&R, &P)) {
+            printf("Failed P + 0 = P\n");
+            return 1;
+        }
+
+        xDBLADD(&R, &Q, &P, &ec_zero, &P, &curve->A24, false);
+        if (!ec_is_equal(&Q, &P)) {
+            printf("Failed P + 0 = P in xDBLADD\n");
+            return 1;
+        }
+        xDBLADD(&R, &Q, &ec_zero, &P, &P, &curve->A24, false);
+        if (!ec_is_equal(&Q, &P)) {
+            printf("Failed P + 0 = P in xDBLADD\n");
+            return 1;
+        }
+        if (!ec_is_zero(&R)) {
+            printf("Failed 2*0 = 0 in xDBLADD\n");
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int
+test_jacobian(const ec_curve_t *curve, unsigned int Ntest)
+{
+    unsigned int i;
+
+    ec_point_t P, Q;
+    jac_point_t R, S, T, U, jac_zero;
+    fp2_t t0, t1;
+
+    jac_init(&jac_zero);
+
+    for (i = 0; i < Ntest; i++) {
+        ec_random_test(&P, curve);
+        ec_normalize_point(&P);
+        ec_random_test(&Q, curve);
+        ec_normalize_point(&Q);
+
+        /* Convert to Jacobian coordinates. */
+        fp2_copy(&(S.x), &(P.x));
+        ec_recover_y(&(S.y), &(S.x), curve);
+        fp2_set_one(&(S.z));
+        fp2_copy(&(T.x), &(Q.x));
+        ec_recover_y(&(T.y), &(T.x), curve);
+        fp2_set_one(&(T.z));
+
+        ADD(&R, &jac_zero, &jac_zero, curve);
+        if (!jac_is_equal(&R, &jac_zero)) {
+            printf("Failed 0 + 0 = 0 in jac\n");
+            return 1;
+        }
+
+        DBL(&R, &jac_zero, curve);
+        if (!jac_is_equal(&R, &jac_zero)) {
+            printf("Failed 2*0 = 0 in jac\n");
+            return 1;
+        }
+
+        jac_neg(&R, &S);
+        ADD(&R, &S, &R, curve);
+        if (!jac_is_equal(&R, &jac_zero)) {
+            printf("Failed P - P = 0 in jac\n");
+            return 1;
+        }
+
+        ADD(&R, &S, &jac_zero, curve);
+        if (!jac_is_equal(&R, &S)) {
+            printf("Failed P + 0 = P in jac\n");
+            return 1;
+        }
+        ADD(&R, &jac_zero, &S, curve);
+        if (!jac_is_equal(&R, &S)) {
+            printf("Failed P + 0 = P in jac\n");
+            return 1;
+        }
+        ADD(&R, &S, &jac_zero, curve);
+        if (!jac_is_equal(&R, &S)) {
+            printf("Failed 0 + P = P in jac\n");
+            return 1;
+        }
+
+        DBL(&R, &S, curve);
+        ADD(&U, &S, &S, curve);
+        if (!jac_is_equal(&R, &U)) {
+            printf("Failed P + P = 2*P in jac\n");
+            return 1;
+        }
+
+        ADD(&R, &T, &S, curve);
+        ADD(&T, &S, &T, curve);
+        if (!jac_is_equal(&R, &T)) {
+            printf("Failed P + Q = Q + P in jac\n");
+            return 1;
+        }
+        ADD(&R, &T, &S, curve);
+        ADD(&U, &S, &T, curve);
+        if (!jac_is_equal(&R, &U)) {
+            printf("Failed P + Q = Q + P in jac\n");
+            return 1;
+        }
+
+        // Double R to make it different than (T + S).
+        DBL(&R, &R, curve);
+        ADD(&U, &S, &T, curve);
+        ADD(&U, &U, &R, curve);
+        ADD(&R, &R, &T, curve);
+        ADD(&R, &R, &S, curve);
+        if (!jac_is_equal(&R, &U)) {
+            printf("Failed (P + Q) + R = P + (Q + R) in jac\n");
+            return 1;
+        }
+
+        jac_to_ws(&R, &t0, &t1, &jac_zero, curve);
+        jac_from_ws(&R, &R, &t1, curve);
+        if (!jac_is_equal(&R, &jac_zero)) {
+            printf("Failed converting to Weierstrass\n");
+            return 1;
+        }
+
+        jac_to_ws(&R, &t0, &t1, &S, curve);
+        jac_from_ws(&R, &R, &t1, curve);
+        if (!jac_is_equal(&S, &R)) {
+            printf("Failed converting to Weierstrass\n");
+            return 1;
+        }
+        DBL(&S, &S, curve);
+        jac_to_ws(&R, &t0, &t1, &S, curve);
+        jac_from_ws(&R, &R, &t1, curve);
+        if (!jac_is_equal(&S, &R)) {
+            printf("Failed converting to Weierstrass\n");
+            return 1;
+        }
+
+        jac_to_ws(&R, &t0, &t1, &jac_zero, curve);
+        DBLW(&R, &t0, &R, &t0);
+        jac_from_ws(&R, &R, &t1, curve);
+        if (!jac_is_equal(&R, &jac_zero)) {
+            printf("Failed 2*0 = 0 in Weierstrass\n");
+            return 1;
+        }
+        jac_to_ws(&R, &t0, &t1, &S, curve);
+        DBLW(&R, &t0, &R, &t0);
+        jac_from_ws(&R, &R, &t1, curve);
+        DBL(&S, &S, curve);
+        if (!jac_is_equal(&S, &R)) {
+            printf("Failed doubling in Weierstrass\n");
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int
+main(int argc, char *argv[])
+{
+    uint32_t seed[12] = { 0 };
+    int iterations = 100 * SQISIGN_TEST_REPS;
+    int help = 0;
+    int seed_set = 0;
+    int res = 0;
+
+    for (int i = 1; i < argc; i++) {
+        if (!help && strcmp(argv[i], "--help") == 0) {
+            help = 1;
+            continue;
+        }
+
+        if (!seed_set && !parse_seed(argv[i], seed)) {
+            seed_set = 1;
+            continue;
+        }
+
+        if (sscanf(argv[i], "--iterations=%d", &iterations) == 1) {
+            continue;
+        }
+    }
+
+    if (help || iterations <= 0) {
+        printf("Usage: %s [--iterations=<iterations>] [--seed=<seed>]\n", argv[0]);
+        printf("Where <iterations> is the number of iterations used for testing; if not "
+               "present, uses the default: %d)\n",
+               iterations);
+        printf("Where <seed> is the random seed to be used; if not present, a random seed is "
+               "generated\n");
+        return 1;
+    }
+
+    if (!seed_set) {
+        randombytes_select((unsigned char *)seed, sizeof(seed));
+    }
+
+    print_seed(seed);
+
+#if defined(TARGET_BIG_ENDIAN)
+    for (int i = 0; i < 12; i++) {
+        seed[i] = BSWAP32(seed[i]);
+    }
+#endif
+
+    randombytes_init((unsigned char *)seed, NULL, 256);
+
+    // Curve A=6
+    ec_curve_t curve;
+    ec_curve_init(&curve);
+    fp2_set_small(&(curve.A), 0);
+    fp2_set_small(&(curve.C), 1);
+    // fp2_random_test(&(curve.C));
+    // fp2_mul(&(curve.A), &(curve.A), &(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    res |= test_xDBL_xADD(&curve, iterations);
+    res |= test_xDBLADD(&curve, iterations);
+    res |= test_xDBL_variants(&curve, iterations);
+    res |= test_zero_identities(&curve, iterations);
+    res |= test_jacobian(&curve, iterations);
+
+    fp2_random_test(&(curve.C));
+    fp2_mul(&(curve.A), &(curve.A), &(curve.C));
+    ec_curve_normalize_A24(&curve);
+
+    res |= test_xDBL_xADD(&curve, iterations);
+    res |= test_xDBLADD(&curve, iterations);
+    res |= test_xDBL_variants(&curve, iterations);
+    res |= test_zero_identities(&curve, iterations);
+    res |= test_jacobian(&curve, iterations);
+
+    if (res) {
+        printf("Tests failed!\n");
+    } else {
+        printf("All ec arithmetic tests passed.\n");
+    }
+
+    return res;
+}
--- a/src/ec/ref/lvlx/test/test_extras.c
+++ b/src/ec/ref/lvlx/test/test_extras.c
@@ -0,0 +1,116 @@
+#include "test_extras.h"
+#include "rng.h"
+
+// Make n random-ish field elements (for tests only!).
+void
+fp_random_test(fp_t *a)
+{
+    uint8_t tmp[FP_ENCODED_BYTES];
+
+    randombytes(tmp, sizeof(tmp));
+
+    fp_decode_reduce(a, tmp, sizeof(tmp));
+}
+
+void
+fp2_random_test(fp2_t *a)
+{
+    fp_random_test(&(a->re));
+    fp_random_test(&(a->im));
+}
+
+// Given an x-coordinate, determines if this is a valid
+// point on the curve. Assumes C=1.
+static uint32_t
+projective_is_on_curve(const ec_point_t *P, const ec_curve_t *curve)
+{
+
+    fp2_t t0, t1, t2;
+
+    // Check if xz*(C^2x^2+zACx+z^2C^2) is a square
+    fp2_mul(&t0, &curve->C, &P->x);
+    fp2_mul(&t1, &t0, &P->z);
+    fp2_mul(&t1, &t1, &curve->A);
+    fp2_mul(&t2, &curve->C, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sqr(&t2, &t2);
+    fp2_add(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t2);
+    fp2_mul(&t0, &t0, &P->x);
+    fp2_mul(&t0, &t0, &P->z);
+    return fp2_is_square(&t0) || fp2_is_zero(&t0);
+}
+
+void
+ec_random_normalized_test(ec_point_t *P, const ec_curve_t *curve)
+{
+    fp2_set_one(&P->z);
+    while (1) {
+        fp2_random_test(&P->x);
+        if (projective_is_on_curve(P, curve)) {
+            break;
+        }
+    }
+}
+
+void
+ec_random_test(ec_point_t *P, const ec_curve_t *curve)
+{
+    ec_random_normalized_test(P, curve);
+    fp2_random_test(&P->z);
+    fp2_mul(&P->x, &P->x, &P->z);
+}
+
+void
+projective_difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve)
+{
+    // Given P,Q in projective x-only, computes a deterministic choice for (P-Q)
+    // Based on Proposition 3 of https://eprint.iacr.org/2017/518.pdf
+
+    fp2_t Bxx, Bxz, Bzz, t0, t1;
+
+    fp2_mul(&t0, &P->x, &Q->x);
+    fp2_mul(&t1, &P->z, &Q->z);
+    fp2_sub(&Bxx, &t0, &t1);
+    fp2_sqr(&Bxx, &Bxx);
+    fp2_mul(&Bxx, &Bxx, &curve->C); // C*(P.x*Q.x-P.z*Q.z)^2
+    fp2_add(&Bxz, &t0, &t1);
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    fp2_add(&Bzz, &t0, &t1);
+    fp2_mul(&Bxz, &Bxz, &Bzz); // (P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_sub(&Bzz, &t0, &t1);
+    fp2_sqr(&Bzz, &Bzz);
+    fp2_mul(&Bzz, &Bzz, &curve->C); // C*(P.x*Q.z-P.z*Q.x)^2
+    fp2_mul(&Bxz, &Bxz, &curve->C); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&t0, &t0, &curve->A);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&Bxz, &Bxz, &t0); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x) + 2*A*P.x*Q.z*P.z*Q.x
+
+    // Normalization: our squareroot always has the same sign as long as P.z, Q.z, and C
+    // are in Fp and C is a square, so the B's should be scaled by C*C_bar^2*P.z_bar^2*Q.Z_bar^2
+    fp_copy(&t0.re, &curve->C.re);
+    fp_neg(&t0.im, &curve->C.im);
+    fp2_sqr(&t0, &t0);
+    fp2_mul(&t0, &t0, &curve->C);
+    fp_copy(&t1.re, &P->z.re);
+    fp_neg(&t1.im, &P->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp_copy(&t1.re, &Q->z.re);
+    fp_neg(&t1.im, &Q->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&Bxx, &Bxx, &t0);
+    fp2_mul(&Bxz, &Bxz, &t0);
+    fp2_mul(&Bzz, &Bzz, &t0);
+
+    // Solving quadratic equation
+    fp2_sqr(&t0, &Bxz);
+    fp2_mul(&t1, &Bxx, &Bzz);
+    fp2_sub(&t0, &t0, &t1);
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &Bxz, &t0);
+    fp2_copy(&PQ->z, &Bzz);
+}
--- a/src/ec/ref/lvlx/test/test_extras.h
+++ b/src/ec/ref/lvlx/test/test_extras.h
@@ -0,0 +1,43 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <encoded_sizes.h>
+#include <ec.h>
+#include <fp.h>
+#include <fp2.h>
+
+#define PASSED 0
+#define FAILED 1
+
+// Generating a pseudo-random field element in [0, p-1]
+void fp_random_test(fp_t *a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2_random_test(fp2_t *a);
+
+// Generating a random projective x-only point
+void ec_random_test(ec_point_t *P, const ec_curve_t *curve);
+
+// Generating a random projective x-only point and normalizing it
+void ec_random_normalized_test(ec_point_t *P, const ec_curve_t *curve);
+
+// Point difference
+void projective_difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve);
+
+// xDBL
+void xDBL(ec_point_t *Q, const ec_point_t *P, const ec_point_t *AC);
+
+// Double-and-add
+extern void xDBLADD(ec_point_t *R,
+                    ec_point_t *S,
+                    const ec_point_t *P,
+                    const ec_point_t *Q,
+                    const ec_point_t *PQ,
+                    const ec_point_t *A24,
+                    const bool A24_normalized);
+
+#endif
--- a/src/ec/ref/lvlx/xeval.c
+++ b/src/ec/ref/lvlx/xeval.c
@@ -0,0 +1,64 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
+void
+xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1, t2;
+    for (int j = 0; j < lenQ; j++) {
+        fp2_add(&t0, &Q[j].x, &Q[j].z);
+        fp2_sub(&t1, &Q[j].x, &Q[j].z);
+        fp2_mul(&t2, &kps->K.x, &t1);
+        fp2_mul(&t1, &kps->K.z, &t0);
+        fp2_add(&t0, &t2, &t1);
+        fp2_sub(&t1, &t2, &t1);
+        fp2_mul(&R[j].x, &Q[j].x, &t0);
+        fp2_mul(&R[j].z, &Q[j].z, &t1);
+    }
+}
+
+void
+xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1;
+    for (int i = 0; i < lenQ; i++) {
+        fp2_mul(&t0, &Q[i].x, &Q[i].z);
+        fp2_mul(&t1, &kps->K.x, &Q[i].z);
+        fp2_add(&t1, &t1, &Q[i].x);
+        fp2_mul(&t1, &t1, &Q[i].x);
+        fp2_sqr(&R[i].x, &Q[i].z);
+        fp2_add(&R[i].x, &R[i].x, &t1);
+        fp2_mul(&R[i].z, &t0, &kps->K.z);
+    }
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
+void
+xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps)
+{
+    const ec_point_t *K = kps->K;
+
+    fp2_t t0, t1;
+
+    for (int i = 0; i < lenQ; i++) {
+        fp2_add(&t0, &Q[i].x, &Q[i].z);
+        fp2_sub(&t1, &Q[i].x, &Q[i].z);
+        fp2_mul(&(R[i].x), &t0, &K[1].x);
+        fp2_mul(&(R[i].z), &t1, &K[2].x);
+        fp2_mul(&t0, &t0, &t1);
+        fp2_mul(&t0, &t0, &K[0].x);
+        fp2_add(&t1, &(R[i].x), &(R[i].z));
+        fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
+        fp2_sqr(&t1, &t1);
+        fp2_sqr(&(R[i].z), &(R[i].z));
+        fp2_add(&(R[i].x), &t0, &t1);
+        fp2_sub(&t0, &t0, &(R[i].z));
+        fp2_mul(&(R[i].x), &(R[i].x), &t1);
+        fp2_mul(&(R[i].z), &(R[i].z), &t0);
+    }
+}
--- a/src/ec/ref/lvlx/xisog.c
+++ b/src/ec/ref/lvlx/xisog.c
@@ -0,0 +1,61 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+// Degree-2 isogeny with kernel generated by P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    fp2_sqr(&B->x, &P.x);
+    fp2_sqr(&B->z, &P.z);
+    fp2_sub(&B->x, &B->z, &B->x);
+    fp2_add(&kps->K.x, &P.x, &P.z);
+    fp2_sub(&kps->K.z, &P.x, &P.z);
+}
+
+void
+xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24)
+{
+    // No need to check the square root, only used for signing.
+    fp2_t t0, four;
+    fp2_set_small(&four, 4);
+    fp2_add(&t0, &A24.x, &A24.x);
+    fp2_sub(&t0, &t0, &A24.z);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(&A24.z);
+    fp2_mul(&t0, &t0, &A24.z);
+    fp2_copy(&kps->K.x, &t0);
+    fp2_add(&B24->x, &t0, &t0);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t0, &t0, &four);
+    fp2_sqrt(&t0);
+    fp2_neg(&kps->K.z, &t0);
+    fp2_add(&B24->z, &t0, &t0);
+    fp2_add(&B24->x, &B24->x, &B24->z);
+    fp2_add(&B24->z, &B24->z, &B24->z);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    ec_point_t *K = kps->K;
+
+    fp2_sqr(&K[0].x, &P.x);
+    fp2_sqr(&K[0].z, &P.z);
+    fp2_add(&K[1].x, &K[0].z, &K[0].x);
+    fp2_sub(&K[1].z, &K[0].z, &K[0].x);
+    fp2_mul(&B->x, &K[1].x, &K[1].z);
+    fp2_sqr(&B->z, &K[0].z);
+
+    // Constants for xeval_4
+    fp2_add(&K[2].x, &P.x, &P.z);
+    fp2_sub(&K[1].x, &P.x, &P.z);
+    fp2_add(&K[0].x, &K[0].z, &K[0].z);
+    fp2_add(&K[0].x, &K[0].x, &K[0].x);
+}
--- a/src/ec/ref/lvlx_test.cmake
+++ b/src/ec/ref/lvlx_test.cmake
@@ -0,0 +1,32 @@
+add_executable(curve-arith.test_${SVARIANT_LOWER} ${LVLX_DIR}/test/curve-arith-test.c ${LVLX_DIR}/test/test_extras.c)
+target_include_directories(curve-arith.test_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(curve-arith.test_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}} sqisign_common_test)
+
+add_executable(biextension.test_${SVARIANT_LOWER} ${LVLX_DIR}/test/biextension-test.c)
+target_include_directories(biextension.test_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(biextension.test_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}} sqisign_common_test)
+
+add_executable(basis-gen.test_${SVARIANT_LOWER} ${LVLX_DIR}/test/basis-gen-test.c)
+target_include_directories(basis-gen.test_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${LVLX_DIR}/test ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(basis-gen.test_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}})
+
+add_test(curve_arith.test_${SVARIANT_LOWER} curve-arith.test_${SVARIANT_LOWER})
+add_test(ec_biextension.test_${SVARIANT_LOWER} biextension.test_${SVARIANT_LOWER})
+add_test(ec_basis_gen.test_${SVARIANT_LOWER} basis-gen.test_${SVARIANT_LOWER})
+
+add_executable(curve-arith.bench_${SVARIANT_LOWER} ${LVLX_DIR}/test/curve-arith-bench.c ${LVLX_DIR}/test/test_extras.c)
+target_include_directories(curve-arith.bench_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(curve-arith.bench_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}} sqisign_common_sys)
+
+add_executable(biextension.bench_${SVARIANT_LOWER} ${LVLX_DIR}/test/biextension-bench.c)
+target_include_directories(biextension.bench_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(biextension.bench_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}} sqisign_common_sys)
+
+add_executable(basis-gen.bench_${SVARIANT_LOWER} ${LVLX_DIR}/test/basis-gen-bench.c)
+target_include_directories(basis-gen.bench_${SVARIANT_LOWER} PUBLIC ${INC_COMMON} ${INC_MP} ${LVLX_DIR}/test ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ../include ${INC_EC} .)
+target_link_libraries(basis-gen.bench_${SVARIANT_LOWER} ${LIB_EC_${SVARIANT_UPPER}})
+
+set(BM_BINS ${BM_BINS}
+    curve-arith.bench_${SVARIANT_LOWER} basis-gen.bench_${SVARIANT_LOWER} biextension.bench_${SVARIANT_LOWER}
+    CACHE INTERNAL "List of benchmark executables")
+
--- a/src/gf/broadwell/CMakeLists.txt
+++ b/src/gf/broadwell/CMakeLists.txt
@@ -1 +1,3 @@
+set(LVLX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lvlx)
+
 include(${SELECT_SQISIGN_VARIANT})
--- a/src/gf/broadwell/include/asm_preamble.h
+++ b/src/gf/broadwell/include/asm_preamble.h
@@ -0,0 +1,22 @@
+#ifdef __APPLE__
+#define CAT(A, B) _CAT(A, B)
+#define _CAT(A, B) A##B
+#undef fp_add
+#undef fp_sub
+#undef fp_mul
+#undef fp_sqr
+#undef fp2_mul_c0
+#undef fp2_mul_c1
+#undef fp2_sq_c0
+#undef fp2_sq_c1
+#define p2 CAT(_, p2)
+#define p CAT(_, p)
+#define fp_add CAT(_, SQISIGN_NAMESPACE(fp_add))
+#define fp_sub CAT(_, SQISIGN_NAMESPACE(fp_sub))
+#define fp_mul CAT(_, SQISIGN_NAMESPACE(fp_mul))
+#define fp_sqr CAT(_, SQISIGN_NAMESPACE(fp_sqr))
+#define fp2_mul_c0 CAT(_, SQISIGN_NAMESPACE(fp2_mul_c0))
+#define fp2_mul_c1 CAT(_, SQISIGN_NAMESPACE(fp2_mul_c1))
+#define fp2_sq_c0 CAT(_, SQISIGN_NAMESPACE(fp2_sq_c0))
+#define fp2_sq_c1 CAT(_, SQISIGN_NAMESPACE(fp2_sq_c1))
+#endif
--- a/src/gf/broadwell/include/fp2x.h
+++ b/src/gf/broadwell/include/fp2x.h
@@ -0,0 +1,162 @@
+#ifndef FP2X_H
+#define FP2X_H
+
+#include <sqisign_namespace.h>
+#include "fp.h"
+#include <stdio.h>
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t
+{
+    fp_t re, im;
+} fp2_t;
+
+static inline void
+fp2_set_small(fp2_t *x, const uint32_t val)
+{
+    fp_set_small(&(x->re), val);
+    fp_set_zero(&(x->im));
+}
+
+static inline void
+fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n)
+{
+    fp_mul_small(&x->re, &y->re, n);
+    fp_mul_small(&x->im, &y->im, n);
+}
+
+static inline void
+fp2_set_zero(fp2_t *x)
+{
+    fp_set_zero(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+static inline void
+fp2_set_one(fp2_t *x)
+{
+    fp_set_one(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+static inline uint32_t
+fp2_is_equal(const fp2_t *a, const fp2_t *b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(&(a->re), &(b->re)) & fp_is_equal(&(a->im), &(b->im));
+}
+
+static inline uint32_t
+fp2_is_zero(const fp2_t *a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(&(a->re)) & fp_is_zero(&(a->im));
+}
+
+static inline uint32_t
+fp2_is_one(const fp2_t *a)
+{ // Is a GF(p^2) element one?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    return fp_is_equal(&(a->re), &ONE) & fp_is_zero(&(a->im));
+}
+
+static inline void
+fp2_half(fp2_t *x, const fp2_t *y)
+{
+    fp_half(&(x->re), &(y->re));
+    fp_half(&(x->im), &(y->im));
+}
+
+static inline void
+fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_add(&(x->re), &(y->re), &(z->re));
+    fp_add(&(x->im), &(y->im), &(z->im));
+}
+
+static inline void
+fp2_add_one(fp2_t *x, const fp2_t *y)
+{
+    fp_add(&x->re, &y->re, &ONE);
+    fp_copy(&x->im, &y->im);
+}
+
+static inline void
+fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_sub(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &(y->im), &(z->im));
+}
+
+static inline void
+fp2_neg(fp2_t *x, const fp2_t *y)
+{
+    fp_neg(&(x->re), &(y->re));
+    fp_neg(&(x->im), &(y->im));
+}
+
+#ifndef NO_FP2X_MUL
+static inline void
+fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_t t0, t1;
+
+    fp_add(&t0, &(y->re), &(y->im));
+    fp_add(&t1, &(z->re), &(z->im));
+    fp_mul(&t0, &t0, &t1);
+    fp_mul(&t1, &(y->im), &(z->im));
+    fp_mul(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &t0, &t1);
+    fp_sub(&(x->im), &(x->im), &(x->re));
+    fp_sub(&(x->re), &(x->re), &t1);
+}
+#endif
+
+#ifndef NO_FP2X_SQR
+static inline void
+fp2_sqr(fp2_t *x, const fp2_t *y)
+{
+    fp_t sum, diff;
+
+    fp_add(&sum, &(y->re), &(y->im));
+    fp_sub(&diff, &(y->re), &(y->im));
+    fp_mul(&(x->im), &(y->re), &(y->im));
+    fp_add(&(x->im), &(x->im), &(x->im));
+    fp_mul(&(x->re), &sum, &diff);
+}
+#endif
+
+static inline void
+fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl)
+{
+    fp_select(&(d->re), &(a0->re), &(a1->re), ctl);
+    fp_select(&(d->im), &(a0->im), &(a1->im), ctl);
+}
+
+static inline void
+fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl)
+{
+    fp_cswap(&(a->re), &(b->re), ctl);
+    fp_cswap(&(a->im), &(b->im), ctl);
+}
+
+static inline void
+fp2_copy(fp2_t *x, const fp2_t *y)
+{
+    *x = *y;
+}
+
+// New functions
+void fp2_encode(void *dst, const fp2_t *a);
+uint32_t fp2_decode(fp2_t *d, const void *src);
+void fp2_inv(fp2_t *x);
+uint32_t fp2_is_square(const fp2_t *x);
+void fp2_sqrt(fp2_t *x);
+uint32_t fp2_sqrt_verify(fp2_t *a);
+void fp2_batched_inv(fp2_t *x, int len);
+void fp2_pow_vartime(fp2_t *out, const fp2_t *x, const uint64_t *exp, const int size);
+void fp2_print(const char *name, const fp2_t *a);
+
+#endif
--- a/src/gf/broadwell/lvl1/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/CMakeLists.txt
@@ -1,10 +1,6 @@
-
-set(SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL
-    fp_asm.S fp.c fp2.c
+set(SOURCE_FILES_GF_SPECIFIC
+    gf5248.c
+    fp_asm.S
 )

-add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL})
-target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
-target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
-
-add_subdirectory(test)
+include(../lvlx.cmake)
--- a/src/gf/broadwell/lvl1/Makefile
+++ b/src/gf/broadwell/lvl1/Makefile
@@ -1,46 +0,0 @@
-
-CC=gcc
-CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
-LDFLAGS=-lm
-AR=ar rcs
-RANLIB=ranlib
-
-OBJECTS=objs/fp_p1913.o objs/fp.o objs/fp2.o objs/fp_asm.o objs/random.o
-
-all: lib tests
-	
-objs/fp_p1913.o: fp_p1913.c
-	@mkdir -p $(@D)
-	$(CC) -c $(CFLAGS) fp_p1913.c -o objs/fp_p1913.o
-	
-objs/fp.o: fp.c
-	@mkdir -p $(@D)
-	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
-	
-objs/fp2.o: fp2.c
-	@mkdir -p $(@D)
-	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
-
-objs/fp_asm.o: fp_asm.S
-	$(CC) -c $(CFLAGS) fp_asm.S -o objs/fp_asm.o
-
-objs/random.o: ../../../common/generic/randombytes_system.c
-	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
-
-lib: $(OBJECTS)
-	rm -rf lib
-	mkdir lib
-	$(AR) lib/libtest.a $^
-	$(RANLIB) lib/libtest.a
-
-tests: lib
-	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
-	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
-
-check: tests
-
-.PHONY: clean
-
-clean:
-	rm -rf *.req objs lib test_fp*
-
--- a/src/gf/broadwell/lvl1/fp.c
+++ b/src/gf/broadwell/lvl1/fp.c
@@ -1,192 +1,95 @@
-#include "include/fp.h"
+#include <assert.h>
+#include "fp.h"

-const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0x252C9E49355147FF, 0x33A6A86587407437, 0x34E29E286B95D98C };
-const uint64_t R2[NWORDS_FIELD] = { 0x233625AE400674D4, 0x20AFD6C1025A1C2E, 0x30A841AB0920655D, 0x0D72E7D67C30CD3D };
-const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+const digit_t p[NWORDS_FIELD] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x04ffffffffffffff };
+const digit_t p2[NWORDS_FIELD] = { 0xfffffffffffffffe, 0xffffffffffffffff, 0xffffffffffffffff, 0x09ffffffffffffff };

-
-void fp_set(digit_t* x, const digit_t val)
-{ // Set field element x = val, where val has wordsize
-
-    x[0] = val;
-    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
-        x[i] = 0;
-    }
-}
-
-void fp_mont_setone(digit_t* out1) {
-    out1[0] = 0x4;
-    out1[1] = UINT64_C(0x6b4d86db2abae000);
-    out1[2] = UINT64_C(0x31655e69e2fe2f23);
-    out1[3] = UINT64_C(0x2c75875e51a899cf);
-}
-
-bool fp_is_equal(const digit_t* a, const digit_t* b)
-{ // Compare two field elements in constant time
-  // Returns 1 (true) if a=b, 0 (false) otherwise
-    digit_t r = 0;
-
-    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
-        r |= a[i] ^ b[i];
-
-    return (bool)is_digit_zero_ct(r);
-}
-
-bool fp_is_zero(const digit_t* a)
-{ // Is a field element zero?
-  // Returns 1 (true) if a=0, 0 (false) otherwise
-    digit_t r = 0;
-
-    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
-        r |= a[i] ^ 0;
-
-    return (bool)is_digit_zero_ct(r);
-}
-
-void fp_copy(digit_t* out, const digit_t* a)
+void
+fp_sqrt(fp_t *x)
 {
-    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+    (void)gf5248_sqrt(x, x);
 }

-void fp_neg(digit_t* out, const digit_t* a)
-{ // Modular negation, out = -a mod p
-  // Input: a in [0, p-1] 
-  // Output: out in [0, p-1] 
-    unsigned int i, borrow = 0;
-
-    for (i = 0; i < NWORDS_FIELD; i++) {
-        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
-    }
-    fp_sub(out, out, (digit_t*)p);
+uint32_t
+fp_is_square(const fp_t *a)
+{
+    // ls is (0, 1, -1) and we want fp_is_square
+    // to return 0xFF..FF when ls is 1 or 0 and 0x00..00 otherwise
+    int32_t ls = gf5248_legendre(a);
+    return ~(uint32_t)(ls >> 1);
 }

-void fp_tomont(digit_t* out, const digit_t* a)
-{ // Conversion to Montgomery representation
-  // out = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
-
-    fp_mul(out, a, (digit_t*)&R2);
+void
+fp_inv(fp_t *x)
+{
+    (void)gf5248_invert(x, x);
 }

-void fp_frommont(digit_t* out, const digit_t* a)
-{ // Conversion from Montgomery representation to standard representation
-  // out = a*R^(-1) mod p, where a in [0, p-1].
-    digit_t one[NWORDS_FIELD] = {0};
-
-    one[0] = 1;
-    fp_mul(out, a, one);
+void
+fp_exp3div4(fp_t *a)
+{
+    //
+    // We optimise this by using the shape of the prime
+    // to avoid almost all multiplications:
+    //
+    // We write:
+    //     (p - 3) / 4 = (5*2^248 - 4) / 4
+    //                 = 5*2^246 - 1
+    //                 = 5*(2^246 - 1) + 4
+    // Then we first compute:
+    //     a246 = a**(2^246 - 1)
+    // Then from this we get the desired result as:
+    //     a**((p-3)/4) = a246**5 * a**4
+    // We can compute this with 12 multiplications and 247 squares.
+    fp_t z4, t3, t6, tmp;
+    // Compute a**3 and a**4
+    fp_sqr(&z4, a);
+    fp_mul(&tmp, a, &z4);
+    fp_sqr(&z4, &z4);
+    // Compute a**(2^3 - 1) = a**7
+    fp_mul(&t3, &tmp, &z4);
+    // Compute a**(2^6 - 1)
+    fp_sqr(&t6, &t3);
+    for (int i = 1; i < 3; i++)
+        fp_sqr(&t6, &t6);
+    fp_mul(&t6, &t6, &t3);
+    // Compute a**(2^12 - 1)
+    fp_sqr(a, &t6);
+    for (int i = 1; i < 6; i++)
+        fp_sqr(a, a);
+    fp_mul(a, a, &t6);
+    // Compute a**(2^15 - 1)
+    for (int i = 0; i < 3; i++)
+        fp_sqr(a, a);
+    fp_mul(a, a, &t3);
+    // Compute a**(2^30 - 1)
+    fp_sqr(&tmp, a);
+    for (int i = 1; i < 15; i++)
+        fp_sqr(&tmp, &tmp);
+    fp_mul(a, a, &tmp);
+    // Compute a**(2^60 - 1)
+    fp_sqr(&tmp, a);
+    for (int i = 1; i < 30; i++)
+        fp_sqr(&tmp, &tmp);
+    fp_mul(a, a, &tmp);
+    // Compute a**(2^120 - 1)
+    fp_sqr(&tmp, a);
+    for (int i = 1; i < 60; i++)
+        fp_sqr(&tmp, &tmp);
+    fp_mul(a, a, &tmp);
+    // Compute a**(2^123 - 1)
+    for (int i = 0; i < 3; i++)
+        fp_sqr(a, a);
+    fp_mul(a, a, &t3);
+    // Compute a**(2^246 - 1)
+    fp_sqr(&tmp, a);
+    for (int i = 1; i < 123; i++)
+        fp_sqr(&tmp, &tmp);
+    fp_mul(a, a, &tmp);
+    // Compute a**(5*(2^246 - 1))
+    fp_sqr(&tmp, a);
+    fp_sqr(&tmp, &tmp);
+    fp_mul(a, a, &tmp);
+    // Compute a**(5*(2^246 - 1) + 4)
+    fp_mul(a, a, &z4);
 }
-
-void MUL(digit_t* out, const digit_t a, const digit_t b)
-{ // Digit multiplication, digit*digit -> 2-digit result 
-  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
-  // Output: 0 < out < 2^(2w)-1    
-    register digit_t al, ah, bl, bh, temp;
-    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
-    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
-
-    al = a & mask_low;                        // Low part
-    ah = a >> (sizeof(digit_t)*4);            // High part
-    bl = b & mask_low;
-    bh = b >> (sizeof(digit_t)*4);
-
-    albl = al * bl;
-    albh = al * bh;
-    ahbl = ah * bl;
-    ahbh = ah * bh;
-    out[0] = albl & mask_low;                 // out00
-
-    res1 = albl >> (sizeof(digit_t)*4);
-    res2 = ahbl & mask_low;
-    res3 = albh & mask_low;
-    temp = res1 + res2 + res3;
-    carry = temp >> (sizeof(digit_t)*4);
-    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
-
-    res1 = ahbl >> (sizeof(digit_t)*4);
-    res2 = albh >> (sizeof(digit_t)*4);
-    res3 = ahbh & mask_low;
-    temp = res1 + res2 + res3 + carry;
-    out[1] = temp & mask_low;                 // out10 
-    carry = temp & mask_high;
-    out[1] ^= (ahbh & mask_high) + carry;     // out11
-}
-
-digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
-{ // Multiprecision right shift
-    digit_t bit_out = x[0] & 1;
-
-    for (unsigned int i = 0; i < nwords-1; i++) {
-        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
-    }
-    x[nwords-1] >>= shift;
-    return bit_out;
-}
-
-void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
-{ // Multiprecision left shift
-
-    for (int i = nwords-1; i > 0; i--) {
-        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
-    }
-    x[0] <<= shift;
-}
-
-static void fp_exp3div4(digit_t* out, const digit_t* a)
-{ // Fixed exponentiation out = a^((p-3)/4) mod p
-  // Input: a in [0, p-1] 
-  // Output: out in [0, p-1] 
-  // Requirement: p = 3(mod 4)
-    fp_t p_t, acc;
-    digit_t bit;
-
-    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
-    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
-    mp_shiftr(p_t, 1, NWORDS_FIELD);
-    mp_shiftr(p_t, 1, NWORDS_FIELD);
-    fp_set(out, 1);
-    fp_tomont(out, out);
-
-    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
-        bit = p_t[0] & 1;
-        mp_shiftr(p_t, 1, NWORDS_FIELD);
-        if (bit == 1) {
-            fp_mul(out, out, acc);
-        }
-        fp_sqr(acc, acc);
-    }
-}
-
-void fp_inv(digit_t* a)
-{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
-  // Input: a=xR in [0, p-1] 
-  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
-  // Requirement: Ceiling(Log(p)) < w*nwords
-    fp_t t;
-
-    fp_exp3div4(t, a);
-    fp_sqr(t, t);
-    fp_sqr(t, t);
-    fp_mul(a, t, a);    // a^(p-2)
-}
-
-bool fp_is_square(const digit_t* a)
-{ // Is field element a square?
-  // Output: out = 0 (false), 1 (true)
-    fp_t t, one;
-
-    fp_exp3div4(t, a);
-    fp_sqr(t, t);
-    fp_mul(t, t, a);    // a^((p-1)/2)
-    fp_frommont(t, t);
-    fp_set(one, 1);
-
-    return fp_is_equal(t, one);
-}
-
-void fp_sqrt(digit_t* a)
-{ // Square root computation, out = a^((p+1)/4) mod p
-    fp_t t;
-
-    fp_exp3div4(t, a);
-    fp_mul(a, t, a);    // a^((p+1)/4)
-}
--- a/src/gf/broadwell/lvl1/fp2.c
+++ b/src/gf/broadwell/lvl1/fp2.c
@@ -1,190 +0,0 @@
-#include <fp2.h>
-
-extern const digit_t R[NWORDS_FIELD];
-
-extern void fp2_sq_c0(fp2_t *out, const fp2_t *in);
-extern void fp2_sq_c1(fp_t *out, const fp2_t *in);
-
-extern void fp2_mul_c0(fp_t *out, const fp2_t *in0, const fp2_t *in1);
-extern void fp2_mul_c1(fp_t *out, const fp2_t *in0, const fp2_t *in1);
-
-/* Arithmetic modulo X^2 + 1 */
-
-void fp2_set(fp2_t* x, const digit_t val)
-{
-    fp_set(x->re, val);
-    fp_set(x->im, 0);
-}
-
-bool fp2_is_zero(const fp2_t* a)
-{ // Is a GF(p^2) element zero?
-  // Returns 1 (true) if a=0, 0 (false) otherwise
-
-    return fp_is_zero(a->re) & fp_is_zero(a->im);
-}
-
-bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
-{ // Compare two GF(p^2) elements in constant time
-  // Returns 1 (true) if a=b, 0 (false) otherwise
-
-    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
-}
-
-void fp2_copy(fp2_t* x, const fp2_t* y)
-{
-    fp_copy(x->re, y->re);
-    fp_copy(x->im, y->im);
-}
-
-fp2_t fp2_non_residue()
-{ // 2 + i is a quadratic non-residue for p1913
-    fp_t one = {0};
-    fp2_t res;
-
-    one[0] = 1;
-    fp_tomont(one, one);
-    fp_add(res.re, one, one);
-    fp_copy(res.im, one);
-    return res;
-}
-
-void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
-{
-    fp_add(x->re, y->re, z->re);
-    fp_add(x->im, y->im, z->im);
-}
-
-void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
-{
-    fp_sub(x->re, y->re, z->re);
-    fp_sub(x->im, y->im, z->im);
-}
-
-void fp2_neg(fp2_t* x, const fp2_t* y)
-{
-    fp_neg(x->re, y->re);
-    fp_neg(x->im, y->im);
-}
-
-void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
-{
-    fp_t t;
-
-    fp2_mul_c0(&t, y, z);              // c0 = a0*b0 - a1*b1
-    fp2_mul_c1(&x->im, y, z);          // c1 = a0*b1 + a1*b0 
-    x->re[0] = t[0]; x->re[1] = t[1]; x->re[2] = t[2]; x->re[3] = t[3];
-}
-
-void fp2_sqr(fp2_t* x, const fp2_t* y) {
-    fp2_t t;
-
-    fp2_sq_c0(&t, y);               // c0 = (a0+a1)(a0-a1)
-    fp2_sq_c1(&x->im, y);           // c1 = 2a0*a1
-    x->re[0] = t.re[0]; x->re[1] = t.re[1]; x->re[2] = t.re[2]; x->re[3] = t.re[3];
-}
-
-void fp2_inv(fp2_t* x)
-{
-    fp_t t0, t1;
-
-    fp_sqr(t0, x->re);
-    fp_sqr(t1, x->im);
-    fp_add(t0, t0, t1);
-    fp_inv(t0);
-    fp_mul(x->re, x->re, t0);
-    fp_mul(x->im, x->im, t0);
-    fp_neg(x->im, x->im);
-}
-
-bool fp2_is_square(const fp2_t* x)
-{
-    fp_t t0, t1;
-
-    fp_sqr(t0, x->re);
-    fp_sqr(t1, x->im);
-    fp_add(t0, t0, t1);
-
-    return fp_is_square(t0);
-}
-
-void fp2_frob(fp2_t* x, const fp2_t* y)
-{
-    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
-    fp_neg(x->im, y->im);
-}
-
-void fp2_tomont(fp2_t* x, const fp2_t* y)
-{ 
-    fp_tomont(x->re, y->re);
-    fp_tomont(x->im, y->im);
-}
-
-void fp2_frommont(fp2_t* x, const fp2_t* y)
-{
-    fp_frommont(x->re, y->re);
-    fp_frommont(x->im, y->im);
-}
-
-// NOTE: old, non-constant-time implementation. Could be optimized
-void fp2_sqrt(fp2_t* x)
-{
-    fp_t sdelta, re, tmp1, tmp2, inv2, im;
-
-    if (fp_is_zero(x->im)) {
-        if (fp_is_square(x->re)) {
-            fp_sqrt(x->re);
-            return;
-        } else {
-            fp_neg(x->im, x->re);
-            fp_sqrt(x->im);
-            fp_set(x->re, 0);
-            return;
-        }
-    }
-
-    // sdelta = sqrt(re^2 + im^2)
-    fp_sqr(sdelta, x->re);
-    fp_sqr(tmp1, x->im);
-    fp_add(sdelta, sdelta, tmp1);
-    fp_sqrt(sdelta);
-
-    fp_set(inv2, 2);
-    fp_tomont(inv2, inv2);     // inv2 <- 2
-    fp_inv(inv2);
-    fp_add(re, x->re, sdelta);
-    fp_mul(re, re, inv2);
-    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
-
-    if (!fp_is_square(tmp2)) {
-        fp_sub(re, x->re, sdelta);
-        fp_mul(re, re, inv2);
-    }
-
-    fp_sqrt(re);
-    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
-
-    fp_inv(im);
-    fp_mul(im, im, inv2);
-    fp_mul(x->im, im, x->im);    
-    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
-}
-
-// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
-int fp2_cmp(fp2_t* x, fp2_t* y){
-    fp2_t a, b;
-    fp2_frommont(&a, x);
-    fp2_frommont(&b, y);
-    for(int i = NWORDS_FIELD-1; i >= 0; i--){
-        if(a.re[i] > b.re[i])
-            return 1;
-        if(a.re[i] < b.re[i])
-            return -1;
-    }
-    for(int i = NWORDS_FIELD-1; i >= 0; i--){
-        if(a.im[i] > b.im[i])
-            return 1;
-        if(a.im[i] < b.im[i])
-            return -1;
-    }
-    return 0;
-}
--- a/src/gf/broadwell/lvl1/fp_asm.S
+++ b/src/gf/broadwell/lvl1/fp_asm.S
@@ -1,17 +1,27 @@
+#include <sqisign_namespace.h>
 .intel_syntax noprefix

 .set pbytes,32
 .set plimbs,4

-.global p_plus_1
-p_plus_1: .quad 0x0000000000000000, 0x252C9E4935514800, 0x33A6A86587407437, 0x34E29E286B95D98C
+#ifdef __APPLE__
+.section __TEXT,__const
+#else
+.section .rodata
+#endif
+p_plus_1: .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0500000000000000
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+#include <asm_preamble.h>

 .text
 .p2align 4,,15

 .global fp_add
 fp_add:
-  push   r12  
  xor    rax, rax
  mov    r8, [rsi]
  mov    r9, [rsi+8]
@@ -20,36 +30,34 @@ fp_add:
  add    r8, [rdx] 
  adc    r9, [rdx+8] 
  adc    r10, [rdx+16] 
-  adc    r11, [rdx+24] 
-  mov    r12, [rip+p]
-  sub    r8, r12
-  mov    rcx, [rip+p+8]
-  sbb    r9, rcx
-  mov    rsi, [rip+p+16]
-  sbb    r10, rsi
+  adc    r11, [rdx+24]
+  mov    rax, r11
+  shr    rax, 59
+  neg    rax
  mov    rdx, [rip+p+24]
-  sbb    r11, rdx
-  sbb    rax, 0
-  
-  and    r12, rax
-  and    rcx, rax
-  and    rsi, rax
  and    rdx, rax
+  sub    r8, rax
+  sbb    r9, rax
+  sbb    r10, rax
+  sbb    r11, rdx
  
-  add    r8, r12  
-  adc    r9, rcx  
-  adc    r10, rsi  
-  adc    r11, rdx 
+  mov    rax, r11
+  shr    rax, 59
+  neg    rax
+  mov    rdx, [rip+p+24]
+  and    rdx, rax
+  sub    r8, rax
+  sbb    r9, rax
+  sbb    r10, rax
+  sbb    r11, rdx
  mov    [rdi], r8
  mov    [rdi+8], r9 
  mov    [rdi+16], r10 
  mov    [rdi+24], r11
-  pop    r12
  ret

 .global fp_sub
 fp_sub:
-  push   r12  
  xor    rax, rax
  mov    r8, [rsi]
  mov    r9, [rsi+8]
@@ -61,23 +69,26 @@ fp_sub:
  sbb    r11, [rdx+24]
  sbb    rax, 0
  
-  mov    r12, [rip+p]
-  mov    rcx, [rip+p+8]
-  mov    rsi, [rip+p+16]
  mov    rdx, [rip+p+24]
-  and    r12, rax
-  and    rcx, rax
-  and    rsi, rax
  and    rdx, rax  
-  add    r8, r12  
-  adc    r9, rcx 
-  adc    r10, rsi  
+  add    r8, rax  
+  adc    r9, rax
+  adc    r10, rax  
  adc    r11, rdx 
+  
+  mov    rax, r11
+  sar    rax, 59  
+  mov    rdx, [rip+p+24]
+  and    rdx, rax  
+  add    r8, rax  
+  adc    r9, rax
+  adc    r10, rax  
+  adc    r11, rdx
+
  mov    [rdi], r8
  mov    [rdi+8], r9 
  mov    [rdi+16], r10 
  mov    [rdi+24], r11 
-  pop    r12
  ret
  
 ///////////////////////////////////////////////////////////////// MACROS
@@ -105,18 +116,11 @@ fp_sub:
    adc    \Z4, 0   
 .endm

-.macro MULADD64x192 M1, Z0, Z1, Z2, Z3, T0, T1
+.macro MULADD64x64 M1, Z0, Z1, Z2, Z3, T0, T1
    mulx   \T0, \T1, \M1     // A0*B0
    xor    rax, rax
-    adox   \Z0, \T1
-    adox   \Z1, \T0  
-    mulx   \T0, \T1, 8\M1    // A0*B1
-    adcx   \Z1, \T1
-    adox   \Z2, \T0    
-    mulx   \T0, \T1, 16\M1   // A0*B2
-    adcx   \Z2, \T1
+    adox   \Z2, \T1
    adox   \Z3, \T0
-    adc    \Z3, 0   
 .endm
  
 //***********************************************************************
@@ -133,13 +137,13 @@ fp2_mul_c0:
    push   r14   
    mov    rcx, rdx
 	
-	// [rdi0:3] <- p - b1
-	mov    r8, [rip+p]  
-	mov    r9, [rip+p+8]   
-	mov    r10, [rip+p+16]
-	mov    r11, [rip+p+24] 
+	// [rdi0:3] <- 2p - b1
+	mov    r8, [rip+p2] 
+	mov    r9, [rip+p2+8] 
+	mov    r10, r9
+	mov    r11, [rip+p2+24] 
 	mov    rax, [rcx+32]
-	mov    rdx, [rcx+40]        
+	mov    rdx, [rcx+40]
 	sub    r8, rax
 	sbb    r9, rdx
 	mov    rax, [rcx+48]
@@ -167,7 +171,7 @@ fp2_mul_c0:
    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r8                 // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r9, r10, r11, r12, r13, r14
    
    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
    mov    rdx, [rcx+8]
@@ -176,7 +180,7 @@ fp2_mul_c0:
    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r9                 // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r10, r11, r12, r8, r13, r14
    
    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
    mov    rdx, [rcx+16]
@@ -185,7 +189,7 @@ fp2_mul_c0:
    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r10                // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r11, r12, r8, r9, r13, r14
    
    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
    mov    rdx, [rcx+24]
@@ -194,27 +198,8 @@ fp2_mul_c0:
    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r11                // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r12, r8, r9, r10, r13, r14

-	// Final correction                        
-	mov    rsi, [rip+p]
-	mov    rcx, [rip+p+8]
-	mov    rdx, [rip+p+16]
-	mov    r11, [rip+p+24]
-	sub    r12, rsi
-	sbb    r8, rcx
-	sbb    r9, rdx
-	sbb    r10, r11
-	sbb    rax, 0
-	and    rsi, rax
-	and    rcx, rax
-	and    rdx, rax
-	and    r11, rax
-	add    r12, rsi
-	adc    r8, rcx
-	adc    r9, rdx
-	adc    r10, r11
-    
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
@@ -254,7 +239,7 @@ fp2_mul_c1:
    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r8                 // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r9, r10, r11, r12, r13, r14
    
    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
    mov    rdx, [rcx+40]
@@ -263,7 +248,7 @@ fp2_mul_c1:
    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r9                 // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r10, r11, r12, r8, r13, r14
    
    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
    mov    rdx, [rcx+48]
@@ -272,7 +257,7 @@ fp2_mul_c1:
    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r10                // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r11, r12, r8, r9, r13, r14
    
    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
    mov    rdx, [rcx+56]
@@ -281,27 +266,8 @@ fp2_mul_c1:
    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r11                // rdx <- z0 
-    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+    MULADD64x64 [rip+p_plus_1+24], r12, r8, r9, r10, r13, r14

-	// Final correction                        
-	mov    rsi, [rip+p]
-	mov    rcx, [rip+p+8]
-	mov    rdx, [rip+p+16]
-	mov    r11, [rip+p+24]
-	sub    r12, rsi
-	sbb    r8, rcx
-	sbb    r9, rdx
-	sbb    r10, r11
-	sbb    rax, 0
-	and    rsi, rax
-	and    rcx, rax
-	and    rdx, rax
-	and    r11, rax
-	add    r12, rsi
-	adc    r8, rcx
-	adc    r9, rdx
-	adc    r10, r11
-    
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
@@ -322,28 +288,28 @@ fp2_mul_c1:
 .macro FPMUL256x256 M0, M1, Z0, Z1, Z2, Z3, Z4, T0, T1           
    // [Z1:Z4] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z0                 // rdx <- z0
-    MULADD64x192 [rip+p_plus_1+8], \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    MULADD64x64 [rip+p_plus_1+24], \Z1, \Z2, \Z3, \Z4, \T0, \T1
    
    // [Z1:Z4, Z0] <- z = a01 x a1 + z 
    mov    rdx, 8\M0
    MULADD64x256 \M1, \Z1, \Z2, \Z3, \Z4, \Z0, \T0, \T1, \Z0
    // [Z2:Z4, Z0] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z1                 // rdx <- z0
-    MULADD64x192 [rip+p_plus_1+8], \Z2, \Z3, \Z4, \Z0, \T0, \T1
+    MULADD64x64 [rip+p_plus_1+24], \Z2, \Z3, \Z4, \Z0, \T0, \T1
    
    // [Z2:Z4, Z0:Z1] <- z = a02 x a1 + z  
    mov    rdx, 16\M0
    MULADD64x256 \M1, \Z2, \Z3, \Z4, \Z0, \Z1, \T0, \T1, \Z1
    // [Z3:Z4, Z0:Z1] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z2                // rdx <- z0
-    MULADD64x192 [rip+p_plus_1+8], \Z3, \Z4, \Z0, \Z1, \T0, \T1
+    MULADD64x64 [rip+p_plus_1+24], \Z3, \Z4, \Z0, \Z1, \T0, \T1
    
    // [Z3:Z4, Z0:Z2] <- z = a03 x a1 + z
    mov    rdx, 24\M0
    MULADD64x256 \M1, \Z3, \Z4, \Z0, \Z1, \Z2, \T0, \T1, \Z2
    // [Z4, Z0:Z2] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z3                // rdx <- z0
-    MULADD64x192 [rip+p_plus_1+8], \Z4, \Z0, \Z1, \Z2, \T0, \T1
+    MULADD64x64 [rip+p_plus_1+24], \Z4, \Z0, \Z1, \Z2, \T0, \T1
 .endm

 //***********************************************************************
@@ -371,19 +337,21 @@ fp2_sq_c0:
 	mov    [rdi+16], r10
 	mov    [rdi+24], r11
 	
-	// a0 - a1 + p
+	// a0 - a1 + 2p
 	mov    r8, [rsi]
 	mov    r10, [rsi+8]
 	mov    r12, [rsi+16]
 	mov    r13, [rsi+24]
 	sub    r8, [rsi+32]
 	sbb    r10, [rsi+40]
-	sbb    r12, [rsi+48] 
+	sbb    r12, [rsi+48]
 	sbb    r13, [rsi+56]
-	add    r8, [rip+p]                    
-	adc    r10, [rip+p+8]
-	adc    r12, [rip+p+16]
-	adc    r13, [rip+p+24]
+	mov    rax, [rip+p2]
+	add    r8, rax   
+	mov    rax, [rip+p2+8]                 
+	adc    r10, rax
+	adc    r12, rax
+	adc    r13, [rip+p2+24]
 	mov    [rdi+32], r8               
 	mov    [rdi+40], r10 
 	mov    [rdi+48], r12 
@@ -402,25 +370,6 @@ fp2_sq_c0:

    FPMUL256x256 [rdi], [rdi+32], r8, r9, r10, r11, r12, r13, rcx

-	// Final correction                        
-	mov    rsi, [rip+p]
-	mov    rcx, [rip+p+8]
-	mov    rdx, [rip+p+16]
-	mov    r11, [rip+p+24]
-	sub    r12, rsi
-	sbb    r8, rcx
-	sbb    r9, rdx
-	sbb    r10, r11
-	sbb    rax, 0
-	and    rsi, rax
-	and    rcx, rax
-	and    rdx, rax
-	and    r11, rax
-	add    r12, rsi
-	adc    r8, rcx
-	adc    r9, rdx
-	adc    r10, r11
-    
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
@@ -465,27 +414,8 @@ fp2_sq_c1:
    adox   r12, rax 

 	FPMUL256x256 [rsp], [rsi+32], r8, r9, r10, r11, r12, r13, rcx
-	add    rsp, 32
+	add    rsp, 32 

-	// Final correction                        
-	mov    rsi, [rip+p]
-	mov    rcx, [rip+p+8]
-	mov    rdx, [rip+p+16]
-	mov    r11, [rip+p+24]
-	sub    r12, rsi
-	sbb    r8, rcx
-	sbb    r9, rdx
-	sbb    r10, r11
-	sbb    rax, 0
-	and    rsi, rax
-	and    rcx, rax
-	and    rdx, rax
-	and    r11, rax
-	add    r12, rsi
-	adc    r8, rcx
-	adc    r9, rdx
-	adc    r10, r11
-    
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
@@ -521,26 +451,7 @@ fp_mul:

 	FPMUL256x256 [rcx], [rsi], r8, r9, r10, r11, r12, r13, r14

-	// Final correction                        
-	mov    rsi, [rip+p]
-	mov    rcx, [rip+p+8]
-	mov    rdx, [rip+p+16]
-	mov    r11, [rip+p+24]
-	sub    r12, rsi
-	sbb    r8, rcx
-	sbb    r9, rdx
-	sbb    r10, r11
-	sbb    rax, 0
-	and    rsi, rax
-	and    rcx, rax
-	and    rdx, rax
-	and    r11, rax
-	add    r12, rsi
-	adc    r8, rcx
-	adc    r9, rdx
-	adc    r10, r11
-    
-    mov    [rdi], r12          
+    mov    [rdi], r12        
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10  
@@ -552,4 +463,4 @@ fp_mul:
 .global fp_sqr
 fp_sqr:
    mov rdx, rsi
-    jmp fp_mul
+    jmp fp_mul
--- a/src/gf/broadwell/lvl1/gf5248.c
+++ b/src/gf/broadwell/lvl1/gf5248.c
@@ -0,0 +1,767 @@
+/*
+ * This code is derived from discussions with Thomas Pornin
+ */
+
+#include "gf5248.h"
+
+// see gf5248.h
+const gf5248 gf5248_ZERO = { 0, 0, 0, 0 };
+
+// see gf5248.h
+const gf5248 gf5248_ONE = { 0x0000000000000033, 0x0000000000000000, 0x0000000000000000, 0x0100000000000000 };
+
+// see gf5248.h
+const gf5248 gf5248_MINUS_ONE = { 0xFFFFFFFFFFFFFFCC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x03FFFFFFFFFFFFFF };
+
+// Montgomery representation of 2^256.
+static const gf5248 R2 = { 0x3333333333333d70, 0x3333333333333333, 0x3333333333333333, 0x0333333333333333 };
+
+// The modulus itself (this is also a valid representation of zero).
+static const gf5248 MODULUS = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x04FFFFFFFFFFFFFF };
+
+// 1/2^244 (in Montgomery representation).
+static const gf5248 INVT244 = { 0x0000000000001000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 };
+
+static const gf5248 PM1O3 = { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0x01aaaaaaaaaaaaaa };
+
+// Normalize value *a into *d.
+static inline void
+inner_gf5248_normalize(gf5248 *d, const gf5248 *a)
+{
+    uint64_t d0, d1, d2, d3, m;
+    unsigned char cc;
+
+    // Subtract q.
+    cc = inner_gf5248_sbb(0, a->v0, 0xFFFFFFFFFFFFFFFF, &d0);
+    cc = inner_gf5248_sbb(cc, a->v1, 0xFFFFFFFFFFFFFFFF, &d1);
+    cc = inner_gf5248_sbb(cc, a->v2, 0xFFFFFFFFFFFFFFFF, &d2);
+    cc = inner_gf5248_sbb(cc, a->v3, 0x04FFFFFFFFFFFFFF, &d3);
+
+    // Add back q if the result is negative.
+    (void)inner_gf5248_sbb(cc, 0, 0, &m);
+    cc = inner_gf5248_adc(0, d0, m, &d0);
+    cc = inner_gf5248_adc(cc, d1, m, &d1);
+    cc = inner_gf5248_adc(cc, d2, m, &d2);
+    (void)inner_gf5248_adc(cc, d3, m & 0x04FFFFFFFFFFFFFF, &d3);
+
+    d->v0 = d0;
+    d->v1 = d1;
+    d->v2 = d2;
+    d->v3 = d3;
+}
+
+// Expand the most significant bit of x into a full-width 64-bit word
+// (0x0000000000000000 or 0xFFFFFFFFFFFFFFFF).
+static inline uint64_t
+sgnw(uint64_t x)
+{
+    return (uint64_t)(*(int64_t *)&x >> 63);
+}
+
+// d <- u*f + v*g  (in the field)
+// Coefficients f and g are provided as unsigned integers, but they
+// really are signed values which must be less than 2^62 (in absolute value).
+static void
+gf5248_lin(gf5248 *d, const gf5248 *u, const gf5248 *v, uint64_t f, uint64_t g)
+{
+    // f <- abs(f), keeping the sign in sf, and negating u accordingly
+    uint64_t sf = sgnw(f);
+    f = (f ^ sf) - sf;
+    gf5248 tu;
+    gf5248_neg(&tu, u);
+    gf5248_select(&tu, u, &tu, (uint32_t)sf);
+
+    // g <- abs(g), keeping the sign in sg, and negating v accordingly
+    uint64_t sg = sgnw(g);
+    g = (g ^ sg) - sg;
+    gf5248 tv;
+    gf5248_neg(&tv, v);
+    gf5248_select(&tv, v, &tv, (uint32_t)sg);
+
+    // Linear combination over plain integers.
+    uint64_t d0, d1, d2, d3, t;
+    inner_gf5248_umul_x2(d0, t, tu.v0, f, tv.v0, g);
+    inner_gf5248_umul_x2_add(d1, t, tu.v1, f, tv.v1, g, t);
+    inner_gf5248_umul_x2_add(d2, t, tu.v2, f, tv.v2, g, t);
+    inner_gf5248_umul_x2_add(d3, t, tu.v3, f, tv.v3, g, t);
+
+    // Reduction: split into low part (248 bits) and high part
+    // (71 bits, since t can be up to 63 bits). If the high
+    // part is h, then:
+    //    h*2^248 = (h mod 5)*2^248 + floor(h/5)  mod q
+    uint64_t h0 = (d3 >> 56) | (t << 8);
+    uint64_t h1 = t >> 56;
+    d3 &= 0x00FFFFFFFFFFFFFF;
+    uint64_t z0, z1, quo0, rem0, quo1, rem1;
+    inner_gf5248_umul(z0, z1, h0, 0xCCCCCCCCCCCCCCCD);
+    (void)z0;
+    quo0 = z1 >> 2;
+    rem0 = h0 - (5 * quo0);
+    quo1 = (h1 * 0xCD) >> 10;
+    rem1 = h1 - (5 * quo1);
+
+    // h = rem0 + 5*quo0 + (rem1 + 5*quo1)*2^64
+    //   = rem0 + rem1 + 5*(quo0 + quo1*2^64 + rem1*((2^64 - 1)/5))
+    // We add rem0 and rem1 modulo 5, with an extra carry that
+    // goes into the folded part (multiple of 5).
+    uint64_t e, f0, f1;
+    unsigned char cc;
+    cc = inner_gf5248_adc(0, rem0 + 0xFFFFFFFFFFFFFFFA, rem1, &e);
+    cc = inner_gf5248_adc(cc, quo0, rem1 * 0x3333333333333333, &f0);
+    (void)inner_gf5248_adc(cc, quo1, 0, &f1);
+    e -= 0xFFFFFFFFFFFFFFFA;
+
+    // Now we only have to add e*2^248 + f0:f1 to the low part.
+    cc = inner_gf5248_adc(0, d0, f0, &d0);
+    cc = inner_gf5248_adc(cc, d1, f1, &d1);
+    cc = inner_gf5248_adc(cc, d2, 0, &d2);
+    (void)inner_gf5248_adc(cc, d3, e << 56, &d3);
+
+    d->v0 = d0;
+    d->v1 = d1;
+    d->v2 = d2;
+    d->v3 = d3;
+}
+
+// d <- abs(floor((a*f + b*g) / 2^31))
+// Coefficients f and g are provided as unsigned integer, but they really
+// are signed values, which MUST be at most 2^31 in absolute value.
+// The computation is performed over the integers, not modulo q. The low
+// 31 bits are dropped (in practice, callers provided appropriate coefficients
+// f and g such that a*f + b*g is a multiple of 2^31.
+//
+// If a*f + b*g is negative, then the absolute value is computed, and the
+// function returns 0xFFFFFFFFFFFFFFFF; otherwise, the function returns
+// 0x0000000000000000.
+static uint64_t
+lindiv31abs(gf5248 *d, const gf5248 *a, const gf5248 *b, uint64_t f, uint64_t g)
+{
+    // f <- abs(f), keeping the sign in sf
+    uint64_t sf = sgnw(f);
+    f = (f ^ sf) - sf;
+
+    // g <- abs(g), keeping the sign in sg
+    uint64_t sg = sgnw(g);
+    g = (g ^ sg) - sg;
+
+    // Apply the signs of f and g to the source operands.
+    uint64_t a0, a1, a2, a3, a4;
+    uint64_t b0, b1, b2, b3, b4;
+    unsigned char cc;
+
+    cc = inner_gf5248_sbb(0, a->v0 ^ sf, sf, &a0);
+    cc = inner_gf5248_sbb(cc, a->v1 ^ sf, sf, &a1);
+    cc = inner_gf5248_sbb(cc, a->v2 ^ sf, sf, &a2);
+    cc = inner_gf5248_sbb(cc, a->v3 ^ sf, sf, &a3);
+    (void)inner_gf5248_sbb(cc, 0, 0, &a4);
+
+    cc = inner_gf5248_sbb(0, b->v0 ^ sg, sg, &b0);
+    cc = inner_gf5248_sbb(cc, b->v1 ^ sg, sg, &b1);
+    cc = inner_gf5248_sbb(cc, b->v2 ^ sg, sg, &b2);
+    cc = inner_gf5248_sbb(cc, b->v3 ^ sg, sg, &b3);
+    (void)inner_gf5248_sbb(cc, 0, 0, &b4);
+
+    // Compute a*f + b*g into d0:d1:d2:d3:d4. Since f and g are at
+    // most 2^31, we can add two 128-bit products with no overflow.
+    // Note: a4 and b4 are both in {0, -1}.
+    uint64_t d0, d1, d2, d3, d4, t;
+    inner_gf5248_umul_x2(d0, t, a0, f, b0, g);
+    inner_gf5248_umul_x2_add(d1, t, a1, f, b1, g, t);
+    inner_gf5248_umul_x2_add(d2, t, a2, f, b2, g, t);
+    inner_gf5248_umul_x2_add(d3, t, a3, f, b3, g, t);
+    d4 = t - (a4 & f) - (b4 & g);
+
+    // Right-shift the value by 31 bits.
+    d0 = (d0 >> 31) | (d1 << 33);
+    d1 = (d1 >> 31) | (d2 << 33);
+    d2 = (d2 >> 31) | (d3 << 33);
+    d3 = (d3 >> 31) | (d4 << 33);
+
+    // If the result is negative, negate it.
+    t = sgnw(d4);
+    cc = inner_gf5248_sbb(0, d0 ^ t, t, &d0);
+    cc = inner_gf5248_sbb(cc, d1 ^ t, t, &d1);
+    cc = inner_gf5248_sbb(cc, d2 ^ t, t, &d2);
+    (void)inner_gf5248_sbb(cc, d3 ^ t, t, &d3);
+
+    d->v0 = d0;
+    d->v1 = d1;
+    d->v2 = d2;
+    d->v3 = d3;
+    return t;
+}
+
+// lzcnt(x) returns the number of leading bits of value 0 in x. It supports
+// x == 0 (in which case the function returns 64).
+#if defined __LZCNT__
+static inline uint64_t
+lzcnt(uint64_t x)
+{
+    return _lzcnt_u64(x);
+}
+#else
+static inline uint64_t
+lzcnt(uint64_t x)
+{
+    uint64_t m, s;
+    m = sgnw((x >> 32) - 1);
+    s = m & 32;
+    x = (x >> 32) ^ (m & (x ^ (x >> 32)));
+    m = sgnw((x >> 16) - 1);
+    s |= m & 16;
+    x = (x >> 16) ^ (m & (x ^ (x >> 16)));
+    m = sgnw((x >> 8) - 1);
+    s |= m & 8;
+    x = (x >> 8) ^ (m & (x ^ (x >> 8)));
+    m = sgnw((x >> 4) - 1);
+    s |= m & 4;
+    x = (x >> 4) ^ (m & (x ^ (x >> 4)));
+    m = sgnw((x >> 2) - 1);
+    s |= m & 2;
+    x = (x >> 2) ^ (m & (x ^ (x >> 2)));
+
+    // At this point, x fits on 2 bits. Count of extra zeros:
+    //   x = 0  -> 2
+    //   x = 1  -> 1
+    //   x = 2  -> 0
+    //   x = 3  -> 0
+    s += (2 - x) & ((x - 3) >> 2);
+    return s;
+}
+#endif
+
+// see gf5248.h
+uint32_t
+gf5248_div(gf5248 *d, const gf5248 *x, const gf5248 *y)
+{
+    // Extended binary GCD:
+    //
+    //   a <- y
+    //   b <- q (modulus)
+    //   u <- x (self)
+    //   v <- 0
+    //
+    // Value a is normalized (in the 0..q-1 range). Values a and b are
+    // then considered as (signed) integers. Values u and v are field
+    // elements.
+    //
+    // Invariants:
+    //    a*x = y*u mod q
+    //    b*x = y*v mod q
+    //    b is always odd
+    //
+    // At each step:
+    //    if a is even, then:
+    //        a <- a/2, u <- u/2 mod q
+    //    else:
+    //        if a < b:
+    //            (a, u, b, v) <- (b, v, a, u)
+    //        a <- (a-b)/2, u <- (u-v)/2 mod q
+    //
+    // What we implement below is the optimized version of this
+    // algorithm, as described in https://eprint.iacr.org/2020/972
+
+    gf5248 a, b, u, v;
+    uint64_t xa, xb, f0, g0, f1, g1;
+    uint32_t r;
+
+    r = ~gf5248_iszero(y);
+    inner_gf5248_normalize(&a, y);
+    b = MODULUS;
+    u = *x;
+    v = gf5248_ZERO;
+
+    // Generic loop does 15*31 = 465 inner iterations.
+    for (int i = 0; i < 15; i++) {
+        // Get approximations of a and b over 64 bits:
+        //  - If len(a) <= 64 and len(b) <= 64, then we just use
+        //    their values (low limbs).
+        //  - Otherwise, with n = max(len(a), len(b)), we use:
+        //       (a mod 2^31) + 2^31*floor(a / 2^(n - 33))
+        //       (b mod 2^31) + 2^31*floor(b / 2^(n - 33))
+        uint64_t m3 = a.v3 | b.v3;
+        uint64_t m2 = a.v2 | b.v2;
+        uint64_t m1 = a.v1 | b.v1;
+        uint64_t tnz3 = sgnw(m3 | -m3);
+        uint64_t tnz2 = sgnw(m2 | -m2) & ~tnz3;
+        uint64_t tnz1 = sgnw(m1 | -m1) & ~tnz3 & ~tnz2;
+        uint64_t tnzm = (m3 & tnz3) | (m2 & tnz2) | (m1 & tnz1);
+        uint64_t tnza = (a.v3 & tnz3) | (a.v2 & tnz2) | (a.v1 & tnz1);
+        uint64_t tnzb = (b.v3 & tnz3) | (b.v2 & tnz2) | (b.v1 & tnz1);
+        uint64_t snza = (a.v2 & tnz3) | (a.v1 & tnz2) | (a.v0 & tnz1);
+        uint64_t snzb = (b.v2 & tnz3) | (b.v1 & tnz2) | (b.v0 & tnz1);
+
+        // If both len(a) <= 64 and len(b) <= 64, then:
+        //    tnzm = 0
+        //    tnza = 0, snza = 0, tnzb = 0, snzb = 0
+        // Otherwise:
+        //    tnzm != 0
+        //    tnza contains the top non-zero limb of a
+        //    snza contains the limb right below tnza
+        //    tnzb contains the top non-zero limb of a
+        //    snzb contains the limb right below tnzb
+        //
+        // We count the number of leading zero bits in tnzm:
+        //  - If s <= 31, then the top 31 bits can be extracted from
+        //    tnza and tnzb alone.
+        //  - If 32 <= s <= 63, then we need some bits from snza and
+        //    snzb as well.
+        int64_t s = lzcnt(tnzm);
+        uint64_t sm = (uint64_t)((31 - s) >> 63);
+        tnza ^= sm & (tnza ^ ((tnza << 32) | (snza >> 32)));
+        tnzb ^= sm & (tnzb ^ ((tnzb << 32) | (snzb >> 32)));
+        s -= 32 & sm;
+        tnza <<= s;
+        tnzb <<= s;
+
+        // At this point:
+        //  - If len(a) <= 64 and len(b) <= 64, then:
+        //       tnza = 0
+        //       tnzb = 0
+        //       tnz1 = tnz2 = tnz3 = 0
+        //       we want to use the entire low words of a and b
+        //  - Otherwise, we want to use the top 33 bits of tnza and
+        //    tnzb, and the low 31 bits of the low words of a and b.
+        uint64_t tzx = ~(tnz1 | tnz2 | tnz3);
+        tnza |= a.v0 & tzx;
+        tnzb |= b.v0 & tzx;
+        xa = (a.v0 & 0x7FFFFFFF) | (tnza & 0xFFFFFFFF80000000);
+        xb = (b.v0 & 0x7FFFFFFF) | (tnzb & 0xFFFFFFFF80000000);
+
+        // Compute the 31 inner iterations on xa and xb.
+        uint64_t fg0 = (uint64_t)1;
+        uint64_t fg1 = (uint64_t)1 << 32;
+        for (int j = 0; j < 31; j++) {
+            uint64_t a_odd, swap, t0, t1, t2;
+            unsigned char cc;
+            a_odd = -(xa & 1);
+            cc = inner_gf5248_sbb(0, xa, xb, &t0);
+            (void)inner_gf5248_sbb(cc, 0, 0, &swap);
+            swap &= a_odd;
+            t1 = swap & (xa ^ xb);
+            xa ^= t1;
+            xb ^= t1;
+            t2 = swap & (fg0 ^ fg1);
+            fg0 ^= t2;
+            fg1 ^= t2;
+            xa -= a_odd & xb;
+            fg0 -= a_odd & fg1;
+            xa >>= 1;
+            fg1 <<= 1;
+        }
+        fg0 += 0x7FFFFFFF7FFFFFFF;
+        fg1 += 0x7FFFFFFF7FFFFFFF;
+        f0 = (fg0 & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g0 = (fg0 >> 32) - (uint64_t)0x7FFFFFFF;
+        f1 = (fg1 & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g1 = (fg1 >> 32) - (uint64_t)0x7FFFFFFF;
+
+        // Propagate updates to a, b, u and v.
+        gf5248 na, nb, nu, nv;
+        uint64_t nega = lindiv31abs(&na, &a, &b, f0, g0);
+        uint64_t negb = lindiv31abs(&nb, &a, &b, f1, g1);
+        f0 = (f0 ^ nega) - nega;
+        g0 = (g0 ^ nega) - nega;
+        f1 = (f1 ^ negb) - negb;
+        g1 = (g1 ^ negb) - negb;
+        gf5248_lin(&nu, &u, &v, f0, g0);
+        gf5248_lin(&nv, &u, &v, f1, g1);
+        a = na;
+        b = nb;
+        u = nu;
+        v = nv;
+    }
+
+    // If y is invertible, then the final GCD is 1, and
+    // len(a) + len(b) <= 37, so we can end the computation with
+    // the low words directly. We only need 35 iterations to reach
+    // the point where b = 1.
+    //
+    // If y is zero, then v is unchanged (hence zero) and none of
+    // the subsequent iterations will change it either, so we get
+    // 0 on output, which is what we want.
+    xa = a.v0;
+    xb = b.v0;
+    f0 = 1;
+    g0 = 0;
+    f1 = 0;
+    g1 = 1;
+    for (int j = 0; j < 35; j++) {
+        uint64_t a_odd, swap, t0, t1, t2, t3;
+        unsigned char cc;
+        a_odd = -(xa & 1);
+        cc = inner_gf5248_sbb(0, xa, xb, &t0);
+        (void)inner_gf5248_sbb(cc, 0, 0, &swap);
+        swap &= a_odd;
+        t1 = swap & (xa ^ xb);
+        xa ^= t1;
+        xb ^= t1;
+        t2 = swap & (f0 ^ f1);
+        f0 ^= t2;
+        f1 ^= t2;
+        t3 = swap & (g0 ^ g1);
+        g0 ^= t3;
+        g1 ^= t3;
+        xa -= a_odd & xb;
+        f0 -= a_odd & f1;
+        g0 -= a_odd & g1;
+        xa >>= 1;
+        f1 <<= 1;
+        g1 <<= 1;
+    }
+    gf5248_lin(d, &u, &v, f1, g1);
+
+    // At the point:
+    //  - Numerator and denominator were both in Montgomery representation,
+    //    but the two factors R canceled each other.
+    //  - We have injected 31*15+35 = 500 extra factors of 2, hence we
+    //    must divide the result by 2^500.
+    //  - However, we also want to obtain the result in Montgomery
+    //    representation, i.e. multiply by 2^256. We thus want to
+    //    divide the current result by 2^(500 - 256) = 2^244.
+    //  - We do this division by using a Montgomery multiplication with
+    //    the Montgomery representation of 1/2^244, i.e. the integer
+    //    2^256/2^244 = 4096.
+    gf5248_mul(d, d, &INVT244);
+    return r;
+}
+
+// see gf5248.h
+uint32_t
+gf5248_invert(gf5248 *d, const gf5248 *a)
+{
+    return gf5248_div(d, &gf5248_ONE, a);
+}
+
+// see gf5248.h
+int32_t
+gf5248_legendre(const gf5248 *x)
+{
+    // Same algorithm as the binary GCD in gf5248_div(), with
+    // a few differences:
+    //   - We do not keep track of the Bézout coefficients u and v.
+    //   - In each inner iteration we adjust the running symbol value,
+    //     which uses the low 3 bits of the values.
+    //   - Since we need two extra bits of look-ahead, we can only run
+    //     29 inner iterations, and then need an extra recomputation
+    //     for the last 2.
+
+    gf5248 a, b;
+    uint64_t xa, xb, f0, g0, f1, g1, ls;
+
+    inner_gf5248_normalize(&a, x);
+    b = MODULUS;
+    ls = 0; // running symbol information in bit 1.
+
+    // Outer loop
+    for (int i = 0; i < 15; i++) {
+        // Get approximations of a and b over 64 bits.
+        uint64_t m3 = a.v3 | b.v3;
+        uint64_t m2 = a.v2 | b.v2;
+        uint64_t m1 = a.v1 | b.v1;
+        uint64_t tnz3 = sgnw(m3 | -m3);
+        uint64_t tnz2 = sgnw(m2 | -m2) & ~tnz3;
+        uint64_t tnz1 = sgnw(m1 | -m1) & ~tnz3 & ~tnz2;
+        uint64_t tnzm = (m3 & tnz3) | (m2 & tnz2) | (m1 & tnz1);
+        uint64_t tnza = (a.v3 & tnz3) | (a.v2 & tnz2) | (a.v1 & tnz1);
+        uint64_t tnzb = (b.v3 & tnz3) | (b.v2 & tnz2) | (b.v1 & tnz1);
+        uint64_t snza = (a.v2 & tnz3) | (a.v1 & tnz2) | (a.v0 & tnz1);
+        uint64_t snzb = (b.v2 & tnz3) | (b.v1 & tnz2) | (b.v0 & tnz1);
+
+        int64_t s = lzcnt(tnzm);
+        uint64_t sm = (uint64_t)((31 - s) >> 63);
+        tnza ^= sm & (tnza ^ ((tnza << 32) | (snza >> 32)));
+        tnzb ^= sm & (tnzb ^ ((tnzb << 32) | (snzb >> 32)));
+        s -= 32 & sm;
+        tnza <<= s;
+        tnzb <<= s;
+
+        uint64_t tzx = ~(tnz1 | tnz2 | tnz3);
+        tnza |= a.v0 & tzx;
+        tnzb |= b.v0 & tzx;
+        xa = (a.v0 & 0x7FFFFFFF) | (tnza & 0xFFFFFFFF80000000);
+        xb = (b.v0 & 0x7FFFFFFF) | (tnzb & 0xFFFFFFFF80000000);
+
+        // First 290 inner iterations.
+        uint64_t fg0 = (uint64_t)1;
+        uint64_t fg1 = (uint64_t)1 << 32;
+        for (int j = 0; j < 29; j++) {
+            uint64_t a_odd, swap, t0, t1, t2;
+            unsigned char cc;
+            a_odd = -(xa & 1);
+            cc = inner_gf5248_sbb(0, xa, xb, &t0);
+            (void)inner_gf5248_sbb(cc, 0, 0, &swap);
+            swap &= a_odd;
+            ls ^= swap & xa & xb;
+            t1 = swap & (xa ^ xb);
+            xa ^= t1;
+            xb ^= t1;
+            t2 = swap & (fg0 ^ fg1);
+            fg0 ^= t2;
+            fg1 ^= t2;
+            xa -= a_odd & xb;
+            fg0 -= a_odd & fg1;
+            xa >>= 1;
+            fg1 <<= 1;
+            ls ^= (xb + 2) >> 1;
+        }
+
+        // Compute the updated a and b (low words only) to get
+        // enough bits for the next two iterations.
+        uint64_t fg0z = fg0 + 0x7FFFFFFF7FFFFFFF;
+        uint64_t fg1z = fg1 + 0x7FFFFFFF7FFFFFFF;
+        f0 = (fg0z & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g0 = (fg0z >> 32) - (uint64_t)0x7FFFFFFF;
+        f1 = (fg1z & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g1 = (fg1z >> 32) - (uint64_t)0x7FFFFFFF;
+        uint64_t a0 = (a.v0 * f0 + b.v0 * g0) >> 29;
+        uint64_t b0 = (a.v0 * f1 + b.v0 * g1) >> 29;
+        for (int j = 0; j < 2; j++) {
+            uint64_t a_odd, swap, t0, t1, t2, t3;
+            unsigned char cc;
+            a_odd = -(xa & 1);
+            cc = inner_gf5248_sbb(0, xa, xb, &t0);
+            (void)inner_gf5248_sbb(cc, 0, 0, &swap);
+            swap &= a_odd;
+            ls ^= swap & a0 & b0;
+            t1 = swap & (xa ^ xb);
+            xa ^= t1;
+            xb ^= t1;
+            t2 = swap & (fg0 ^ fg1);
+            fg0 ^= t2;
+            fg1 ^= t2;
+            t3 = swap & (a0 ^ b0);
+            a0 ^= t3;
+            b0 ^= t3;
+            xa -= a_odd & xb;
+            fg0 -= a_odd & fg1;
+            a0 -= a_odd & b0;
+            xa >>= 1;
+            fg1 <<= 1;
+            a0 >>= 1;
+            ls ^= (b0 + 2) >> 1;
+        }
+
+        // Propagate updates to a and b.
+        fg0 += 0x7FFFFFFF7FFFFFFF;
+        fg1 += 0x7FFFFFFF7FFFFFFF;
+        f0 = (fg0 & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g0 = (fg0 >> 32) - (uint64_t)0x7FFFFFFF;
+        f1 = (fg1 & 0xFFFFFFFF) - (uint64_t)0x7FFFFFFF;
+        g1 = (fg1 >> 32) - (uint64_t)0x7FFFFFFF;
+        gf5248 na, nb;
+        uint64_t nega = lindiv31abs(&na, &a, &b, f0, g0);
+        (void)lindiv31abs(&nb, &a, &b, f1, g1);
+        ls ^= nega & nb.v0;
+        a = na;
+        b = nb;
+    }
+
+    // Final iterations: values are at most 37 bits now. We do not
+    // need to keep track of update coefficients. Just like the GCD,
+    // we need only 35 iterations, because after 35 iterations,
+    // value a is 0 or 1, and b is 1, and no further modification to
+    // the Legendre symbol may happen.
+    xa = a.v0;
+    xb = b.v0;
+    for (int j = 0; j < 35; j++) {
+        uint64_t a_odd, swap, t0, t1;
+        unsigned char cc;
+        a_odd = -(xa & 1);
+        cc = inner_gf5248_sbb(0, xa, xb, &t0);
+        (void)inner_gf5248_sbb(cc, 0, 0, &swap);
+        swap &= a_odd;
+        ls ^= swap & xa & xb;
+        t1 = swap & (xa ^ xb);
+        xa ^= t1;
+        xb ^= t1;
+        xa -= a_odd & xb;
+        xa >>= 1;
+        ls ^= (xb + 2) >> 1;
+    }
+
+    // At this point, if the source value was not zero, then the low
+    // bit of ls contains the QR status (0 = square, 1 = non-square),
+    // which we need to convert to the expected value (+1 or -1).
+    // If y == 0, then we return 0, per the API.
+    uint32_t r = 1 - ((uint32_t)ls & 2);
+    r &= ~gf5248_iszero(x);
+    return *(int32_t *)&r;
+}
+
+// see gf5248.h
+uint32_t
+gf5248_sqrt(gf5248 *d, const gf5248 *a)
+{
+    // Candidate root is a^((q+1)/4), with (q+1)/4 = 5*2^246
+    gf5248 y;
+    gf5248_xsquare(&y, a, 2);
+    gf5248_mul(&y, &y, a);
+    gf5248_xsquare(&y, &y, 246);
+
+    // Normalize y and negate if necessary, to set the low bit to 0.
+    // The low bit check must be on the normal representation,
+    // not the Montgomery representation.
+    gf5248 yn;
+    inner_gf5248_montgomery_reduce(&yn, &y);
+    uint32_t ctl = -((uint32_t)yn.v0 & 1);
+    gf5248_neg(&yn, &y);
+    gf5248_select(&y, &y, &yn, ctl);
+
+    // Check whether the candidate is indeed a square root.
+    gf5248_square(&yn, &y);
+    uint32_t r = gf5248_equals(&yn, a);
+    *d = y;
+    return r;
+}
+
+// Little-endian encoding of a 64-bit integer.
+static inline void
+enc64le(void *dst, uint64_t x)
+{
+    uint8_t *buf = dst;
+    buf[0] = (uint8_t)x;
+    buf[1] = (uint8_t)(x >> 8);
+    buf[2] = (uint8_t)(x >> 16);
+    buf[3] = (uint8_t)(x >> 24);
+    buf[4] = (uint8_t)(x >> 32);
+    buf[5] = (uint8_t)(x >> 40);
+    buf[6] = (uint8_t)(x >> 48);
+    buf[7] = (uint8_t)(x >> 56);
+}
+
+// Little-endian decoding of a 64-bit integer.
+static inline uint64_t
+dec64le(const void *src)
+{
+    const uint8_t *buf = src;
+    return (uint64_t)buf[0] | ((uint64_t)buf[1] << 8) | ((uint64_t)buf[2] << 16) | ((uint64_t)buf[3] << 24) |
+           ((uint64_t)buf[4] << 32) | ((uint64_t)buf[5] << 40) | ((uint64_t)buf[6] << 48) | ((uint64_t)buf[7] << 56);
+}
+
+// see gf5248.h
+void
+gf5248_encode(void *dst, const gf5248 *a)
+{
+    uint8_t *buf = dst;
+    gf5248 x;
+
+    inner_gf5248_montgomery_reduce(&x, a);
+    enc64le(buf, x.v0);
+    enc64le(buf + 8, x.v1);
+    enc64le(buf + 16, x.v2);
+    enc64le(buf + 24, x.v3);
+}
+
+// see gf5248.h
+uint32_t
+gf5248_decode(gf5248 *d, const void *src)
+{
+    const uint8_t *buf = src;
+    uint64_t d0, d1, d2, d3, t;
+    unsigned char cc;
+
+    d0 = dec64le(buf);
+    d1 = dec64le(buf + 8);
+    d2 = dec64le(buf + 16);
+    d3 = dec64le(buf + 24);
+    cc = inner_gf5248_sbb(0, d0, MODULUS.v0, &t);
+    cc = inner_gf5248_sbb(cc, d1, MODULUS.v1, &t);
+    cc = inner_gf5248_sbb(cc, d2, MODULUS.v2, &t);
+    cc = inner_gf5248_sbb(cc, d3, MODULUS.v3, &t);
+    (void)inner_gf5248_sbb(cc, 0, 0, &t);
+
+    // If the value was not canonical then t = 0; otherwise, t = -1.
+    d->v0 = d0 & t;
+    d->v1 = d1 & t;
+    d->v2 = d2 & t;
+    d->v3 = d3 & t;
+
+    // Convert to Montgomery representation.
+    gf5248_mul(d, d, &R2);
+
+    return (uint32_t)t;
+}
+
+// see gf5248.h
+void
+gf5248_decode_reduce(gf5248 *d, const void *src, size_t len)
+{
+    const uint8_t *buf = src;
+
+    *d = gf5248_ZERO;
+    if (len == 0) {
+        return;
+    }
+
+    if ((len & 31) != 0) {
+        // Input size is not a multiple of 32, we decode a partial
+        // block, which is already less than 2^248.
+        uint8_t tmp[32];
+        size_t k;
+
+        k = len & ~(size_t)31;
+        memcpy(tmp, buf + k, len - k);
+        memset(tmp + len - k, 0, (sizeof tmp) - (len - k));
+        d->v0 = dec64le(&tmp[0]);
+        d->v1 = dec64le(&tmp[8]);
+        d->v2 = dec64le(&tmp[16]);
+        d->v3 = dec64le(&tmp[24]);
+        len = k;
+    } else {
+        // Input size is a multiple of 32, we decode a full block,
+        // and a reduction is needed.
+        len -= 32;
+        uint64_t d0 = dec64le(buf + len);
+        uint64_t d1 = dec64le(buf + len + 8);
+        uint64_t d2 = dec64le(buf + len + 16);
+        uint64_t d3 = dec64le(buf + len + 24);
+        inner_gf5248_partial_reduce(d, d0, d1, d2, d3);
+    }
+
+    // Process all remaining blocks, in descending address order.
+    while (len > 0) {
+        gf5248_mul(d, d, &R2);
+        len -= 32;
+        uint64_t t0 = dec64le(buf + len);
+        uint64_t t1 = dec64le(buf + len + 8);
+        uint64_t t2 = dec64le(buf + len + 16);
+        uint64_t t3 = dec64le(buf + len + 24);
+        gf5248 t;
+        inner_gf5248_partial_reduce(&t, t0, t1, t2, t3);
+        gf5248_add(d, d, &t);
+    }
+
+    // Final conversion to Montgomery representation.
+    gf5248_mul(d, d, &R2);
+}
+
+void
+gf5248_div3(gf5248 *d, const gf5248 *a)
+{
+    const digit_t MAGIC = 0xAAAAAAAAAAAAAAAB; // 3^-1 mod 2^64
+    uint64_t c0, c1, f0, f1;
+    gf5248 t;
+
+    inner_gf5248_umul(f0, f1, a->arr[3], MAGIC);
+    t.arr[3] = f1 >> 1;
+    c1 = a->arr[3] - 3 * t.arr[3];
+
+    for (int32_t i = 2; i >= 0; i--) {
+        c0 = c1;
+        inner_gf5248_umul(f0, f1, a->arr[i], MAGIC);
+        t.arr[i] = f1 >> 1;
+        c1 = c0 + a->arr[i] - 3 * t.arr[i];
+        t.arr[i] += c0 * ((MAGIC - 1) >> 1);
+        f0 = ((c1 >> 1) & c1);           /* c1 == 3 */
+        f1 = ((c1 >> 2) & !(c1 & 0x11)); /* c1 == 4 */
+        f0 |= f1;
+        t.arr[i] += f0;
+        c1 = c1 - 3 * f0;
+    }
+    *d = t;
+    gf5248_sub(&t, d, &PM1O3);
+    gf5248_select(d, d, &t, -((c1 & 1) | (c1 >> 1))); // c1 >= 1
+    gf5248_sub(&t, d, &PM1O3);
+    gf5248_select(d, d, &t, -(c1 == 2));
+}
--- a/src/gf/broadwell/lvl1/include/fp.h
+++ b/src/gf/broadwell/lvl1/include/fp.h
@@ -1,7 +1,8 @@
 #ifndef FP_H
 #define FP_H

-//////////////////////////////////////////////// NOTE: this is placed here for now
+// Include statements
+#include <sqisign_namespace.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdbool.h>
@@ -10,67 +11,129 @@
 #include <tutil.h>
 #include <fp_constants.h>

-typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+#include "gf5248.h"

-void fp_set(digit_t* x, const digit_t val);
-bool fp_is_equal(const digit_t* a, const digit_t* b);
-bool fp_is_zero(const digit_t* a);
-void fp_copy(digit_t* out, const digit_t* a);
-digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
-void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
-void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
-void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
-void fp_neg(digit_t* out, const digit_t* a);
-void fp_sqr(digit_t* out, const digit_t* a);
-void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
-void MUL(digit_t* out, const digit_t a, const digit_t b);
-void fp_inv(digit_t* x);
-bool fp_is_square(const digit_t* a);
-void fp_sqrt(digit_t* a);
-void fp_tomont(digit_t* out, const digit_t* a);
-void fp_frommont(digit_t* out, const digit_t* a);
-void fp_mont_setone(digit_t* out);
+// Type for elements of GF(p)
+#define fp_t gf5248

-/********************** Constant-time unsigned comparisons ***********************/
+// Constants (Assumed to be in Montgomery form)
+#define ZERO gf5248_ZERO
+#define ONE gf5248_ONE

-// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
-
-static inline unsigned int is_digit_nonzero_ct(digit_t x)
-{ // Is x != 0?
-    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+// Operations in fp
+static inline void
+fp_neg(fp_t *d, const fp_t *a)
+{
+    gf5248_neg(d, a);
 }

-static inline unsigned int is_digit_zero_ct(digit_t x)
-{ // Is x = 0?
-    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+void fp_add(fp_t *out, const fp_t *a, const fp_t *b); // implemented in fp_asm.S
+void fp_sub(fp_t *out, const fp_t *a, const fp_t *b); // implemented in fp_asm.S
+void fp_sqr(fp_t *out, const fp_t *a);                // implemented in fp_asm.S
+void fp_mul(fp_t *out, const fp_t *a, const fp_t *b); // implemented in fp_asm.S
+
+static inline void
+fp_mul_small(fp_t *d, const fp_t *a, uint32_t n)
+{
+    gf5248_mul_small(d, a, n);
 }

-static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
-{ // Is x < y?
-    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+static inline void
+fp_half(fp_t *d, const fp_t *a)
+{
+    gf5248_half(d, a);
+}
+// #define fp_half gf5248_half
+
+static inline void
+fp_div3(fp_t *d, const fp_t *a)
+{
+    gf5248_div3(d, a);
+}
+// #define fp_div3 gf5248_div3
+
+// Constant time selection and swapping
+static inline void
+fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl)
+{
+    gf5248_select(d, a0, a1, ctl);
+}
+// #define fp_select gf5248_select
+static inline void
+fp_cswap(fp_t *a, fp_t *b, uint32_t ctl)
+{
+    gf5248_cswap(a, b, ctl);
+}
+// #define fp_cswap gf5248_cswap
+
+// Comparisons for fp elements
+static inline uint32_t
+fp_is_zero(const fp_t *a)
+{
+    return gf5248_iszero(a);
+}
+// #define fp_is_zero gf5248_iszero
+
+static inline uint32_t
+fp_is_equal(const fp_t *a, const fp_t *b)
+{
+    return gf5248_equals(a, b);
+}
+// #define fp_is_equal gf5248_equals
+
+// Set a uint32 to an Fp value
+static inline void
+fp_set_small(fp_t *d, uint32_t x)
+{
+    gf5248_set_small(d, x);
+}
+// #define fp_set_small gf5248_set_small
+
+// Encoding and decoding of bytes
+static inline void
+fp_encode(void *dst, const fp_t *a)
+{
+    gf5248_encode(dst, a);
+}
+// #define fp_encode gf5248_encode
+static inline uint32_t
+fp_decode(fp_t *d, const void *src)
+{
+    return gf5248_decode(d, src);
+}
+// #define fp_decode gf5248_decode
+static inline void
+fp_decode_reduce(fp_t *d, const void *src, size_t len)
+{
+    gf5248_decode_reduce(d, src, len);
+}
+// #define fp_decode_reduce gf5248_decode_reduce
+
+// These functions are essentially useless because we can just
+// use = for the shallow copies we need, but they're here for
+// now until we do a larger refactoring
+static inline void
+fp_copy(fp_t *out, const fp_t *a)
+{
+    memcpy(out, a, sizeof(fp_t));
 }

-/********************** Platform-independent macros for digit-size operations **********************/
+static inline void
+fp_set_zero(fp_t *a)
+{
+    memcpy(a, &ZERO, sizeof(fp_t));
+}

-// Digit addition with carry
-#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
-    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
-    (sumOut) = (addend2) + tempReg;                                                               \
-    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+static inline void
+fp_set_one(fp_t *a)
+{
+    memcpy(a, &ONE, sizeof(fp_t));
+}

-// Digit subtraction with borrow
-#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
-    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
-    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
-    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
-    (borrowOut) = borrowReg; }
+// Functions defined in low level code but with different API
+void fp_inv(fp_t *a);
+void fp_sqrt(fp_t *a);
+void fp_exp3div4(fp_t *a);
+uint32_t fp_is_square(const fp_t *a);

-// Shift right with flexible datatype
-#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
-    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
-
-// Digit shift left
-#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
-    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
-
-#endif
+#endif
--- a/src/gf/broadwell/lvl1/include/fp2.h
+++ b/src/gf/broadwell/lvl1/include/fp2.h
@@ -1,29 +1,41 @@
 #ifndef FP2_H
 #define FP2_H

-#include "fp.h"
+#define NO_FP2X_MUL
+#define NO_FP2X_SQR

-// Structure for representing elements in GF(p^2)
-typedef struct fp2_t {
-    fp_t re, im;
-} fp2_t;
+#include <fp2x.h>

-void fp2_set(fp2_t* x, const digit_t val);
-bool fp2_is_zero(const fp2_t* a);
-bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
-void fp2_copy(fp2_t* x, const fp2_t* y);
-fp2_t fp2_non_residue();
-void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
-void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
-void fp2_neg(fp2_t* x, const fp2_t* y);
-void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
-void fp2_sqr(fp2_t* x, const fp2_t* y);
-void fp2_inv(fp2_t* x);
-bool fp2_is_square(const fp2_t* x);
-void fp2_frob(fp2_t* x, const fp2_t* y);
-void fp2_sqrt(fp2_t* x);
-void fp2_tomont(fp2_t* x, const fp2_t* y);
-void fp2_frommont(fp2_t* x, const fp2_t* y);
-int fp2_cmp(fp2_t* x, fp2_t* y);
+extern void fp2_sq_c0(fp2_t *out, const fp2_t *in);
+extern void fp2_sq_c1(fp_t *out, const fp2_t *in);

-#endif
+extern void fp2_mul_c0(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+extern void fp2_mul_c1(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+
+static inline void
+fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_t t;
+
+    fp2_mul_c0(&t, y, z);     // c0 = a0*b0 - a1*b1
+    fp2_mul_c1(&x->im, y, z); // c1 = a0*b1 + a1*b0
+    x->re.arr[0] = t.arr[0];
+    x->re.arr[1] = t.arr[1];
+    x->re.arr[2] = t.arr[2];
+    x->re.arr[3] = t.arr[3];
+}
+
+static inline void
+fp2_sqr(fp2_t *x, const fp2_t *y)
+{
+    fp2_t t;
+
+    fp2_sq_c0(&t, y);     // c0 = (a0+a1)(a0-a1)
+    fp2_sq_c1(&x->im, y); // c1 = 2a0*a1
+    x->re.arr[0] = t.re.arr[0];
+    x->re.arr[1] = t.re.arr[1];
+    x->re.arr[2] = t.re.arr[2];
+    x->re.arr[3] = t.re.arr[3];
+}
+
+#endif
--- a/Show More
+++ b/Show More