initial version of SQIsign

Co-authored-by: Jorge Chavez-Saab <jorgechavezsaab@gmail.com> Co-authored-by: Maria Corte-Real Santos <36373796+mariascrs@users.noreply.github.com> Co-authored-by: Luca De Feo <github@defeo.lu> Co-authored-by: Jonathan Komada Eriksen <jonathan.eriksen97@gmail.com> Co-authored-by: Basil Hess <bhe@zurich.ibm.com> Co-authored-by: Antonin Leroux <18654258+tonioecto@users.noreply.github.com> Co-authored-by: Patrick Longa <plonga@microsoft.com> Co-authored-by: Lorenz Panny <lorenz@yx7.cc> Co-authored-by: Francisco Rodríguez-Henríquez <francisco.rodriguez@tii.ae> Co-authored-by: Sina Schaeffler <108983332+syndrakon@users.noreply.github.com> Co-authored-by: Benjamin Wesolowski <19474926+Calodeon@users.noreply.github.com>
2023-06-01 00:00:00 +00:00
commit 28ff420dd0
285 changed files with 70301 additions and 0 deletions
--- a/.astylerc
+++ b/.astylerc
@@ -0,0 +1,16 @@
 # find include src test -name '*.[ch]' | xargs astyle --options=.astylerc
 --style=google 
 --indent=spaces
 #--indent-preproc-define
 #--indent-preproc-cond
 --pad-oper 
 --pad-comma 
 --pad-header
 #--unpad-paren 
 --align-pointer=name 
 --add-braces 
 --convert-tabs
 --mode=c 
 # disable backup files
 --suffix=none
 --lineend=linux
--- a/.cmake/flags.cmake
+++ b/.cmake/flags.cmake
@@ -0,0 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 if (CMAKE_SYSTEM_NAME STREQUAL "WindowsStore")
 	set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -nostdlib")
 endif()
 if (SOURCE_PATH)
 	set(SOURCE_FINAL_PATH ${SOURCE_PATH})
 else()
 	set(SOURCE_FINAL_PATH ${PROJECT_BINARY_DIR}/src)
 endif()
 include(GNUInstallDirs)
 include(CheckSymbolExists)
 include(CMakePushCheckState)
 set(STRICT_OPTIONS_CPP )
 set(STRICT_OPTIONS_C )
 set(STRICT_OPTIONS_CXX )
 if(MSVC)
 	if(ENABLE_STRICT)
 		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} /WX /Zc:__cplusplus")
 	endif()
 else()
 	set(STRICT_OPTIONS_CXX "${STRICT_OPTIONS_CXX} -std=c++14 -O2")
 	set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wall -Wuninitialized -Wno-deprecated-declarations -Wno-missing-field-initializers")
 	if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -O3")
 	endif()
 	set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -std=c99 -Wno-error=strict-prototypes -fvisibility=hidden -funroll-loops -Wno-error=implicit-function-declaration -Wno-error=attributes")
 	if(CMAKE_C_COMPILER_ID MATCHES "Clang")
 		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-error=unknown-warning-option -Qunused-arguments -Wno-tautological-compare")
 		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-unused-function -Wno-pass-failed")
 	endif()
 	if(ENABLE_STRICT)
 		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -Werror -Wextra -Wno-unused-parameter -fno-strict-aliasing")
 	endif()
 endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${STRICT_OPTIONS_C}")
 #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STRICT_OPTIONS_CXX} ${STRICT_OPTIONS_CPP}")
--- a/.cmake/gmpconfig.cmake
+++ b/.cmake/gmpconfig.cmake
@@ -0,0 +1,36 @@
 if (ENABLE_GMP_BUILD)
  # Download and build own libgmp version
  if (POLICY CMP0135)
    cmake_policy(SET CMP0135 NEW)
  endif()
  SET(GMP_BUILD_CONFIG_ARGS "" CACHE STRING "Some user-specified gmp config options")
  option(ENABLE_GMP_STATIC "Option to statically link. Default is dynamic linking" OFF)
  if (ENABLE_GMP_STATIC)
    set(GMP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
  else()
    set(GMP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
  endif()
  message("${GMP_BUILD_CONFIG_ARGS}")
  include(ExternalProject)
  find_program(MAKE_EXE NAMES make gmake nmake)
  set(libgmp_INSTALL_DIR "${CMAKE_BINARY_DIR}/libgmp")
  ExternalProject_Add(libgmp_external
    PREFIX ${libgmp_INSTALL_DIR}
    URL               https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz
    URL_HASH          SHA256=fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2
    CONFIGURE_COMMAND ${libgmp_INSTALL_DIR}/src/libgmp_external/configure --prefix=${libgmp_INSTALL_DIR} ${GMP_BUILD_CONFIG_ARGS}
    BUILD_COMMAND     ${MAKE_EXE} -j8
    INSTALL_COMMAND   ${MAKE_EXE} install
  )
  set(GMP ${libgmp_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gmp${GMP_LIB_SUFFIX})
  include_directories(${libgmp_INSTALL_DIR}/include)
 else()
  # use system gmp version
  find_library(GMP gmp)
  find_path(GMP_INCLUDE gmp.h)
  include_directories(${GMP_INCLUDE})
 endif()
--- a/.cmake/impl_type.cmake
+++ b/.cmake/impl_type.cmake
@@ -0,0 +1,11 @@
 get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
 string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/broadwell AND SQISIGN_BUILD_TYPE MATCHES "broadwell")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/broadwell)
 elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/opt AND SQISIGN_BUILD_TYPE MATCHES "opt")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/opt)
 elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ref)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ref)
 else()
    message(FATAL_ERROR "No matching implementation found")
 endif()
--- a/.cmake/sanitizers.cmake
+++ b/.cmake/sanitizers.cmake
@@ -0,0 +1,40 @@
 # SPDX-License-Identifier: Apache-2.0
 # AddressSanitizer
 set(CMAKE_C_FLAGS_ASAN
    "-fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g -O1"
    CACHE STRING "Flags used by the C compiler during AddressSanitizer builds."
    FORCE)
 # LeakSanitizer
 set(CMAKE_C_FLAGS_LSAN
    "-fsanitize=leak -fno-omit-frame-pointer -g -O1"
    CACHE STRING "Flags used by the C compiler during LeakSanitizer builds."
    FORCE)
 # MemorySanitizer
 set(CMAKE_C_FLAGS_MSAN
    "-fsanitize=memory -fno-optimize-sibling-calls -fsanitize-memory-track-origins=2 -fno-omit-frame-pointer -g -O1"
    CACHE STRING "Flags used by the C compiler during MemorySanitizer builds."
    FORCE)
 # UndefinedBehaviour
 set(CMAKE_C_FLAGS_UBSAN
    "-fsanitize=undefined"
    CACHE STRING "Flags used by the C compiler during UndefinedBehaviourSanitizer builds."
    FORCE)
 set(CMAKE_C_FLAGS_COVERAGE
    "-fprofile-arcs -ftest-coverage"
    CACHE STRING "Flags used by the C compiler during Coverage builds."
    FORCE)
 set(CMAKE_C_FLAGS_PERF
    "-ggdb"
    CACHE STRING "Flags used for profiling with perf or pprof."
    FORCE)
 set(CMAKE_C_FLAGS_GPROF
    "-g -pg"
    CACHE STRING "Flags used for profiling with gprof."
    FORCE)
--- a/.cmake/sqisign_variant.cmake
+++ b/.cmake/sqisign_variant.cmake
@@ -0,0 +1,26 @@
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/generic)
    set(LIB_${CCSD_NAME_UPPER} sqisign_${CCSD_NAME}_generic CACHE INTERNAL "LIB")
    set(INC_${CCSD_NAME_UPPER} ${CMAKE_CURRENT_SOURCE_DIR}/generic/include CACHE INTERNAL "LIB")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
    FOREACH(SVARIANT ${SVARIANT_S})
        string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        set(LIB_${CCSD_NAME_UPPER}_${SVARIANT_UPPER} ${LIB_${CCSD_NAME_UPPER}} CACHE INTERNAL "LIB")
        set(INC_${CCSD_NAME_UPPER}_${SVARIANT_UPPER} ${INC_${CCSD_NAME_UPPER}} CACHE INTERNAL "INC")
    ENDFOREACH()
 else()
    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/include)
        set(INC_${CCSD_NAME_UPPER} ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE INTERNAL "LIB")
    endif()
    FOREACH(SVARIANT ${SVARIANT_S})
        string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        set(LIB_${CCSD_NAME_UPPER}_${SVARIANT_UPPER} sqisign_${CCSD_NAME}_${SVARIANT} CACHE INTERNAL "LIB")
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${SVARIANT})
            set(INC_${CCSD_NAME_UPPER}_${SVARIANT_UPPER} ${CMAKE_CURRENT_SOURCE_DIR}/${SVARIANT}/include CACHE INTERNAL "INC")
            add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${SVARIANT})
        else()
            message(FATAL_ERROR "No matching implementation found for variant ${SVARIANT}")
        endif()
    ENDFOREACH()
 endif()
--- a/.cmake/target.cmake
+++ b/.cmake/target.cmake
@@ -0,0 +1,39 @@
 # SPDX-License-Identifier: Apache-2.0
 if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
    add_definitions(-DTARGET_ARM64)
    add_definitions(-DRADIX_64)
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
    add_definitions(-DTARGET_ARM)
    add_definitions(-DRADIX_32)
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
    add_definitions(-DTARGET_AMD64)
    add_definitions(-DRADIX_64)
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "i686")
    add_definitions(-DTARGET_X86)
    add_definitions(-DRADIX_32)
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(s390x.*|S390X.*)")
    add_definitions(-DTARGET_S390X)
    add_definitions(-DTARGET_BIG_ENDIAN)
    add_definitions(-DRADIX_64)
 else()
    add_definitions(-DTARGET_OTHER)
    add_definitions(-DRADIX_64)
    message("Warning: system architecture not detected, defaulting to 64 bit")
 endif()
 if (UNIX)
    add_definitions(-DTARGET_OS_UNIX)
 else()
    add_definitions(-DTARGET_OS_OTHER)
 endif()
 set(C_OPT_FLAGS "")
 if ((NOT DEFINED SQISIGN_BUILD_TYPE))
  set(SQISIGN_BUILD_TYPE opt)
 endif()
 if ((NOT DEFINED SQISIGN_TEST_REPS))
  set(SQISIGN_TEST_REPS 1000)
 endif()
--- a/.cmake/target_docs.cmake
+++ b/.cmake/target_docs.cmake
@@ -0,0 +1,17 @@
 # Find Doxygen
 find_package(Doxygen)
 # Check if Doxygen is found
 if(DOXYGEN_FOUND)
  # Set input and output directories
  set(DOXYGEN_CONF ${PROJECT_SOURCE_DIR}/Doxyfile)
  # Add a target to generate Doxygen documentation using 'make doc'
  add_custom_target(doc
    COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONF}
    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
    COMMENT "Generating API documentation with Doxygen"
    VERBATIM )
 else()
    message(FATAL_ERROR "Doxygen not found - can't generate docs.")
 endif(DOXYGEN_FOUND)
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -0,0 +1,16 @@
 ;; Emacs style file
 ;;
 ;; Sets spaces-only indentation, 4-spaces tab stops, linux kernel
 ;; coding style
 (
 (nil . ((indent-tabs-mode . nil)
         (tab-width . 4)
         )
      )
 (c-default-style . ((c-mode . "linux")
                     ))
 (c-mode . ((c-file-style . "linux")
            (c-basic-offset . 4)
            )
         )
 )
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,121 @@
 name: CMake
 on:
  push:
    branches: [ '*' ]
  pull_request:
    branches: [ "main" ]
 env:
  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
  BUILD_TYPE: Release
 jobs:
  build:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    # You can convert this to a matrix build if you need cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sqisign_build_type: [opt]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python 3.10
      uses: actions/setup-python@v3
      with:
        python-version: "3.10"
    - name: Install dependencies Valgrind, GMP, Doxygen, TeX
      run: |
          sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex
          echo "Valgrind installed"
    - name: Install Valgrind dependencies
      run: |
        python -m pip install --upgrade pip
        pip install ValgrindCI
    - name: Configure CMake
      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }}
    - name: Build
      # Build your program with the given configuration
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    - name: Build documentation
      # Create html and latex documentation, TODO: do we need different docs for ref and opt?
      run: doxygen Doxyfile && cd latex && xelatex refman
    - name: Upload latex documentation
      uses: actions/upload-artifact@v3
      with:
        name: docs
        path: latex/refman.pdf
    - name: Test
      working-directory: ${{github.workspace}}/build
      # Execute tests defined by the CMake configuration.
      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
      run: ctest -C ${{env.BUILD_TYPE}}
    - name: Examples
      if: false
      working-directory: ${{github.workspace}}/build/apps
      run: |
          ./PQCgenKAT_sign_lvl1
          ./PQCgenKAT_sign_lvl1_varp6983
          ./example_nistapi_lvl1
          ./example_nistapi_lvl1_varp6983
    - name: CT-Tests
      # TODO: re-enable for those tests that should be ct
      if: false
      run: |
          rm -rf build
          cmake -Bbuild -DENABLE_CT_TESTING=ON -DCMAKE_BUILD_TYPE=Debug -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }}
          cmake --build build
          # valgrind --track-origins=yes build/
          # valgrind --track-origins=yes build/
          # valgrind --track-origins=yes build/
          # valgrind --track-origins=yes build/
    - name: Memcheck
      run: |
          rm -rf build
          cmake -Bbuild -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=10
          cmake --build build
          ctest -T memcheck --test-dir build
      if: false
    - name: Address Sanitizer ASAN
      run: |
          rm -rf build
          cmake -Bbuild -DCMAKE_BUILD_TYPE=ASAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang
          cmake --build build
          ctest -v --test-dir build
    - name: Memory Sanitizer MSAN
      run: |
          rm -rf build
          cmake -Bbuild -DCMAKE_BUILD_TYPE=MSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang
          cmake --build build
          ctest -v --test-dir build
    - name: Leak Sanitizer LSAN
      run: |
          rm -rf build
          cmake -Bbuild -DCMAKE_BUILD_TYPE=LSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang
          cmake --build build
          ctest -v --test-dir build
    - name: Undefined Behavior Sanitizer UBSAN
      run: |
          rm -rf build
          cmake -Bbuild -DCMAKE_BUILD_TYPE=UBSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang
          cmake --build build
          ctest -v --test-dir build
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 build/
 html/
 latex/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,47 @@
 # SPDX-License-Identifier: Apache-2.0
 cmake_minimum_required(VERSION 3.5)
 project(SQIsign VERSION 1.0 LANGUAGES C ASM)
 set(SQISIGN_SO_VERSION "0")
 set(CMAKE_C_STANDARD 99)
 include(CTest)
 option(ENABLE_STRICT "Build with strict compile options." OFF)
 option(ENABLE_TESTS  "Enable compilation of tests." ON)
 option(ENABLE_CT_TESTING  "Enable compilation for constant time testing." OFF)
 option(ENABLE_GMP_BUILD "Download and build external version of GMP" OFF)
 option(ENABLE_DOC_TARGET "Enable building API documentation using doxygen" OFF)
 if (NOT DEFINED SQISIGN_BUILD_TYPE)
 	SET(SQISIGN_BUILD_TYPE "ref")
 endif()
 if(SQISIGN_BUILD_TYPE STREQUAL "broadwell")
 	SET(SVARIANT_S "lvl1")
 else()
 	SET(SVARIANT_S "lvl1;lvl3;lvl5")
 endif()
 include(.cmake/flags.cmake)
 include(.cmake/sanitizers.cmake)
 include(.cmake/target.cmake)
 if(ENABLE_DOC_TARGET)
 	include(.cmake/target_docs.cmake)
 endif()
 include(.cmake/gmpconfig.cmake)
 set(SELECT_IMPL_TYPE ${PROJECT_SOURCE_DIR}/.cmake/impl_type.cmake)
 set(SELECT_SQISIGN_VARIANT ${PROJECT_SOURCE_DIR}/.cmake/sqisign_variant.cmake)
 set(INC_PUBLIC ${PROJECT_SOURCE_DIR}/include)
 add_subdirectory(src)
 add_subdirectory(apps)
 if(ENABLE_TESTS)
 	enable_testing()
 	add_subdirectory(test)
 endif()
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -0,0 +1,193 @@
 # Developer guidelines
 Please read carefully before contributing to this repo.
 ## Code structure
 The code is split into the modules below:
 - `common`: common code for AES, SHAKE, (P)RNG, memory handling. Every
  module that needs a hash function, seed expansion (e.g., KLPT),
  deterministic alea for tests, should call to this module.
 - `uintbig`: multi-precision big integers.
 - `gf`: GF(p^2) and GF(p) arithmetic.
 - `ec`: elliptic curves, isogenies and pairings. Everything that is
  purely finite-fieldy.
 - `quaternion`: quaternion orders and ideals. This is, essentially,
  replacing PARI/GP.
 - `klpt`: implementation of KLPT.
 - `id2iso`: code for Iso <-> Ideal.
 - `util`: auxilary code shared among libraries.
 The sources for the modules are in [`src/`](src).  Each module is
 structured as follows:
 ```
 SQIsign
 └── src
    └── <module_name>
        ├── <arch>
        │   ├── generic
        │   └── lvl1
        ├── opt
        │   ├── generic
        │   ├── lvl1
        │   ├── lvl1_var1
        │   ├── lvl3
        │   └── lvl5
        └── ref
            └── generic
 ```
 where:
 - `<module_name>` is the name of the module.
 - `<arch>` are optional architecture-specific implementations of the
  module (e.g., `broadwell` for code using assembly instructions
  specific to the Broadwell platform).
 - `opt` and `ref` are the portable *optimized* and *reference*
  implementations.
 - `lvl1`, `lvl3`, `lvl5` are parameter-dependent implementations of
  the module, corresponding to NIST levels 1, 3 and 5, respectively.
 - `lvl1_var1` is a variant of `lvl1`, e.g., using a different prime
  characteristic. The naming is free, and implementors are encouraged
  to choose more explicit naming, e.g., `lvl1_varp6983` for the
  variant using the `p6983` prime defined in the SQIsign AC20 variant.
 - `generic` is a parameter-independent implementation of the module.
  If no folder is named like the currently selected variant (see
  [Build](README.md#Build)), then this is compiled instead.
 Each of the folders above is allowed to be a symlink. E.g., if a
 module has no separate optimized and reference implementation, then
 `opt` can be a symlink to `ref`. Other example: a module's code only
 depends on the field size, but not the specific prime, then
 `lvl1_varp6983` could be a symlink to `lvl1`.
 ### Contents of a module
 The leaf folders described above should arrange code as described
 below.  We use the `generic` implementation of the `uintbig` module as
 an example.
 ```
 generic
 ├── bench
 │   ├── CmakeLists.txt
 │   ├── bench1.c
 │   └── bench2.c
 ├── include
 │   └── uintbig.h
 ├── test 
 │   ├── CmakeLists.txt
 │   ├── test1.c
 │   └── test2.c
 ├── CmakeLists.txt
 ├── internal_header.h
 ├── soruce1.c
 └── soruce2.c
 ```
 where:
 - `include/` shall contain a **unique header file** named
  `<module_name>.h`, where `<module_name>` is the name of the module.
  This header contains the public API of the module, and is the only
  header that can be included by other modules (e.g., via `#include
  <uintbig.h>`). These files must contain extensive doxygen-formatted
  documentation describing the module, see
  [Documentation](#Documentation).
 - `bench` and `test` contain one executable per file, containing,
  well, benchmarks and unit tests. Refer to [Benchmarks](#Benchmarks)
  and [Tests](#Tests) for instructions on how to write these.
 - Internal headers for the private use of the module, such as
  `internal_header.h` go to the root. Include these using `#include
  "internal_header.h"`.
 - The implementation of the module also goes into the root.
 ## Tests
 It is important to have extensive test coverage of the whole software.
 Each module must have its own unit tests, as well as integration tests
 to ensure consistency across the modules.
 ### Unit tests
 These go into `src/<module_name>/<ref|opt|...>/<generic|lvl1|...>/test/`.
 Refer to ... for an example of how to write tests.
 ### Integration tests
 These go into `test/`.  Refer to
 [`test/test_sqisign.c`](test/test_sqisign.c) for an example.
 ### Known Answer Tests (KAT)
 KATs help validate consistency across implementations. By ensuring
 that, e.g., the optimized and reference implementation produce the
 same signatures.
 See [Known Answer Tests in README.md](README.md#Known Answer Tests (KAT)).
 ## Benchmarks
 Benchmarks for a module go into
 `src/<module_name>/<ref|opt|...>/<generic|lvl1|...>/bench/`.  Global
 benchmarks go...
 ## Documentation
 Use [Doxygen headers](https://www.doxygen.nl/manual/docblocks.html)
 for documentation.
 All code should be extensively documented.  The public module headers
 **MUST** be thoroughly documented.
 CI automatically builds a PDF of the doc every time code is pushed.
 To download the PDF, go to
 [Actions](https://github.com/SQIsign/sqisign-nist/actions), click on
 the workflow run you're interested in, then go to Artifacts -> docs
 (see figure).
 ![](https://user-images.githubusercontent.com/149199/231756751-0f2780f8-33fe-4db9-8800-b5f145423b65.png)
 ## Branches and pull requests
 Always work on topic branches, never push work in progress on the
 `main` branch.  Once a task / issue / work unit is completed, create a
 pull-request and ask your team leader for a review.
 ## Coding style
 - **C version**: All code must compile cleanly as *C99*, without
  emitting any warnings, using recent versions of GCC and clang.
 - **Names**: Externally visible functions and types should be prefixed
  with the name of the module they belong to.
 - **Aliases**: Do use `typedef` with descriptive names whenever it
  makes any nonzero amount of sense to do so.
  Avoid `typedef`s for things other than `struct`s (or elementary data
  types); they have a tendency to break for array and pointer types if
  programmers are not aware of the nature of the underlying type.
 - **Parameters**: Output arguments, if any, should always come first.
  Input arguments should generally be marked `const`. Objects of types
  which typically fit into registers should be passed and returned by
  value, larger objects by reference (i.e., as a pointer).
  If certain arguments often appear together, it may be an indication
  that they should be wrapped as a `struct`.
 - **Global variables**: Global *constants* are acceptable if needed,
  especially within modules whose code already implicitly relies on
  the same constants anyway (primary example: 𝔽ₚ). It is often a good
  idea to group global constants in a meaningful `struct` and write
  the code such that the struct could easily be replaced by a runtime
  variable at a later point.
  Global *state* (modifiable global variables), on the other hand, is
  strictly forbidden.
 - **Whitespace**: Try not to mix tabs and spaces. Line endings
  should be UNIX-style (i.e., `\n` rather than `\r\n`). Whitespace
  characters at the end of a line, or by themselves on an otherwise
  empty line, are to be avoided.
--- a/2443
+++ b/2443
--- a/2443
+++ b/2443
--- a/KAT/PQCsignKAT_1138_lvl3.req
+++ b/KAT/PQCsignKAT_1138_lvl3.req
--- a/KAT/PQCsignKAT_1138_lvl3.rsp
+++ b/KAT/PQCsignKAT_1138_lvl3.rsp
--- a/KAT/PQCsignKAT_1509_lvl5.req
+++ b/KAT/PQCsignKAT_1509_lvl5.req
--- a/KAT/PQCsignKAT_1509_lvl5.rsp
+++ b/KAT/PQCsignKAT_1509_lvl5.rsp
--- a/KAT/PQCsignKAT_782_lvl1.req
+++ b/KAT/PQCsignKAT_782_lvl1.req
--- a/KAT/PQCsignKAT_782_lvl1.rsp
+++ b/KAT/PQCsignKAT_782_lvl1.rsp
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright {yyyy} {name of copyright owner}
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/13
+++ b/13
@@ -0,0 +1,13 @@
 Copyright 2023 the SQIsign team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,246 @@
 # SQIsign
 This library is a C implementation of SQIsign, short for Short Quaternion and Isogeny Signature (from isogeny graphs of supersingular elliptic curves).
 ## Requirements
 - CMake (version 3.5 or later)
 - C99-compatible compiler
 - Valgrind (for dynamic testing)
 - Clang static analyzer (version 10 or later, for static analysis)
 - GMP (version 6.1.2 or later)
 ## Build
 - `mkdir -p build`
 - `cd build`
 - `cmake -DSQISIGN_BUILD_TYPE=<ref/broadwell> ..`
 - `make`
 ## Build options
 CMake build options can be specified with `-D<BUILD_OPTION>=<VALUE>`.
 ### ENABLE_TESTS
 Builds a test harness for the library, the default value is `ON`.
 ### ENABLE_CT_TESTING
 Builds the library with instrumentation for constant-time behavior testing, the default value is `OFF`. Valgrind development files are used for this build option.
 ### ENABLE_GMP_BUILD
 If set to `OFF` (by default), the gmp library on the system is dynamically linked.
 If set to `ON`, a custom gmp library is linked, which is built as part of the overall build process. 
 In the latter case, the following further options are available:
 - `ENABLE_GMP_STATIC`: Does static linking against gmp. The default is `OFF`.
 - `GMP_BUILD_CONFIG_ARGS`: Provides additional config arguments for the gmp build (for example `--disable-assembly`). By default, no config arguments are provided.
 ### ENABLE_DOC_TARGET
 If set to `ON`, a doc target is available that builds API documentation. Note that a doxygen installation is required if set to `ON`.
 The default is `OFF`.
 ### SQISIGN_BUILD_TYPE
 Specifies the build type for which SQIsign is built. The currently supported flags are:
 - `ref`, which builds the plain C reference implementation.
 - `broadwell`, which builds an additional implementation with GF assembly optimized code for the Intel Broadwell architecture.
 ### SQISIGN_TEST_REPS
 Specifies and overrides the number of (self-)test repetitions to be run.
 ### CMAKE_BUILD_TYPE
 Can be used to specify special build types. The options are:
 - `Release`: Builds with optimizations enabled and assertions disabled.
 - `Debug`: Builds with debug symbols.
 - `ASAN`: Builds with AddressSanitizer memory error detector.
 - `MSAN`: Builds with MemorySanitizer detector for uninitialized reads.
 - `LSAN`: Builds with LeakSanitizer for run-time memory leak detection.
 - `UBSAN`: Builds with UndefinedBehaviorSanitizer for undefined behavior detection.
 The default build type uses the flags `-O3 -Wstrict-prototypes -Wno-error=strict-prototypes -fvisibility=hidden -Wno-error=implicit-function-declaration -Wno-error=attributes`. (Notice that assertions remain enabled in this configuration, which harms performance.)
 ## Build artifacts
 The following libraries are built:
 - `libsqisign_common_sys.a`: library with common crypto - AES, Keccak and system random number generator.
 - `libsqisign_common_test.a`: library with common crypto for deterministic tests - AES, Keccak and CTR-DRBG PRNG.
 - `libsqisign_<level>.a`: library for `SQIsign_<level>`.
 - `libsqisign_<level>_test`: library for `SQIsign_<level>`, only for test, using the deterministic CTR-DRBG as backend.
 - `libsqisign_<level>_nistapi.a`: library for `SQIsign_<level>` against the NIST API.
 - `libsqisign_<level>_nistapi_test.a`: library for `SQIsign_<level>` against the NIST API. Only for test, using the deterministic CTR-DRBG as backend.
 - `libsqisign_gf_<level>.a`: gf sub-library, generic or for `<level>`
 - `libsqisign_ec_<level>.a`: ec sub-library, generic or for `<level>`
 - `libsqisign_klpt_<level>.a`: klpt sub-library, generic or for `<level>`
 - `libsqisign_intbig_generic.a`: intbig sub-library, generic
 - `libsqisign_quaternion_generic.a`: quaternion sub-library, generic
 - `libsqisign_id2iso_<level>.a`: id2iso sub-library, generic or for `<level>`
 The following test apps are built:
 - `sqisign_bench_<level>`: Benchmarking suites.
 - `sqisign_test_kat_<level>`: KAT test suites.
 - `sqisign_test_scheme_<level>`: Self-test suites.
 - `sqisign_test_prof_<level>`: Profiling suites.
 More apps are built in folder `build/apps`:
 - `PQCgenKAT_sign_<param>`: App for generating NIST KAT.
 - `example_nistapi_<param>`: Example app using the NIST API.
 ## Test
 In the build directory, run: `make test` or `ctest`.
 The test harness consists of the following units:
 - KAT test: tests against the KAT files in the `KAT` folder - `SQIsign_<level>_KAT`
 - Self-tests: runs random self-tests (key-generation, signing and verifying) - `SQIsign_<level>_SELFTEST`
 - Sub-library specific unit-tests.
 ## Known Answer Tests (KAT)
 KAT are available in folder `KAT`. They can be generated by running the apps built in the `apps` folder:
 - `apps/PQCgenKAT_sign_<level>`
 A successful execution will generate the `.req` and `.rsp` files.
 KAT verification is done as part of the test harness (see previous section).
 ## Benchmarks
 A benchmarking suite is built and runs with the following command:
 - `test/sqisign_bench_<param> <runs>`, where params specifies the SQIsign parameter set and runs the number of benchmark runs.
 The benchmarks profile the `KeyGen`, `Sign` and `Verify` functions. The results are reported in CPU cycles if available on the host platform, and timing in nanoseconds otherwise.
 ## Examples
 Example code that demonstrates how to use SQIsign are available in the `apps` folder:
 - `apps/example_nistapi.c`: Example with the NIST API.
 ## Project Structure
 The SQIsign library consists of a number of sub-libraries used to implement the final SQIsign library.
 The dependencies are depicted below.
 ```
    ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
    │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
    │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
    │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
    └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
          │                    │                       │
          │                    │                       │
          ├────────────────────┼─────────────────┐     │
          │                    │                 │     │
          │                    │                 │     │
      ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
      │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
      └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
          │       │            │            └───▲──────┬───────────┘
          │       │            │                │      │
      ┌───┴──┐    │            │                │      │
      │ KLPT ◄────┘            │     ┌──────────┘      │
      └───┬──┘                 │     │                 │
          │                    │     │                 │
 ┌─────────▼─────────┐          │     │                 │
 │ Quaternion orders │          │     │            ┌────▼───┐
 │     and ideals    │          │     │            │ GF(p²) │
 └─────────┬─────────┘          │     │            └────┬───┘
          │           ┌─┬──────▼─────┴──┬─┐            │
    ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
    │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
    └───────────┘     │ ├───────────────┤ │      └───────────┘
                      └─┴───────────────┴─┘                       
 ```
 There are the following sub-libraries:
 - `common`: common code for AES, SHAKE, (P)RNG, memory handling
 - `ec`: elliptic curves, isogenies and pairings
 - `gf`: GF(p^2) and GF(p) arithmetic, including FP BigInt
 - `id2iso`: code for Iso <-> Ideal
 - `klpt`: implementation of KLPT
 - `quaternion`: quaternion orders and ideals
 - `intbig`: multi-precision big integers
 - `precomp`: precomputed constants
 - `protocols`: protocol implementation
 ### Folder structure
 Folder levels after `src`:
 ```
 SQIsign
 └── src
    ├── lib_1
    │   ├── broadwell
    │   │   ├── generic
    │   │   └── lvl1
    │   ├── opt
    │   │   ├── generic
    │   │   └── lvl1
    │   └── ref
    │       └── generic
    ├── lib_2
    │   ├── broadwell
    │   │   └── generic
    │   ├── opt
    │   │   └── generic
    │   └── ref
    │       └── generic
    └── lib_n
        ├── broadwell
        │   └── generic
        ├── opt
        │   └── generic
        └── ref
            └── generic
 ```
 Level 1: Library (e.g. quaternion). A `CMakeLists.txt` file with entry `include(${SELECT_IMPL_TYPE})` takes care of including the implementation Level 2.
 Level 2: Implementation type: reference C (ref), optimized C (opt), ASM-optimized (e.g. broadwell, neon, m4). A `CMakeLists.txt` file entry with `include(${SELECT_SQISIGN_VARIANT})` takes care of including the SQIsign variant.
 Level 3: SQIsign variant -> generic code or code for a specific parameter set (e.g. lvl1). 
 Other folders:
 - `apps`: Applications: KAT generation application, examples
 - `include`: SQIsign public header files
 - `KAT`: Known Answer Test files
 - `test`: SQIsign test code
 ### Sub-library headers
 Sub-libraries can define their own headers, which may be different between the implementation types. These header files are used sub-library-internally and by other dependent sub-libraries. The convention is to put the headers in an `include` folder of the sub-library src directory. For example, `src/intbig/ref/generic/include/intbig.h`.
 ### Sub-library unit tests
 Sub-libraries can implement their own, self-contained unit tests. The convention is to put the unit tests in a `test` folder of the sub-library `src` directory. For example, `src/intbig/ref/generic/test/test_intbig.c`.
 ### Shared implementation types
 It is possible to share implementations between implementation types. For example, the broadwell optimized implementation might use the same code as the reference implementation except in the GF module.
 ## License
 SQIsign is licensed under Apache-2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
 Third party code is used in some test and common code files:
 - `src/common/aes_c.c`; MIT: "Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>"
 - `src/common/fips202.c`: Public Domain
 - `src/common/randombytes_system.c`: MIT: Copyright (c) 2017 Daan Sprenkels <hello@dsprenkels.com>
 - `apps/PQCgenKAT_sign.c`, `common/randombytes_ctrdrbg.c`, `test/test_kat.c`: by NIST (Public Domain)
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,15 @@
 # NIST KAT generation apps
 foreach(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    add_executable(PQCgenKAT_sign_${SVARIANT_LOWER} PQCgenKAT_sign.c)
    target_link_libraries(PQCgenKAT_sign_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_test_nistapi)
    target_include_directories(PQCgenKAT_sign_${SVARIANT_LOWER} PRIVATE ../include)
 endforeach()
 # Examples with NIST API
 foreach(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    add_executable(example_nistapi_${SVARIANT_LOWER} example_nistapi.c)
    target_link_libraries(example_nistapi_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_nistapi)
    target_include_directories(example_nistapi_${SVARIANT_LOWER} PRIVATE ../include ../src/${SVARIANT_LOWER})
 endforeach()
--- a/apps/PQCgenKAT_sign.c
+++ b/apps/PQCgenKAT_sign.c
@@ -0,0 +1,280 @@
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 /*
 NIST-developed software is provided by NIST as a public service. You may use,
 copy, and distribute copies of the software in any medium, provided that you
 keep intact this entire notice. You may improve, modify, and create derivative
 works of the software or any portion of the software, and you may copy and
 distribute such modifications or works. Modified works should carry a notice
 stating that you changed the software and should note the date and nature of any
 such change. Please explicitly acknowledge the National Institute of Standards
 and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
 ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
 WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
 PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
 NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
 ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
 ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
 INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
 USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and
 distributing the software and you assume all risks associated with its use,
 including but not limited to the risks and costs of program errors, compliance
 with applicable laws, damage to or loss of data, programs or equipment, and the
 unavailability or interruption of operation. This software is not intended to be
 used in any situation where a failure could cause risk of injury or damage to
 property. The software developed by NIST employees is not subject to copyright
 protection within the United States.
 */
 #include "api.h"
 #include "rng.h"
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define MAX_MARKER_LEN 50
 #define KAT_SUCCESS 0
 #define KAT_FILE_OPEN_ERROR -1
 #define KAT_DATA_ERROR -3
 #define KAT_CRYPTO_FAILURE -4
 int FindMarker(FILE *infile, const char *marker);
 int ReadHex(FILE *infile, unsigned char *A, int Length, char *str);
 void fprintBstr(FILE *fp, char *S, unsigned char *A, unsigned long long L);
 int main(void) {
  char fn_req[32], fn_rsp[32];
  FILE *fp_req, *fp_rsp;
  unsigned char seed[48];
  unsigned char msg[3300];
  unsigned char entropy_input[48];
  unsigned char *m, *sm, *m1;
  unsigned long long mlen, smlen, mlen1;
  int count;
  int done;
  unsigned char pk[CRYPTO_PUBLICKEYBYTES], sk[CRYPTO_SECRETKEYBYTES];
  int ret_val;
  // Create the REQUEST file
  sprintf(fn_req, "PQCsignKAT_%d_%s.req", CRYPTO_SECRETKEYBYTES,
          CRYPTO_ALGNAME);
  if ((fp_req = fopen(fn_req, "w")) == NULL) {
    printf("Couldn't open <%s> for write\n", fn_req);
    return KAT_FILE_OPEN_ERROR;
  }
  sprintf(fn_rsp, "PQCsignKAT_%d_%s.rsp", CRYPTO_SECRETKEYBYTES,
          CRYPTO_ALGNAME);
  if ((fp_rsp = fopen(fn_rsp, "w")) == NULL) {
    printf("Couldn't open <%s> for write\n", fn_rsp);
    return KAT_FILE_OPEN_ERROR;
  }
  for (int i = 0; i < 48; i++)
    entropy_input[i] = i;
  randombytes_init(entropy_input, NULL, 256);
  for (int i = 0; i < 100; i++) {
    fprintf(fp_req, "count = %d\n", i);
    randombytes(seed, 48);
    fprintBstr(fp_req, "seed = ", seed, 48);
    mlen = 33 * (i + 1);
    fprintf(fp_req, "mlen = %llu\n", mlen);
    randombytes(msg, mlen);
    fprintBstr(fp_req, "msg = ", msg, mlen);
    fprintf(fp_req, "pk =\n");
    fprintf(fp_req, "sk =\n");
    fprintf(fp_req, "smlen =\n");
    fprintf(fp_req, "sm =\n\n");
  }
  fclose(fp_req);
  // Create the RESPONSE file based on what's in the REQUEST file
  if ((fp_req = fopen(fn_req, "r")) == NULL) {
    printf("Couldn't open <%s> for read\n", fn_req);
    return KAT_FILE_OPEN_ERROR;
  }
  fprintf(fp_rsp, "# %s\n\n", CRYPTO_ALGNAME);
  done = 0;
  do {
    if (FindMarker(fp_req, "count = ")) {
      if (fscanf(fp_req, "%d", &count) != 1)
        return KAT_DATA_ERROR;
    } else {
      done = 1;
      break;
    }
    fprintf(fp_rsp, "count = %d\n", count);
    if (!ReadHex(fp_req, seed, 48, "seed = ")) {
      printf("ERROR: unable to read 'seed' from <%s>\n", fn_req);
      return KAT_DATA_ERROR;
    }
    fprintBstr(fp_rsp, "seed = ", seed, 48);
    randombytes_init(seed, NULL, 256);
    if (FindMarker(fp_req, "mlen = ")) {
      if (fscanf(fp_req, "%llu", &mlen) != 1)
        return KAT_DATA_ERROR;
    } else {
      printf("ERROR: unable to read 'mlen' from <%s>\n", fn_req);
      return KAT_DATA_ERROR;
    }
    fprintf(fp_rsp, "mlen = %llu\n", mlen);
    m = (unsigned char *)calloc(mlen, sizeof(unsigned char));
    m1 = (unsigned char *)calloc(mlen + CRYPTO_BYTES, sizeof(unsigned char));
    sm = (unsigned char *)calloc(mlen + CRYPTO_BYTES, sizeof(unsigned char));
    if (!ReadHex(fp_req, m, (int)mlen, "msg = ")) {
      printf("ERROR: unable to read 'msg' from <%s>\n", fn_req);
      return KAT_DATA_ERROR;
    }
    fprintBstr(fp_rsp, "msg = ", m, mlen);
    // Generate the public/private keypair
    if ((ret_val = crypto_sign_keypair(pk, sk)) != 0) {
      printf("crypto_sign_keypair returned <%d>\n", ret_val);
      return KAT_CRYPTO_FAILURE;
    }
    fprintBstr(fp_rsp, "pk = ", pk, CRYPTO_PUBLICKEYBYTES);
    fprintBstr(fp_rsp, "sk = ", sk, CRYPTO_SECRETKEYBYTES);
    if ((ret_val = crypto_sign(sm, &smlen, m, mlen, sk)) != 0) {
      printf("crypto_sign returned <%d>\n", ret_val);
      return KAT_CRYPTO_FAILURE;
    }
    fprintf(fp_rsp, "smlen = %llu\n", smlen);
    fprintBstr(fp_rsp, "sm = ", sm, smlen);
    fprintf(fp_rsp, "\n");
    if ((ret_val = crypto_sign_open(m1, &mlen1, sm, smlen, pk)) != 0) {
      printf("crypto_sign_open returned <%d>\n", ret_val);
      return KAT_CRYPTO_FAILURE;
    }
    if (mlen != mlen1) {
      printf(
          "crypto_sign_open returned bad 'mlen': Got <%llu>, expected <%llu>\n",
          mlen1, mlen);
      return KAT_CRYPTO_FAILURE;
    }
    if (memcmp(m, m1, mlen)) {
      printf("crypto_sign_open returned bad 'm' value\n");
      return KAT_CRYPTO_FAILURE;
    }
    free(m);
    free(m1);
    free(sm);
  } while (!done);
  fclose(fp_req);
  fclose(fp_rsp);
  return KAT_SUCCESS;
 }
 //
 // ALLOW TO READ HEXADECIMAL ENTRY (KEYS, DATA, TEXT, etc.)
 //
 int FindMarker(FILE *infile, const char *marker) {
  char line[MAX_MARKER_LEN];
  int i, len;
  int curr_line;
  len = (int)strlen(marker);
  if (len > MAX_MARKER_LEN - 1)
    len = MAX_MARKER_LEN - 1;
  for (i = 0; i < len; i++) {
    curr_line = fgetc(infile);
    line[i] = curr_line;
    if (curr_line == EOF)
      return 0;
  }
  line[len] = '\0';
  while (1) {
    if (!strncmp(line, marker, len))
      return 1;
    for (i = 0; i < len - 1; i++)
      line[i] = line[i + 1];
    curr_line = fgetc(infile);
    line[len - 1] = curr_line;
    if (curr_line == EOF)
      return 0;
    line[len] = '\0';
  }
  // shouldn't get here
  return 0;
 }
 //
 // ALLOW TO READ HEXADECIMAL ENTRY (KEYS, DATA, TEXT, etc.)
 //
 int ReadHex(FILE *infile, unsigned char *A, int Length, char *str) {
  int i, ch, started;
  unsigned char ich;
  if (Length == 0) {
    A[0] = 0x00;
    return 1;
  }
  memset(A, 0x00, Length);
  started = 0;
  if (FindMarker(infile, str))
    while ((ch = fgetc(infile)) != EOF) {
      if (!isxdigit(ch)) {
        if (!started) {
          if (ch == '\n')
            break;
          else
            continue;
        } else
          break;
      }
      started = 1;
      if ((ch >= '0') && (ch <= '9'))
        ich = ch - '0';
      else if ((ch >= 'A') && (ch <= 'F'))
        ich = ch - 'A' + 10;
      else if ((ch >= 'a') && (ch <= 'f'))
        ich = ch - 'a' + 10;
      else // shouldn't ever get here
        ich = 0;
      for (i = 0; i < Length - 1; i++)
        A[i] = (A[i] << 4) | (A[i + 1] >> 4);
      A[Length - 1] = (A[Length - 1] << 4) | ich;
    }
  else
    return 0;
  return 1;
 }
 void fprintBstr(FILE *fp, char *S, unsigned char *A, unsigned long long L) {
  unsigned long long i;
  fprintf(fp, "%s", S);
  for (i = 0; i < L; i++)
    fprintf(fp, "%02X", A[i]);
  if (L == 0)
    fprintf(fp, "00");
  fprintf(fp, "\n");
 }
--- a/apps/example_nistapi.c
+++ b/apps/example_nistapi.c
@@ -0,0 +1,89 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
 * An example to demonstrate how to use SQIsign with the NIST API.
 */
 #include <api.h>
 #include <mem.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 /**
 * Example for SQIsign variant:
 * - crypto_sign_keypair
 * - crypto_sign
 * - crypto_sign_open
 * 
 * @return int return code
 */
 static int example_sqisign(void) {
    unsigned long long msglen = 32;
    unsigned long long smlen = CRYPTO_BYTES + msglen;
    unsigned char *pk  = calloc(CRYPTO_PUBLICKEYBYTES, 1);
    unsigned char *sk  = calloc(CRYPTO_SECRETKEYBYTES, 1);
    unsigned char *sig = calloc(smlen, 1);
    unsigned char msg[32] = { 0xe };
    unsigned char msg2[32] = { 0 };
    printf("Example with %s\n", CRYPTO_ALGNAME);
    printf("crypto_sign_keypair -> ");
    int res = crypto_sign_keypair(pk, sk);
    if (res) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        printf("OK\n");
    }
    printf("crypto_sign -> ");
    res = crypto_sign(sig, &smlen, msg, msglen, sk);
    if (res) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        printf("OK\n");
    }
    printf("crypto_sign_open (with correct signature) -> ");
    res = crypto_sign_open(msg2, &msglen, sig, smlen, pk);
    if (res || memcmp(msg, msg2, msglen)) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        res = 0;
        printf("OK\n");
    }
    printf("crypto_sign_open (with altered signature) -> ");
    sig[0] = ~sig[0];
    memset(msg2, 0, msglen);
    res = crypto_sign_open(msg2, &msglen, sig, smlen, pk);
    if (!res || !memcmp(msg, msg2, msglen)) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        res = 0;
        printf("OK\n");
    }
 err:
    free(pk);
    sqisign_secure_free(sk, CRYPTO_SECRETKEYBYTES);
    free(sig);
    return res;
 }
 int main(void) {
    return example_sqisign();
 }
--- a/include/mem.h
+++ b/include/mem.h
@@ -0,0 +1,23 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef MEM_H
 #define MEM_H
 #include <stddef.h>
 /**
 * Clears and frees allocated memory.
 * 
 * @param[out] mem Memory to be cleared and freed.
 * @param size Size of memory to be cleared and freed.
 */
 void sqisign_secure_free(void *mem, size_t size);
 /**
 * Clears memory.
 * 
 * @param[out] mem Memory to be cleared.
 * @param size Size of memory to be cleared.
 */
 void sqisign_secure_clear(void *mem, size_t size);
 #endif
--- a/include/rng.h
+++ b/include/rng.h
@@ -0,0 +1,28 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef rng_h
 #define rng_h
 /**
 * Randombytes initialization.
 * Initialization may be needed for some random number generators (e.g. CTR-DRBG).
 *
 * @param[in] entropy_input 48 bytes entropy input
 * @param[in] personalization_string Personalization string
 * @param[in] security_strength Security string
 */
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength);
 /**
 * Random byte generation.
 * The caller is responsible to allocate sufficient memory to hold x.
 *
 * @param[out] x Memory to hold the random bytes.
 * @param[in] xlen Number of random bytes to be generated
 * @return int 0 on success, -1 otherwise
 */
 int randombytes(unsigned char *x, unsigned long long xlen);
 #endif /* rng_h */
--- a/include/sig.h
+++ b/include/sig.h
@@ -0,0 +1,73 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef SQISIGN_H
 #define SQISIGN_H
 #include <stdint.h>
 /**
 * SQIsign keypair generation.
 *
 * The implementation corresponds to SQIsign.CompactKeyGen() in the SQIsign spec.
 * The caller is responsible to allocate sufficient memory to hold pk and sk.
 *
 * @param[out] pk SQIsign public key
 * @param[out] sk SQIsign secret key
 * @return int status code
 */
 int sqisign_keypair(unsigned char *pk, unsigned char *sk);
 /**
 * SQIsign signature generation.
 *
 * The implementation performs SQIsign.expandSK() + SQIsign.sign() in the SQIsign spec.
 * Keys provided is a compacted secret keys.
 * The caller is responsible to allocate sufficient memory to hold sm.
 *
 * @param[out] sm Signature concatenated with message
 * @param[out] smlen Pointer to the length of sm
 * @param[in] m Message to be signed
 * @param[in] mlen Message length
 * @param[in] sk Compacted secret key
 * @return int status code
 */
 int sqisign_sign(unsigned char *sm,
              unsigned long long *smlen, const unsigned char *m,
              unsigned long long mlen, const unsigned char *sk);
 /**
 * SQIsign open signature.
 *
 * The implementation performs SQIsign.verify(). If the signature verification succeeded, the original message is stored in m.
 * Keys provided is a compact public key.
 * The caller is responsible to allocate sufficient memory to hold m.
 *
 * @param[out] m Message stored if verification succeeds
 * @param[out] mlen Pointer to the length of m
 * @param[in] sm Signature concatenated with message
 * @param[in] smlen Length of sm
 * @param[in] pk Compacted public key
 * @return int status code
 */
 int sqisign_open(unsigned char *m,
              unsigned long long *mlen, const unsigned char *sm,
              unsigned long long smlen, const unsigned char *pk);
 /**
 * SQIsign verify signature.
 *
 * If the signature verification succeeded, returns 0, otherwise 1.
 *
 * @param[out] m Message stored if verification succeeds
 * @param[out] mlen Pointer to the length of m
 * @param[in] sig Signature
 * @param[in] siglen Length of sig
 * @param[in] pk Compacted public key
 * @return int 0 if verification succeeded, 1 otherwise.
 */
 int sqisign_verify(const unsigned char *m,
                unsigned long long mlen, const unsigned char *sig,
                unsigned long long siglen, const unsigned char *pk);
 #endif
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -0,0 +1,2 @@
 __pycache__
 *.sage.py
--- a/scripts/IntBigTest.scala
+++ b/scripts/IntBigTest.scala
@@ -0,0 +1,107 @@
 /**
 * Code to analyze instrumented code from the SQIsign IntBig module.
 *
 * Features:
 * - verifies arithmetic
 * - aggregate number of errors / ok per function
 * - aggregate minimum / maximum values per function
 *
 * Prerequisite: enable debug output in intbig.x: #define DEBUG_VERBOSE
 * Usage: ./<test app> | scala IntBigTest.scala
 * Usage with unit test: sqisign_test_intbig <reps> <bits> | scala IntBigTest.scala
 *
 * Run option -v: verbose full output
 */
 object IntBigTest {
  // Test functions
  object IntBigTestFuns {
    def ibz_add(a: Array[BigInt]) = IntBigRes(a(0) == a(1) + a(2), a(1) + a(2), a)
    def ibz_sub(a: Array[BigInt]) = IntBigRes(a(0) == a(1) - a(2), a(1) - a(2), a)
    def ibz_mul(a: Array[BigInt]) = IntBigRes(a(0) == a(1) * a(2), a(1) * a(2), a)
    def ibz_div(a: Array[BigInt]) = IntBigRes(a(0) == a(2) / a(3) && a(1) == a(2) % a(3), a(2) / a(3), a)
    def ibz_pow_mod(a: Array[BigInt]) = IntBigRes(a(0) == a(1).modPow(a(2), a(3)),  a(1).modPow(a(2), a(3)), a)
    def ibz_cmp(a: Array[BigInt]) = IntBigRes(
      if (a(1) == a(2)) a(0) == 0 else if (a(1) < a(2)) a(0) < 0 else a(0) > 0,
      if (a(1) == a(2)) -1 else if (a(1) < a(2)) 1 else 0,
      a)
    def ibz_is_zero(a: Array[BigInt]) = IntBigRes(if (a(1) == 0) a(0) == 1 else a(0) == 0, if (a(1) == 0) 1 else 0, a)
    def ibz_is_one(a: Array[BigInt]) = IntBigRes(if (a(1) == 1) a(0) == 1 else a(0) == 0, if (a(1) == 1) 1 else 0, a)
    def ibz_probab_prime(a: Array[BigInt]) = IntBigRes(if (a(1).isProbablePrime(a(2).toInt)) a(0) > 0 else a(0) == 0, if (a(1).isProbablePrime(a(2).toInt)) 1 else 0, a)
    def ibz_gcd(a: Array[BigInt]) = IntBigRes(a(1).gcd(a(2)) == a(0), a(1).gcd(a(2)), a)
    def ibz_sqrt_mod_p(in: Array[BigInt]): IntBigRes = {
      val sqrt = in(0)
      val p = in(2)
      val a = if (in(1).mod(p) < 0) p + in(1).mod(p) else in(1).mod(p)
      val exp0 = sqrt.modPow(2, p)
      IntBigRes(exp0 == a || (p - exp0) == a, sqrt.modPow(2, p), in)
    }
    def ibz_sqrt_mod_2p(a: Array[BigInt]): IntBigRes = IntBigRes(a(0).modPow(2, 2 * a(2)) == a(1), a(0), a)
  }
  val funList = Map(
    //"ibz_add" -> ibz_add _,
    "ibz_sqrt_mod_p" -> IntBigTestFuns.ibz_sqrt_mod_p _,
    "ibz_sqrt_mod_2p" -> IntBigTestFuns.ibz_sqrt_mod_2p _,
    "ibz_add" -> IntBigTestFuns.ibz_add _,
    "ibz_sub" -> IntBigTestFuns.ibz_sub _,
    "ibz_mul" -> IntBigTestFuns.ibz_mul _,
    "ibz_div" -> IntBigTestFuns.ibz_div _,
    "ibz_pow_mod" -> IntBigTestFuns.ibz_pow_mod _,
    "ibz_cmp" -> IntBigTestFuns.ibz_cmp _,
    "ibz_is_zero" -> IntBigTestFuns.ibz_is_zero _,
    "ibz_is_one" -> IntBigTestFuns.ibz_is_one _,
    "ibz_probab_prime" -> IntBigTestFuns.ibz_probab_prime _,
    "ibz_gcd" -> IntBigTestFuns.ibz_gcd _
  )
  case class AggregateResults(funName: String, errors: Int, ok: Int, max: Option[Int], min: Option[Int]) {
    def errInc = AggregateResults(funName, errors + 1, ok, max, min)
    def okInc(operands: List[BigInt]) = {
      val operandsBitLen = operands.map(_.bitLength)
      val newMax = Some((max.getOrElse(0) :: operandsBitLen).max)
      val newMin = Some((min.getOrElse(Int.MaxValue) :: operandsBitLen).min)
      AggregateResults(funName, errors, ok + 1, newMax, newMin)
    }
    override def toString: String = s"$funName: $errors errors, $ok ok, max value: ${max.getOrElse(BigInt(0))} bits, min value: ${min.getOrElse(BigInt(0))} bits)"
  }
  def main(args: Array[String]) = {
    val v = args.length >= 1 && args(0) == "-v"
    var err = Map() ++ funList.map(i => (i._1 -> AggregateResults(i._1, 0, 0, None, None)))
    var cont = true
    while (cont) {
      val l = scala.io.StdIn.readLine()
      if (l == null) {
        val numerr =
        err.foreach(i => println(i._2))
        println(s"==========\n${err.values.map(_.errors).sum} errors found\n${err.values.map(_.ok).sum} checks ok")
        cont = false
      } else {
        err = check(l, v, err)
      }
    }
  }
  case class IntBigRes(verif: Boolean, expected: BigInt, got: Array[BigInt])
  def check(line: String, v: Boolean, agg: Map[String, AggregateResults]): Map[String, AggregateResults] = {
    line.split(",").toList match {
      case x :: xs if funList.contains(x) =>
        val funA = xs.map(i => BigInt(i, 16)).toArray
        funList(x)(funA) match {
          case IntBigRes(false, exp, got) =>
            println(s"function: $x\ngot:\n${got.map(_.toString(16)).mkString(",")}\nexpected:\n${exp.toString(16)}")
            agg.map(i => if (i._1 == x) i._1 -> i._2.errInc else i)
          case _ =>
            agg.map(i => if (i._1 == x) i._1 -> i._2.okInc(funA.toList) else i)
        }
      case _ =>
        if (v) println(line)
        agg
    }
  }
 }
--- a/scripts/cformat.py
+++ b/scripts/cformat.py
@@ -0,0 +1,92 @@
 #!/usr/bin/env python3
 import sys, itertools
 from math import floor, log
 import sage.all
 class Ibz:
    def __init__(self, v):
        self.v = int(v)
    def _literal(self, sz):
        val = int(self.v)
        sgn = val < 0
        num_limbs = (abs(val).bit_length() + sz-1) // sz if val else 0
        limbs = [(abs(val) >> sz*i) & (2**sz-1) for i in range(num_limbs or 1)]
        data = {
                '._mp_alloc': 0,
                '._mp_size': (-1)**sgn * num_limbs,
                '._mp_d': '(mp_limb_t[]) {' + ','.join(map(hex,limbs)) + '}',
            }
        return '{{' + ', '.join(f'{k} = {v}' for k,v in data.items()) + '}}'
 class Object:
    def __init__(self, ty, name, obj):
        if '[' in ty:
            idx = ty.index('[')
            depth = ty.count('[]')
            def rec(os, d):
                assert d >= 0
                if not d:
                    return ()
                assert isinstance(os,list) or isinstance(os,tuple)
                r, = {rec(o, d-1) for o in os}
                return (len(os),) + r
            dims = rec(obj, depth)
            self.ty = ty[:idx], ''.join(f'[{d}]' for d in dims)
        else:
            self.ty = ty, ''
        self.name = name
        self.obj = obj
    def _declaration(self):
        return f'extern const {self.ty[0]} {self.name}{self.ty[1]};'
    def _literal(self, mp_limb_t_bits):
        def rec(obj):
            if isinstance(obj, int):
                return hex(obj)
            if isinstance(obj, sage.all.Integer):
                return hex(obj)
            if isinstance(obj, Ibz):
                return obj._literal(mp_limb_t_bits)
            if isinstance(obj, list) or isinstance(obj, tuple):
                return '{' + ', '.join(map(rec, obj)) + '}'
            raise NotImplementedError(f'unknown type {type(obj)} in Formatter')
        return rec(self.obj)
    def _definition(self, mp_limb_t_bits):
        return f'const {self.ty[0]} {self.name}{self.ty[1]} = ' + self._literal(mp_limb_t_bits) + ';'
 class ObjectFormatter:
    def __init__(self, objs):
        self.objs = objs
    def header(self, file=None):
        for obj in self.objs:
            assert isinstance(obj, Object)
            print(obj._declaration(), file=file)
    def implementation(self, file=None):
        print('#if 0', file=file)
        for sz in (16, 32, 64):
            print(f'#elif 8*DIGIT_LEN == {sz}', file=file)
            for obj in self.objs:
                assert isinstance(obj, Object)
                print(obj._definition(sz), file=file)
        print('#endif', file=file)
 def field(v, F=None):
    if F:
        v = F(v)
    p = F.characteristic()
    l = 1 + floor(log(p,2**64))
    vs = [[(c >> 64*i) & (2**64-1) for i in range(l)] for c in v]
    return vs
 def xonly(T, *args):
    if not T: raise NotImplementedError('is point at infinity')
    x, _ = T.xy()
    return field(x, *args)
--- a/scripts/parameters.py
+++ b/scripts/parameters.py
@@ -0,0 +1,31 @@
 #!/usr/bin/env python3
 from sage.all import *
 proof.all(False)  # faster
 import re
 for l in open('sqisign_parameters.txt'):
    for k in ('lvl', 'p', 'B'):
        m = re.search(rf'^\s*{k}\s*=\s*([x0-9a-f]+)', l)
        if m:
            v = ZZ(m.groups()[0], 0)
            globals()[k] = v
 L = {l for l,_ in (p**2 - 1).factor(limit=B+5) if l <= B}
 assert 2 in L
 L.remove(2)
 f = (p+1).valuation(2)
 if (p-1).valuation(2) > f:
    raise NotImplementedError('2-power torsion is on twist')
 Lpls = {l for l in L if (p+1).valuation(l) >= (p-1).valuation(l)}
 Lmin = L - Lpls
 Lpls, Lmin = map(sorted, (Lpls, Lmin))
 Epls = [(p+1).valuation(l) for l in Lpls]
 Emin = [(p-1).valuation(l) for l in Lmin]
 Tpls = prod(l**e for l,e in zip(Lpls,Epls))
 Tmin = prod(l**e for l,e in zip(Lmin,Emin))
 Dcom = (Tpls*Tmin).prime_to_m_part(2*3)
 Dchall = prod(l**(p+1).valuation(l) for l in (2,3))
 __all__ = ['lvl', 'p', 'B', 'f', 'Tpls', 'Tmin', 'Dcom', 'Dchall']
--- a/scripts/precompute_endomorphism_action.sage
+++ b/scripts/precompute_endomorphism_action.sage
@@ -0,0 +1,203 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(10, 0, print_message=True):
    exit('')
 ################################################################
 from parameters import p, B, f, Tpls, Tmin, Dcom, Dchall
 T = Tpls * Tmin
 ################################################################
 if p % 4 != 3:
    raise NotImplementedError('requires p ≡ 3 (mod 4)')
 Fp2.<i> = GF((p,2), modulus=[1,0,1])
 Fp4 = Fp2.extension(2,'u')
 E = EllipticCurve(Fp4, [1,0])
 assert E.j_invariant() == 1728
 assert E.is_supersingular()
 assert E.change_ring(Fp2).frobenius() == -p
 assert E.order() == (p^2-1)^2
 endo_1 = E.scalar_multiplication(1)
 endo_i = E.automorphisms()[-1]
 endo_j = E.frobenius_isogeny()
 endo_k = endo_i * endo_j
 if 0:  # skipped for speed, for now
    assert endo_i^2 == E.scalar_multiplication(-1)
    assert endo_j^2 == E.scalar_multiplication(-p)
    assert endo_j * endo_i == - endo_i * endo_j
 else:
    R = E.random_point()
    assert (endo_i^2)(R) == -1*R
    assert (endo_j^2)(R) == -p*R
    assert (endo_j*endo_i)(R) == -(endo_i*endo_j)(R)
 def half_endo(summands):
    def _eval(P):
        E = P.curve()
        assert P in E
        F = E.base_field()
        if (halves := P.division_points(2)):
            Q = halves[0]
        else:
            Q = E.change_ring(F.extension(2,'v'))(P)
        R = sum(endo._eval(Q) for endo in summands)
        return E(R)
    return _eval
 gen1 = endo_1._eval
 gen2 = endo_i._eval
 gen3 = half_endo([endo_i, endo_j])
 gen4 = half_endo([endo_1, endo_k])
 ################################################################
 from sage.groups.generic import order_from_multiple
 x = Fp4.gen()
 while True:
    x += 1
    try:
        P = E.lift_x(x)
    except ValueError:
        continue
    o = order_from_multiple(P, p^2-1)
    if (T<<f).divides(o):
        P *= o // (T<<f)
        P.set_order(T<<f)
        break
 x = Fp4.gen()
 while True:
    x += 1
    try:
        Q = E.lift_x(x)
    except ValueError:
        continue
    o = order_from_multiple(Q, p^2-1)
    if not (T<<f).divides(o):
        continue
    Q *= o // (T<<f)
    Q.set_order(T<<f)
    if order_from_multiple(P.weil_pairing(Q, T<<f), T<<f, operation='*') == T<<f:
        break
 def dlp(P, Q, R):
    n = P.order()
    assert P.order() == Q.order()
    assert R.order().divides(P.order())
    e = Fp2(P.weil_pairing(Q, n))
    a = Fp2(R.weil_pairing(Q, n)).log(e)
    b = Fp2(P.weil_pairing(R, n)).log(e)
    assert a*P + b*Q == R
    return a, b
 def matrix_of_isogeny(phi):
    imP, imQ = map(phi, (P,Q))
    vecP = dlp(P, Q, imP)
    vecQ = dlp(P, Q, imQ)
    mat = matrix(Zmod(T<<f), [vecP, vecQ]).transpose()
    assert imP == ZZ(mat[0][0])*P + ZZ(mat[1][0])*Q
    assert imQ == ZZ(mat[0][1])*P + ZZ(mat[1][1])*Q
    return mat
 #mat1 = matrix_of_isogeny(endo_1)
 mati = matrix_of_isogeny(endo_i)
 matj = matrix_of_isogeny(endo_j)
 matk = matrix_of_isogeny(endo_k)
 #assert mat1 == 1    # identity; omit
 #mat1 = matrix_of_isogeny(gen1)
 mat2 = matrix_of_isogeny(gen2)
 mat3 = matrix_of_isogeny(gen3)
 mat4 = matrix_of_isogeny(gen4)
 #assert mat1 == 1    # identity; omit
 ################################################################
 Quat.<i,j,k> = QuaternionAlgebra(-1, -p)
 O0 = Quat.quaternion_order([1, i, (i+j)/2, (1+k)/2])
 assert Dcom % 2 == 1  # odd
 mat = block_matrix(Zmod(Dcom), [[identity_matrix(2), mati, matj, matk]])[:,::2]
 ker = list(map(Quat, mat.right_kernel_matrix()))
 idealP = sum((O0*g for g in ker), O0*Dcom)
 assert idealP.norm() == Dcom
 for b in idealP.basis():
    assert sum(Mod(c,Dcom)*g for c,g in zip(b,(1,mati,matj,matk)))[:,0] == 0  # kills P
 for v in (ZZ^4):
    idealPgen = sum(c*g for c,g in zip(v, idealP.basis()))
    if vector(list(idealPgen)).denominator() == 2:
        idealPgen *= 2
    if gcd(idealPgen.reduced_norm(), Dcom^2) == Dcom:
        break
 assert idealP == O0*Dcom + O0*idealPgen
 mat = mat   # still
 rhs = vector(Zmod(Dcom), [0,1])
 cs = mat.solve_right(rhs)
 distorter = Quat(cs)
 assert sum(Mod(c,Dcom)*g for c,g in zip(distorter,(1,mati,matj,matk))).columns()[0] == vector((0,1))  # maps P->Q
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 def field2limbs(el):
    l = 1 + floor(log(p, 2**64))
    el = Fp2(el)
    vs = [[(int(c) >> 64*i) % 2**64 for i in range(l)] for c in el]
    return vs
 def fmt_basis(name, P, Q):
    vs = [
            [field2limbs(T[0]), field2limbs(T[2])]
            for T in (P,Q,P-Q)
        ]
    return Object('ec_basis_t', name, vs)
 bases = {
        'EVEN': 1<<f,
        'ODD_PLUS': Tpls,
        'ODD_MINUS': Tmin,
        'COMMITMENT_PLUS': gcd(Tpls, Dcom),
        'COMMITMENT_MINUS': gcd(Tmin, Dcom),
        'CHALLENGE': Dchall,
    }
 assert P.order() == Q.order()
 objs = ObjectFormatter([
        fmt_basis(f'BASIS_{k}', ZZ(P.order()/v)*P, ZZ(Q.order()/v)*Q)
        for k,v in bases.items()
    ] + [
        Object('ec_curve_t', 'CURVE_E0', [[[int(0)]], [[int(1)]]]),
        Object('ec_point_t', 'CURVE_E0_A24', [[[int(0)]], [[int(1)]]]),
        Object('ibz_mat_2x2_t', 'ACTION_I', [[Ibz(v) for v in vs] for vs in mati]),
        Object('ibz_mat_2x2_t', 'ACTION_J', [[Ibz(v) for v in vs] for vs in matj]),
        Object('ibz_mat_2x2_t', 'ACTION_K', [[Ibz(v) for v in vs] for vs in matk]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN2', [[Ibz(v) for v in vs] for vs in mat2]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN3', [[Ibz(v) for v in vs] for vs in mat3]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN4', [[Ibz(v) for v in vs] for vs in mat4]),
        Object('quat_alg_elem_t', 'COMMITMENT_IDEAL_UNDISTORTED_GEN', [Ibz(1), [Ibz(ZZ(v)) for v in idealPgen]]),
        Object('quat_alg_elem_t', 'COMMITMENT_IDEAL_DISTORTION_ENDO', [Ibz(1), [Ibz(ZZ(v)) for v in distorter]]),
    ])
 with open('include/endomorphism_action.h','w') as hfile:
    with open('endomorphism_action.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <ec.h>', file=hfile)
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <endomorphism_action.h>', file=cfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_klpt_constants.sage
+++ b/scripts/precompute_klpt_constants.sage
@@ -0,0 +1,114 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import f, p, Tpls, Tmin
 negl = 2**-64   #TODO optimize
 ################################################################
 logp = ceil(log(p, 2))
 logT = ceil(log(Tpls*Tmin, 2))
 tors2val = (p+1).valuation(2)
 defs = dict()
 # lideal_equiv
 defs['KLPT_equiv_bound_coeff'] = ceil((log(negl, 1-2/logp) ** (1/4) - 1) / 2) + 2
 assert (1 - 2/logp) ** ((2 * defs['KLPT_equiv_bound_coeff'] + 1) ** 4) <= negl
 defs['KLPT_equiv_num_iter'] = (2 * defs['KLPT_equiv_bound_coeff'] + 1) ** 4
 defs['KLPT_primality_num_iter'] = ceil(-log(negl, 4))
 # signing KLPT
 defs['KLPT_signing_klpt_length'] = f * ceil (ceil((log(negl, 2) / -2) + 15/4*logp + 25)/f)
 assert 2**(-2 * (defs['KLPT_signing_klpt_length'] - 15/4*logp - 25)) <= negl
 defs['KLPT_signing_num_gamma_trial'] = ceil(log(negl, 2) / -1)
 assert 2 ** ( - defs['KLPT_signing_num_gamma_trial']) <= negl
 defs['KLPT_gamma_exponent_interval_size'] = 0
 defs['KLPT_gamma_exponent_center_shift'] = ceil(log(log(negl, 1-1/logp) + defs['KLPT_signing_num_gamma_trial'], 2) + defs['KLPT_gamma_exponent_interval_size'])
 assert (1 - 1/logp) ** (2**(defs['KLPT_gamma_exponent_center_shift'] - defs['KLPT_gamma_exponent_interval_size']) - defs['KLPT_signing_num_gamma_trial']) <= negl
 defs['KLPT_repres_num_gamma_trial'] = 2**(defs['KLPT_gamma_exponent_center_shift'] + defs['KLPT_gamma_exponent_interval_size'])
 defs['KLPT_signing_number_strong_approx'] = ceil(log(1/64, 1-4/13/logp))
 assert (1 - 4/13/logp) ** defs['KLPT_signing_number_strong_approx'] <= 1/64
 # keygen KLPT
 defs['KLPT_random_prime_attempts'] = 64
 defs['KLPT_secret_key_prime_size'] = ceil(logp / 4)
 defs['KLPT_keygen_length'] =   f* ceil ( ceil(log(negl, 2) / -2 + 5/2*logp -25 ) / f)
 assert 2 ** (-2 * (defs['KLPT_keygen_length'] - 5/2*logp +25)) <= negl
 defs['KLPT_keygen_num_gamma_trial'] = ceil(log(negl, 2) / -1)
 defs['KLPT_eichler_smallnorm_bitsize'] = ceil(1/2*logp - 4/3*( logT - 5/4*logp))
 defs['KLPT_keygen_number_strong_approx'] = ceil(log(1/64, 1-2/5/logp))
 assert (1 - 2/5/logp) ** defs['KLPT_keygen_number_strong_approx'] <= 1/64
 # Eichler
 defs['KLPT_eichler_number_mu_norm'] = ceil((logT - 5/4*logp) / log(3,2))
 defs['KLPT_eichler_strong_approx_log_margin'] = 2
 defs['KLPT_eichler_num_equiv_ideal'] = ceil(logp / 10)
 defs['KLPT_eichler_number_strong_approx'] = ceil(10 * logp)
 # signature response
 defs['SQISIGN_response_attempts'] = 64
 # signature isogeny degrees
 defs['SQISIGN_random_length'] = 0
 defs['SQISIGN_signing_total_length'] = defs['KLPT_signing_klpt_length']
 defs['SQISIGN_signing_length'] = ZZ(defs['SQISIGN_signing_total_length'] / tors2val)
 defs['SQISIGN_keygen_length'] = ZZ(defs['KLPT_keygen_length'] / tors2val)
 # prime data for Cornacchia
 primes_1mod4 = [p for p in primes(100) if p%4==1]
 prod_primes_3mod4 = prod(p for p in primes(100) if p%4==3)
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 objs = ObjectFormatter([
        Object('short[]', 'SMALL_PRIMES_1MOD4', [int(v) for v in primes_1mod4]),
        Object('ibz_t', 'PROD_SMALL_PRIMES_3MOD4', Ibz(prod_primes_3mod4)),
    ])
 ################################################################
 with open('include/klpt_constants.h','w') as hfile:
    with open('klpt_constants.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <klpt_constants.h>', file=cfile)
        for k,v in defs.items():
            v = ZZ(v)
            print(f'#define {k} {v}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_quaternion_data.sage
+++ b/scripts/precompute_quaternion_data.sage
@@ -0,0 +1,115 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import p
 num = 7  #TODO how many extra maximal orders to precompute?
 ################################################################
 # Underlying theory:
 # - Ibukiyama, On maximal orders of division quaternion algebras with certain optimal embeddings
 # - https://ia.cr/2023/106 Lemma 10
 from sage.algebras.quatalg.quaternion_algebra import basis_for_quaternion_lattice
 bfql = lambda els: basis_for_quaternion_lattice(els, reverse=True)
 Quat.<i,j,k> = QuaternionAlgebra(-1, -p)
 assert Quat.discriminant() == p         # ramifies correctly
 orders = []
 q = 1
 while len(orders) < num:
    q = next_prime(q)
    if q == 2:
        continue
    Quat2.<ii,jj,kk> = QuaternionAlgebra(-q, -p)
    if Quat2.discriminant() != p:       # ramifies incorrectly
        continue
    x, y = QuadraticForm(QQ, 2, [1,0,p]).solve(q)
    gamma = x + j*y
    assert gamma.reduced_norm() == q
    ims = [Quat(1), i*gamma, j, k*gamma]
    assert ims[1]^2 == -q
    assert ims[2]^2 == -p
    assert ims[1]*ims[2] == ims[3]
    assert ims[2]*ims[1] == -ims[3]
    # (1,ii,jj,kk)->ims is an isomorphism Quat2->Quat
    r = min(map(ZZ, Mod(-p, 4*q).sqrt(all=True)))
    if q % 4 == 3:
        bas2 = [
                Quat2(1),
                (1 + ii) / 2,
                jj * (1 + ii) / 2,
                (r + jj) * ii / q,
            ]
    else:
        bas2 = [
                Quat2(1),
                ii,
                (1 + jj) / 2,
                (r + jj) * ii / 2 / q,
            ]
    O2 = Quat2.quaternion_order(bas2)
    assert O2.discriminant() == p       # is maximal
    bas = [sum(c*im for c,im in zip(el,ims)) for el in bas2]
    bas = bfql(bas)
    O = Quat.quaternion_order(bas)
    assert O.discriminant() == p        # is maximal
    assert j in O                       # p-extremal
    mat = matrix(map(list, bas))
 #    print(f'{q = }\nsqrt(-q) = {ims[1]}\n    {(chr(10)+"    ").join(map(str,bas))}', file=sys.stderr)
    assert mat[0] == vector((1,0,0,0))
    orders.append((q, ims[1], mat))
 ################################################################
 gram = matrix(ZZ, [
    [((gi+gj).reduced_norm() - gi.reduced_norm() - gj.reduced_norm()) / 2
        for gi in Quat.basis()] for gj in Quat.basis()])
 O0mat = matrix([list(g) for g in [Quat(1), i, (i+j)/2, (1+k)/2]])
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 algobj = [Ibz(p), [[Ibz(v) for v in vs] for vs in gram]]
 O0ord = [Ibz(O0mat.denominator()), [[Ibz(v*O0mat.denominator()) for v in vs] for vs in O0mat.transpose()]]
 O0obj = [O0ord, [Ibz(1), [Ibz(c) for c in (0,1,0,0)]], [Ibz(1), [Ibz(c) for c in (0,0,1,0)]], 1]
 objs = [[[Ibz(mat.denominator()), [[Ibz(v*mat.denominator()) for v in vs] for vs in mat.transpose()]], [Ibz(mat.denominator()), [Ibz(c*mat.denominator()) for c in ii]], [Ibz(1), [Ibz(c) for c in (0,0,1,0)]], q] for q,ii,mat in orders]
 objs = ObjectFormatter([
        Object('quat_alg_t', 'QUATALG_PINFTY', algobj),
        Object('quat_order_t', 'MAXORD_O0', O0ord),
        Object('quat_p_extremal_maximal_order_t', 'STANDARD_EXTREMAL_ORDER', O0obj),
        Object('quat_p_extremal_maximal_order_t[]', 'ALTERNATE_EXTREMAL_ORDERS', objs),
    ])
 with open('include/quaternion_data.h','w') as hfile:
    with open('quaternion_data.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <quaternion_data.h>', file=cfile)
        print(f'#define NUM_ALTERNATE_EXTREMAL_ORDERS {len(orders)}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_sizes.sage
+++ b/scripts/precompute_sizes.sage
@@ -0,0 +1,92 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import lvl, f, p
 ################################################################
 logp = ceil(log(p, 2))
 tors2part = (p+1).p_primary_part(2)
 tors3part = (p+1).p_primary_part(3)
 #XXX first load the constants from klpt_constants.h
 import re
 klpt_consts = dict()
 for l in open('include/klpt_constants.h'):
    m = re.search(r'#define *([^ ]+) *([x0-9]+)$', l)
    if m:
        k,v = m.groups()
        klpt_consts[k] = int(v, 0)
 defs = dict()
 fp2sz = (logp + 63)//64*8 * 2
 defs['FP2_ENCODED_BYTES'] = fp2sz
 defs['EC_CURVE_ENCODED_BYTES'] = fp2sz  # just the A
 defs['EC_POINT_ENCODED_BYTES'] = fp2sz  # just the x
 defs['EC_BASIS_ENCODED_BYTES'] = 3 * defs['EC_POINT_ENCODED_BYTES']
 defs['CHAIN_LENGTH'] = klpt_consts['SQISIGN_keygen_length']
 defs['QUAT_ALG_ELEM_ENCODED_BITS'] = ceil(((logp/4) + klpt_consts['KLPT_keygen_length'])/2  +55)  #TODO FIXME figure this out XXX XXX
 defs['QUAT_ALG_ELEM_ENCODED_BYTES'] = (defs['QUAT_ALG_ELEM_ENCODED_BITS'] + 7)//8
 defs['ID2ISO_LONG_TWO_ISOG_ENCODED_BYTES'] = defs['CHAIN_LENGTH'] * (defs['EC_CURVE_ENCODED_BYTES'] + defs['EC_POINT_ENCODED_BYTES'] + 2)
 defs['ZIP_CHAIN_LEN'] = klpt_consts['SQISIGN_signing_length']
 defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_ZIP_CHAIN_BYTES'] = (f + 7) // 8
 defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_BYTES'] = defs['ZIP_CHAIN_LEN'] * defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_ZIP_CHAIN_BYTES'] + 1
 defs['SIGNATURE_LEN'] = defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_BYTES'] + ((tors2part*tors3part).bit_length()+7)//8 + 1 + (tors2part.bit_length()+7)//8 + (tors3part.bit_length()+7)//8
 defs['PUBLICKEY_BYTES'] = defs['EC_CURVE_ENCODED_BYTES']
 defs['SECRETKEY_BYTES'] = defs['EC_CURVE_ENCODED_BYTES'] + 5*defs['QUAT_ALG_ELEM_ENCODED_BYTES'] + defs['EC_POINT_ENCODED_BYTES'] + defs['EC_BASIS_ENCODED_BYTES'] + defs['EC_BASIS_ENCODED_BYTES']
 size_privkey = defs['SECRETKEY_BYTES']
 size_pubkey = defs['PUBLICKEY_BYTES']
 size_signature = defs['SIGNATURE_LEN']
 algname = f'lvl{lvl}'
 ################################################################
 with open('include/encoded_sizes.h','w') as hfile:
    for k,v in defs.items():
        v = ZZ(v)
        print(f'#define {k} {v}', file=hfile)
 api = f'''
 // SPDX-License-Identifier: Apache-2.0
 #ifndef api_h
 #define api_h
 #define CRYPTO_SECRETKEYBYTES {size_privkey:4}
 #define CRYPTO_PUBLICKEYBYTES {size_pubkey:4}
 #define CRYPTO_BYTES          {size_signature:4}
 #define CRYPTO_ALGNAME "{algname}"
 int
 crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
 int
 crypto_sign(unsigned char *sm, unsigned long long *smlen,
            const unsigned char *m, unsigned long long mlen,
            const unsigned char *sk);
 int
 crypto_sign_open(unsigned char *m, unsigned long long *mlen,
                 const unsigned char *sm, unsigned long long smlen,
                 const unsigned char *pk);
 #endif /* api_h */
 '''.strip()
 with open(f'../../../nistapi/lvl{lvl}/api.h', 'w') as f:
    print(api, file=f)
--- a/scripts/precompute_torsion_constants.sage
+++ b/scripts/precompute_torsion_constants.sage
@@ -0,0 +1,67 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import p, B, f, Tpls, Tmin, Dcom, Dchall
 ################################################################
 Lpls = sorted(set(Tpls.prime_factors()) - {2})
 Epls = [Tpls.valuation(l) for l in Lpls]
 Lmin = sorted(set(Tmin.prime_factors()) - {2})
 Emin = [Tmin.valuation(l) for l in Lmin]
 tors2part = (p+1).p_primary_part(2)
 tors3part = (p+1).p_primary_part(3)
 tors23part = tors2part * tors3part
 defs = {
        'TORSION_2POWER_BYTES': (int(tors2part).bit_length() + 7) // 8,
        'TORSION_3POWER_BYTES': (int(tors3part).bit_length() + 7) // 8,
        'TORSION_23POWER_BYTES': (int(tors23part).bit_length() + 7) // 8,
    }
 from cformat import Ibz, Object, ObjectFormatter
 objs = ObjectFormatter([
        Object('uint64_t', 'TORSION_PLUS_EVEN_POWER', int(f)),
        Object('uint64_t[]', 'TORSION_ODD_PRIMES', Lpls + Lmin),
        Object('uint64_t[]', 'TORSION_ODD_POWERS', Epls + Emin),
        Object('uint64_t[]', 'TORSION_PLUS_ODD_PRIMES', Lpls),      # TODO deduplicate?
        Object('size_t[]', 'TORSION_PLUS_ODD_POWERS', Epls),        # TODO deduplicate?
        Object('uint64_t[]', 'TORSION_MINUS_ODD_PRIMES', Lmin),     # TODO deduplicate?
        Object('size_t[]', 'TORSION_MINUS_ODD_POWERS', Emin),       # TODO deduplicate?
        Object('size_t[]', 'DEGREE_COMMITMENT_POWERS', [Dcom.valuation(l) for l in Lpls+Lmin]), #FIXME should be ec_degree_odd_t
        Object('ibz_t', 'CHARACTERISTIC', Ibz(p)),
        Object('ibz_t', 'TORSION_ODD', Ibz(Tpls * Tmin)),
        Object('ibz_t[]', 'TORSION_ODD_PRIMEPOWERS', [Ibz(l^e) for Tpm in (Tpls,Tmin) for l,e in Tpm.factor()]),
        Object('ibz_t', 'TORSION_ODD_PLUS', Ibz(Tpls)),
        Object('ibz_t', 'TORSION_ODD_MINUS', Ibz(Tmin)),
        Object('ibz_t', 'TORSION_PLUS_2POWER', Ibz(tors2part)),
        Object('ibz_t', 'TORSION_PLUS_3POWER', Ibz(tors3part)),
        Object('ibz_t', 'TORSION_PLUS_23POWER', Ibz(tors23part)),
        Object('ibz_t', 'DEGREE_COMMITMENT', Ibz(Dcom)),
        Object('ibz_t', 'DEGREE_COMMITMENT_PLUS', Ibz(gcd(Dcom, Tpls))),
        Object('ibz_t', 'DEGREE_COMMITMENT_MINUS', Ibz(gcd(Dcom, Tmin))),
        Object('ibz_t', 'DEGREE_CHALLENGE', Ibz(Dchall)),
    ])
 with open('include/torsion_constants.h','w') as hfile:
    with open('torsion_constants.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <torsion_constants.h>', file=cfile)
        for k,v in defs.items():
            print(f'#define {k} {v}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,92 @@
 # There are the following dependencies
 #     ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
 #     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
 #     │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
 #     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
 #     └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
 #           │                    │                       │
 #           │                    │                       │
 #           ├────────────────────┼─────────────────┐     │
 #           │                    │                 │     │
 #           │                    │                 │     │
 #       ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
 #       │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
 #       └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
 #           │       │            │            └───▲──────┬───────────┘
 #           │       │            │                │      │
 #       ┌───┴──┐    │            │                │      │
 #       │ KLPT ◄────┘            │     ┌──────────┘      │
 #       └───┬──┘                 │     │                 │
 #           │                    │     │                 │
 # ┌─────────▼─────────┐          │     │                 │
 # │ Quaternion orders │          │     │            ┌────▼───┐
 # │     and ideals    │          │     │            │ GF(p²) │
 # └─────────┬─────────┘          │     │            └────┬───┘
 #           │           ┌─┬──────▼─────┴──┬─┐            │
 #     ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
 #     │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
 #     └───────────┘     │ ├───────────────┤ │      └───────────┘
 #                       └─┴───────────────┴─┘                    
 add_subdirectory(common)
 add_subdirectory(intbig)
 add_subdirectory(quaternion)
 add_subdirectory(precomp)
 add_subdirectory(klpt)
 add_subdirectory(gf)
 add_subdirectory(ec)
 add_subdirectory(id2iso)
 add_subdirectory(protocols)
 FOREACH(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
    set(SOURCE_FILES_VARIANT sqisign.c)
    # Library for SQIsign variant
    add_library(sqisign_${SVARIANT_LOWER} ${SOURCE_FILES_VARIANT})
    target_link_libraries(sqisign_${SVARIANT_LOWER} PUBLIC 
        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
        ${LIB_KLPT_${SVARIANT_UPPER}} 
        ${LIB_QUATERNION} 
        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
        ${LIB_INTBIG} 
        ${LIB_GF_${SVARIANT_UPPER}} 
        ${LIB_EC_${SVARIANT_UPPER}} 
        ${GMP} 
        sqisign_common_sys
    )
    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT})
    # Library for SQIsign variant (test)
    add_library(sqisign_${SVARIANT_LOWER}_test ${SOURCE_FILES_VARIANT})
    target_link_libraries(sqisign_${SVARIANT_LOWER}_test PUBLIC 
        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
        ${LIB_KLPT_${SVARIANT_UPPER}} 
        ${LIB_QUATERNION} 
        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
        ${LIB_INTBIG} 
        ${LIB_GF_${SVARIANT_UPPER}} 
        ${LIB_EC_${SVARIANT_UPPER}} 
        ${GMP} 
        sqisign_common_test
    )
    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test PUBLIC SQISIGN_VARIANT=${SVARIANT})
    # Library with NIST API
    set(SOURCE_FILE_NISTAPI nistapi/${SVARIANT_LOWER}/api.c)
    add_library(sqisign_${SVARIANT_LOWER}_nistapi ${SOURCE_FILE_NISTAPI})
    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PRIVATE sqisign_${SVARIANT_LOWER})
    target_include_directories(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC nistapi/${SVARIANT_LOWER} PUBLIC ../include)
    # Library with NIST API (test)
    add_library(sqisign_${SVARIANT_LOWER}_test_nistapi ${SOURCE_FILE_NISTAPI})
    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PRIVATE sqisign_${SVARIANT_LOWER}_test)
    target_include_directories(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC nistapi/${SVARIANT_LOWER})
 ENDFOREACH()
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,3 @@
 get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
 string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
 include(${SELECT_SQISIGN_VARIANT})
--- a/src/common/generic/CMakeLists.txt
+++ b/src/common/generic/CMakeLists.txt
@@ -0,0 +1,26 @@
 set(SOURCE_FILES_COMMON_SYS 
    randombytes_system.c 
    aes_c.c 
    fips202.c 
    mem.c
 )
 add_library(sqisign_common_sys ${SOURCE_FILES_COMMON_SYS})
 target_include_directories(sqisign_common_sys PRIVATE include ../../include)
 target_compile_options(sqisign_common_sys PUBLIC ${C_OPT_FLAGS})
 set(SOURCE_FILES_COMMON_TEST 
    randombytes_ctrdrbg.c 
    aes_c.c 
    fips202.c 
    mem.c
 )
 add_library(sqisign_common_test ${SOURCE_FILES_COMMON_TEST})
 target_include_directories(sqisign_common_test PRIVATE include ../include)
 target_compile_options(sqisign_common_test PUBLIC ${C_OPT_FLAGS})
 if (ENABLE_CT_TESTING)
    target_compile_definitions(sqisign_common_sys PUBLIC ENABLE_CT_TESTING)
    target_compile_definitions(sqisign_common_test PUBLIC ENABLE_CT_TESTING)
 endif()
--- a/src/common/generic/aes_c.c
+++ b/src/common/generic/aes_c.c
@@ -0,0 +1,740 @@
 // SPDX-License-Identifier: MIT and Apache-2.0
 /*
 * AES implementation based on code from PQClean,
 * which is in turn based on BearSSL (https://bearssl.org/)
 * by Thomas Pornin.
 *
 *
 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 #define AES128_KEYBYTES 16
 #define AES192_KEYBYTES 24
 #define AES256_KEYBYTES 32
 #define AESCTR_NONCEBYTES 12
 #define AES_BLOCKBYTES 16
 // We've put these states on the heap to make sure ctx_release is used.
 #define PQC_AES128_STATESIZE 88
 typedef struct {
    uint64_t *sk_exp;
 } aes128ctx;
 #define PQC_AES192_STATESIZE 104
 typedef struct {
    uint64_t  *sk_exp;
 } aes192ctx;
 #define PQC_AES256_STATESIZE 120
 typedef struct {
    uint64_t *sk_exp;
 } aes256ctx;
 /** Initializes the context **/
 void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);
 void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key);
 void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx);
 void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx);
 /** Frees the context **/
 void aes128_ctx_release(aes128ctx *r);
 /** Initializes the context **/
 void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key);
 void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key);
 void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx);
 void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx);
 void aes192_ctx_release(aes192ctx *r);
 /** Initializes the context **/
 void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);
 void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key);
 void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx);
 void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx);
 /** Frees the context **/
 void aes256_ctx_release(aes256ctx *r);
 static inline uint32_t br_dec32le(const unsigned char *src) {
    return (uint32_t)src[0]
           | ((uint32_t)src[1] << 8)
           | ((uint32_t)src[2] << 16)
           | ((uint32_t)src[3] << 24);
 }
 static void br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src) {
    while (num-- > 0) {
        *v ++ = br_dec32le(src);
        src += 4;
    }
 }
 static inline uint32_t br_swap32(uint32_t x) {
    x = ((x & (uint32_t)0x00FF00FF) << 8)
        | ((x >> 8) & (uint32_t)0x00FF00FF);
    return (x << 16) | (x >> 16);
 }
 static inline void br_enc32le(unsigned char *dst, uint32_t x) {
    dst[0] = (unsigned char)x;
    dst[1] = (unsigned char)(x >> 8);
    dst[2] = (unsigned char)(x >> 16);
    dst[3] = (unsigned char)(x >> 24);
 }
 static void br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num) {
    while (num-- > 0) {
        br_enc32le(dst, *v ++);
        dst += 4;
    }
 }
 static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
    /*
     * This S-box implementation is a straightforward translation of
     * the circuit described by Boyar and Peralta in "A new
     * combinational logic minimization technique with applications
     * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
     *
     * Note that variables x* (input) and s* (output) are numbered
     * in "reverse" order (x0 is the high bit, x7 is the low bit).
     */
    uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
    uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
    uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
    uint64_t y20, y21;
    uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
    uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
    uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
    uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
    uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
    uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
    uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
    uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
    uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
    x0 = q[7];
    x1 = q[6];
    x2 = q[5];
    x3 = q[4];
    x4 = q[3];
    x5 = q[2];
    x6 = q[1];
    x7 = q[0];
    /*
     * Top linear transformation.
     */
    y14 = x3 ^ x5;
    y13 = x0 ^ x6;
    y9 = x0 ^ x3;
    y8 = x0 ^ x5;
    t0 = x1 ^ x2;
    y1 = t0 ^ x7;
    y4 = y1 ^ x3;
    y12 = y13 ^ y14;
    y2 = y1 ^ x0;
    y5 = y1 ^ x6;
    y3 = y5 ^ y8;
    t1 = x4 ^ y12;
    y15 = t1 ^ x5;
    y20 = t1 ^ x1;
    y6 = y15 ^ x7;
    y10 = y15 ^ t0;
    y11 = y20 ^ y9;
    y7 = x7 ^ y11;
    y17 = y10 ^ y11;
    y19 = y10 ^ y8;
    y16 = t0 ^ y11;
    y21 = y13 ^ y16;
    y18 = x0 ^ y16;
    /*
     * Non-linear section.
     */
    t2 = y12 & y15;
    t3 = y3 & y6;
    t4 = t3 ^ t2;
    t5 = y4 & x7;
    t6 = t5 ^ t2;
    t7 = y13 & y16;
    t8 = y5 & y1;
    t9 = t8 ^ t7;
    t10 = y2 & y7;
    t11 = t10 ^ t7;
    t12 = y9 & y11;
    t13 = y14 & y17;
    t14 = t13 ^ t12;
    t15 = y8 & y10;
    t16 = t15 ^ t12;
    t17 = t4 ^ t14;
    t18 = t6 ^ t16;
    t19 = t9 ^ t14;
    t20 = t11 ^ t16;
    t21 = t17 ^ y20;
    t22 = t18 ^ y19;
    t23 = t19 ^ y21;
    t24 = t20 ^ y18;
    t25 = t21 ^ t22;
    t26 = t21 & t23;
    t27 = t24 ^ t26;
    t28 = t25 & t27;
    t29 = t28 ^ t22;
    t30 = t23 ^ t24;
    t31 = t22 ^ t26;
    t32 = t31 & t30;
    t33 = t32 ^ t24;
    t34 = t23 ^ t33;
    t35 = t27 ^ t33;
    t36 = t24 & t35;
    t37 = t36 ^ t34;
    t38 = t27 ^ t36;
    t39 = t29 & t38;
    t40 = t25 ^ t39;
    t41 = t40 ^ t37;
    t42 = t29 ^ t33;
    t43 = t29 ^ t40;
    t44 = t33 ^ t37;
    t45 = t42 ^ t41;
    z0 = t44 & y15;
    z1 = t37 & y6;
    z2 = t33 & x7;
    z3 = t43 & y16;
    z4 = t40 & y1;
    z5 = t29 & y7;
    z6 = t42 & y11;
    z7 = t45 & y17;
    z8 = t41 & y10;
    z9 = t44 & y12;
    z10 = t37 & y3;
    z11 = t33 & y4;
    z12 = t43 & y13;
    z13 = t40 & y5;
    z14 = t29 & y2;
    z15 = t42 & y9;
    z16 = t45 & y14;
    z17 = t41 & y8;
    /*
     * Bottom linear transformation.
     */
    t46 = z15 ^ z16;
    t47 = z10 ^ z11;
    t48 = z5 ^ z13;
    t49 = z9 ^ z10;
    t50 = z2 ^ z12;
    t51 = z2 ^ z5;
    t52 = z7 ^ z8;
    t53 = z0 ^ z3;
    t54 = z6 ^ z7;
    t55 = z16 ^ z17;
    t56 = z12 ^ t48;
    t57 = t50 ^ t53;
    t58 = z4 ^ t46;
    t59 = z3 ^ t54;
    t60 = t46 ^ t57;
    t61 = z14 ^ t57;
    t62 = t52 ^ t58;
    t63 = t49 ^ t58;
    t64 = z4 ^ t59;
    t65 = t61 ^ t62;
    t66 = z1 ^ t63;
    s0 = t59 ^ t63;
    s6 = t56 ^ ~t62;
    s7 = t48 ^ ~t60;
    t67 = t64 ^ t65;
    s3 = t53 ^ t66;
    s4 = t51 ^ t66;
    s5 = t47 ^ t65;
    s1 = t64 ^ ~s3;
    s2 = t55 ^ ~t67;
    q[7] = s0;
    q[6] = s1;
    q[5] = s2;
    q[4] = s3;
    q[3] = s4;
    q[2] = s5;
    q[1] = s6;
    q[0] = s7;
 }
 static void br_aes_ct64_ortho(uint64_t *q) {
 #define SWAPN(cl, ch, s, x, y)   do { \
        uint64_t a, b; \
        a = (x); \
        b = (y); \
        (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \
        (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \
    } while (0)
 #define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
 #define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
 #define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
    SWAP2(q[0], q[1]);
    SWAP2(q[2], q[3]);
    SWAP2(q[4], q[5]);
    SWAP2(q[6], q[7]);
    SWAP4(q[0], q[2]);
    SWAP4(q[1], q[3]);
    SWAP4(q[4], q[6]);
    SWAP4(q[5], q[7]);
    SWAP8(q[0], q[4]);
    SWAP8(q[1], q[5]);
    SWAP8(q[2], q[6]);
    SWAP8(q[3], q[7]);
 }
 static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
    uint64_t x0, x1, x2, x3;
    x0 = w[0];
    x1 = w[1];
    x2 = w[2];
    x3 = w[3];
    x0 |= (x0 << 16);
    x1 |= (x1 << 16);
    x2 |= (x2 << 16);
    x3 |= (x3 << 16);
    x0 &= (uint64_t)0x0000FFFF0000FFFF;
    x1 &= (uint64_t)0x0000FFFF0000FFFF;
    x2 &= (uint64_t)0x0000FFFF0000FFFF;
    x3 &= (uint64_t)0x0000FFFF0000FFFF;
    x0 |= (x0 << 8);
    x1 |= (x1 << 8);
    x2 |= (x2 << 8);
    x3 |= (x3 << 8);
    x0 &= (uint64_t)0x00FF00FF00FF00FF;
    x1 &= (uint64_t)0x00FF00FF00FF00FF;
    x2 &= (uint64_t)0x00FF00FF00FF00FF;
    x3 &= (uint64_t)0x00FF00FF00FF00FF;
    *q0 = x0 | (x2 << 8);
    *q1 = x1 | (x3 << 8);
 }
 static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
    uint64_t x0, x1, x2, x3;
    x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
    x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
    x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
    x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
    x0 |= (x0 >> 8);
    x1 |= (x1 >> 8);
    x2 |= (x2 >> 8);
    x3 |= (x3 >> 8);
    x0 &= (uint64_t)0x0000FFFF0000FFFF;
    x1 &= (uint64_t)0x0000FFFF0000FFFF;
    x2 &= (uint64_t)0x0000FFFF0000FFFF;
    x3 &= (uint64_t)0x0000FFFF0000FFFF;
    w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
    w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
    w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
    w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
 }
 static const unsigned char Rcon[] = {
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
 };
 static uint32_t sub_word(uint32_t x) {
    uint64_t q[8];
    memset(q, 0, sizeof q);
    q[0] = x;
    br_aes_ct64_ortho(q);
    br_aes_ct64_bitslice_Sbox(q);
    br_aes_ct64_ortho(q);
    return (uint32_t)q[0];
 }
 static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len) {
    unsigned int i, j, k, nk, nkf;
    uint32_t tmp;
    uint32_t skey[60];
    unsigned nrounds = 10 + ((key_len - 16) >> 2);
    nk = (key_len >> 2);
    nkf = ((nrounds + 1) << 2);
    br_range_dec32le(skey, (key_len >> 2), key);
    tmp = skey[(key_len >> 2) - 1];
    for (i = nk, j = 0, k = 0; i < nkf; i ++) {
        if (j == 0) {
            tmp = (tmp << 24) | (tmp >> 8);
            tmp = sub_word(tmp) ^ Rcon[k];
        } else if (nk > 6 && j == 4) {
            tmp = sub_word(tmp);
        }
        tmp ^= skey[i - nk];
        skey[i] = tmp;
        if (++ j == nk) {
            j = 0;
            k ++;
        }
    }
    for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
        uint64_t q[8];
        br_aes_ct64_interleave_in(&q[0], &q[4], skey + i);
        q[1] = q[0];
        q[2] = q[0];
        q[3] = q[0];
        q[5] = q[4];
        q[6] = q[4];
        q[7] = q[4];
        br_aes_ct64_ortho(q);
        comp_skey[j + 0] =
            (q[0] & (uint64_t)0x1111111111111111)
            | (q[1] & (uint64_t)0x2222222222222222)
            | (q[2] & (uint64_t)0x4444444444444444)
            | (q[3] & (uint64_t)0x8888888888888888);
        comp_skey[j + 1] =
            (q[4] & (uint64_t)0x1111111111111111)
            | (q[5] & (uint64_t)0x2222222222222222)
            | (q[6] & (uint64_t)0x4444444444444444)
            | (q[7] & (uint64_t)0x8888888888888888);
    }
 }
 static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds) {
    unsigned u, v, n;
    n = (nrounds + 1) << 1;
    for (u = 0, v = 0; u < n; u ++, v += 4) {
        uint64_t x0, x1, x2, x3;
        x0 = x1 = x2 = x3 = comp_skey[u];
        x0 &= (uint64_t)0x1111111111111111;
        x1 &= (uint64_t)0x2222222222222222;
        x2 &= (uint64_t)0x4444444444444444;
        x3 &= (uint64_t)0x8888888888888888;
        x1 >>= 1;
        x2 >>= 2;
        x3 >>= 3;
        skey[v + 0] = (x0 << 4) - x0;
        skey[v + 1] = (x1 << 4) - x1;
        skey[v + 2] = (x2 << 4) - x2;
        skey[v + 3] = (x3 << 4) - x3;
    }
 }
 static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
    q[0] ^= sk[0];
    q[1] ^= sk[1];
    q[2] ^= sk[2];
    q[3] ^= sk[3];
    q[4] ^= sk[4];
    q[5] ^= sk[5];
    q[6] ^= sk[6];
    q[7] ^= sk[7];
 }
 static inline void shift_rows(uint64_t *q) {
    int i;
    for (i = 0; i < 8; i ++) {
        uint64_t x;
        x = q[i];
        q[i] = (x & (uint64_t)0x000000000000FFFF)
               | ((x & (uint64_t)0x00000000FFF00000) >> 4)
               | ((x & (uint64_t)0x00000000000F0000) << 12)
               | ((x & (uint64_t)0x0000FF0000000000) >> 8)
               | ((x & (uint64_t)0x000000FF00000000) << 8)
               | ((x & (uint64_t)0xF000000000000000) >> 12)
               | ((x & (uint64_t)0x0FFF000000000000) << 4);
    }
 }
 static inline uint64_t rotr32(uint64_t x) {
    return (x << 32) | (x >> 32);
 }
 static inline void mix_columns(uint64_t *q) {
    uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
    uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
    q0 = q[0];
    q1 = q[1];
    q2 = q[2];
    q3 = q[3];
    q4 = q[4];
    q5 = q[5];
    q6 = q[6];
    q7 = q[7];
    r0 = (q0 >> 16) | (q0 << 48);
    r1 = (q1 >> 16) | (q1 << 48);
    r2 = (q2 >> 16) | (q2 << 48);
    r3 = (q3 >> 16) | (q3 << 48);
    r4 = (q4 >> 16) | (q4 << 48);
    r5 = (q5 >> 16) | (q5 << 48);
    r6 = (q6 >> 16) | (q6 << 48);
    r7 = (q7 >> 16) | (q7 << 48);
    q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
    q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
    q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
    q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
    q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
    q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
    q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
    q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
 }
 static void inc4_be(uint32_t *x) {
    uint32_t t = br_swap32(*x) + 4;
    *x = br_swap32(t);
 }
 static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
    uint32_t w[16];
    uint64_t q[8];
    unsigned int i;
    memcpy(w, ivw, sizeof(w));
    for (i = 0; i < 4; i++) {
        br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2));
    }
    br_aes_ct64_ortho(q);
    add_round_key(q, sk_exp);
    for (i = 1; i < nrounds; i++) {
        br_aes_ct64_bitslice_Sbox(q);
        shift_rows(q);
        mix_columns(q);
        add_round_key(q, sk_exp + (i << 3));
    }
    br_aes_ct64_bitslice_Sbox(q);
    shift_rows(q);
    add_round_key(q, sk_exp + 8 * nrounds);
    br_aes_ct64_ortho(q);
    for (i = 0; i < 4; i ++) {
        br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
    }
    br_range_enc32le(out, w, 16);
 }
 static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
    aes_ecb4x(out, ivw, sk_exp, nrounds);
    /* Increase counter for next 4 blocks */
    inc4_be(ivw + 3);
    inc4_be(ivw + 7);
    inc4_be(ivw + 11);
    inc4_be(ivw + 15);
 }
 static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, unsigned int nrounds) {
    uint32_t blocks[16];
    unsigned char t[64];
    while (nblocks >= 4) {
        br_range_dec32le(blocks, 16, in);
        aes_ecb4x(out, blocks, rkeys, nrounds);
        nblocks -= 4;
        in += 64;
        out += 64;
    }
    if (nblocks) {
        br_range_dec32le(blocks, nblocks * 4, in);
        aes_ecb4x(t, blocks, rkeys, nrounds);
        memcpy(out, t, nblocks * 16);
    }
 }
 static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, unsigned int nrounds) {
    uint32_t ivw[16];
    size_t i;
    uint32_t cc = 0;
    br_range_dec32le(ivw, 3, iv);
    memcpy(ivw +  4, ivw, 3 * sizeof(uint32_t));
    memcpy(ivw +  8, ivw, 3 * sizeof(uint32_t));
    memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
    ivw[ 3] = br_swap32(cc);
    ivw[ 7] = br_swap32(cc + 1);
    ivw[11] = br_swap32(cc + 2);
    ivw[15] = br_swap32(cc + 3);
    while (outlen > 64) {
        aes_ctr4x(out, ivw, rkeys, nrounds);
        out += 64;
        outlen -= 64;
    }
    if (outlen > 0) {
        unsigned char tmp[64];
        aes_ctr4x(tmp, ivw, rkeys, nrounds);
        for (i = 0; i < outlen; i++) {
            out[i] = tmp[i];
        }
    }
 }
 void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key) {
    uint64_t skey[22];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES128_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 16);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 10);
 }
 void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key) {
    aes128_ecb_keyexp(r, key);
 }
 void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key) {
    uint64_t skey[26];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES192_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 24);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 12);
 }
 void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key) {
    aes192_ecb_keyexp(r, key);
 }
 void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key) {
    uint64_t skey[30];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES256_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 32);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 14);
 }
 void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key) {
    aes256_ecb_keyexp(r, key);
 }
 void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx) {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 10);
 }
 void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx) {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 10);
 }
 void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx) {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 12);
 }
 void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx) {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 12);
 }
 void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx) {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 14);
 }
 void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx) {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 14);
 }
 void aes128_ctx_release(aes128ctx *r) {
    free(r->sk_exp);
 }
 void aes192_ctx_release(aes192ctx *r) {
    free(r->sk_exp);
 }
 void aes256_ctx_release(aes256ctx *r) {
    free(r->sk_exp);
 }
 int AES_128_CTR(unsigned char *output, size_t outputByteLen,
                const unsigned char *input, size_t inputByteLen) {
    aes128ctx ctx;
    unsigned char iv[16] = { 0 };
    aes128_ctr_keyexp(&ctx, input);
    aes128_ctr(output, outputByteLen, iv, &ctx);
    aes128_ctx_release(&ctx);
    return (int)outputByteLen;
 }
 void AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output) {
    aes256ctx ctx;
    aes256_ecb_keyexp(&ctx, key);
    aes256_ecb(output, input, 1, &ctx);
    aes256_ctx_release(&ctx);
 }
--- a/src/common/generic/fips202.c
+++ b/src/common/generic/fips202.c
--- a/src/common/generic/include/aes.h
+++ b/src/common/generic/include/aes.h
@@ -0,0 +1,23 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef AES_H
 #define AES_H
 #include <stddef.h>
 #include <stdint.h>
 void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
 #define AES_ECB_encrypt AES_256_ECB
 #ifdef ENABLE_AESNI
 int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
                   const unsigned char *input, size_t inputByteLen);
 int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
                      const unsigned char *input, size_t inputByteLen);
 #define AES_128_CTR AES_128_CTR_NI
 #else
 int AES_128_CTR(unsigned char *output, size_t outputByteLen,
                const unsigned char *input, size_t inputByteLen);
 #endif
 #endif
--- a/src/common/generic/include/bench.h
+++ b/src/common/generic/include/bench.h
@@ -0,0 +1,63 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <inttypes.h>
 #if defined(TARGET_OS_UNIX) && (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_OTHER))
 #include <time.h>
 #endif
 #if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X) || defined(TARGET_OTHER))
 #define print_bench_unit printf("nsec\n");
 #else
 #define print_bench_unit printf("cycles\n");
 #endif
 #if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X))
 #define BENCH_UNITS "nsec"
 #else
 #define BENCH_UNITS "cycles"
 #endif
 static inline int64_t cpucycles(void) {
 #if (defined(TARGET_AMD64) || defined(TARGET_X86))
    unsigned int hi, lo;
    asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
    return ((int64_t) lo) | (((int64_t) hi) << 32);
 #elif (defined(TARGET_S390X))
    uint64_t tod;
    asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
    return (tod * 1000 / 4096);
 #else
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
    return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
 #endif
 }
 static inline int cmpfunc (const void *a, const void *b) {
    return ( *(uint64_t *)a - * (uint64_t *)b );
 }
 #define BENCH_CODE_1(r) \
    cycles = 0; \
    for (i = 0; i < (r); ++i) { \
        cycles1 = cpucycles();
 #define BENCH_CODE_2(name, csv) \
        cycles2 = cpucycles(); \
        if(i < LIST_SIZE) \
          cycles_list[i] = (cycles2 - cycles1);\
        cycles = cycles + (cycles2 - cycles1); \
    } \
    qsort(cycles_list, (runs < LIST_SIZE)? runs : LIST_SIZE, sizeof(uint64_t), cmpfunc);\
    if (csv) \
      printf("%2" PRId64 ",", cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2]); \
    else { \
      printf("  %-20s-> median: %2" PRId64 ", average: %2" PRId64 " ", name, \
      cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2], (cycles / runs)); \
      printf("%s\n", BENCH_UNITS); \
    }
--- a/src/common/generic/include/fips202.h
+++ b/src/common/generic/include/fips202.h
@@ -0,0 +1,11 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef FIPS202_H
 #define FIPS202_H
 #include <stddef.h>
 int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
 int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
 #endif
--- a/src/common/generic/include/tutil.h
+++ b/src/common/generic/include/tutil.h
@@ -0,0 +1,33 @@
 #ifndef TUTIL_H
 #define TUTIL_H
 #include <stddef.h>
 #include <stdint.h>
 #if defined(__GNUC__) || defined(__clang__)
 #define BSWAP32(i) __builtin_bswap32((i))
 #define BSWAP64(i) __builtin_bswap64((i))
 #else
 #define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
 #define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
 #endif
 #if defined(RADIX_64)
 #define digit_t uint64_t
 #define sdigit_t int64_t
 #define DIGIT_LEN 8
 #define RADIX 64
 #define LOG2RADIX 6
 #define BSWAP_DIGIT(i) BSWAP64(i)
 #elif defined(RADIX_32)
 #define digit_t uint32_t
 #define sdigit_t int32_t
 #define DIGIT_LEN 4
 #define RADIX 32
 #define LOG2RADIX 5
 #define BSWAP_DIGIT(i) BSWAP32(i)
 #else
 #error "Radix must be 32bit or 64 bit"
 #endif
 #endif
--- a/src/common/generic/mem.c
+++ b/src/common/generic/mem.c
@@ -0,0 +1,18 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <string.h>
 #include <stdlib.h>
 void sqisign_secure_free(void *mem, size_t size) {
    if (mem) {
        typedef void *(*memset_t)(void *, int, size_t);
        static volatile memset_t memset_func = memset;
        memset_func(mem, 0, size);
        free(mem);
    }
 }
 void sqisign_secure_clear(void *mem, size_t size) {
    typedef void *(*memset_t)(void *, int, size_t);
    static volatile memset_t memset_func = memset;
    memset_func(mem, 0, size);
 }
--- a/src/common/generic/randombytes_ctrdrbg.c
+++ b/src/common/generic/randombytes_ctrdrbg.c
@@ -0,0 +1,140 @@
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 //
 /*
 NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
 */
 #include <string.h>
 #include <aes.h>
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
 #define RNG_SUCCESS      0
 #define RNG_BAD_MAXLEN  -1
 #define RNG_BAD_OUTBUF  -2
 #define RNG_BAD_REQ_LEN -3
 static __inline void AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer) {
    AES_ECB_encrypt(ctr, key, buffer);
 }
 typedef struct {
    unsigned char   buffer[16];
    int             buffer_pos;
    unsigned long   length_remaining;
    unsigned char   key[32];
    unsigned char   ctr[16];
 } AES_XOF_struct;
 typedef struct {
    unsigned char   Key[32];
    unsigned char   V[16];
    int             reseed_counter;
 } AES256_CTR_DRBG_struct;
 void
 AES256_CTR_DRBG_Update(unsigned char *provided_data,
                       unsigned char *Key,
                       unsigned char *V);
 AES256_CTR_DRBG_struct  DRBG_ctx;
 static void
 randombytes_init_nist(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
    unsigned char   seed_material[48];
    (void)security_strength;  // Unused parameter
    memcpy(seed_material, entropy_input, 48);
    if (personalization_string)
        for (int i = 0; i < 48; i++) {
            seed_material[i] ^= personalization_string[i];
        }
    memset(DRBG_ctx.Key, 0x00, 32);
    memset(DRBG_ctx.V, 0x00, 16);
    AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter = 1;
 }
 static int
 randombytes_nist(unsigned char *x, size_t xlen) {
    unsigned char   block[16];
    size_t          i = 0;
    while ( xlen > 0 ) {
        //increment V
        for (int j = 15; j >= 0; j--) {
            if ( DRBG_ctx.V[j] == 0xff ) {
                DRBG_ctx.V[j] = 0x00;
            } else {
                DRBG_ctx.V[j]++;
                break;
            }
        }
        AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
        if ( xlen > 15 ) {
            memcpy(x + i, block, 16);
            i += 16;
            xlen -= 16;
        } else {
            memcpy(x + i, block, xlen);
            i += xlen;
            xlen = 0;
        }
    }
    AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter++;
    return 0;
 }
 void
 AES256_CTR_DRBG_Update(unsigned char *provided_data,
                       unsigned char *Key,
                       unsigned char *V) {
    unsigned char   temp[48];
    for (int i = 0; i < 3; i++) {
        //increment V
        for (int j = 15; j >= 0; j--) {
            if ( V[j] == 0xff ) {
                V[j] = 0x00;
            } else {
                V[j]++;
                break;
            }
        }
        AES256_ECB(Key, V, temp + 16 * i);
    }
    if ( provided_data != NULL )
        for (int i = 0; i < 48; i++) {
            temp[i] ^= provided_data[i];
        }
    memcpy(Key, temp, 32);
    memcpy(V, temp + 32, 16);
 }
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
    int ret = randombytes_nist(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
    return ret;
 }
 void
 randombytes_init(unsigned char *entropy_input,
                 unsigned char *personalization_string,
                 int security_strength) {
    return randombytes_init_nist(entropy_input, personalization_string, security_strength);
 }
--- a/src/common/generic/randombytes_system.c
+++ b/src/common/generic/randombytes_system.c
@@ -0,0 +1,396 @@
 // SPDX-License-Identifier: MIT
 /*
 The MIT License
 Copyright (c) 2017 Daan Sprenkels <hello@dsprenkels.com>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
 // In the case that are compiling on linux, we need to define _GNU_SOURCE
 // *before* randombytes.h is included. Otherwise SYS_getrandom will not be
 // declared.
 #if defined(__linux__) || defined(__GNU__)
 # define _GNU_SOURCE
 #endif /* defined(__linux__) || defined(__GNU__) */
 #if defined(_WIN32)
 /* Windows */
 # include <windows.h>
 # include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
 #endif /* defined(_WIN32) */
 /* wasi */
 #if defined(__wasi__)
 #include <stdlib.h>
 #endif
 /* kFreeBSD */
 #if defined(__FreeBSD_kernel__) && defined(__GLIBC__)
 # define GNU_KFREEBSD
 #endif
 #if defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
 /* Linux */
 // We would need to include <linux/random.h>, but not every target has access
 // to the linux headers. We only need RNDGETENTCNT, so we instead inline it.
 // RNDGETENTCNT is originally defined in `include/uapi/linux/random.h` in the
 // linux repo.
 # define RNDGETENTCNT 0x80045200
 # include <assert.h>
 # include <errno.h>
 # include <fcntl.h>
 # include <poll.h>
 # include <stdint.h>
 # include <stdio.h>
 # include <sys/ioctl.h>
 # if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
 #  define USE_GLIBC
 #  include <sys/random.h>
 # endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24)) */
 # include <sys/stat.h>
 # include <sys/syscall.h>
 # include <sys/types.h>
 # include <unistd.h>
 // We need SSIZE_MAX as the maximum read len from /dev/urandom
 # if !defined(SSIZE_MAX)
 #  define SSIZE_MAX (SIZE_MAX / 2 - 1)
 # endif /* defined(SSIZE_MAX) */
 #endif /* defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD) */
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 /* Dragonfly, FreeBSD, NetBSD, OpenBSD (has arc4random) */
 # include <sys/param.h>
 # if defined(BSD)
 #  include <stdlib.h>
 # endif
 /* GNU/Hurd defines BSD in sys/param.h which causes problems later */
 # if defined(__GNU__)
 #  undef BSD
 # endif
 #endif
 #if defined(__EMSCRIPTEN__)
 # include <assert.h>
 # include <emscripten.h>
 # include <errno.h>
 # include <stdbool.h>
 #endif /* defined(__EMSCRIPTEN__) */
 #if defined(_WIN32)
 static int randombytes_win32_randombytes(void* buf, size_t n)
 {
 	HCRYPTPROV ctx;
 	BOOL tmp;
 	DWORD to_read = 0;
 	const size_t MAX_DWORD = 0xFFFFFFFF;
 	tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL,
 	                          CRYPT_VERIFYCONTEXT);
 	if (tmp == FALSE) return -1;
 	while (n > 0) {
 		to_read = (DWORD)(n < MAX_DWORD ? n : MAX_DWORD);
 		tmp = CryptGenRandom(ctx, to_read, (BYTE*) buf);
 		if (tmp == FALSE) return -1;
 		buf = ((char*)buf) + to_read;
 		n -= to_read;
 	}
 	tmp = CryptReleaseContext(ctx, 0);
 	if (tmp == FALSE) return -1;
 	return 0;
 }
 #endif /* defined(_WIN32) */
 #if defined(__wasi__)
 static int randombytes_wasi_randombytes(void *buf, size_t n) {
 	arc4random_buf(buf, n);
 	return 0;
 }
 #endif /* defined(__wasi__) */
 #if (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom))
 # if defined(USE_GLIBC)
 // getrandom is declared in glibc.
 # elif defined(SYS_getrandom)
 static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) {
 	return syscall(SYS_getrandom, buf, buflen, flags);
 }
 # endif
 static int randombytes_linux_randombytes_getrandom(void *buf, size_t n)
 {
 	/* I have thought about using a separate PRF, seeded by getrandom, but
 	 * it turns out that the performance of getrandom is good enough
 	 * (250 MB/s on my laptop).
 	 */
 	size_t offset = 0, chunk;
 	int ret;
 	while (n > 0) {
 		/* getrandom does not allow chunks larger than 33554431 */
 		chunk = n <= 33554431 ? n : 33554431;
 		do {
 			ret = getrandom((char *)buf + offset, chunk, 0);
 		} while (ret == -1 && errno == EINTR);
 		if (ret < 0) return ret;
 		offset += ret;
 		n -= ret;
 	}
 	assert(n == 0);
 	return 0;
 }
 #endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom)) */
 #if (defined(__linux__) || defined(GNU_KFREEBSD)) && !defined(SYS_getrandom)
 # if defined(__linux__)
 static int randombytes_linux_read_entropy_ioctl(int device, int *entropy)
 {
 	return ioctl(device, RNDGETENTCNT, entropy);
 }
 static int randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
 {
 	int retcode;
 	do {
 		rewind(stream);
 		retcode = fscanf(stream, "%d", entropy);
 	} while (retcode != 1 && errno == EINTR);
 	if (retcode != 1) {
 		return -1;
 	}
 	return 0;
 }
 static int randombytes_linux_wait_for_entropy(int device)
 {
 	/* We will block on /dev/random, because any increase in the OS' entropy
 	 * level will unblock the request. I use poll here (as does libsodium),
 	 * because we don't *actually* want to read from the device. */
 	enum { IOCTL, PROC } strategy = IOCTL;
 	const int bits = 128;
 	struct pollfd pfd;
 	int fd;
 	FILE *proc_file;
 	int retcode, retcode_error = 0; // Used as return codes throughout this function
 	int entropy = 0;
 	/* If the device has enough entropy already, we will want to return early */
 	retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
 	// printf("errno: %d (%s)\n", errno, strerror(errno));
 	if (retcode != 0 && (errno == ENOTTY || errno == ENOSYS)) {
 		// The ioctl call on /dev/urandom has failed due to a
 		//   - ENOTTY (unsupported action), or
 		//   - ENOSYS (invalid ioctl; this happens on MIPS, see #22).
 		//
 		// We will fall back to reading from
 		// `/proc/sys/kernel/random/entropy_avail`.  This less ideal,
 		// because it allocates a file descriptor, and it may not work
 		// in a chroot.  But at this point it seems we have no better
 		// options left.
 		strategy = PROC;
 		// Open the entropy count file
 		proc_file = fopen("/proc/sys/kernel/random/entropy_avail", "r");
 		if (proc_file == NULL) {
 			return -1;
 		}
 	} else if (retcode != 0) {
 		// Unrecoverable ioctl error
 		return -1;
 	}
 	if (entropy >= bits) {
 		return 0;
 	}
 	do {
 		fd = open("/dev/random", O_RDONLY);
 	} while (fd == -1 && errno == EINTR); /* EAGAIN will not occur */
 	if (fd == -1) {
 		/* Unrecoverable IO error */
 		return -1;
 	}
 	pfd.fd = fd;
 	pfd.events = POLLIN;
 	for (;;) {
 		retcode = poll(&pfd, 1, -1);
 		if (retcode == -1 && (errno == EINTR || errno == EAGAIN)) {
 			continue;
 		} else if (retcode == 1) {
 			if (strategy == IOCTL) {
 				retcode = randombytes_linux_read_entropy_ioctl(device, &entropy);
 			} else if (strategy == PROC) {
 				retcode = randombytes_linux_read_entropy_proc(proc_file, &entropy);
 			} else {
 				return -1; // Unreachable
 			}
 			if (retcode != 0) {
 				// Unrecoverable I/O error
 				retcode_error = retcode;
 				break;
 			}
 			if (entropy >= bits) {
 				break;
 			}
 		} else {
 			// Unreachable: poll() should only return -1 or 1
 			retcode_error = -1;
 			break;
 		}
 	}
 	do {
 		retcode = close(fd);
 	} while (retcode == -1 && errno == EINTR);
 	if (strategy == PROC) {
 		do {
 			retcode = fclose(proc_file);
 		} while (retcode == -1 && errno == EINTR);
 	}
 	if (retcode_error != 0) {
 		return retcode_error;
 	}
 	return retcode;
 }
 # endif /* defined(__linux__) */
 static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
 {
 	int fd;
 	size_t offset = 0, count;
 	ssize_t tmp;
 	do {
 		fd = open("/dev/urandom", O_RDONLY);
 	} while (fd == -1 && errno == EINTR);
 	if (fd == -1) return -1;
 # if defined(__linux__)
 	if (randombytes_linux_wait_for_entropy(fd) == -1) return -1;
 # endif
 	while (n > 0) {
 		count = n <= SSIZE_MAX ? n : SSIZE_MAX;
 		tmp = read(fd, (char *)buf + offset, count);
 		if (tmp == -1 && (errno == EAGAIN || errno == EINTR)) {
 			continue;
 		}
 		if (tmp == -1) return -1; /* Unrecoverable IO error */
 		offset += tmp;
 		n -= tmp;
 	}
 	close(fd);
 	assert(n == 0);
 	return 0;
 }
 #endif /* defined(__linux__) && !defined(SYS_getrandom) */
 #if defined(BSD)
 static int randombytes_bsd_randombytes(void *buf, size_t n)
 {
 	arc4random_buf(buf, n);
 	return 0;
 }
 #endif /* defined(BSD) */
 #if defined(__EMSCRIPTEN__)
 static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
 	const int ret = EM_ASM_INT({
 		var crypto;
 		try {
 			crypto = require('crypto');
 		} catch (error) {
 			return -2;
 		}
 		try {
 			writeArrayToMemory(crypto.randomBytes($1), $0);
 			return 0;
 		} catch (error) {
 			return -1;
 		}
 	}, buf, n);
 	switch (ret) {
 	case 0:
 		return 0;
 	case -1:
 		errno = EINVAL;
 		return -1;
 	case -2:
 		errno = ENOSYS;
 		return -1;
 	}
 	assert(false); // Unreachable
 }
 #endif /* defined(__EMSCRIPTEN__) */
 static int randombytes_select(void *buf, size_t n)
 {
 #if defined(__EMSCRIPTEN__)
 	return randombytes_js_randombytes_nodejs(buf, n);   
 #elif defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
 # if defined(USE_GLIBC)
 	/* Use getrandom system call */
 	return randombytes_linux_randombytes_getrandom(buf, n);
 # elif defined(SYS_getrandom)
 	/* Use getrandom system call */
 	return randombytes_linux_randombytes_getrandom(buf, n);
 # else
 	/* When we have enough entropy, we can read from /dev/urandom */
 	return randombytes_linux_randombytes_urandom(buf, n);
 # endif
 #elif defined(BSD)
 	/* Use arc4random system call */
 	return randombytes_bsd_randombytes(buf, n);
 #elif defined(_WIN32)
 	/* Use windows API */
 	return randombytes_win32_randombytes(buf, n);
 #elif defined(__wasi__)
 	/* Use WASI */
 	return randombytes_wasi_randombytes(buf, n);
 #else
 # error "randombytes(...) is not supported on this platform"
 #endif
 }
 int randombytes(unsigned char *x, unsigned long long xlen) {
    int ret = randombytes_select(x, (size_t) xlen);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(x, xlen);
 #endif
    return ret;
 }
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
    (void) entropy_input;
    (void) personalization_string;
    (void) security_strength;
 }
--- a/src/ec/CMakeLists.txt
+++ b/src/ec/CMakeLists.txt
@@ -0,0 +1 @@
 include(${SELECT_IMPL_TYPE})
--- a/src/ec/ref/CMakeLists.txt
+++ b/src/ec/ref/CMakeLists.txt
@@ -0,0 +1,3 @@
 set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)
 include(${SELECT_SQISIGN_VARIANT})
--- a/src/ec/ref/ecx/basis.c
+++ b/src/ec/ref/ecx/basis.c
@@ -0,0 +1,508 @@
 #include "isog.h"
 static void xTPL(ec_point_t* Q, const ec_point_t* P, const ec_point_t* A3)
 {
    /* ----------------------------------------------------------------------------- *
     * Differential point tripling given the montgomery coefficient A3 = (A+2C:A-2C)
     * ----------------------------------------------------------------------------- */
    fp2_t t0, t1, t2, t3, t4;
    fp2_sub(&t0, &P->x, &P->z);
    fp2_sqr(&t2, &t0);
    fp2_add(&t1, &P->x, &P->z);
    fp2_sqr(&t3, &t1);
    fp2_add(&t4, &t1, &t0);
    fp2_sub(&t0, &t1, &t0);
    fp2_sqr(&t1, &t4);
    fp2_sub(&t1, &t1, &t3);
    fp2_sub(&t1, &t1, &t2);
    fp2_mul(&Q->x, &t3, &A3->x);
    fp2_mul(&t3, &Q->x, &t3);
    fp2_mul(&Q->z, &t2, &A3->z);
    fp2_mul(&t2, &t2, &Q->z);
    fp2_sub(&t3, &t2, &t3);
    fp2_sub(&t2, &Q->x, &Q->z);
    fp2_mul(&t1, &t2, &t1);
    fp2_add(&t2, &t3, &t1);
    fp2_sqr(&t2, &t2);
    fp2_mul(&Q->x, &t2, &t4);
    fp2_sub(&t1, &t3, &t1);
    fp2_sqr(&t1, &t1);
    fp2_mul(&Q->z, &t1, &t0);
 }
 int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P){
    fp2_t t0, t1, t2;
    // Check if xz*(C^2x^2+zACx+z^2C^2) is a square
    fp2_mul(&t0, &curve->C, &P->x); 
    fp2_mul(&t1, &t0, &P->z);       
    fp2_mul(&t1, &t1, &curve->A);   
    fp2_mul(&t2, &curve->C, &P->z); 
    fp2_sqr(&t0, &t0);              
    fp2_sqr(&t2, &t2);              
    fp2_add(&t0, &t0, &t1);
    fp2_add(&t0, &t0, &t2);
    fp2_mul(&t0, &t0, &P->x);
    fp2_mul(&t0, &t0, &P->z);
    return fp2_is_square(&t0);
 }
 static void difference_point(ec_point_t* PQ, const ec_point_t* P, const ec_point_t* Q, const ec_curve_t* curve){
    // Given P,Q in affine x-only, computes a deterministic choice for (P-Q)
    // The points must be normalized to z=1 and the curve to C=1
    fp2_t t0, t1, t2, t3;
    fp2_sub(&PQ->z, &P->x, &Q->x);  // P - Q
    fp2_mul(&t2, &P->x, &Q->x);     // P*Q
    fp_mont_setone(t1.re);
    fp_set(t1.im, 0);
    fp2_sub(&t3, &t2, &t1);         // P*Q-1
    fp2_mul(&t0, &PQ->z, &t3);      // (P-Q)*(P*Q-1)
    fp2_sqr(&PQ->z, &PQ->z);        // (P-Q)^2
    fp2_sqr(&t0, &t0);              // (P-Q)^2*(P*Q-1)^2
    fp2_add(&t1, &t2, &t1);         // P*Q+1
    fp2_add(&t3, &P->x, &Q->x);     // P+Q
    fp2_mul(&t1, &t1, &t3);         // (P+Q)*(P*Q+1)
    fp2_mul(&t2, &t2, &curve->A);   // A*P*Q
    fp2_add(&t2, &t2, &t2);         // 2*A*P*Q
    fp2_add(&t1, &t1, &t2);         // (P+Q)*(P*Q+1) + 2*A*P*Q
    fp2_sqr(&t2, &t1);              // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2
    fp2_sub(&t0, &t2, &t0);         // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2 - (P-Q)^2*(P*Q-1)^2
    fp2_sqrt(&t0);
    fp2_add(&PQ->x, &t0, &t1);
 }
 void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q2, P2, A24;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&P, &P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&P2, &P);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&P2, &P2, &A24);
        if(ec_is_zero(&P2))
            continue;
        else
            break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&Q2, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q2, &Q2, &A24);
        if(ec_is_zero(&Q2))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P2, &Q2))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ2->PmQ, &P, &Q, &E);
    copy_point(&PQ2->P, &P);
    copy_point(&PQ2->Q, &Q);
 }
 void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P){
    fp2_t x, t0, t1, t2;
    ec_point_t Q, Q2, P2, A24;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Point of order 2 generated by P
    copy_point(&P2, P);
    for(int i = 0; i < POWER_OF_2 - 1; i++)
        xDBLv2(&P2, &P2, &A24);
    // Find Q
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_2f, (int)P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&Q2, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q2, &Q2, &A24);
        if(ec_is_zero(&Q2))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P2, &Q2))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    ec_point_t PP;
    fp2_mul(&t0, &P->z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&PP.x, &P->x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&PP.x, &PP.x, &Q.z);
    fp2_mul(&PP.x, &PP.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P->z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(PP.z.re);
    fp_set(PP.z.im, 0);
    fp2_copy(&Q.z, &PP.z);
    fp2_copy(&E.C, &PP.z);
    // Compute P-Q
    difference_point(&PQ2->PmQ, &PP, &Q, &E);
    copy_point(&PQ2->P, &PP);
    copy_point(&PQ2->Q, &Q);
 }
 void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q3, P3, A24, A3;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Curve coefficient in the form A3 = (A+2C:A-2C)
    fp2_sub(&A3.z, &A24.x, &A24.z);
    fp2_copy(&A3.x, &A24.x);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear non-3 factors from the order
        xMULv2(&P, &P, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
        // Check if point has order 3^g
        copy_point(&P3, &P);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&P3, &P3, &A3);
        if(ec_is_zero(&P3))
            continue;
        else
            break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear non-3 factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
        // Check if point has order 3^g
        copy_point(&Q3, &Q);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&Q3, &Q3, &A3);
        if(ec_is_zero(&Q3))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P3, &Q3))
            continue;
        xDBLv2(&P3, &P3, &A24);
        if(is_point_equal(&P3, &Q3))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ3->PmQ, &P, &Q, &E);
    copy_point(&PQ3->P, &P);
    copy_point(&PQ3->Q, &Q);
 }
 void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q6, P6, R, T, A24, A3;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Curve coefficient in the form A3 = (A+2C:A-2C)
    fp2_sub(&A3.z, &A24.x, &A24.z);
    fp2_copy(&A3.x, &A24.x);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear non-2 factors and non-3 factors from the order
        xMULv2(&P, &P, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
        // Check if point has order 2^f*3^g
        copy_point(&P6, &P);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&P6, &P6, &A24);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&P6, &P6, &A3);
        if(ec_is_zero(&P6))
            continue;
        xDBLv2(&T, &P6, &A24);
        if (ec_is_zero(&T))
            continue;
        xTPL(&T, &P6, &A3);
        if (ec_is_zero(&T))
            continue;
        break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear non-6 factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
        // Check first if point has order 2^f*3^g
        copy_point(&Q6, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q6, &Q6, &A24);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&Q6, &Q6, &A3);
        if(ec_is_zero(&Q6))
            continue;
        xDBLv2(&T, &Q6, &A24);
        if (ec_is_zero(&T))
            continue;
        xTPL(&T, &Q6, &A3);
        if (ec_is_zero(&T))
            continue;
        // Check if point P is independent from point Q
        xTPL(&R, &P6, &A3);
        xTPL(&T, &Q6, &A3);
        if(is_point_equal(&R, &T))
            continue;
        xDBLv2(&R, &P6, &A24);
        xDBLv2(&T, &Q6, &A24);
        if(is_point_equal(&R, &T))
            continue;
        break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ6->PmQ, &P, &Q, &E);
    copy_point(&PQ6->P, &P);
    copy_point(&PQ6->Q, &Q);
 }
--- a/src/ec/ref/ecx/ec.c
+++ b/src/ec/ref/ecx/ec.c
--- a/src/ec/ref/ecx/fp2-test.c
+++ b/src/ec/ref/ecx/fp2-test.c
@@ -0,0 +1,90 @@
 #include <assert.h>
 #include <time.h>
 #include <stdio.h>
 #include "../generic/include/fp2_tmp.h"
 int main()
 {
 	fp2_t fp2_0, fp2_1;
 	// ------------
 	fp2_set0(fp2_0);
 	fp2_set1(fp2_1);
 	// ------------
 	int i;
 	fp2_t a, b, c, d;
 	fp_t e;
 	for (i = 0; i < 1024; i++)
 	{
 		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)1024);
 		fflush(stdout);
 		printf("\r\x1b[K");
 		// Random elements of fp
 		fp2_random(a);
 		fp2_random(b);
 		fp2_copy(c, a);
 		c.re[0] += 1;
 		fp2_copy(d, b);
 		d.re[0] -= 1;
 		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
 		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
 		// Testing neg
 		fp2_set0(b);
 		fp2_copy(c, a);
 		fp2_neg(a, a);
 		fp2_sub(c, b, c);
 		assert(fp2_isequal(a,c) == 1);
 		fp2_set1(a);	// Now a == 1
 		fp2_set0(b);	// Now b == 0
 		assert(fp2_is_zero(a) == 0);
 		assert(fp2_is_zero(b) == 1);
 		// testing c - c
 		fp2_sub(d, c, c);
 		assert(fp2_is_zero(d) == 1);
 		// tetsing c * 0
 		fp2_mul(d, c, b);
 		assert(fp2_is_zero(d) == 1);
 		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
 		fp2_set1(a);
 		fp2_mul(d, c, a);
 		assert(fp2_isequal(d, c) == 1);
 		// fp_set(e, 1);	// Now e == 1
 		// fp2_pow(d, e, c);
 		// assert(fp2_isequal(d, c) == 1);
 		// fp_set(e, 0);	// Now e == 0
 		// fp2_pow(d, e, c);
 		// assert(fp2_isone(d) == 1);
 		// fp2_set(a, 1);	// Now e == R mod p
 		// fp_random(e);
 		// fp2_pow(d, e, a);
 		// assert(fp2_isone(d) == 1);
 		// Testing 1/a by computing (1/a) x a
 		fp2_random(a);
 		fp2_copy(b, a);
 		fp2_inv(a);
 		fp2_mul(c, a, b);
 		assert(fp2_isone(c) == 1);
 		fp2_random(a);
 		fp2_sqr(b, a);
 		assert( fp2_issquare(b) );
 	};
 	printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i / (int)1024);
 	printf("-- All tests passed.\n");
 	return 0;
 }
--- a/src/ec/ref/ecx/isog_chains.c
+++ b/src/ec/ref/ecx/isog_chains.c
@@ -0,0 +1,298 @@
 #include "isog.h"
 #include <assert.h>
 static inline void AC_to_A24(ec_point_t *A24, ec_curve_t const *E)
 {
    // A24 = (A+2C : 4C)
    fp2_add(&A24->z, &E->C, &E->C);
    fp2_add(&A24->x, &E->A, &A24->z);
    fp2_add(&A24->z, &A24->z, &A24->z);
 }
 static inline void A24_to_AC(ec_curve_t *E, ec_point_t const *A24)
 {
    // (A:C) = ((A+2C)*2-4C : 4C)
    fp2_add(&E->A, &A24->x, &A24->x);
    fp2_sub(&E->A, &E->A, &A24->z);
    fp2_add(&E->A, &E->A, &E->A);
    fp2_copy(&E->C, &A24->z);
 }
 void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_point_t* points, unsigned short length){
        ec_point_t Q4, Q, A24;
        copy_point(&Q4, &phi->kernel);
        AC_to_A24(&A24, &phi->curve);
        for(int i = 0; i < phi->length - 2; i++)
            xDBLv2(&Q4, &Q4, &A24);
        xDBLv2(&Q, &Q4, &A24);
        if(fp2_is_zero(&Q.x)){
            xisog_4_singular(&A24, Q4, A24);
            xeval_4_singular(points, points, length, Q4);
            xeval_4_singular(&Q, &phi->kernel, 1, Q4);
        }
        else{
            xisog_4(&A24, Q4);
            xeval_4(points, points, length);
            xeval_4(&Q, &phi->kernel, 1);
        }
        ec_eval_even_strategy(image, points, length, &A24, &Q, phi->length-2);
    }
 void ec_eval_even_nonzero(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_point_t* points, unsigned short length){
        ec_point_t Q4, A24;
        copy_point(&Q4, &phi->kernel);
        AC_to_A24(&A24, &phi->curve);
        for(int i = 0; i < phi->length - 2; i++)
            xDBLv2(&Q4, &Q4, &A24);
        xisog_4(&A24, Q4);
        xeval_4(points, points, length);
        xeval_4(&Q4, &phi->kernel, 1);
        ec_eval_even_strategy(image, points, length, &A24, &Q4, phi->length-2);
    }
 static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
    ec_point_t* A24, const ec_point_t *kernel, const int isog_len){
    assert(isog_len == POWER_OF_2-2);
    uint8_t log2_of_e, tmp;
    fp2_t t0;
    digit_t e_half = (isog_len)>>1;
    for(tmp = e_half, log2_of_e = 0; tmp > 0; tmp>>=1, ++log2_of_e);
    log2_of_e *= 2; // In order to ensure each splits is at most size log2_of_e
    ec_point_t SPLITTING_POINTS[log2_of_e], K2;
    copy_point(&SPLITTING_POINTS[0], kernel);
    int strategy = 0,    // Current element of the strategy to be used
    i, j;
    int BLOCK = 0,       // Keeps track of point order
    current = 0;         // Number of points being carried
    int XDBLs[log2_of_e]; // Number of doubles performed
    // If walk length is odd, we start with a 2-isogeny
    if(isog_len & 1){
        copy_point(&SPLITTING_POINTS[1], &SPLITTING_POINTS[0]);
        for(i = 0; i < isog_len-1; i++)
            xDBLv2(&SPLITTING_POINTS[1], &SPLITTING_POINTS[1], A24);
        xisog_2(A24, SPLITTING_POINTS[1]);
        xeval_2(SPLITTING_POINTS, SPLITTING_POINTS, 1);
        xeval_2(points, points, points_len);
    }
    // Chain of 4-isogenies
    for(j = 0; j < (e_half - 1); j++)
    {   
        // Get the next point of order 4
        while (BLOCK != (e_half -  1 - j) )
        {
            // A new split will be added
            current += 1;
            // We set the seed of the new split to be computed and saved
            copy_point(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current - 1]);
            for(i = 0; i < 2*STRATEGY4[strategy]; i++)
                xDBLv2(&SPLITTING_POINTS[current], &SPLITTING_POINTS[current], A24);
            XDBLs[current] = STRATEGY4[strategy];  // The number of doublings performed is saved
            BLOCK += STRATEGY4[strategy];          // BLOCK is increased by the number of doublings performed
            strategy += 1;                  // Next, we move to the next element of the strategy
        }
        // Evaluate 4-isogeny
        xisog_4(A24, SPLITTING_POINTS[current]);
        xeval_4(SPLITTING_POINTS, SPLITTING_POINTS, current);
        xeval_4(points, points, points_len);
        BLOCK -= XDBLs[current];  
        XDBLs[current] = 0;      
        current -= 1;            
    }
    // Final 4-isogeny
    xisog_4(A24, SPLITTING_POINTS[current]);
    xeval_4(points, points, points_len);
    // Output curve in the form (A:C)
    A24_to_AC(image, A24);
 }
 void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
        ec_point_t* points, unsigned short length){
    ec_point_t ker_plus, ker_minus, P, K, A24, B24;
    int i,j,k;
    AC_to_A24(&A24, &phi->curve);
    // Isogenies with kernel in E[p+1]
    copy_point(&ker_plus, &phi->ker_plus);
    copy_point(&ker_minus, &phi->ker_minus);
    for(i = 0; i < P_LEN; i++){
        copy_point(&P, &ker_plus);
        for(j = i+1; j < P_LEN; j++){
            for(k = 0; k < phi->degree[j]; k++)
                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
        }
        for(k = 0; k < phi->degree[i]; k++){
            copy_point(&K, &P);
            for(j = 0; j < phi->degree[i]-k-1; j++)
                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
            kps(i, K, A24);
            xisog(&B24, i, A24);
            xeval(&P, i, P, A24);
            xeval(&ker_plus, i, ker_plus, A24);
            xeval(&ker_minus, i, ker_minus, A24);
            for(j = 0; j < length; j++)
                xeval(&points[j], i, points[j], A24);
            copy_point(&A24, &B24);
            kps_clear(i);
        }
    }
    // Isogenies with kernel in E[p-1]
    for(i = P_LEN; i < P_LEN+M_LEN; i++){
        copy_point(&P, &ker_minus);
        for(j = i+1; j < P_LEN+M_LEN; j++){
            for(k = 0; k < phi->degree[j]; k++)
                xMULv2(&P, &P, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A24);
        }
        for(k = 0; k < phi->degree[i]; k++){
            copy_point(&K, &P);
            for(j = 0; j < phi->degree[i]-k-1; j++)
                xMULv2(&K, &K, &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A24);
            kps(i, K, A24);
            xisog(&B24, i, A24);
            xeval(&P, i, P, A24);
            xeval(&ker_minus, i, ker_minus, A24);
            for(j = 0; j < length; j++)
                xeval(&points[j], i, points[j], A24);
            copy_point(&A24, &B24);
            kps_clear(i);
        }
    }
    A24_to_AC(image, &A24);
 }
 void ec_curve_normalize(ec_curve_t *new, ec_isom_t *isom, const ec_curve_t *old){
    fp2_t t0, t1, t2, t3, t4, t5;
    // Compute the other solutions:
    // A'^2 = [ sqrt(A^2-4C^2)*(9C^2-A^2) +- (A^3-3AC^2) ] / [ 2C^2*sqrt(A^2-4C^2) ]
    fp2_sqr(&t0, &old->C);      //C^2
    fp2_add(&t1, &t0, &t0);     //2C^2
    fp2_add(&t2, &t1, &t1);     //4C^2
    fp2_sqr(&t3, &old->A);      //A^2
    fp2_sub(&t2, &t3, &t2);     //A^2-4C^2
    fp2_sqrt(&t2);              //sqrt(A^2-4C^2)
    fp2_add(&t0, &t0, &t1);     //3C^2
    fp2_mul(&t1, &t2, &t1);     //2C^2*sqrt(A^2-4C^2)
    fp2_sub(&t5, &t3, &t0);     //A^2-3C^2
    fp2_mul(&t5, &t5, &old->A);     //A^3-3AC^2
    fp2_add(&t4, &t0, &t0);     //6C^2
    fp2_add(&t0, &t4, &t0);     //9C^2
    fp2_sub(&t0, &t0, &t3);     //9C^2-A^2
    fp2_add(&t3, &t3, &t3);     //2A^2
    fp2_mul(&t3, &t3, &t2);     //2A^2*sqrt(A^2-4C^2)
    fp2_mul(&t2, &t2, &t0);     //sqrt(A^2-4C^2)*(9C^2-A^2)
    fp2_add(&t0, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) + (A^3-3AC^2)
    fp2_sub(&t2, &t2, &t5);     //sqrt(A^2-4C^2)*(9C^2-A^2) - (A^3-3AC^2)
    fp2_inv(&t1);               //1/2C^2*sqrt(A^2-4C^2)
    fp2_mul(&t0, &t0, &t1);     // First solution
    fp2_mul(&t2, &t2, &t1);     // Second solution
    fp2_mul(&t1, &t3, &t1);     // Original solution
    // Chose the lexicographically first solution
    if(fp2_cmp(&t0, &t1)==1)
        fp2_copy(&t0, &t1);
    if(fp2_cmp(&t0, &t2)==1)
        fp2_copy(&t0, &t2);
    // Copy the solution
    fp2_sqrt(&t0);
    ec_curve_t E;
    fp2_copy(&E.A, &t0);
    fp_mont_setone(E.C.re);
    fp_set(E.C.im, 0);
    ec_isomorphism(isom, old, &E);
    fp2_copy(&new->A, &E.A);
    fp2_copy(&new->C, &E.C);
 }
 void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to){
    fp2_t t0, t1, t2, t3, t4;
    fp2_mul(&t0, &from->A, &to->C);
    fp2_sqr(&t0, &t0);                  //fromA^2toC^2
    fp2_mul(&t1, &to->A, &from->C);
    fp2_sqr(&t1, &t1);                  //toA^2fromC^2
    fp2_mul(&t2, &to->C, &from->C);
    fp2_sqr(&t2, &t2);                  //toC^2fromC^2
    fp2_add(&t3, &t2, &t2);
    fp2_add(&t2, &t3, &t2);             //3toC^2fromC^2
    fp2_sub(&t3, &t2, &t0);             //3toC^2fromC^2-fromA^2toC^2
    fp2_sub(&t4, &t2, &t1);             //3toC^2fromC^2-toA^2fromC^2
    fp2_inv(&t3);
    fp2_mul(&t4, &t4, &t3);
    fp2_sqrt(&t4);                      //lambda^2 constant for SW isomorphism
    fp2_sqr(&t3, &t4);
    fp2_mul(&t3, &t3, &t4);             //lambda^6
    // Check sign of lambda^2, such that lambda^6 has the right sign
    fp2_sqr(&t0, &from->C);
    fp2_add(&t1, &t0, &t0);
    fp2_add(&t1, &t1, &t1);
    fp2_add(&t1, &t1, &t1);
    fp2_add(&t0, &t0, &t1); // 9fromC^2
    fp2_sqr(&t2, &from->A);
    fp2_add(&t2, &t2, &t2); // 2fromA^2
    fp2_sub(&t2, &t2, &t0);
    fp2_mul(&t2, &t2, &from->A); // -9fromC^2fromA+2fromA^3
    fp2_sqr(&t0, &to->C);
    fp2_mul(&t0, &t0, &to->C);
    fp2_mul(&t2, &t2, &t0);     //toC^3* [-9fromC^2fromA+2fromA^3]
    fp2_mul(&t3, &t3, &t2);             //lambda^6*(-9fromA+2fromA^3)*toC^3
    fp2_sqr(&t0, &to->C);
    fp2_add(&t1, &t0, &t0);
    fp2_add(&t1, &t1, &t1);
    fp2_add(&t1, &t1, &t1);
    fp2_add(&t0, &t0, &t1); // 9toC^2
    fp2_sqr(&t2, &to->A);
    fp2_add(&t2, &t2, &t2); // 2toA^2
    fp2_sub(&t2, &t2, &t0);
    fp2_mul(&t2, &t2, &to->A); // -9toC^2toA+2toA^3
    fp2_sqr(&t0, &from->C);
    fp2_mul(&t0, &t0, &from->C);
    fp2_mul(&t2, &t2, &t0);     //fromC^3* [-9toC^2toA+2toA^3]
    if(!fp2_is_equal(&t2, &t3))
        fp2_neg(&t4, &t4);
    // Mont -> SW -> SW -> Mont
    fp_mont_setone(t0.re);
    fp_set(t0.im, 0);
    fp2_add(&isom->D, &t0, &t0);
    fp2_add(&isom->D, &isom->D, &t0);
    fp2_mul(&isom->D, &isom->D, &from->C);
    fp2_mul(&isom->D, &isom->D, &to->C);
    fp2_mul(&isom->Nx, &isom->D, &t4);
    fp2_mul(&t4, &t4, &from->A);
    fp2_mul(&t4, &t4, &to->C);
    fp2_mul(&t0, &to->A, &from->C);
    fp2_sub(&isom->Nz, &t0, &t4);
 }
 void ec_iso_inv(ec_isom_t* isom){
    fp2_t tmp;
    fp2_copy(&tmp, &isom->D);
    fp2_copy(&isom->D, &isom->Nx);
    fp2_copy(&isom->Nx, &tmp);
    fp2_neg(&isom->Nz, &isom->Nz);
 }
 void ec_iso_eval(ec_point_t *P, ec_isom_t* isom){
    fp2_t tmp;
    fp2_mul(&P->x, &P->x, &isom->Nx);
    fp2_mul(&tmp, &P->z, &isom->Nz);
    fp2_sub(&P->x, &P->x, &tmp);
    fp2_mul(&P->z, &P->z, &isom->D);
 }
--- a/src/ec/ref/ecx/kps.c
+++ b/src/ec/ref/ecx/kps.c
@@ -0,0 +1,228 @@
 #include "isog.h"
 #include "curve_extras.h"
 #include <assert.h>
 int sI, sJ, sK;	// Sizes of each current I, J, and K	
 fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
 			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
 ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
 fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
    A0;			// constant multiple of the reciprocal R0
 poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
 fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
 int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
 fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
 // -----------------------------------------------------------
 // -----------------------------------------------------------
 // Traditional Kernel Point computation (KPs)
 // Kernel computation required in tye degree-4 isogeny evaluation
 void kps_4(ec_point_t const P)
 {
 	fp2_sub(&K[1].x, &P.x, &P.z);
 	fp2_add(&K[2].x, &P.x, &P.z);
 	fp2_sqr(&K[0].x, &P.z);
 	fp2_add(&K[0].z, &K[0].x, &K[0].x);
 	fp2_add(&K[0].x, &K[0].z, &K[0].z);
 }
 void eds2mont(ec_point_t* P)
 {
 	fp2_t t;
 	fp2_add(&t, &(P->z), &(P->x));
 	fp2_sub(&(P->z), &(P->z), &(P->x));
 	fp2_copy(&(P->x), &t);
 }
 // Differential doubling in Twisted Edwards model
 void ydbl(ec_point_t* Q, ec_point_t* const P, ec_point_t const* A)
 {
 	fp2_t t_0, t_1, X, Z;
 	fp2_sqr(&t_0, &(P->x));
 	fp2_sqr(&t_1, &(P->z));
 	fp2_mul(&Z, &(A->z), &t_0);
 	fp2_mul(&X, &Z, &t_1);
 	fp2_sub(&t_1, &t_1, &t_0);
 	fp2_mul(&t_0, &(A->x), &t_1);
 	fp2_add(&Z, &Z, &t_0);
 	fp2_mul(&Z, &Z, &t_1);
 	fp2_sub(&(Q->x), &X, &Z);
 	fp2_add(&(Q->z), &X, &Z);
 }
 // Differential addition in Twisted Edwards model
 void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ)
 {
 	fp2_t a, b, c, d, X, Z;
 	fp2_mul(&a, &(P->z), &(Q->x));
 	fp2_mul(&b, &(P->x), &(Q->z));
 	fp2_add(&c, &a, &b);
 	fp2_sub(&d, &a, &b);
 	fp2_sqr(&c, &c);
 	fp2_sqr(&d, &d);
 	fp2_add(&a, &(PQ->z), &(PQ->x));
 	fp2_sub(&b, &(PQ->z), &(PQ->x));
 	fp2_mul(&X, &b, &c);
 	fp2_mul(&Z, &a, &d);
 	fp2_sub(&(R->x), &X, &Z);
 	fp2_add(&(R->z), &X, &Z);
 }
 // tvelu formulae
 void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A)
 {
 	int j;
 	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;
 	// Mapping the input point x(P), which belongs to a 
 	// Montogmery curve model, into its Twisted Edwards 
 	// representation y(P)
 	fp2_sub(&K[0].x, &P.x, &P.z);
 	fp2_add(&K[0].z, &P.x, &P.z);
 	ydbl(&K[1], &K[0], &A);				// y([2]P)
 	for (j = 2; j < d; j++)
 		yadd(&K[j], &K[j - 1], &K[0], &K[j - 2]);	// y([j+1]P)
 }
 // -----------------------------------------------------------
 // -----------------------------------------------------------
 // Kernel Point computation (KPs) used in velu SQRT
 void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A)
 {
 	// =================================================================================
 	assert(TORSION_ODD_PRIMES[i] > gap);	// Ensuring velusqrt is used for l_i > gap
 	// The optimal bounds must corresponds to sI, sJ, and sK
 	sI = sizeI[i];	// Size of I
 	sJ = sizeJ[i];	// Size of J
 	sK = sizeK[i];	// Size of K
 	assert(sI >= sJ);	// Ensuring #I >= #J
 	assert(sK >= 0);	// Recall, it must be that #K >= 0
 	assert(sJ > 1);		// ensuring sI >= sJ > 1
 	// =================================================================================
 	// Now, we can proceed by the general case
 	int j;
 	// --------------------------------------------------
 	// Computing [j]P for each j in {1, 3, ..., 2*sJ - 1}
 	ec_point_t P2, P4;
 	copy_point(&J[0], &P);				//    x(P)
 	// Next computations are required for allowing the use of the function get_A()
 	fp2_mul(&XZJ4[0], &J[0].x, &J[0].z);					//   Xj*Zj
 	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  2Xj*Zj
 	fp2_add(&XZJ4[0], &XZJ4[0], &XZJ4[0]);					//  4Xj*Zj
 	fp2_neg(&XZJ4[0], &XZJ4[0]);					// -4Xj*Zj
 	xDBLv2(&P2, &P, &A);					// x([2]P)
 	xADD(&J[1], &P2, &J[0], &J[0]);			// x([3]P)
 	// Next computations are required for allowing the use of the function get_A()
 	fp2_mul(&XZJ4[1], &J[1].x, &J[1].z);					//   Xj*Zj
 	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  2Xj*Zj
 	fp2_add(&XZJ4[1], &XZJ4[1], &XZJ4[1]);					//  4Xj*Zj
 	fp2_neg(&XZJ4[1], &XZJ4[1]);					// -4Xj*Zj
 	for (j = 2; j < sJ; j++)
 	{
 		xADD(&J[j], &J[j - 1], &P2, &J[j - 2]);	// x([2*j + 1]P)
 		// Next computations are required for allowing the use of the function get_A()
 		fp2_mul(&XZJ4[j], &J[j].x, &J[j].z);					//   Xj*Zj
 		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  2Xj*Zj
 		fp2_add(&XZJ4[j], &XZJ4[j], &XZJ4[j]);					//  4Xj*Zj
 		fp2_neg(&XZJ4[j], &XZJ4[j]);					// -4Xj*Zj
 	};
 	// ----------------------------------------------------------
 	// Computing [i]P for i in { (2*sJ) * (2i + 1) : 0 <= i < sI}
 	// and the linear factors of h_I(W)
 	ec_point_t Q, Q2, tmp1, tmp2;
 	int bhalf_floor= sJ >> 1;
 	int bhalf_ceil = sJ - bhalf_floor;
 	xDBLv2(&P4, &P2, &A);								// x([4]P)
 	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
 	xADD(&Q, &J[bhalf_ceil], &J[bhalf_floor - 1], &P2);	// Q := [2b]P
 	swap_points(&P2, &P4, -(uint64_t)(sJ % 2));								// x([4]P) <--- coditional swap ---> x([2]P)
 	// .............................................
 	xDBLv2(&Q2, &Q, &A);					// x([2]Q)
 	xADD(&tmp1, &Q2, &Q, &Q);	// x([3]Q)
 	fp2_neg(&I[0][0], &Q.x);
 	fp2_copy(&I[0][1], &Q.z);
 	fp2_neg(&I[1][0], &tmp1.x);
 	fp2_copy(&I[1][1], &tmp1.z);
 	copy_point(&tmp2, &Q);
 	for (j = 2; j < sI; j++){
 		xADD(&tmp2, &tmp1, &Q2, &tmp2);	// x([2*j + 1]Q)
 		fp2_neg(&I[j][0], &tmp2.x);
 		fp2_copy(&I[j][1], &tmp2.z);
 		swap_points(&tmp1, &tmp2, -(uint64_t)1);
 	}
 	// ----------------------------------------------------------------
 	// Computing [k]P for k in { 4*sJ*sI + 1, ..., l - 6, l - 4, l - 2}
 	// In order to avoid BRANCHES we make allways copy in K[0] and K[1]
 	// by assuming that these entries are only used when sK >= 1 and 
 	// sK >= 2, respectively.
 	//if (sK >= 1)
 	copy_point(&K[0], &P2);				//       x([l - 2]P) = x([2]P)
 	//if (sK >= 2)
 	copy_point(&K[1], &P4);				//       x([l - 4]P) = x([4]P)
 	for (j = 2; j < sK; j++)
 		xADD(&K[j], &K[j - 1], &P2, &K[j - 2]);	// x([l - 2*(j+1)]P) = x([2 * (j+1)]P)
 	// ----------------------------------------------------------------
 	//                   ~~~~~~~~               ~~~~~~~~
 	//                    |    |                 |    |
 	// Computing h_I(W) = |    | (W - x([i]P)) = |    | (Zi * W - Xi) / Zi where x([i]P) = Xi/Zi
 	//                    i in I                 i in I
 	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
 	product_tree_LENFeq2(ptree_hI, deg_ptree_hI, 0, I, sI);				// Product tree of hI
 	if (!scaled)
 	{
 		// (unscaled) remainder tree approach
 		reciprocal_tree(rtree_hI, rtree_A, 2*sJ + 1, ptree_hI, deg_ptree_hI, 0, sI);	// Reciprocal tree of hI
 	}
 	else
 	{
 		// scaled remainder tree approach
 		fp2_t f_rev[sI_max + 1];
 		for (j = 0; j < (sI + 1); j++)
 			fp2_copy(&f_rev[j], &ptree_hI[0][sI - j]);
 		if (sI > (2*sJ - sI + 1))
 			reciprocal(R0, &A0, f_rev, sI + 1, sI);
 		else
 			reciprocal(R0, &A0, f_rev, sI + 1, 2*sJ - sI + 1);
 	};
 }
 void kps_clear(int i){
 		if (TORSION_ODD_PRIMES[i] > gap)
 		{
 			if (!scaled)
 				clear_tree(rtree_hI, 0, sizeI[i]);
 			clear_tree(ptree_hI, 0, sizeI[i]);
 		}
 }
--- a/src/ec/ref/ecx/poly-mul.c
+++ b/src/ec/ref/ecx/poly-mul.c
--- a/src/ec/ref/ecx/poly-redc.c
+++ b/src/ec/ref/ecx/poly-redc.c
@@ -0,0 +1,349 @@
 #define _POLY_MUL_REDC_H_
 #include "poly.h"
 #include <assert.h>
 void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n){
  // Writes a polynomial to h and a field element to c such that f*h = c mod x^n
  // REQUIRES h to have space for n terms
  // NOT responsible for terms in h beyond h[n-1]
  int i;
  // Case when f needs to be padded with zeroes
  if(n > lenf)
  {
    fp2_t fpad[n];
    for(i = 0; i < lenf; i++)
      fp2_copy(&fpad[i], &f[i]);
    for(i = lenf; i < n; i++)
      fp2_set(&fpad[i], 0);
    reciprocal(h, c, fpad, n, n);
    return;
  }
  // Trivial case
  if(n == 0)
  {
    fp2_set(&*c, 0);
    return;
  }
  // Case n = 1
  if(n == 1)
  {
    fp2_copy(&*c, &f[0]);
    fp_mont_setone(h[0].re);fp_set(h[0].im,0);
    return;
  }
  // Case n = 2
  if(n == 2)
  {
    fp2_sqr(&*c, &f[0]);
    fp2_copy(&h[0], &f[0]);
    fp2_neg(&h[1], &f[1]);
    return;
  }
  // Case n = 3
  if(n == 3)
  {
    fp2_t t0, t1;
    fp2_sqr(&t0, &f[1]);
    fp2_mul(&t1, &f[0], &f[2]);
    fp2_sub(&t1, &t1, &t0);
    fp2_mul(&t1, &t1, &f[0]);
    reciprocal(h, c, f, 2, 2);
    fp2_mul(&h[0], &h[0], &*c);
    fp2_mul(&h[1], &h[1], &*c);
    fp2_neg(&h[2], &t1);
    fp2_sqr(&*c, &*c);
    return;
  }
  // Case n = 4
  if(n == 4)
  {
    fp2_t t0, t1, t2, t3, g[2];
    reciprocal(g, &t3, f, 2, 2);
    fp2_sqr(&t0, &f[1]);
    fp2_mul(&t1, &g[0], &f[2]);
    fp2_mul(&t2, &g[0], &f[3]);
    fp2_mul(&h[1], &g[1], &f[2]);
    fp2_sub(&t0, &t1, &t0);
    fp2_add(&t1, &t2, &h[1]);
    fp2_mul(&t2, &t0, &g[0]);
    fp2_mul(&h[1], &t0, &g[1]);
    fp2_mul(&h[3], &t1, &g[0]);
    fp2_add(&h[3], &h[1], &h[3]);
    fp2_mul(&h[0], &g[0], &t3);
    fp2_mul(&h[1], &g[1], &t3);
    fp2_neg(&h[2], &t2);
    fp2_neg(&h[3], &h[3]);
    fp2_sqr(&*c, &t3);
    return;
  }
  // General case
  // Compute the reciprocal g mod x^m for m = ceil(n/2)
  // Then f*g-c is multiple of x^m so we only care about terms from m to n-1
  const int m = n - (n>>1);
  fp2_t g[m], t[m], t0;
  reciprocal(g, &t0, f, lenf, m);
  poly_mul_middle(t, g, m, f, n);
  poly_mul_low(t, n-m, g, m, &(t[2*m-n]), n-m);
  for(i = 0; i < m; i++)
    fp2_mul(&h[i], &g[i], &t0);
  for(i = m; i < n; i++)
    fp2_neg(&h[i], &t[i-m]);
  fp2_sqr(&*c, &t0);
  return;
 }
 void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,//
 	       const poly f_rev_inv, const fp2_t c)
 {
  // Computes h(x) =  a * g(x) mod f(x) for some scalar a, writting lenf-1 terms to h.
  // REQUIRES an inverse f_rev_inv such that f_rev*f_rev_inv = c mod x^(leng-lenf+1),
  // where f_rev is the polynomial with the coefficients of f listed in reverse order.
  // The scalar a is equal to c, except for special cases:
  //    - If leng<lenf (no reduction needed) then a = 1
  //    - If lenf = leng = 2, then a = f[1] 
  //    - If lenf = leng = 3, then a = f[2] 
  //    - If lenf=2, leng=3 then a = 2*f[1]^2
  //
  // REQUIRES h to have space for lenf-1 terms
  // NOT responsible for terms in h beyond h[lenf-2]
  int i;
  // Case without reduction
  if(leng < lenf)
  {
    for(i = 0; i < leng; i++)
      fp2_copy(&h[i], &g[i]);
    for(i = leng; i < lenf-1; i++)
      fp2_set(&h[i], 0);
    return;
  }
  // Small cases for f linear
  if(lenf == 2)
  {
    if(leng == 2)
    {
      fp2_t t0;
      fp2_mul(&t0, &g[0], &f[1]);
      fp2_mul(&h[0], &g[1], &f[0]);
      fp2_sub(&h[0], &t0, &h[0]);
      return;
    }
    if(leng == 3)
    {
      fp2_t f0f1, f02, f12;
      fp2_sqr(&f02, &f[0]);
      fp2_sqr(&f12, &f[1]);
      fp2_sub(&f0f1, &f[0], &f[1]);
      fp2_sqr(&f0f1, &f0f1);
      fp2_sub(&f0f1, &f0f1, &f02);
      fp2_sub(&f0f1, &f0f1, &f12);
      fp2_add(&f02, &f02, &f02);
      fp2_add(&f12, &f12, &f12);
      fp2_mul(&f02, &f02, &g[2]);
      fp2_mul(&f12, &f12, &g[0]);
      fp2_mul(&f0f1, &f0f1, &g[1]);
      fp2_add(&h[0], &f02, &f12);
      fp2_add(&h[0], &h[0], &f0f1);
      return;
    }
  }
  // Small case for f cuadratic
  if(lenf == 3 && leng == 3)
  {
    fp2_t f2g1, f2g0, f1g2;
    fp2_mul(&f2g1, &g[1], &f[2]);
    fp2_mul(&f2g0, &g[0], &f[2]);
    fp2_mul(&f1g2, &g[2], &f[1]);
    fp2_mul(&h[0], &g[2], &f[0]);
    fp2_sub(&h[0], &f2g0, &h[0]);
    fp2_sub(&h[1], &f2g1, &f1g2);
    return;
  }
  // General case
  fp2_t g_reversed[leng], Q[leng - lenf + 1], Q_reversed[leng - lenf + 1];
  for(i = 0; i < leng; i++)
    fp2_copy(&g_reversed[i], &g[leng-1-i]);
  poly_mul_low(Q, leng-lenf+1, f_rev_inv, leng-lenf+1, g_reversed, leng-lenf+1);
  for(i = 0; i < leng - lenf + 1; i++)
    fp2_copy(&Q_reversed[i], &Q[leng - lenf - i]);
  poly_mul_low(g_reversed, lenf-1, Q_reversed, leng-lenf+1, f, lenf);
  for(i = 0; i < lenf-1; i++)
  {
    fp2_mul(&h[i], &g[i], &c);
    fp2_sub(&h[i], &h[i], &g_reversed[i]);
  }
  return;
 }
 void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[],//
 		     const int root, const int n)
 {
  // Given a product tree H with degrees tree DEG rooted at root and generated 
  // by n polynomials, writes the reverse-reciprocal polynomials to R and field elements 
  // to A such that Rev(H[i])*R[i] = A[i] mod x^(N) for all nodes but the leaves.
  // The mod is N = deg(parent)-deg(self) for inner nodes, or N = leng - deg(root) for the root.
  //
  // REQUIRES that leng >= DEG[0] and that R,A have enough space for the tree (see product_tree)
  if(n == 0)
    return;
  const int parent = (root-1) >> 1;
  const int brother = root - 1 + 2*(root & 1);
  int lenr;
  if(root > 0)
    lenr = DEG[parent] - DEG[root];
  else
    lenr = leng - DEG[root];
  R[root] = malloc(sizeof(fp2_t)*lenr);
  // ----------------------------------
  // base cases determined by poly_redc
  if(n == 1)
    return;
  // case for computing  g mod f when len(f), len(g) = 3
  if (DEG[root] == 2 && lenr == 1)
  {
    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
    reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
    return;
  }
  // ----------------------------------
  int i;
  // When the parent's inverse was calculated to a smaller modulus, need to invert from scratch
  if(root == 0 || leng < lenr)
  {
    for(i = 0; i < lenr && i < DEG[root]+1; i++)
      fp2_copy(&R[root][i], &H[root][DEG[root]-i]);
    for(i = DEG[root]+1; i < lenr; i++){
      fp2_set(&R[root][i], 0);
    }
    reciprocal(R[root], &(A[root]), R[root], lenr, lenr);
  }
  else
  {
  // When parent's inverse was to a greater/equal modulus, this inverse can be obtained from it
    for(i = 0; i < lenr; i++)
      fp2_copy(&R[root][i], &H[brother][DEG[brother]-i]);
    poly_mul_low(R[root], lenr, R[parent], leng, R[root], lenr);
    fp2_copy(&A[root], &A[parent]);
  }
  // Now move on to the children
  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+1, n-(n>>1));
  reciprocal_tree(R, A, lenr-1, H, DEG, 2*root+2, n>>1);
  return;
 }
 void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[],//
 		const poly H[], const int DEG[], const int root, const int n)
 {
  // Given the product tree H and reciprocal tree R,A generated by f_0, ... , f_{n-1},
  // with corresponding degrees tree DEG[] and rooted at root,  writes the constant term 
  // of c_i*g mod f_i to REM[i]. The constants c_i are unspecified, but are a function
  // only of leng and f_0,...,f_{n-1} so they cancel out when taking the ratios of
  // remainders of different g's of the same length.
  //
  // REQUIRES REM to have space for n terms
  if(n == 0)
    return;
  fp2_t g_mod[DEG[root]];
  poly_redc(g_mod, g, leng, H[root], DEG[root]+1, R[root], A[root]);
  if(n == 1)
  {
    fp2_copy(&REM[0], &g_mod[0]);
    return;
  }
  multieval_unscaled(REM, g_mod, DEG[root], R, A, H, DEG, 2*root+1, n-(n>>1));
  multieval_unscaled(&(REM[n-(n>>1)]), g_mod, DEG[root], R, A, H, DEG, 2*root+2, n>>1);
  return;
 }
 void multieval_scaled(fp2_t REM[], const poly G, const poly H[], //
 			   const int DEG[], const int root, const int n)
 {
  // Given the product tree H generated by LINEAR f_0,...,f_{n-1} rooted at root and with
  // corresponding degrees tree DEG, writes the constant term of c_i * g mod f_i(x) to REM[i]
  // The constants c_i are unspecified but are only a function of leng and f_0,...,f_{n-1},
  // so they cancel out when taking the ratios of remainders of different g's of the same length.
  //
  // REQUIRES REM to have space for n terms and n > 1
  // Also REQUIRES G = rev((rev(g mod F)) * F_rev_inv mod x^deg(F)-1) where F = H[root]
  // and F_rev_inv is its reverse's reciprocal mod x^deg(F)
  if(root == 0)
  {
    if(n == 1)
    {
      fp2_copy(&REM[0], &G[DEG[root]-1]);
      return;
    }
    else
    {
      multieval_scaled(REM, G, H, DEG, 2*root+1, n-(n>>1));
      multieval_scaled(&(REM[n-(n>>1)]), G, H, DEG, 2*root+2, n>>1);
      return;
    }
  }
  const int parent = (root-1) >> 1;
  const int brother = root - 1 + 2*(root & 1);
  const int uncle = parent - 1 + 2*(parent & 1);
  fp2_t fg[DEG[brother]+1];
  if(root > 2)
    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[uncle]+1);
  else
    poly_mul_middle(fg, H[brother], DEG[brother]+1, G, DEG[0]);
  if(n == 1)
  {
    fp2_copy(&REM[0], &fg[DEG[brother]]);
    return;
  }
  multieval_scaled(REM, fg, H, DEG, 2*root+1, n-(n>>1));
  multieval_scaled(&(REM[n-(n>>1)]), fg, H, DEG, 2*root+2, n>>1);
  return;
 }
--- a/src/ec/ref/ecx/tedwards.c
+++ b/src/ec/ref/ecx/tedwards.c
@@ -0,0 +1,231 @@
 #include <tedwards.h>
 #include <assert.h>
 // a*x^2+y^2=1+d*x^2*y^2
 // a = A.x/A.z + 2, d = A.x/A.z - 2
 void ted_init(ted_point_t* P)
 { // Initialize point as identity element (X:Y:Z:T) <- (0:1:1:0)
    fp_t one = {0};
    memset((digit_t*)P, 0, NWORDS_FIELD*RADIX*8/8);
    one[0] = 1;
    fp_tomont(P->x.re, one);
 }
 void copy_ted_point(ted_point_t* P, ted_point_t const* Q)
 {
    fp2_copy(&(P->x), &(Q->x));
    fp2_copy(&(P->y), &(Q->y));
    fp2_copy(&(P->z), &(Q->z));
    fp2_copy(&(P->t), &(Q->t));
 }
 void ted_dbl(ted_point_t *Q, ted_point_t const *P, ec_curve_t const* E) 
 {
    // A = X1^2
    // B = Y1^2
    // C = 2*Z1^2
    // D = a*A
    // K = (X1+Y1)^2-A-B
    // G = D+B
    // F = G-C
    // H = D-B
    // X3 = K*F
    // Y3 = G*H
    // T3 = K*H
    // Z3 = F*G
    // TODO: neutral element
    fp2_t A, B, C, D, K, G, F, H;
    fp2_sqr(&A, &P->x);
    fp2_sqr(&B, &P->y);
    fp2_sqr(&C, &P->z);
    fp2_add(&C, &C, &C);
    fp2_mul(&D, &A, &E->A);
    fp2_add(&K, &P->x, &P->y);
    fp2_sqr(&K, &K);
    fp2_sub(&K, &K, &A);
    fp2_sub(&K, &K, &B);
    fp2_add(&G, &D, &B);
    fp2_sub(&F, &G, &C);
    fp2_sub(&H, &D, &B);
    fp2_mul(&Q->x, &K, &F);
    fp2_mul(&Q->y, &G, &H);
    fp2_mul(&Q->t, &K, &H);
    fp2_mul(&Q->z, &F, &G);
 }
 void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E)
 {
    // A = X1*X2
    // B = Y1*Y2
    // C = Z1*T2
    // D = T1*Z2
    // K = D+C
    // F = (X1-Y1)*(X2+Y2)+B-A
    // G = B+a*A
    // H = D-C
    // X3 = K*F
    // Y3 = G*H
    // T3 = K*H
    // Z3 = F*G
    // TODO: neutral element
    ted_point_t res;
    if (is_ted_equal(P, Q)) {
      ted_dbl(S, P, E);
      return;
    }
    //assert(!is_ted_equal(P, Q));
    ted_neg(&res, P);
    if (is_ted_equal(&res, Q)) {
       ted_init(S);
       return;
    }
    // assert(!ted_equal(&res,Q));
    fp2_t A, B, C, D, K, F, G, H, tmp;
    fp2_mul(&A, &P->x, &Q->x);
    fp2_mul(&B, &P->y, &Q->y);
    fp2_mul(&C, &P->z, &Q->t);
    fp2_mul(&D, &P->t, &Q->z);
    fp2_add(&K, &D, &C);
    fp2_add(&F, &Q->x, &Q->y);
    fp2_sub(&tmp, &P->x, &P->y);
    fp2_mul(&F, &F, &tmp);
    fp2_add(&F, &F, &B);
    fp2_sub(&F, &F, &A);
    fp2_mul(&G, &A, &E->A);
    fp2_add(&G, &G, &B);
    fp2_sub(&H, &D, &C);
    fp2_mul(&res.x, &K, &F);
    fp2_mul(&res.y, &G, &H);
    fp2_mul(&res.t, &K, &H);
    fp2_mul(&res.z, &F, &G);
    if (fp2_is_zero(&res.x) && fp2_is_zero(&res.y) && fp2_is_zero(&res.z)) {
        ted_dbl(S, P, E);
    } else {
        copy_ted_point(S, &res);
    }
 }
 void ted_neg(ted_point_t* Q, ted_point_t const* P)
 {
    fp2_neg(&Q->x, &P->x);
    fp2_copy(&Q->y, &P->y);
    fp2_copy(&Q->z, &P->z);
    fp2_neg(&Q->t, &P->t);
 }
 static bool xLIFT(fp2_t* y, const ec_point_t* P, const ec_curve_t* curve)
 { // Returns false if it is on the curve, true if it is on the twist
    fp2_t z2, tmp1, tmp2, y2;
    if (fp2_is_zero(&P->z)) return false;
    // (X^2 + Z^2) C
    fp2_sqr(&tmp1, &P->x);
    fp2_sqr(&z2, &P->z);
    fp2_add(&tmp1, &tmp1, &z2);
    fp2_mul(&tmp1, &tmp1, &curve->C);
    // X^2C + AXZ + Z^2C
    fp2_mul(&tmp2, &P->x, &P->z);
    fp2_mul(&tmp2, &tmp2, &curve->A);
    fp2_add(&tmp1, &tmp1, &tmp2);
    // X^3C + AX^2Z + XZ^2C = Z^3(Cx^3 + Ax^2 + Cx) = Z^3 C (B*y^2) = Z C (B*Y^2) // x = X/Z
    fp2_mul(&tmp1, &tmp1, &P->x);
    // (ZC)^(-1)
    fp2_mul(&tmp2, &curve->C, &P->z);
    assert(!fp2_is_zero(&tmp2));
    fp2_inv(&tmp2);    
    fp2_mul(&y2, &tmp1, &tmp2);    // (B*Y^2)
    fp2_copy(y, &y2);
    if (fp2_is_square(&y2)) {  // on the curve
        fp2_sqrt(y);
        return false;
    } else { // on the twist
        fp2_t tmp = fp2_non_residue();
        fp2_mul(y, y, &tmp);
        fp2_sqrt(y);
        return true;
    }
 }
 //void mont_to_ted(ec_point_t* E, ec_point_t const* A, bool twist)
 void mont_to_ted(ec_curve_t* ted_curve, ec_curve_t const* curve)
 {
    fp2_t tmp, two;
    // A : y^2 = x^3 + (a/c)x^2 + x
    fp2_copy(&tmp, &curve->C);         
    fp2_inv(&tmp);                    // 1/c
    fp2_mul(&tmp, &tmp, &curve->A);   // a/c
    fp2_set(&two, 2);
    fp2_tomont(&two, &two);
    fp2_add(&ted_curve->A, &tmp, &two);       // a/c + 2
    fp2_sub(&ted_curve->C, &tmp, &two);       // a/c - 2
    //if (twist) {
        // B = Fp2_inv(fp2_non_residue)
    //    tmp = fp2_non_residue();
    //    fp2_mul2(&E->x,&tmp);
    //    fp2_mul2(&E->z,&tmp);
    //}
 }
 void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* curve)
 {
    if (fp2_is_zero(&P->z)) {
        fp2_set(&Q->x, 0);
        fp2_set(&Q->y, 1);
        fp2_set(&Q->z, 1);
        fp2_set(&Q->t, 0);
        fp_tomont(Q->y.re, Q->y.re);
        fp_tomont(Q->z.re, Q->z.re);
    } else {
        fp2_t tmp, y;
        xLIFT(&y, P, curve);
        fp2_add(&tmp, &P->x, &P->z);
        fp2_mul(&Q->x, &P->x, &tmp);
        fp2_sub(&Q->y, &P->x, &P->z);
        fp2_mul(&Q->y, &Q->y, &y);
        fp2_mul(&Q->z, &tmp, &y);
        fp2_copy(&Q->t, &Q->z);
        fp2_inv(&Q->t);
        fp2_mul(&Q->t, &Q->t, &Q->x);
        fp2_mul(&Q->t, &Q->t, &Q->y);
    }
 }
 void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P)
 {
    fp2_add(&Q->x, &P->z, &P->y);
    fp2_sub(&Q->z, &P->z, &P->y);
 }
 bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2)
 {
    fp2_t x1z2, y1z2;
    fp2_t y2z1, x2z1;
    fp2_t t1y2, t2y1;
    fp2_mul(&x1z2, &P1->x, &P2->z);
    fp2_mul(&y1z2, &P1->y, &P2->z);
    fp2_mul(&y2z1, &P2->y, &P1->z);
    fp2_mul(&x2z1, &P2->x, &P1->z);
    fp2_mul(&t1y2, &P1->t, &P2->y);
    fp2_mul(&t2y1, &P2->t, &P1->y);
    return fp2_is_equal(&x1z2, &x2z1) && fp2_is_equal(&y1z2, &y2z1) && fp2_is_equal(&t1y2, &t2y1);
 }
--- a/src/ec/ref/ecx/test/ec-test.c
+++ b/src/ec/ref/ecx/test/ec-test.c
@@ -0,0 +1,18 @@
 #include "ec-tests.h"
 int main(int argc, char* argv[])
 {
    if (argc < 3) {
        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
        exit(1);
    }
    if (!strcmp(argv[1], "test")) {
        TEST_LOOPS = atoi(argv[2]);
        return !(ec_test() & dlog_test());
    } else if (!strcmp(argv[1], "bench")) {
        BENCH_LOOPS = atoi(argv[2]);
        return !(ec_run() & dlog_run());
    } else {
        exit(1);
    }
 }
--- a/src/ec/ref/ecx/test/fp2-test.c
+++ b/src/ec/ref/ecx/test/fp2-test.c
@@ -0,0 +1,142 @@
 #include <assert.h>
 #include <time.h>
 #include <stdio.h>
 #include <fp2.h>
 #include <inttypes.h>
 static int BENCH_LOOPS = 1000;       // Number of iterations per bench
 static int TEST_LOOPS  = 512;       // Number of iterations per test
 bool fp2_isequal(fp2_t a, fp2_t b){
    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
 }
 bool fp2_isone(fp2_t a){
    fp_t one;
    bool res = 1;
    fp_mont_setone(one);
    for(int i = 0; i < NWORDS_FIELD; i++){
        res = res && (a.re[i] == one[i]);
        res = res && (a.im[i] == 0);
    }
    return res;
 }
 void fp2_print(char *name, fp2_t const a){
    fp2_t b;
    fp2_set(&b, 1);
    fp2_mul(&b, &b, &a);
    printf("%s = 0x", name);
    for(int i = NWORDS_FIELD - 1; i >=0; i--)
        printf("%016" PRIx64, b.re[i]);
    printf(" + i*0x");
    for(int i = NWORDS_FIELD - 1; i >=0; i--)
        printf("%016" PRIx64, b.im[i]);
    printf("\n");
 }
 // VERY NOT SECURE (testing only)
 void fp2_random(fp2_t *a){
    for(int i = 0; i < NWORDS_FIELD; i++){
        a->re[i] = rand();
        a->im[i] = rand();
    }
    // Normalize
    fp2_t one;
    fp_mont_setone(one.re);fp_set(one.im,0);
    fp2_mul(&*a, &*a, &one);
    // Update seed
    srand((unsigned) a->re[0]);
 }
 int main(int argc, char* argv[])
 {
 	if (argc > 1) {
 		TEST_LOOPS = atoi(argv[1]);
 	}
 	fp2_t fp2_0, fp2_1;
 	// ------------
 	fp2_set(&fp2_0, 0);
 	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
 	// ------------
 	int i;
 	fp2_t a, b, c, d;
 	fp_t e;
 	for (i = 0; i < TEST_LOOPS; i++)
 	{
 		printf("[%3d%%] Testing fp2_t arithmetic", 100 * i / (int)TEST_LOOPS);
 		fflush(stdout);
 		printf("\r\x1b[K");
 		// Random elements of fp
 		fp2_random(&a);
 		fp2_random(&b);
 		fp2_copy(&c, &a);
 		c.re[0] += 1;
 		fp2_copy(&d, &b);
 		d.re[0] -= 1;
 		assert(fp2_isequal(a,b) == 0);		// different values check --> (a != b)
 		assert(fp2_isequal(c,c) == 1);		// equal values check --> 1 (c == c)
 		// Testing neg
 		fp2_set(&b, 0);
 		fp2_copy(&c, &a);
 		fp2_neg(&a, &a);
 		fp2_sub(&c, &b, &c);
 		assert(fp2_isequal(a,c) == 1);
 		fp_mont_setone(a.re);fp_set(a.im,0);	// Now a == 1
 		fp2_set(&b, 0);	// Now b == 0
 		assert(fp2_is_zero(&a) == 0);
 		assert(fp2_is_zero(&b) == 1);
 		// testing c - c
 		fp2_sub(&d, &c, &c);
 		assert(fp2_is_zero(&d) == 1);
 		// tetsing c * 0
 		fp2_mul(&d, &c, &b);
 		assert(fp2_is_zero(&d) == 1);
 		// tetsing c * 1 ... recall, in Montgomery domain R mod p plays the role of the 1
 		fp_mont_setone(a.re);fp_set(a.im,0);
 		fp2_mul(&d, &c, &a);
 		assert(fp2_isequal(d, c) == 1);
 		// fp_set(e, 1);	// Now e == 1
 		// fp2_pow(d, e, c);
 		// assert(fp2_isequal(d, c) == 1);
 		// fp_set(e, 0);	// Now e == 0
 		// fp2_pow(d, e, c);
 		// assert(fp2_isone(d) == 1);
 		// fp2_set(a, 1);	// Now e == R mod p
 		// fp_random(e);
 		// fp2_pow(d, e, a);
 		// assert(fp2_isone(d) == 1);
 		// Testing 1/a by computing (1/a) x a
 		fp2_random(&a);
 		fp2_copy(&b, &a);
 		fp2_inv(&a);
 		fp2_mul(&c, &a, &b);
 		assert(fp2_isone(c) == 1);
 		fp2_random(&a);
 		fp2_sqr(&b, &a);
 		assert( fp2_is_square(&b) );
 	};
 	if(TEST_LOOPS){
 		printf("[%2d%%] Tested fp2_t arithmetic:\tNo errors!\n", 100 * i /TEST_LOOPS);
 	}
 	printf("-- All tests passed.\n");
 	return 0;
 }
--- a/src/ec/ref/ecx/test/isog-test.c
+++ b/src/ec/ref/ecx/test/isog-test.c
--- a/src/ec/ref/ecx/test/mont-test.c
+++ b/src/ec/ref/ecx/test/mont-test.c
@@ -0,0 +1,386 @@
 #include <time.h>
 #include <assert.h>
 #include <stdio.h>
 #include "ec.h"
 #include "isog.h"
 #include "test-basis.h"
 #include <bench.h> 
 static int BENCH_LOOPS = 1000;       // Number of iterations per bench
 static int TEST_LOOPS  = 128;       // Number of iterations per test
 // void random_scalar(fp_t k, const uint8_t j)
 // {
 //         // To implement a better random function (We must use some of the SHAKE family functions)
 //         do
 //         {
 //                 randombytes((void *)k, keyspace_bytes[j]);
 //         } while (fp_issmaller((uint64_t *)k, keyspace_size[j]));
 // }
 // VERY NOT SECURE (testing only)
 void fp2_random(fp2_t *a){
    for(int i = 0; i < NWORDS_FIELD; i++){
        a->re[i] = rand();
        a->im[i] = rand();
    }
    // Normalize
    fp2_t one;
    fp_mont_setone(one.re);fp_set(one.im,0);
    fp2_mul(&*a, &*a, &one);
    // Update seed
    srand((unsigned) a->re[0]);
 }
 // Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
 void coeff(fp2_t *B, ec_point_t const A)
 {
 	fp2_t t;
 	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
 	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
 	fp2_copy(&*B, &A.z);
 	fp2_inv(&*B);		// 1 / (C24)
 	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
 	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
 }
 // Determines if point is fp2-rational (if not, then it must be a zero trace point)
 uint8_t isrational(ec_point_t const T, fp2_t const a)
 {
 	fp2_t XT, tmp, aux, YT_squared;
 	fp2_copy(&XT, &T.z);
 	fp2_inv(&XT);
 	fp2_mul(&XT, &XT, &T.x);
 	fp2_sqr(&tmp, &XT);
 	fp2_mul(&aux, &tmp, &XT);
 	fp2_mul(&tmp, &tmp, &a);
 	fp2_add(&YT_squared, &tmp, &aux);
 	fp2_add(&YT_squared, &YT_squared, &XT);
 	return fp2_is_square(&YT_squared);
 }
 // ladder3pt computes x(P + [m]Q)
 void ladder3pt(ec_point_t* R, fp_t const m, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A)
 {
 	ec_point_t X0, X1, X2;
 	copy_point(&X0, Q);
 	copy_point(&X1, P);
 	copy_point(&X2, PQ);
 	int i,j;
 	uint64_t t;
 	for (i = 0; i < NWORDS_FIELD; i++)
 	{
 		t = 1;
 		for (j = 0 ; j < 64; j++)
 		{
 			swap_points(&X1, &X2, -((t & m[i]) == 0));
 			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
 			swap_points(&X1, &X2, -((t & m[i]) == 0));
 			t <<= 1;
 		};
 	};
 	copy_point(R, &X1);
 }
 // For computing [(p + 1) / l_i]P, i:=0, ..., (N - 1)
 void cofactor_multiples(ec_point_t P[], ec_point_t const* A, size_t lower, size_t upper)
 {
 	assert(lower < upper);
 	if (upper - lower == 1)
 		return ;
 	int i;
 	size_t mid = lower + (upper - lower + 1) / 2;
 	copy_point(&(P[mid]), &(P[lower]));
 	for (i = lower; i < (int)mid; i++)
 		xMULv2(&(P[mid]), &(P[mid]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
 	for (i = (int)mid; i < (int)upper; i++)
 		xMULv2(&(P[lower]), &(P[lower]), &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], A);
 	cofactor_multiples(P, A, lower, mid);
 	cofactor_multiples(P, A, mid, upper);
 }
 // The projective x-coordinate point (X : Z) at infinity is such that Z == 0
 static inline int isinfinity(ec_point_t const P)
 {
 	return fp2_is_zero(&P.z);
 }
 int main(int argc, char* argv[])
 {
 	if (argc > 1) {
 		TEST_LOOPS = atoi(argv[1]);
 	}
 	fp2_t fp2_0, fp2_1;
 	fp2_set(&fp2_0, 0);
 	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
 	int i, j;
 	ec_point_t A;
 	fp2_set(&A.x, 0);
 	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
 	fp2_add(&A.z, &A.z, &A.z);	// 2C
 	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
 	fp2_add(&A.z, &A.z, &A.z);	// 4C
 	// Just to ensure the projective curve coeffientes are different from zero
 	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
 	fp2_t a;
 	coeff(&a, A);
 	ec_point_t PA, QA, PQA, PB, QB, PQB;
 	// Writing the public projective x-coordinate points into Montogmery domain
 	fp2_tomont(&(PA.x), &(xPA));
 	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
 	fp2_tomont(&(QA.x), &(xQA));
 	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
 	fp2_tomont(&(PQA.x), &(xPQA));
 	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
 	assert( isrational(PA, a) );
 	assert( isrational(QA, a) );
 	assert( isrational(PQA, a) );
 	// ======================================================================================================
 	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
 	for (j = 0; j < P_LEN; j++)
 	{
 		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
 		{
 			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			assert( isrational(PA, a) );
 			assert( isrational(QA, a) );
 			assert( isrational(PQA, a) );
 		};
 	};
 	assert( !isinfinity(PA) );
 	assert( !isinfinity(QA) );
 	assert( !isinfinity(PQA) );
 	ec_point_t P[P_LEN + M_LEN], Q[P_LEN + M_LEN], PQ[P_LEN + M_LEN];
 	copy_point(&(P[0]), &PA);
 	cofactor_multiples(P, &A, 0, P_LEN);
 	copy_point(&(Q[0]), &QA);
 	cofactor_multiples(Q, &A, 0, P_LEN);
 	copy_point(&(PQ[0]), &PQA);
 	cofactor_multiples(PQ, &A, 0, P_LEN);
 	for (j = 0; j < P_LEN; j++)
 	{
 		// x(PA)
 		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
 		assert( isrational(P[j], a) );
 		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(P[j]) );		// It must be now the point at infinity
 		// x(QA)
 		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
 		assert( isrational(Q[j], a) );
 		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
 		// x(PQA)
 		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
 		assert( isrational(PQ[j], a) );
 		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
 	};
 	// Writing the public projective x-coordinate points into Montogmery domain
 	fp2_tomont(&(PB.x), &(xPB));
 	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
 	fp2_tomont(&(QB.x), &(xQB));
 	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
 	fp2_tomont(&(PQB.x), &(xPQB));
 	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
 	assert( !isrational(PB, a) );
 	assert( !isrational(QB, a) );
 	assert( !isrational(PQB, a) );
 	// ======================================================================================================
 	// Recall, PB, QB, and PQB are expeted to be M-order points, but we require to ensure they are of order M
 	for (j = P_LEN; j < (P_LEN + M_LEN); j++)
 	{
 		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
 		{
 			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			assert( !isrational(PB, a) );
 			assert( !isrational(QB, a) );
 			assert( !isrational(PQB, a) );
 		};
 	};
 	assert( !isinfinity(PB) );
 	assert( !isinfinity(QB) );
 	assert( !isinfinity(PQB) );
 	copy_point(&(P[P_LEN]), &PB);
 	cofactor_multiples(P, &A, P_LEN, P_LEN + M_LEN);
 	copy_point(&(Q[P_LEN]), &QB);
 	cofactor_multiples(Q, &A, P_LEN, P_LEN + M_LEN);
 	copy_point(&(PQ[P_LEN]), &PQB);
 	cofactor_multiples(PQ, &A, P_LEN, P_LEN + M_LEN);
 	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
 	{
 		// x(PB)
 		assert( !isinfinity(P[j]) );	// It must be different from the point at infinity
 		assert( !isrational(P[j], a) );
 		xMULv2(&P[j], &P[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(P[j]) );		// It must be now the point at infinity
 		// x(QB)
 		assert( !isinfinity(Q[j]) );	// It must be different from the point at infinity
 		assert( !isrational(Q[j], a) );
 		xMULv2(&Q[j], &Q[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(Q[j]) );		// It must be now the point at infinity
 		// x(PQB)
 		assert( !isinfinity(PQ[j]) );	// It must be different from the point at infinity
 		assert( !isrational(PQ[j], a) );
 		xMULv2(&PQ[j], &PQ[j], &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(PQ[j]) );	// It must be now the point at infinity
 	};
 	fp2_t m;
 	// Writing the public projective x-coordinate points into Montogmery domain
 	fp2_tomont(&(PA.x), &(xPA));
 	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
 	fp2_tomont(&(QA.x), &(xQA));
 	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
 	fp2_tomont(&(PQA.x), &(xPQA));
 	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
 	assert( isrational(PA, a) );
 	assert( isrational(QA, a) );
 	assert( isrational(PQA, a) );
 	fp2_tomont(&(PB.x), &(xPB));
 	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
 	fp2_tomont(&(QB.x), &(xQB));
 	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
 	fp2_tomont(&(PQB.x), &(xPQB));
 	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
 	assert( !isrational(PB, a) );
 	assert( !isrational(QB, a) );
 	assert( !isrational(PQB, a) );
 	ec_point_t R[P_LEN + M_LEN];
 	int k;
 	for (j = 0; j < TEST_LOOPS; j++)
 	{
 		printf("[%3d%%] Testing EC differential arithmetic", 100 * j / TEST_LOOPS);
 		fflush(stdout);
 		printf("\r\x1b[K");
 		fp2_random(&m);
 		ladder3pt(&(R[0]), m.re, &PA, &QA, &PQA, &A);
 		assert( isrational(R[0], a) );
 		for (k = 0; k < P_LEN; k++)
 		{
 			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
 			{
 				xMULv2(&R[0], &R[0], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
 				assert( isrational(R[0], a) );
 			};
 		};
 		cofactor_multiples(R, &A, 0, P_LEN);
 		for (i = 0; i < P_LEN; i++)
 		{
 			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
 			assert( isrational(R[i], a) );
 			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
 			assert( isinfinity(R[i]) );		// It must be now the point at infinity
 		};
 		fp2_random(&m);
 		ladder3pt(&(R[P_LEN]), m.re, &PB, &QB, &PQB, &A);
 		assert( !isrational(R[P_LEN], a) );
 		for (k = P_LEN; k < (P_LEN+M_LEN); k++)
 		{
 			for (i = 1; i < TORSION_ODD_POWERS[k]; i++)
 			{
 				xMULv2(&R[P_LEN], &R[P_LEN], &(TORSION_ODD_PRIMES[k]), p_plus_minus_bitlength[k], &A);
 				assert( !isrational(R[P_LEN], a) );
 			};
 		};
 		cofactor_multiples(R, &A, P_LEN, P_LEN + M_LEN);
 		for (i = P_LEN; i < (P_LEN+M_LEN); i++)
 		{
 			assert( !isinfinity(R[i]) );	// It must be different from the point at infinity
 			assert( !isrational(R[i], a) );
 			xMULv2(&R[i], &R[i], &(TORSION_ODD_PRIMES[i]), p_plus_minus_bitlength[i], &A);
 			assert( isinfinity(R[i]) );		// It must be now the point at infinity
 		};
 	};
 	if(TEST_LOOPS)
 		printf("[%3d%%] Tested EC differential arithmetic:\tNo errors!\n", 100 * j / TEST_LOOPS);
 	printf("-- All tests passed.\n");
 	// BENCHMARK xDBLv2
    unsigned long long cycles, cycles1, cycles2;
    cycles = 0;
 	ec_point_t PP[TEST_LOOPS], EE[TEST_LOOPS];
 	for(int i = 0; i < TEST_LOOPS; i++){
 		fp2_random(&PP[i].x);
 		fp2_random(&PP[i].z);
 		fp2_random(&EE[i].x);
 		fp2_random(&EE[i].z);
 	}
    cycles1 = cpucycles(); 
 	for(int i = 0; i < TEST_LOOPS; i++){
 		xDBLv2(&PP[i], &PP[i], &EE[i]);
 	}
    cycles2 = cpucycles();
    cycles = cycles+(cycles2-cycles1);
 	printf("xDBLv2 bench: %7lld cycles\n", cycles/TEST_LOOPS);
 	// BENCHMARK xIsog4
    cycles = 0;
 	ec_point_t KK0[TEST_LOOPS], KK1[TEST_LOOPS], KK2[TEST_LOOPS];
 	for(int i = 0; i < TEST_LOOPS; i++){
 		fp2_random(&KK0[i].x);
 		fp2_random(&KK0[i].z);
 		fp2_random(&KK1[i].x);
 		fp2_random(&KK1[i].z);
 		fp2_random(&KK2[i].x);
 		fp2_random(&KK2[i].z);
 	}
    cycles1 = cpucycles(); 
 	for(int i = 0; i < TEST_LOOPS; i++){
 	fp2_t t0, t1;
 	fp2_add(&t0, &PP[i].x, &PP[i].z);
 	fp2_sub(&t1, &PP[i].x, &PP[i].z);
 	fp2_mul(&(EE[i].x), &t0, &KK1[i].x);
 	fp2_mul(&(EE[i].z), &t1, &KK2[i].x);
 	fp2_mul(&t0, &t0, &t1);
 	fp2_mul(&t0, &t0, &KK0[i].x); 
 	fp2_add(&t1, &(EE[i].x), &(EE[i].z));
 	fp2_sub(&(EE[i].z), &(EE[i].x), &(EE[i].z));
 	fp2_sqr(&t1, &t1);
 	fp2_sqr(&(EE[i].z), &(EE[i].z));
 	fp2_add(&(EE[i].x), &t0, &t1);
 	fp2_sub(&t0, &(EE[i].z), &t0);
 	fp2_mul(&(EE[i].x), &(EE[i].x), &t1);
 	fp2_mul(&(EE[i].z), &(EE[i].z), &t0);
 	}
    cycles2 = cpucycles();
    cycles = cycles+(cycles2-cycles1);
 	printf("xeval_4 bench: %7lld cycles\n", cycles/TEST_LOOPS);
 	return 0;
 }
--- a/src/ec/ref/ecx/test/poly-mul-test.c
+++ b/src/ec/ref/ecx/test/poly-mul-test.c
@@ -0,0 +1,445 @@
 #include <poly.h>
 #include <assert.h>
 #include <stdio.h>
 bool fp2_isequal(fp2_t a, fp2_t b){
    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
 }
 // VERY NOT SECURE (testing only)
 void fp2_random(fp2_t *a){
    for(int i = 0; i < NWORDS_FIELD; i++){
        a->re[i] = rand();
        a->im[i] = rand();
    }
    // Normalize
    fp2_t one;
    fp_mont_setone(one.re);fp_set(one.im,0);
    fp2_mul(&*a, &*a, &one);
    // Update seed
    srand((unsigned) a->re[0]);
 }
 void slow_mul(poly h, poly f, int lenf, poly g, int leng){
  // Computes h = f*g by school method
  fp2_t a, b;
  int nf, ng, e;
  int lenh = lenf + leng - 1;
  if(lenh <= 0){
    return;
  }
  fp2_t fg[lenh];
  if (leng > lenf){
    slow_mul(h, g, leng, f, lenf);
    return;
  }
  for(e = 0; e < lenh; e++){
    if (lenf - 1 < e){
      nf = lenf - 1;
    }
    else{
      nf = e;
    }
    ng = e - nf;
    fp2_set(&a, 0);
    while( (ng < leng) & (nf >= 0) ){
      fp2_mul(&b, &f[nf], &g[ng]);
      fp2_add(&a, &a, &b);
      nf--;
      ng++;
    }
    fp2_copy(&fg[e], &a);
  }
  for(e = 0; e < lenh; e++){
    fp2_copy(&h[e], &fg[e]);
  }
  return;
 }
 int main(){
  fp2_t fp2_0, fp2_1;
  #define nmax 16
  int nf, ng, n, e;
        fp2_set(&fp2_0, 0);
        fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0); 
  //TEST MULTIPLICATION BY 0
  for(nf = 2; nf < nmax; nf++){
    fp2_t f[nf], h[nf-1];
    printf("[%3d%%] Testing multiplication by 0", 100 * nf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    for(e = 0; e < nf; e++){
      fp2_random(&f[e]);
    }
    poly_mul(h, f, nf, f, 0);
    for(e = 0; e < nf-1; e++){
      assert(fp2_is_zero(&h[e])==1);
    }
    poly_mul(h, f, 0, f, nf);
    for(e = 0; e < nf-1; e++){
      assert(fp2_is_zero(&h[e])==1);
    }
  }
  printf("[%3d%%] Tested multiplication by 0:\t\tNo errors!\n", 100 * nf / nmax);
  //TEST FOR f, g, h DISJOINT MEMORY SPACES
  for(nf = 1; nf < nmax; nf++){
    printf("[%3d%%] Testing multiplication", 100 * nf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    for(ng = 1; ng < nmax; ng++){
      fp2_t f[nf];   //Random length nf poly
      for(e = 0; e < nf; e++){
 	fp2_random(&f[e]);
      }
      fp2_t g[ng];  // Random length ng poly
      for(e = 0; e < ng; e++){
 	fp2_random(&g[e]);
      }
      fp2_t h[nf+ng-1];// Compute product
      poly_mul(h, f, nf, g, ng);
      fp2_t fg[nf+ng-1]; // Compute the product by school method
      slow_mul(fg, f, nf, g, ng);
      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
 	assert(fp2_isequal(h[e], fg[e])==1);
      }
    }
  }
  printf("[%3d%%] Tested multiplication:\t\t\tNo errors!\n", 100 * nf / nmax);
  // TEST FOR f, g CONTIGIOUS AND RESULT SAVED OVER THEM
  for(nf = 1; nf < nmax; nf++){
    printf("[%3d%%] Testing multiplication in place", 100 * nf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    for(ng = 1; ng < nmax; ng++){
      fp2_t h[nf+ng];
      //Random length nf poly
      for(e = 0; e < nf; e++){
 	fp2_random(&h[e]);
      }
      // Random length ng poly
      for(e = 0; e < ng; e++){
 	fp2_random(&h[e+nf]);
      }
      // Compute the product
      fp2_t fg[nf+ng-1];
      slow_mul(fg, h, nf, &(h[nf]), ng); // School method
      poly_mul(h, h, nf, &(h[nf]), ng); // Karatsuba method
      for(e = 0; e < nf + ng - 1; e++){   // Verify answer term by term
 	assert(fp2_isequal(h[e], fg[e])==1);
      }
    }
  }
    printf("[%3d%%] Tested multiplication in place:\t\tNo errors!\n", 100 * nf / nmax);
  //TEST FOR MULTIPLICATION MOD X^N BY 0
  for(nf = 2; nf < nmax; nf++){
    fp2_t f[nf];
    printf("[%3d%%] Testing mul mod x^n by 0", 100 * nf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    for(e = 0; e < nf; e++){
      fp2_random(&f[e]);
    }
    for(n = 1; n < nmax; n++){
      fp2_t h[n];
      poly_mul_low(h, n, f, nf, f, 0);
      for(e = 0; e < n; e++){
 	assert(fp2_is_zero(&h[e])==1);
      }
      poly_mul_low(h, n, f, 0, f, nf);
      for(e = 0; e < n; e++){
 	assert(fp2_is_zero(&h[e])==1);
      }
    }
  }
  printf("[%3d%%] Tested mul mod x^n by 0:\t\t\tNo errors!\n", 100 * nf / nmax);
  //TEST FOR MULTIPLICATION MOD X^N
    for(nf = 1; nf < nmax; nf++){
      printf("[%3d%%] Testing mul mod x^n", 100 * nf / nmax);
      fflush(stdout);
      printf("\r\x1b[K");
      for(ng = 1; ng < nmax; ng++){
 	fp2_t f[nf], g[ng], fg[nf+ng-1];
 	poly h;
 	//Get random polynomials
 	for(e = 0; e < nf; e++){
 	  fp2_random(&f[e]);
 	}
 	for(e = 0; e < ng; e++){
 	  fp2_random(&g[e]);
 	}
 	//Save regular result to fg
 	slow_mul(fg, f, nf, g, ng);
 	//Compute result mod x^n
 	for(n = 1; n < 2*nmax; n++){
 	  h = malloc(sizeof(fp2_t)*n);
 	  poly_mul_low(h, n, f, nf, g, ng);
 	  //Compare with expected
 	  e = 0;
 	  while(e < nf+ng-1 && e < n){
 	    assert(fp2_isequal(h[e], fg[e]) == 1);
 	    e++;
 	  }
 	  while(e < n){
 	    assert(fp2_is_zero(&h[e]) == 1);
 	    e++;
 	  }
 	  free(h);
 	}
      }
    }
    printf("[%3d%%] Tested mul mod x^n:\t\t\tNo errors!\n", 100 * nf / nmax);
  //TEST FOR POLY_MUL_MIDDLE
    for(nf = 1; nf < 2*nmax; nf+=1){
      fp2_t f[nf];
      printf("[%3d%%] Testing poly_mul_middle", 100 * nf / (2*nmax));
      fflush(stdout);
      printf("\r\x1b[K");
      for(ng = (nf+1)>>1; ng < (nf+1)-((nf+1)>>1); ng++){
 	// This runs from floor((nf+1)/2) to ceil((nf+1)/2)
 	fp2_t g[ng];
 	for(e = 0; e < nf; e++){
 	  fp2_random(&f[e]);
 	}
 	for(e = 0; e < ng; e++){
 	  fp2_random(&g[e]);
 	}
 	fp2_t h[nf+ng-1];
 	slow_mul(h, g, ng, f, nf);
 	poly_mul_middle(g, g, ng, f, nf);
 	for(e = 0; e < ng; e++){
 	  assert(fp2_isequal(h[e+nf-ng], g[e])==1);
 	}
      }
    }
    printf("[%3d%%] Tested poly_mul_middle:\t\t\tNo errors!\n", 100 * nf / (2*nmax));
  // TEST FOR SELF RECIPROCAL MULTIPLICATION
    for(nf = 1; nf < nmax; nf++){
      printf("[%3d%%] Testing self reciprocal mul", 100 * nf / nmax);
      fflush(stdout);
      printf("\r\x1b[K");
      for(ng = 1; ng < nmax; ng++){
 	fp2_t f[nf], g[ng], h[nf+ng-1], fg[nf+ng-1];
 	// Get random palyndromes
 	for(e = 0; e < (nf>>1); e++){
 	  fp2_random(&f[e]);
 	  fp2_copy(&f[nf-1-e], &f[e]);
 	}
 	if(nf & 1){
 	  fp2_random(&f[nf>>1]);
 	}
 	for(e = 0; e < (ng>>1); e++){
 	  fp2_random(&g[e]);
 	  fp2_copy(&g[ng-1-e], &g[e]);
 	}
 	if(ng & 1){
 	  fp2_random(&g[ng>>1]);
 	} 
 	// Compute products
 	poly_mul_selfreciprocal(h, g, ng, f, nf);
 	slow_mul(fg, g, ng, f, nf);
 	// Compare
 	for(e = 0; e < nf+ng-1; e++){
 	  assert(fp2_isequal(fg[e], h[e])==1);
 	}
      }
    }		 
    printf("[%3d%%] Tested self reciprocal mul:\t\tNo errors!\n", 100 * nf / nmax);
  // TEST FOR PRODUCT TREES
    int tree_size, iteration, i;
    int  len, *DEG, LENF;
    poly *H, *F, h;
    for(tree_size = 1; tree_size < nmax; tree_size++){
      printf("[%3d%%] Testing product tree:\t\t\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
      fflush(stdout);
      printf("\r\x1b[K");
      i = 0;
      while((1<<i) < tree_size){
 	i++;
      }
      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
      H = malloc(sizeof(poly)*((1<<(i+2))-1));
      F = malloc(sizeof(poly)*tree_size);
      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
 	// Generate random list of polynomials
 	LENF = (rand() % nmax)+1;
 	for(i = 0; i < tree_size; i++){
 	  F[i] = malloc(sizeof(fp2_t)*LENF);
 	  for(e = 0; e < LENF; e++){
 	    fp2_random(&F[i][e]);
 	  }
 	}
 	product_tree(H, DEG, 0, F, LENF, tree_size);
 	// Build product of all polynomials manually
 	len = LENF;
 	//for(e = 0; e < LENF[0]; e++){
 	for(e = 0; e < LENF; e++){
 	  fp2_copy(&h[e], &F[0][e]);
 	}
 	for(i = 1; i < tree_size; i++){
 	  poly_mul(h, h, len, F[i], LENF);
 	  len += LENF-1;
 	}
 	// Compare to root
 	assert (len == DEG[0]+1);
 	for(e = 0; e < len; e++){
 	  assert(fp2_isequal(H[0][e], h[e])==1);
 	}
      clear_tree(H, 0, tree_size);
      for(i = 0; i < tree_size; i++){
 	free(F[i]);
      }
      }
      free(DEG);
      free(H);
      free(F); 
      free(h);
    }
    printf("[%3d%%] Tested product tree:\t\t\tNo errors!\n", 100 * tree_size / nmax);
  // TEST FOR SELF RECIPROCAL PRODUCT TREES
    for(tree_size = 1; tree_size < nmax; tree_size++){
      printf("[%3d%%] Testing selfreciprocal product tree:\tSize %d out of %d", 100 * tree_size / nmax, tree_size, nmax-1);
      fflush(stdout);
      printf("\r\x1b[K");
      i = 0;
      while((1<<i) < tree_size){
 	i++;
      }
      DEG = malloc(sizeof(int)*((1<<(i+2))-1));
      H = malloc(sizeof(poly)*((1<<(i+2))-1));
      F = malloc(sizeof(poly)*tree_size);
      h = malloc(sizeof(fp2_t)*(nmax+1)*tree_size);
      for(iteration = 0; iteration < nmax + 1 - tree_size ; iteration++){
 	// Generate random list of polynomials
 	LENF = (rand() % nmax)+1;;
 	for(i = 0; i < tree_size; i++){
 	  F[i] = malloc(sizeof(fp2_t)*LENF);
 	  for(e = 0; e < (LENF>>1); e++){
 	    fp2_random(&F[i][e]);
 	    fp2_copy(&F[i][LENF-1-e], &F[i][e]);
 	  }
 	  if(LENF & 1){
 	  	fp2_random(&F[i][(LENF>>1)]);
 	  }
 	}
 	product_tree_selfreciprocal(H, DEG, 0, F, LENF, tree_size);
 	// Build product of all polynomials manually
 	len = LENF;
 	for(e = 0; e < LENF; e++){
 	  fp2_copy(&h[e], &F[0][e]);
 	}
 	for(i = 1; i < tree_size; i++){
 	  poly_mul(h, h, len, F[i], LENF);
 	  len += LENF-1;
 	}
 	// Compare to root
 	assert (len == DEG[0]+1);
 	for(e = 0; e < len; e++){
 	  assert(fp2_isequal(H[0][e], h[e])==1);
 	}
      clear_tree(H, 0, tree_size);
      for(i = 0; i < tree_size; i++){
 	free(F[i]);
      }
      }
      free(DEG);
      free(H);
      free(F); 
      free(h);
    }
    printf("[%3d%%] Tested selfreciprocal product tree:\tNo errors!\n", 100 * tree_size / nmax);
    printf("-- All tests passed.\n");
    return 0;
 }
--- a/src/ec/ref/ecx/test/poly-redc-test.c
+++ b/src/ec/ref/ecx/test/poly-redc-test.c
@@ -0,0 +1,461 @@
 #include "poly.h"
 #include <assert.h>
 #include <stdio.h>
 #define nmax 32
 bool fp2_isequal(fp2_t a, fp2_t b){
    return fp_is_equal(a.re, b.re) && fp_is_equal(a.im, b.im);
 }
 // VERY NOT SECURE (testing only)
 void fp2_random(fp2_t *a){
    for(int i = 0; i < NWORDS_FIELD; i++){
        a->re[i] = rand();
        a->im[i] = rand();
    }
    // Normalize
    fp2_t one;
    fp_mont_setone(one.re);fp_set(one.im,0);
    fp2_mul(&*a, &*a, &one);
    // Update seed
    srand((unsigned) a->re[0]);
 }
 int main(){
  fp2_t fp2_0, fp2_1;
  fp2_set(&fp2_0, 0);
  fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
  int lenf, leng, n, e, iteration, array_size, tree_size, i, root, brother, *DEG, LENF;
  poly f, g, h, f_rev, f_rev_inv, *F, *H, *R, g1, g2, REM1, REM2, G1, G2, G1_rev, G2_rev, R0;
  fp2_t c, *A, *C, ratio, A0;
  f_rev_inv = 0;
 // TEST FOR RECIPROCAL
  for(lenf = 1; lenf < nmax; lenf++)
  {  
    printf("[%3d%%] Testing reciprocals", 100 * lenf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    // Get random poly
    f = malloc(sizeof(fp2_t)*lenf);
    for(e = 0; e < lenf; e++)
      fp2_random(&f[e]);
    for(n = 1; n < nmax; n++)
    {
      // Get the reciprocal and multiply them
      h = malloc(sizeof(fp2_t)*n);
      memset(h, 0, sizeof(fp2_t)*n);
      reciprocal(h, &c, f, lenf, n);
      poly_mul_low(h, n, f, lenf, h, n);
      // Compare with expected
      assert(fp2_isequal(h[0],c));
      for(e = 1;  e < n; e++)
 	assert(fp2_is_zero(&h[e]));
      free(h);
    }
    free(f); 
  }
  printf("[%3d%%] Tested reciprocals:\t\tNo errors!\n", 100 * lenf / nmax);
  // TEST FOR REDUCTION
  for(lenf = 2; lenf < nmax; lenf++)
  {
    printf("[%3d%%] Testing polynomial reduction", 100 * lenf / nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    // Get random poly for the mod
    f = malloc(sizeof(fp2_t)*lenf);
    f_rev = malloc(sizeof(fp2_t)*lenf);
    for(e = 0; e < lenf; e++)
    {
      fp2_random(&f[e]);
      fp2_copy(&f_rev[lenf-1-e], &f[e]);
    }
    for(leng = 1; leng < nmax; leng++)
    {
      // Get random poly to reduce
      g = malloc(sizeof(fp2_t)*leng);
      for(e = 0; e < leng; e++){
 	fp2_random(&g[e]);
      }
      // Get reverse-inverse mod x^(leng-lenf+1)
      if(leng >= lenf)
      {
 	f_rev_inv = malloc(sizeof(fp2_t)*(leng-lenf+1));
 	reciprocal(f_rev_inv, &c, f_rev, lenf, leng-lenf+1);
      }
      else{
 	fp_mont_setone(c.re);fp_set(c.im,0);
      }
      // Compute the reduction
      h = malloc(sizeof(fp2_t)*(lenf-1));
      poly_redc(h, g, leng, f, lenf, f_rev_inv, c);
      // Reduce manually
      int leng_red = leng;
      fp2_t scale, f_e;
      while(leng_red >= lenf)
      {
 	fp2_copy(&scale, &f[lenf-1]);
 	fp2_inv(&scale);
 	fp2_mul(&scale, &scale, &g[leng_red-1]);
 	for(e = 0; e < lenf; e++)
 	  {
 	    fp2_mul(&f_e, &f[e], &scale);
 	    fp2_sub(&g[e+leng_red-lenf], &g[e+leng_red-lenf], &f_e);
 	  }
 	leng_red--;
      }
      // Rescale manual result
      if( leng < lenf){
 	      fp_mont_setone(scale.re);fp_set(scale.im,0);
      }
      else
 	if(lenf == 2 && leng == 3)
 	{
 	  fp2_sqr(&scale, &f[1]);
 	  fp2_add(&scale, &scale, &scale);
 	}
 	else
 	  fp2_copy(&scale, &c);
      for(e = 0; e < leng_red; e++)
 	fp2_mul(&g[e], &g[e], &scale);
      // Comapre results
      for(e = leng_red-1; e >= 0; e--)
 	      assert(fp2_isequal(h[e], g[e]));
      for(e = leng_red; e < lenf-1; e++)
 	      assert(fp2_is_zero(&h[e]));
      free(g);
      free(h);
      if(leng >= lenf)
 	free(f_rev_inv);
    }
    free(f);
    free(f_rev);
  }
  printf("[%3d%%] Tested polynomial reduction:\tNo errors!\n", 100 * lenf / nmax);
 // TEST FOR RECIPROCAL TREES
  for(tree_size = 3; tree_size < nmax; tree_size++)
  {
    printf("[%3d%%] Testing reciprocal tree:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    // Compute size of arrays
    i = 0;
    while((1<<i) < tree_size){
      i++;
    }
    array_size = (1<<(i+2))-1;
    DEG = malloc(sizeof(int)*array_size);
    H = malloc(sizeof(poly)*array_size);
    R = malloc(sizeof(poly)*array_size);
    F = malloc(sizeof(poly)*tree_size);
    A = malloc(sizeof(fp2_t)*array_size);
    // Get random polys
    LENF = 2;
    for(i = 0; i < tree_size; i++)
    {
      F[i] = malloc(sizeof(fp2_t)*LENF);
      for(e = 0; e < LENF; e++){
 	      fp2_random(&F[i][e]);
      }
    }
    // Get product tree then reciprocal tree
    product_tree(H, DEG, 0, F, LENF, tree_size);
    leng = DEG[0]+1+(rand() % nmax);
    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
    // Check the root
    root = 0;
    lenf = leng-DEG[root];
    f = malloc(sizeof(fp2_t)*lenf);
    for(e = 0; e < DEG[root]+1 && e < lenf; e++){
      fp2_copy(&f[e], &H[root][DEG[root]-e]);
    }
    for(e = DEG[root]+1; e < lenf; e++){
      fp2_set(&f[e], 0);
    }
    poly_mul_low(f, lenf, f, lenf, R[root], lenf);
    assert(fp2_isequal(f[0], A[root]));
    for(e = 1; e < lenf; e++){
      assert(fp2_is_zero(&f[e]));
    }
    free(f);
    // Perform random walks
    for(iteration = 0; iteration < nmax - tree_size; iteration++)
    {
      root = 0;
      n = tree_size;
      while(n > 1)
      {
 	if(rand() & 1)
 	{
 	  root = 2*root+1;
 	  n = n - (n>>1);
 	}
 	else
 	{
 	  root = 2*root+2;
 	  n = n>>1;
 	}
 	brother = root - 1 + 2*(root & 1);
 	// Check current node
 	if(DEG[root] > 2)
 	{
 	  lenf = DEG[brother];
 	  f = malloc(sizeof(fp2_t)*lenf);
 	  for(e = 0; e < DEG[root]+1 && e < lenf; e++){
 	    fp2_copy(&f[e], &H[root][DEG[root]-e]);
    }
 	  for(e = DEG[root]+1; e < lenf; e++){
 	    fp2_set(&f[e], 0);
    }
 	  poly_mul_low(f, lenf, f, lenf, R[root], lenf);
 	  assert(fp2_isequal(f[0], A[root]));
 	  for(e = 1; e < lenf; e++){
 	    assert(fp2_is_zero(&f[e]));
    }
 	  free(f);
 	}
      }
    }
    // Clean up
    for(i = 0; i < tree_size; i++)
      free(F[i]);
    clear_tree(H, 0, tree_size);
    clear_tree(R, 0, tree_size);
    free(F);
    free(H);
    free(R);
    free(A);
    free(DEG);
  }
  printf("[%3d%%] Tested reciprocal tree:\t\tNo errors!\n", 100 * tree_size / nmax);
  // TEST FOR REMAINDERS
  for(tree_size = 2; tree_size < nmax; tree_size++)
  {
    printf("[%3d%%] Testing batched remainders:\t\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    // Compute size of arrays
    i = 0;
    while((1<<i) < tree_size)
      i++;
    array_size = (1<<(i+2))-1;
    DEG = malloc(sizeof(int)*array_size);
    H = malloc(sizeof(poly)*array_size);
    R = malloc(sizeof(poly)*array_size);
    F = malloc(sizeof(poly)*tree_size);
    A = malloc(sizeof(fp2_t)*array_size);
    REM1 = malloc(sizeof(fp2_t)*array_size);
    REM2 = malloc(sizeof(fp2_t)*array_size);
    C = malloc(sizeof(fp2_t)*tree_size);
    // Get random polys
    LENF = 2;
    for(i = 0; i < tree_size; i++)
    {
      F[i] = malloc(sizeof(fp2_t)*LENF);
      for(e = 0; e < LENF; e++)
 	fp2_random(&F[i][e]);
    }
    // Get product tree, reciprocal tree, and remainders
    product_tree(H, DEG, 0, F, LENF, tree_size);
    leng = DEG[0]+1+(rand() % nmax);
    g1 = malloc(sizeof(fp2_t)*leng);
    g2 = malloc(sizeof(fp2_t)*leng);
    for(e = 0; e < leng; e++)
    {
      fp2_random(&g1[e]);
      fp2_random(&g2[e]);
    }
    reciprocal_tree(R, A, leng, H, DEG, 0, tree_size);
    multieval_unscaled(REM1, g1, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
    multieval_unscaled(REM2, g2, leng, R, (const fp2_t*)A, H, DEG, 0, tree_size);
    for(i = 0; i < tree_size; i++)
    {
      // Get ratio of the remainder
      fp2_inv(&REM1[i]);
      fp2_mul(&ratio, &REM1[i], &REM2[i]);
      // Compute remainders manually
      f_rev = malloc(sizeof(fp2_t)*LENF);
      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
      h = malloc(sizeof(fp2_t)*(LENF-1));
      for(e = 0; e < LENF; e++)
 	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
      fp2_copy(&REM1[i], &h[0]);
      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
      fp2_copy(&REM2[i], &h[0]);
      free(f_rev);
      free(f_rev_inv);
      free(h);
      // Compare results
      fp2_inv(&REM1[i]);
      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
      assert(fp2_isequal(REM1[i], ratio));
    }
    // Clean up
    for(i = 0; i < tree_size; i++)
      free(F[i]);
    free(g1);
    free(g2);
    clear_tree(H, 0, tree_size);
    clear_tree(R, 0, tree_size);
    free(F);
    free(H);
    free(R);
    free(A);
    free(DEG);
    free(REM1);
    free(REM2);
    free(C);
  } 
  printf("[%3d%%] Tested batched remainders:\tNo errors!\n", 100 * tree_size / nmax);
 // TEST FOR SCALED REMAINDER TREE
  for(tree_size = 1; tree_size < nmax; tree_size++)
  {
    printf("[%3d%%] Testing scaled remainder tree:\tTree size %d out of %d", 100 * tree_size / nmax, tree_size, nmax);
    fflush(stdout);
    printf("\r\x1b[K");
    // Compute size of arrays
    i = 0;
    while((1<<i) < tree_size)
      i++;
    array_size = (1<<(i+2))-1;
    DEG = malloc(sizeof(int)*array_size);
    H = malloc(sizeof(poly)*array_size);
    F = malloc(sizeof(poly)*tree_size);
    REM1 = malloc(sizeof(fp2_t)*array_size);
    REM2 = malloc(sizeof(fp2_t)*array_size);
    // Get random polys
    LENF = 2;
    for(i = 0; i < tree_size; i++)
    {
      F[i] = malloc(sizeof(fp2_t)*LENF);
      for(e = 0; e < LENF; e++)
 	fp2_random(&F[i][e]);
    }
    // Get random polys to reduce
    product_tree(H, DEG, 0, F, LENF, tree_size);
    leng = DEG[0]+1+(rand() % nmax);
    g1 = malloc(sizeof(fp2_t)*leng);
    g2 = malloc(sizeof(fp2_t)*leng);
    for(e = 0; e < leng; e++)
    {
      fp2_random(&g1[e]);
      fp2_random(&g2[e]);
    }
    // Get the required initial nodes
    G1 = malloc(sizeof(fp2_t)*DEG[0]);
    G2 = malloc(sizeof(fp2_t)*DEG[0]);
    G1_rev = malloc(sizeof(fp2_t)*DEG[0]);
    G2_rev = malloc(sizeof(fp2_t)*DEG[0]);
    R0 = malloc(sizeof(fp2_t)*(leng));
    f_rev = malloc(sizeof(fp2_t)*(DEG[0]+1));
    for(e = 0; e < DEG[0]+1; e++)
      fp2_copy(&f_rev[e], &H[0][DEG[0]-e]);
    if( DEG[0] > leng-DEG[0])
      reciprocal(R0, &A0, f_rev, DEG[0]+1, DEG[0]);
    else
      reciprocal(R0, &A0, f_rev, DEG[0]+1, leng-DEG[0]);
    poly_redc(G1, g1, leng, H[0], DEG[0]+1, R0, A0);
    poly_redc(G2, g2, leng, H[0], DEG[0]+1, R0, A0);
    for(e = 0; e < DEG[0]; e++)
    {
      fp2_copy(&G1_rev[e], &G1[DEG[0]-1-e]);
      fp2_copy(&G2_rev[e], &G2[DEG[0]-1-e]);
    }
    poly_mul_middle(G1_rev, G1_rev, DEG[0], R0, DEG[0]);
    poly_mul_middle(G2_rev, G2_rev, DEG[0], R0, DEG[0]);
    for(e = 0; e < DEG[0]; e++)
    {
      fp2_copy(&G1[e], &G1_rev[DEG[0]-1-e]);
      fp2_copy(&G2[e], &G2_rev[DEG[0]-1-e]);
    }
    free(G1_rev);free(G2_rev);free(R0);free(f_rev);
    // Compute the scaled remainder trees
    multieval_scaled(REM1, G1, H, DEG, 0, tree_size);
    multieval_scaled(REM2, G2, H, DEG, 0, tree_size);
    for(i = 0; i < tree_size; i++)
    {
      // Get ratio of the remainder
      fp2_inv(&REM1[i]);
      fp2_mul(&ratio, &REM1[i], &REM2[i]);
      // Compute remainders manually
      f_rev = malloc(sizeof(fp2_t)*LENF);
      f_rev_inv = malloc(sizeof(fp2_t)*(leng-LENF+1));
      h = malloc(sizeof(fp2_t)*(LENF-1));
      for(e = 0; e < LENF; e++)
 	fp2_copy(&f_rev[e], &F[i][LENF-1-e]);
      reciprocal(f_rev_inv, &c, f_rev, LENF, leng-LENF+1);
      poly_redc(h, g1, leng, F[i], LENF, f_rev_inv, c);
      fp2_copy(&REM1[i], &h[0]);
      poly_redc(h, g2, leng, F[i], LENF, f_rev_inv, c);
      fp2_copy(&REM2[i], &h[0]);
      free(f_rev);free(f_rev_inv);free(h);
      // Compare results
      fp2_inv(&REM1[i]);
      fp2_mul(&REM1[i], &REM1[i], &REM2[i]);
      assert(fp2_isequal(REM1[i], ratio));
    }
    // Clean up
    for(i = 0; i < tree_size; i++)
      free(F[i]);
    free(F);free(g1);free(g2);free(G1);free(G2);
    clear_tree(H, 0, tree_size);free(H);free(DEG);
    free(REM1);free(REM2);
  } 
  printf("[%3d%%] Tested scaled remainder tree:\tNo errors!\n", 100 * tree_size / nmax);
  printf("-- All tests passed.\n");
 }
--- a/src/ec/ref/ecx/test/test_extras.c
+++ b/src/ec/ref/ecx/test/test_extras.c
@@ -0,0 +1,75 @@
 #include "test_extras.h"
 #include <bench.h>
 // Global constants
 extern const digit_t p[NWORDS_FIELD];
 extern const digit_t R2[NWORDS_FIELD];
 #if 0
 int64_t cpucycles(void)
 { // Access system counter for benchmarking
    unsigned int hi, lo;
    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
    return ((int64_t)lo) | (((int64_t)hi) << 32);
 }
 #endif
 int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
 { // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
    int i;
    for (i = nwords-1; i >= 0; i--)
    {
        if (a[i] > b[i]) return 1;
        else if (a[i] < b[i]) return -1;
    }
    return 0; 
 }
 void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
 { // Subtraction without borrow, out = a-b where a>b
  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
    unsigned int i;
    digit_t res, carry, borrow = 0;
    for (i = 0; i < nwords; i++)
    {
        res = a[i] - b[i];
        carry = (a[i] < b[i]);
        out[i] = res - borrow;
        borrow = carry || (res < borrow);
    } 
 }
 void fprandom_test(digit_t* a)
 { // Generating a pseudo-random field element in [0, p-1] 
  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
    unsigned char* string = NULL;
    string = (unsigned char*)a;
    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
    }
    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
        sub_test(a, a, (digit_t*)p, nwords);
    }
 }
 void fp2random_test(fp2_t* a)
 { // Generating a pseudo-random element in GF(p^2) 
  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
    fprandom_test(a->re);
    fprandom_test(a->im);
 }
--- a/src/ec/ref/ecx/test/test_extras.h
+++ b/src/ec/ref/ecx/test/test_extras.h
@@ -0,0 +1,29 @@
 #ifndef TEST_EXTRAS_H
 #define TEST_EXTRAS_H
 #include <time.h>
 #include <stdlib.h>
 #include <fp.h>
 #include <fp2.h>
 #include <curve_extras.h>
 #define PASSED    0
 #define FAILED    1
 // Access system counter for benchmarking
 //int64_t cpucycles(void);
 // Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
 int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
 // Multiprecision subtraction for testing, assumes a > b
 void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords);
 // Generating a pseudo-random field element in [0, p-1] 
 void fprandom_test(digit_t* a);
 // Generating a pseudo-random element in GF(p^2)
 void fp2random_test(fp2_t* a);
 #endif
--- a/src/ec/ref/ecx/test/velu-test.c
+++ b/src/ec/ref/ecx/test/velu-test.c
@@ -0,0 +1,298 @@
 #include<time.h>
 #include <stdio.h>
 #include <assert.h>
 #include <inttypes.h>
 #include "isog.h"
 #include "sdacs.h"
 #include "ec.h"
 #include "test-basis.h"
 void random_scalar(fp_t k, const uint8_t j)
 {
    for(int i = 0; i < NWORDS_FIELD; i++)
        k[i] = rand();
 }
 // Affine Montgomery coefficient computation (A + 2C : 4C) --> A/C
 void coeff(fp2_t *B, ec_point_t const A)
 {
 	fp2_t t;
 	fp2_add(&t, &A.x, &A.x);	// (2 * A24)
 	fp2_sub(&t, &t, &A.z);	// (2 * A24) - C24
 	fp2_copy(&*B, &A.z);
 	fp2_inv(&*B);		// 1 / (C24)
 	fp2_add(&t, &t, &t);	// 4*A = 2[(2 * A24) - C24]
 	fp2_mul(&*B, &t, &*B);	// A/C = 2[(2 * A24) - C24] / C24
 }
 // Determines if point is fp2-rational (if not, then it must be a zero trace point)
 uint8_t isrational(ec_point_t const T, fp2_t const a)
 {
 	fp2_t XT, tmp, aux, YT_squared;
 	fp2_copy(&XT, &T.z);
 	fp2_inv(&XT);
 	fp2_mul(&XT, &XT, &T.x);
 	fp2_sqr(&tmp, &XT);
 	fp2_mul(&aux, &tmp, &XT);
 	fp2_mul(&tmp, &tmp, &a);
 	fp2_add(&YT_squared, &tmp, &aux);
 	fp2_add(&YT_squared, &YT_squared, &XT);
 	return fp2_is_square(&YT_squared);
 }
 // ladder3pt computes x(P + [m]Q)
 void ladder3pt(ec_point_t *R, fp_t const m, ec_point_t const *P, ec_point_t const *Q, ec_point_t const *PQ, ec_point_t const *A)
 {
 	ec_point_t X0, X1, X2;
 	copy_point(&X0, Q);
 	copy_point(&X1, P);
 	copy_point(&X2, PQ);
 	int i,j;
 	uint64_t t;
 	for (i = 0; i < NWORDS_FIELD; i++)
 	{
 		t = 1;
 		for (j = 0 ; j < 64; j++)
 		{
 			swap_points(&X1, &X2, -((t & m[i]) == 0));
 			xDBLADD(&X0, &X1, &X0, &X1, &X2, A);
 			swap_points(&X1, &X2, -((t & m[i]) == 0));
 			t <<= 1;
 		};
 	};
 	copy_point(R, &X1);
 }
 // The projective x-coordinate point (X : Z) at infinity is such that Z == 0
 static inline int isinfinity(ec_point_t const P)
 {
 	return fp2_is_zero(&P.z);
 }
 int main()
 {
 	fp2_t fp2_0, fp2_1;
 	fp2_set(&fp2_0, 0);
 	fp_mont_setone(fp2_1.re);fp_set(fp2_1.im,0);
 	int i, j;
 	ec_point_t A, B, T;
 	fp2_set(&A.x, 0);
 	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
 	// fp2_add(&A.x, &A.z, &A.x);	// 1
 	// fp2_add(&A.x, &A.x, &A.x);	// 2
 	// fp2_add(&A.x, &A.z, &A.x);	// 3
 	// fp2_add(&A.x, &A.x, &A.x);	// 6
 	fp2_add(&A.z, &A.z, &A.z);	// 2C
 	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
 	fp2_add(&A.z, &A.z, &A.z);	// 4C
 	// Just to ensure the projective curve coeffientes are different from zero
 	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
 	fp2_t a;
 	coeff(&a, A);
 	ec_point_t PA, QA, PQA, PB, QB, PQB, RA, RB;
 	// Writing the public projective x-coordinate points into Montogmery domain
 	fp2_tomont(&(PA.x), &(xPA));
 	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
 	fp2_tomont(&(QA.x), &(xQA));
 	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
 	fp2_tomont(&(PQA.x), &(xPQA));
 	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
 	assert( isrational(PA, a) );
 	assert( isrational(QA, a) );
 	assert( isrational(PQA, a) );
 	fp2_tomont(&(PB.x), &(xPB));
 	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
 	fp2_tomont(&(QB.x), &(xQB));
 	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
 	fp2_tomont(&(PQB.x), &(xPQB));
 	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
 	assert( !isrational(PB, a) );
 	assert( !isrational(QB, a) );
 	assert( !isrational(PQB, a) );
 	// ======================================================================================================
 	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
 	for (j = 0; j < P_LEN; j++)
 	{
 		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
 		{
 			xMULv2(&PA, &PA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&QA, &QA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&PQA, &PQA, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			assert( isrational(PA, a) );
 			assert( isrational(QA, a) );
 			assert( isrational(PQA, a) );
 		};
 	};
 	assert( !isinfinity(PA) );
 	assert( !isinfinity(QA) );
 	assert( !isinfinity(PQA) );
 	// --------------------------------------------------------------
 	fp_t m;
 	random_scalar(m, 0);
 	ladder3pt(&RA, m, &PA, &QA, &PQA, &A);
 	for (i = 0; i < P_LEN; i++)
 	{
 		printf("// Processing the %d-th prime:\t", i + 1);
 		printf("%2d%%", 100 * i / (int)P_LEN);
 		fflush(stdout);
 		printf("\r\x1b[K");
 		copy_point(&T, &RA);
 		for (j = (i+1); j < P_LEN; j++)
 			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( !isinfinity(T) );
 		kps(i, T, A);
 		if (TORSION_ODD_PRIMES[i] > gap)
 			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
 		else
 			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
 		xisog(&B, i, A);
 		xeval(&PB, i, PB, A);
 		coeff(&a, B);
 		assert( !isinfinity(PB) );
 		assert( !isrational(PB, a) );
 		xeval(&RA, i, RA, A);
 		assert( (!isinfinity(RA) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
 		assert( (isrational(RA, a) && (i < (P_LEN - 1))) || (isinfinity(RA) && (i == (P_LEN - 1))) );
 		copy_point(&A, &B);
 		// Verifying the order of the image point of  PA has been reduced 
 		copy_point(&T, &RA);
 		for (j = (i+1); j < P_LEN; j++)
 			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(T) );
 		kps_clear(i);
 	};
 	fp2_set(&A.x, 0);
 	fp_mont_setone(A.z.re);fp_set(A.z.im,0);
 	// fp2_add(&A.x, &A.z, &A.x);	// 1
 	// fp2_add(&A.x, &A.x, &A.x);	// 2
 	// fp2_add(&A.x, &A.z, &A.x);	// 3
 	// fp2_add(&A.x, &A.x, &A.x);	// 6
 	fp2_add(&A.z, &A.z, &A.z);	// 2C
 	fp2_add(&A.x, &A.x, &A.z);	// A' + 2C
 	fp2_add(&A.z, &A.z, &A.z);	// 4C
 	// Just to ensure the projective curve coeffientes are different from zero
 	assert( !fp2_is_zero(&A.x) & !fp2_is_zero(&A.x) );
 	coeff(&a, A);
 	// Writing the public projective x-coordinate points into Montogmery domain
 	fp2_tomont(&(PA.x), &(xPA));
 	fp_mont_setone(PA.z.re);fp_set(PA.z.im,0);
 	fp2_tomont(&(QA.x), &(xQA));
 	fp_mont_setone(QA.z.re);fp_set(QA.z.im,0);
 	fp2_tomont(&(PQA.x), &(xPQA));
 	fp_mont_setone(PQA.z.re);fp_set(PQA.z.im,0);
 	assert( isrational(PA, a) );
 	assert( isrational(QA, a) );
 	assert( isrational(PQA, a) );
 	fp2_tomont(&(PB.x), &(xPB));
 	fp_mont_setone(PB.z.re);fp_set(PB.z.im,0);
 	fp2_tomont(&(QB.x), &(xQB));
 	fp_mont_setone(QB.z.re);fp_set(QB.z.im,0);
 	fp2_tomont(&(PQB.x), &(xPQB));
 	fp_mont_setone(PQB.z.re);fp_set(PQB.z.im,0);
 	assert( !isrational(PB, a) );
 	assert( !isrational(QB, a) );
 	assert( !isrational(PQB, a) );
 	// ======================================================================================================
 	// Recall, PA, QA, and PQA are expeted to be N-order points, but we require to ensure they are of order N
 	for (j = P_LEN; j < (P_LEN+M_LEN); j++)
 	{
 		for (i = 1; i < TORSION_ODD_POWERS[j]; i++)
 		{
 			xMULv2(&PB, &PB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&QB, &QB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			xMULv2(&PQB, &PQB, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 			assert( !isrational(PB, a) );
 			assert( !isrational(QB, a) );
 			assert( !isrational(PQB, a) );
 		};
 	};
 	assert( !isinfinity(PB) );
 	assert( !isinfinity(QB) );
 	assert( !isinfinity(PQB) );
 	random_scalar(m, 1);
 	ladder3pt(&RB, m, &PB, &QB, &PQB, &A);
 	for (i = P_LEN; i < (P_LEN+M_LEN); i++)
 	{
 		printf("// Processing the %d-th prime:\t", i + 1);
 		printf("%2d%%", 100 * i / (int)(P_LEN+M_LEN));
 		fflush(stdout);
 		printf("\r\x1b[K");
 		copy_point(&T, &RB);
 		for (j = (i+1); j < (P_LEN+M_LEN); j++)
 			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( !isinfinity(T) );
 		kps(i, T, A);
 		if (TORSION_ODD_PRIMES[i] > gap)
 			printf("[\033[0;31m%7" PRId64 "\033[0m] (#I: %3d, #J: %3d, #K: %3d) \n", TORSION_ODD_PRIMES[i], sI, sJ, sK);
 		else
 			printf("[\033[0;31m%7" PRId64 "\033[0m] --------------------------- \n", TORSION_ODD_PRIMES[i]);
 		xisog(&B, i, A);
 		xeval(&PA, i, PA, A);
 		coeff(&a, B);
 		assert( !isinfinity(PA) );
 		assert( isrational(PA, a) );
 		xeval(&RB, i, RB, A);
 		assert( (!isinfinity(RB) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
 		assert( (!isrational(RB, a) && (i < (P_LEN + M_LEN - 1))) || (isinfinity(RB) && (i == (P_LEN + M_LEN - 1))) );
 		copy_point(&A, &B);
 		// Verifying the order of the image point of  PB has been reduced 
 		copy_point(&T, &RB);
 		for (j = (i+1); j < (P_LEN+M_LEN); j++)
 			xMULv2(&T, &T, &(TORSION_ODD_PRIMES[j]), p_plus_minus_bitlength[j], &A);
 		assert( isinfinity(T) );
 		kps_clear(i);
 	};
 	printf("-- All tests passed!\n");
 	return 0;
 }
--- a/src/ec/ref/ecx/xeval.c
+++ b/src/ec/ref/ecx/xeval.c
@@ -0,0 +1,299 @@
 #include "isog.h"
 #include "ec.h"
 #include <assert.h>
 // -----------------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------------
 // Traditional isogeny evaluation (xEVAL)
 // CrissCross procedure as described in Hisil and Costello paper
 void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta)
 {
 	fp2_t t_1, t_2;
 	fp2_mul(&t_1, &alpha, &delta);
    	fp2_mul(&t_2, &beta, &gamma);
 	fp2_add(&*r0, &t_1, &t_2);
 	fp2_sub(&*r1, &t_1, &t_2);
 }
 // Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
 void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ)
 {
 	fp2_t t0, t1, t2;
 	for(int j = 0; j < lenQ; j++){
 		fp2_add(&t0, &Q[j].x, &Q[j].z);
 		fp2_sub(&t1, &Q[j].x, &Q[j].z);
 		fp2_mul(&t2, &K[0].x, &t1);
 		fp2_mul(&t1, &K[0].z, &t0);
 		fp2_add(&t0, &t2, &t1);
 		fp2_sub(&t1, &t2, &t1);
 		fp2_mul(&R[j].x, &Q[j].x, &t0);
 		fp2_mul(&R[j].z, &Q[j].z, &t1);
 	}
 }
 // Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
 void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ)
 {
 	fp2_t t0, t1;
 	for(int i = 0; i < lenQ; i++){
 		fp2_add(&t0, &Q[i].x, &Q[i].z);
 		fp2_sub(&t1, &Q[i].x, &Q[i].z);
 		fp2_mul(&(R[i].x), &t0, &K[1].x);
 		fp2_mul(&(R[i].z), &t1, &K[2].x);
 		fp2_mul(&t0, &t0, &t1);
 		fp2_mul(&t0, &t0, &K[0].x); 
 		fp2_add(&t1, &(R[i].x), &(R[i].z));
 		fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
 		fp2_sqr(&t1, &t1);
 		fp2_sqr(&(R[i].z), &(R[i].z));
 		fp2_add(&(R[i].x), &t0, &t1);
 		fp2_sub(&t0, &t0, &(R[i].z));
 		fp2_mul(&(R[i].x), &(R[i].x), &t1);
 		fp2_mul(&(R[i].z), &(R[i].z), &t0);
 	}
 }
 // Degree-4 isogeny evaluation with kenerl generated by P such that [2]P = (0, 0)
 // Must call after xisog_4_singular
 void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P)
 {
 	fp2_t t0, t1, t2;
 	for(int i = 0; i < lenQ; i++){
 		fp2_add(&t0, &Q[i].x, &Q[i].z);
 		fp2_sub(&t2, &Q[i].x, &Q[i].z);
 		fp2_sqr(&t0, &t0);
 		fp2_sqr(&t2, &t2);
 		fp2_sub(&R[i].z, &t0, &t2);
 		if(fp2_is_equal(&P.x, &P.z)){
 			// Branch for P = (+1,_)
 			fp2_copy(&t1, &t2);
 		}
 		else{
 			// Branch for P = (-1,_)
 			fp2_copy(&t1, &t0);
 			fp2_copy(&t0, &t2);
 		}
 		fp2_mul(&R[i].x, &R[i].z, &K[0].x);
 		fp2_mul(&R[i].z, &R[i].z, &K[1].x);
 		fp2_mul(&R[i].z, &R[i].z, &t1);
 		fp2_mul(&t1, &t1, &K[0].z);
 		fp2_add(&R[i].x, &R[i].x, &t1);
 		fp2_mul(&R[i].x, &R[i].x, &t0);
 	}
 }
 // Isogeny evaluation on Montgomery curves
 // Recall: K has been computed in Twisted Edwards model and none extra additions are required.
 void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P)
 {
 	int j;
 	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
 	fp2_t R0, R1, S0, S1, T0, T1;
 	fp2_add(&S0, &P.x, &P.z);
 	fp2_sub(&S1, &P.x, &P.z);
 	CrissCross(&R0, &R1, K[0].z, K[0].x, S0, S1);
 	for (j = 1; j < d; j++)
 	{
 		CrissCross(&T0, &T1, K[j].z, K[j].x, S0, S1);
 		fp2_mul(&R0, &T0, &R0);
 		fp2_mul(&R1, &T1, &R1);
 	};
 	fp2_sqr(&R0, &R0);
 	fp2_sqr(&R1, &R1);
 	fp2_mul(&(Q->x), &P.x, &R0);
 	fp2_mul(&(Q->z), &P.z, &R1);
 }
 // -----------------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------------
 // Isogeny evaluation (xEVAL) used in velu SQRT
 void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
 {
 	// =================================================================================
 	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
 	sI = sizeI[i];          // size of I
 	sJ = sizeJ[i];          // size of J
 	sK = sizeK[i];          // size of K
 	assert(sI >= sJ);       // Ensuring #I >= #J
 	assert(sK >= 0);        // Recall, it must be that #K >= 0
 	assert(sJ > 1);         // ensuring sI >= sJ > 1
 	// =================================================================================
 	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
 	fp2_t Ap;
 	fp2_add(&Ap, &A.x, &A.x); // 2A' + 4C
 	fp2_sub(&Ap, &Ap, &A.z);   // 2A'
 	fp2_add(&Ap, &Ap, &Ap);     // 4A'
 	//  --------------------------------------------------------------------------------------------------
 	//                   ~~~~~~~~
 	//                    |    | 
 	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
 	//                    j in J 
 	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
 	// In particular, for a degree-l isogeny construction, we need alpha = X/Z and alpha = Z/X (i.e., 1/alpha)
 	//fp2_t EJ_0[sJ][3]; // EJ_0[j][2] factors of one polynomial to be used in a resultant 
 	fp2_t XZ_add, XZj_add,
 	   XZ_sub, XZj_sub,
 	   AXZ2,
 	   CXZ2,
 	   CX2Z2,
 	   t1, t2;
 	fp2_add(&XZ_add, &P.x, &P.z);	// X + Z
 	fp2_sub(&XZ_sub, &P.x, &P.z);	// X - Z
 	fp2_mul(&AXZ2, &P.x, &P.z);	// X * Z
 	fp2_sqr(&t1, &P.x);		// X ^ 2
 	fp2_sqr(&t2, &P.z);		// Z ^ 2
 	fp2_add(&CX2Z2, &t1, &t2);		//      X^2 + Z^2
 	fp2_mul(&CX2Z2, &CX2Z2, &A.z);	// C * (X^2 + Z^2)
 	fp2_add(&AXZ2, &AXZ2, &AXZ2);	//       2 * (X * Z)
 	fp2_mul(&CXZ2, &AXZ2, &A.z);	// C  * [2 * (X * Z)]
 	fp2_mul(&AXZ2, &AXZ2, &Ap);		// A' * [2 * (X * Z)]
 	int j;
 	for (j = 0; j < sJ; j++)
 	{
 		fp2_add(&XZj_add, &J[j].x, &J[j].z);		// Xj + Zj
 		fp2_sub(&XZj_sub, &J[j].x, &J[j].z);		// Xj - Zj
 		fp2_mul(&t1, &XZ_sub, &XZj_add);			// (X - Z) * (Xj + Zj)
 		fp2_mul(&t2, &XZ_add, &XZj_sub);			// (X + Z) * (Xj - Zj)
 		// ...................................
 		// Computing the quadratic coefficient
 		fp2_sub(&EJ_0[j][2], &t1, &t2);			//       2 * [(X*Zj) - (Z*Xj)]
 		fp2_sqr(&EJ_0[j][2], &EJ_0[j][2]);			//     ( 2 * [(X*Zj) - (Z*Xj)] )^2
 		fp2_mul(&EJ_0[j][2], &A.z, &EJ_0[j][2]);		// C * ( 2 * [(X*Zj) - (Z*Xj)] )^2
 		// ..................................
 		// Computing the constant coefficient
 		fp2_add(&EJ_0[j][0], &t1, &t2);			//       2 * [(X*Xj) - (Z*Zj)]
 		fp2_sqr(&EJ_0[j][0], &EJ_0[j][0]);			//     ( 2 * [(X*Xj) - (Z*Zj)] )^2
 		fp2_mul(&EJ_0[j][0], &A.z, &EJ_0[j][0]);		// C * ( 2 * [(X*Xj) - (Z*Zj)] )^2
 		// ................................
 		// Computing the linear coefficient
 		// C * [ (-2*Xj*Zj)*(alpha^2 + 1) + (-2*alpha)*(Xj^2 + Zj^2)] + [A' * (-2*Xj*Zj) * (2*X*Z)] where alpha = X/Z
 		fp2_add(&t1, &J[j].x, &J[j].z);			//      (Xj + Zj)
 		fp2_sqr(&t1, &t1);					//      (Xj + Zj)^2
 		fp2_add(&t1, &t1, &t1);				//  2 * (Xj + Zj)^2
 		fp2_add(&t1, &t1, &XZJ4[j]);			//  2 * (Xj + Zj)^2 - (4*Xj*Zj) := 2 * (Xj^2 + Zj^2)
 		fp2_mul(&t1, &t1, &CXZ2);				// [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
 		fp2_mul(&t2, &CX2Z2, &XZJ4[j]);			// [C * (X^2 + Z^2)] * (-4 * Xj * Zj)
 		fp2_sub(&t1, &t2, &t1);				// [C * (X^2 + Z^2)] * (-4 * Xj * Zj) - [2 * (Xj^2 + Zj^2)] * (2 * [ C * (X * Z)])
 		fp2_mul(&t2, &AXZ2, &XZJ4[j]);			// (2 * [A' * (X * Z)]) * (-4 * Xj * Zj)
 		fp2_add(&EJ_0[j][1], &t1, &t2);			// This is our desired equation but multiplied by 2
 		fp2_add(&EJ_0[j][1], &EJ_0[j][1], &EJ_0[j][1]);	// This is our desired equation but multiplied by 4
 	};
        // ---------------------------------------------------------------------
        // The faster way for multiplying is using a divide-and-conquer approach
 	// product tree of EJ_0 (we only require the root)
 	product_tree_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
 	assert( deg_ptree_EJ[0] == (2*sJ) );
 	if (!scaled)
 	{
 		// unscaled remainder tree approach
 		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
 	}
 	else
 	{
 		// scaled remainder tree approach
 		fp2_t G[sI_max], G_rev[sI_max];
 		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
 		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
 		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
        };
 	// Finally, we must multiply the leaves of the outpur of remainders
 	fp2_t r0;
 	product(&r0, (const fp2_t*)leaves, sI);
 	// EJ_1 is just reverting the ordering in the coefficients of EJ_0
 	for (j = 0; j < sJ; j++){
 		fp2_copy(&t1, &ptree_EJ[0][j]);
 		fp2_copy(&ptree_EJ[0][j], &ptree_EJ[0][2*sJ - j]);
 		fp2_copy(&ptree_EJ[0][2*sJ - j], &t1);
 	}
 	if (!scaled)
 	{
 		// unscaled remainder tree approach
 		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
 	}
 	else
 	{
 		// scaled remainder tree approach
 		fp2_t G[sI_max], G_rev[sI_max];
 		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
 		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
 		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
        };
 	clear_tree(ptree_EJ, 0, sJ);
 	// Finally, we must multiply the leaves of the outpur of remainders
 	fp2_t r1;
 	product(&r1, (const fp2_t*)leaves, sI);
 	// -------------------------------
 	// Sometimes the public value sK is equal to zero,
 	// Thus for avoing runtime error we add one when sK =0
 	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
 	for (j = 0; j < sK; j++)
 	{
 		fp2_add(&XZj_add, &K[j].x, &K[j].z);	// Xk + Zk
 		fp2_sub(&XZj_sub, &K[j].x, &K[j].z);	// Xk - Zk
 		fp2_mul(&t1, &XZ_sub, &XZj_add);		// (X - Z) * (Xk + Zk)
 		fp2_mul(&t2, &XZ_add, &XZj_sub);		// (X + Z) * (Xk - Zk)
 		// Case alpha = X/Z
 		fp2_sub(&hK_0[j], &t1, &t2);		// 2 * [(X*Zk) - (Z*Xk)]
 		// Case 1/alpha = Z/X
 		fp2_add(&hK_1[j], &t1, &t2);		// 2 * [(X*Xk) - (Z*Zk)]
 	};
 	// hk_0 <- use product to mulitiply all the elements in hK_0
 	product(&hk_0, (const fp2_t*)hK_0, sK);
 	// hk_1 <- use product to mulitiply all the elements in hK_1
 	product(&hk_1, (const fp2_t*)hK_1, sK);
 	// ---------------------------------------------------------------------------------
 	// Now, unifying all the computations
 	fp2_mul(&t1, &hk_1, &r1);				// output of algorithm 2 with 1/alpha = Z/X and without the demoninator
 	fp2_sqr(&t1, &t1);
 	fp2_mul(&(Q->x), &t1, &P.x);
 	fp2_mul(&t2, &hk_0, &r0);				// output of algorithm 2 with alpha = X/Z and without the demoninator
 	fp2_sqr(&t2, &t2);
 	fp2_mul(&(Q->z), &t2, &P.z);
 }
--- a/src/ec/ref/ecx/xisog.c
+++ b/src/ec/ref/ecx/xisog.c
@@ -0,0 +1,295 @@
 #include "isog.h"
 #include "ec.h"
 #include <assert.h>
 // -------------------------------------------------------------------------
 // -------------------------------------------------------------------------
 // Degree-2 isogeny with kernel generated by P != (0 ,0)
 // Outputs the curve coefficient in the form A24=(A+2C:4C)
 void xisog_2(ec_point_t* B, ec_point_t const P)
 {
        fp2_sqr(&B->x, &P.x);
        fp2_sqr(&B->z, &P.z);
        fp2_sub(&B->x, &B->z, &B->x);
        fp2_add(&K[0].x, &P.x, &P.z);
        fp2_sub(&K[0].z, &P.x, &P.z);
 }
 // Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
 // Outputs the curve coefficient in the form A24=(A+2C:4C)
 void xisog_4(ec_point_t* B, ec_point_t const P)
 {
 	fp2_sqr(&K[0].x, &P.x);
 	fp2_sqr(&K[0].z, &P.z);
 	fp2_add(&K[1].x, &K[0].z, &K[0].x);
 	fp2_sub(&K[1].z, &K[0].z, &K[0].x);
 	fp2_mul(&B->x, &K[1].x, &K[1].z);
 	fp2_sqr(&B->z, &K[0].z);
 	// Constants for xeval_4
 	fp2_add(&K[2].x, &P.x, &P.z);
 	fp2_sub(&K[1].x, &P.x, &P.z);
 	fp2_add(&K[0].x, &K[0].z, &K[0].z);
 	fp2_add(&K[0].x, &K[0].x, &K[0].x);
 }
 // Degree-4 isogeny with kernel generated by P such that [2]P = (0 ,0)
 void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24)
 {
 	fp2_copy(&K[0].z, &A24.z);
 	if(fp2_is_equal(&P.x, &P.z)){
 		// Case for P=(1,_)
 		fp2_copy(&K[0].x, &A24.x);
 		fp2_sub(&K[1].x, &A24.x, &A24.z);
 		fp2_neg(&B24->z, &K[1].x);
 	}
 	else{
 		// Case for P=(-1,_)
 		fp2_copy(&K[1].x, &A24.x);
 		fp2_sub(&K[0].x, &A24.x, &A24.z);
 		fp2_neg(&B24->z, &K[0].x);
 		fp2_copy(&B24->z, &K[1].x);
 	}
 	fp2_copy(&B24->x, &K[0].z);
 }
 // xISOG procedure, which is a hybrid between Montgomery and Twisted Edwards
 // This tradition fomulae corresponds with the Twisted Edwards formulae but 
 // mapping the output into Montgomery form
 void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A)
 {
 	int j;
 	int d = ((int)TORSION_ODD_PRIMES[i] - 1) / 2;	// Here, l = 2d + 1
 	fp2_t By, Bz, constant_d_edwards, tmp_a, tmp_d;
 	fp2_copy(&By, &K[0].x);
 	fp2_copy(&Bz, &K[0].z);
 	for (j = 1; j < d; j++)
 	{
 		fp2_mul(&By, &By, &K[j].x);
 		fp2_mul(&Bz, &Bz, &K[j].z);
 	};
 	// Mapping Montgomery curve coefficients into Twisted Edwards form
 	fp2_sub(&constant_d_edwards, &A.x, &A.z);
 	fp2_copy(&tmp_a, &A.x);
 	fp2_copy(&tmp_d, &constant_d_edwards);
 	// left-to-right method for computing a^l and d^l
 	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
 	{
 		fp2_sqr(&tmp_a, &tmp_a);
 		fp2_sqr(&tmp_d, &tmp_d);
 		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
 		{
 			fp2_mul(&tmp_a, &tmp_a, &A.x);
 			fp2_mul(&tmp_d, &tmp_d, &constant_d_edwards);
 		};
 	};
 	// raising to 8-th power
 	for (j = 0; j < 3; j++)
 	{
 		fp2_sqr(&By, &By);
 		fp2_sqr(&Bz, &Bz);
 	};
 	// Mapping Twisted Edwards curve coefficients into Montgomery form
 	fp2_mul(&(B->x), &tmp_a, &Bz);
 	fp2_mul(&(B->z), &tmp_d, &By);
 	fp2_sub(&(B->z), &(B->x), &(B->z));
 }
 // -------------------------------------------------------------------------
 // -------------------------------------------------------------------------
 //  Isogeny construction (xISOG) used in velu SQRT
 void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A)
 {
 	// =================================================================================
 	assert(TORSION_ODD_PRIMES[i] > gap);     // Ensuring velusqrt is used for l_i > gap
 	sI = sizeI[i];          // size of I
 	sJ = sizeJ[i];          // size of J
 	sK = sizeK[i];          // size of K
 	assert(sI >= sJ);       // Ensuring #I >= #J
 	assert(sK >= 0);         // Recall, L is a prime and therefore it must be that #K > 0
 	assert(sJ > 1);         // ensuring sI >= sJ > 1
 	// =================================================================================
 	// We require the curve coefficient A = A'/C ... well, a multiple of these ones
 	fp2_t Ap;
 	fp2_add(&Ap, &A.x, &A.x);	// 2A' + 4C
 	fp2_sub(&Ap, &Ap, &A.z);	// 2A'
 	fp2_add(&Ap, &Ap, &Ap);	// 4A'
 	fp2_t ADD_SQUARED[sJ_max],	// (Xj + Zj)^2
 	   SUB_SQUARED[sJ_max];	// (Xj - Zj)^2
 	int j;
 	// Next loop precompute some variables to be used in the reaminder of xisog
 	for (j = 0; j < sJ; j++)
 	{
 		fp2_sub(&SUB_SQUARED[j], &J[j].x, &J[j].z);		// (Xj - Zj)
 		fp2_sqr(&SUB_SQUARED[j], &SUB_SQUARED[j]);		// (Xj - Zj)^2
 		fp2_sub(&ADD_SQUARED[j], &SUB_SQUARED[j], &XZJ4[j]);	// (Xj + Zj)^2
 	};
 	//  --------------------------------------------------------------------------------------------------
 	//                   ~~~~~~~~
 	//                    |    | 
 	// Computing E_J(W) = |    | [ F0(W, x([j]P)) * alpha^2 + F1(W, x([j]P)) * alpha + F2(W, x([j]P)) ]
 	//                    j in J 
 	// In order to avoid costly inverse computations in fp, we are gonna work with projective coordinates
 	// In particular, for a degree-l isogeny construction, we need alpha = 1 and alpha = -1
 	//fp2_t EJ_0[sJ][3],	// quadratic factors of one polynomial to be used in a resultant 
 	//   EJ_1[sJ][3];	// quadratic factors of one polynomial to be used in a resultant
 	// Next loop computes all the quadratic factors of EJ_0 and EJ_1
 	fp2_t t1;
 	for (j = 0; j < sJ; j++)
 	{
 		// Each SUB_SQUARED[j] and ADD_SQUARED[j] should be multiplied by C
 		fp2_mul(&EJ_1[j][0], &ADD_SQUARED[j], &A.z);
 		fp2_mul(&EJ_0[j][0], &SUB_SQUARED[j], &A.z);
 		// We require the double of tadd and tsub
 		fp2_add(&EJ_0[j][1], &EJ_1[j][0], &EJ_1[j][0]);
 		fp2_add(&EJ_1[j][1], &EJ_0[j][0], &EJ_0[j][0]);
 		fp2_mul(&t1, &XZJ4[j], &Ap);			// A' *(-4*Xj*Zj)
 		// Case alpha = 1
 		fp2_sub(&EJ_0[j][1], &t1, &EJ_0[j][1]);
 		fp2_copy(&EJ_0[j][2], &EJ_0[j][0]);		// E_[0,j} is a palindrome
 		// Case alpha = -1
 		fp2_sub(&EJ_1[j][1], &EJ_1[j][1], &t1);
 		fp2_copy(&EJ_1[j][2], &EJ_1[j][0]);		// E_{1,j} is a palindrome
 	};
 	// ---------------------------------------------------------------------
 	// The faster way for multiplying is using a divide-and-conquer approach
 	// selfreciprocal product tree of EJ_0 (we only require the root)
 	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_0, sJ);
 	assert( deg_ptree_EJ[0] == (2*sJ) );
 	if (!scaled)
 	{
 		// (unscaled) remainder tree approach
 		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
 	}
 	else
 	{
 		// scaled remainder tree approach
 		fp2_t G[sI_max], G_rev[sI_max];
 		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
 		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
 		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
 	};
 	clear_tree(ptree_EJ, 0, sJ);
 	// Finally, we must multiply the leaves of the outpur of remainders
 	fp2_t r0;
 	product(&r0, (const fp2_t*)leaves, sI);
 	// selfreciprocal product tree of EJ_1 (we only require the root)
 	product_tree_selfreciprocal_LENFeq3(ptree_EJ, deg_ptree_EJ, 0, EJ_1, sJ);
 	assert( deg_ptree_EJ[0] == (2*sJ) );
 	if (!scaled)
 	{
 		// (unscaled) remainder tree approach
 		multieval_unscaled(leaves, ptree_EJ[0], 2*sJ + 1, rtree_hI, (const fp2_t*)rtree_A, ptree_hI, deg_ptree_hI, 0, sI);
 	}
 	else
 	{
 		// scaled remainder tree approach
 		fp2_t G[sI_max], G_rev[sI_max];
 		poly_redc(G, ptree_EJ[0], 2*sJ + 1, ptree_hI[0], sI + 1, R0, A0);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G_rev[j], &G[sI - 1 - j]);
 		poly_mul_middle(G_rev, G_rev, sI, R0, sI);
 		for (j = 0; j < sI; j++)
 			fp2_copy(&G[j], &G_rev[sI - 1 - j]);
 		multieval_scaled(leaves, G, ptree_hI, deg_ptree_hI, 0, sI);
 	};
 	clear_tree(ptree_EJ, 0, sJ);
 	// Finally, we must multiply the leaves of the outpur of remainders
 	fp2_t r1;
 	product(&r1, (const fp2_t*)leaves, sI);
 	// -------------------------------
 	// Sometimes the public value sK is equal to zero,
 	// Thus for avoing runtime error we add one when sK =0
 	fp2_t hK_0[sK_max + 1], hK_1[sK_max + 1], hk_0, hk_1;
 	for (j = 0; j < sK; j++)
 	{
 		fp2_sub(&hK_0[j], &K[j].z, &K[j].x);
 		fp2_add(&hK_1[j], &K[j].z, &K[j].x);
 	};
 	// hk_0 <- use product to mulitiply all the elements in hK_0
 	product(&hk_0, (const fp2_t*)hK_0, sK);
 	// hk_1 <- use product to mulitiply all the elements in hK_1
 	product(&hk_1, (const fp2_t*)hK_1, sK);
 	// --------------------------------------------------------------
 	// Now, we have all the ingredients for computing the image curve
 	fp2_t A24, A24m,
 	   t24, t24m;	// <---- JORGE creo que podemos omitir estas variables, se usan cuando ya no se requiren los valores de la entrada A (podemos cambiar estos t's por B[0] y B[1]
 	fp2_copy(&A24, &A.x);			// A' + 2C
 	fp2_sub(&A24m, &A.x, &A.z);		// A' - 2C
 	fp2_copy(&Ap, &A24m);
 	// left-to-right method for computing (A' + 2C)^l and (A' - 2C)^l
 	for (j = 1; j < (int)p_plus_minus_bitlength[i]; j++)
 	{
 		fp2_sqr(&A24, &A24);
 		fp2_sqr(&A24m, &A24m);
 		if( ( ((int)TORSION_ODD_PRIMES[i] >> ((int)p_plus_minus_bitlength[i] - j - 1)) & 1 ) != 0 )
 		{
 			fp2_mul(&A24, &A24, &A.x);
 			fp2_mul(&A24m, &A24m, &Ap);
 		};
 	};
 	fp2_mul(&t24m, &hk_1, &r1);			// output of algorithm 2 with alpha =-1 and without the demoninator
 	fp2_sqr(&t24m, &t24m);			// raised at 2
 	fp2_sqr(&t24m, &t24m);			// raised at 4
 	fp2_sqr(&t24m, &t24m);			// raised at 8
 	fp2_mul(&t24, &hk_0, &r0);			// output of algorithm 2 with alpha = 1 and without the demoninator 
 	fp2_sqr(&t24, &t24);			// raised at 2
 	fp2_sqr(&t24, &t24);			// raised at 4
 	fp2_sqr(&t24, &t24);			// raised at 8
 	fp2_mul(&A24, &A24, &t24m);
 	fp2_mul(&A24m, &A24m, &t24);
 	// Now, we have d = (A24m / A24) where the image Montgomery cuve coefficient is
 	//      B'   2*(1 + d)   2*(A24 + A24m)
 	// B = ---- = --------- = --------------
 	//      C      (1 - d)     (A24 - A24m)
 	// However, we required B' + 2C = 4*A24 and 4C = 4 * (A24 - A24m)
 	fp2_sub(&t24m, &A24, &A24m);		//   (A24 - A24m)
 	fp2_add(&t24m, &t24m, &t24m);		// 2*(A24 - A24m)
 	fp2_add(&t24m, &t24m, &t24m);		// 4*(A24 - A24m)
 	fp2_add(&t24, &A24, &A24);			// 2 * A24
 	fp2_add(&t24, &t24, &t24);			// 4 * A24
 	fp2_copy(&(B->x), &t24);
 	fp2_copy(&(B->z), &t24m);
 }
--- a/src/ec/ref/include/curve_extras.h
+++ b/src/ec/ref/include/curve_extras.h
@@ -0,0 +1,28 @@
 #ifndef CURVE_EXTRAS_H
 #define CURVE_EXTRAS_H
 #include "ec.h"
 #include "torsion_constants.h"
 typedef struct jac_point_t {
    fp2_t x;
    fp2_t y;
    fp2_t z;
 } jac_point_t;
 bool ec_is_zero(ec_point_t const* P);
 void copy_point(ec_point_t* P, ec_point_t const* Q);
 void swap_points(ec_point_t* P, ec_point_t* Q, const digit_t option);
 void ec_init(ec_point_t* P);
 void xDBLv2(ec_point_t* Q, ec_point_t const* P, ec_point_t const* A24);
 void xDBLADD(ec_point_t* R, ec_point_t* S, ec_point_t const* P, ec_point_t const* Q, ec_point_t const* PQ, ec_point_t const* A24);
 void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
 void xDBL(ec_point_t* Q, ec_point_t const* P, ec_point_t const* AC);
 void xMUL(ec_point_t* Q, ec_point_t const* P, digit_t const* k, ec_curve_t const* curve);
 void xDBLMUL(ec_point_t* S, ec_point_t const* P, digit_t const* k, ec_point_t const* Q, digit_t const* l, ec_point_t const* PQ, ec_curve_t const* curve);
 #define is_point_equal ec_is_equal
 #define xADD ec_add
 #endif
--- a/src/ec/ref/include/ec.h
+++ b/src/ec/ref/include/ec.h
@@ -0,0 +1,776 @@
 /** @file
 *
 * @authors Luca De Feo, Francisco RH
 *
 * @brief Elliptic curve stuff
 */
 #ifndef EC_H
 #define EC_H
 #include <fp2.h>
 #include <ec_params.h>
 /** @defgroup ec Elliptic curves
 * @{
 */
 /** @defgroup ec_t Data structures
 * @{
 */
 /** @brief Projective point
 *
 * @typedef ec_point_t
 *
 * @struct ec_point_t
 *
 * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
 */
 typedef struct ec_point_t {
    fp2_t x;
    fp2_t z;
 } ec_point_t;
 /** @brief A basis of a torsion subgroup
 *
 * @typedef ec_basis_t
 *
 * @struct ec_basis_t
 *
 * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
 */
 typedef struct ec_basis_t {
    ec_point_t P;
    ec_point_t Q;
    ec_point_t PmQ;
 } ec_basis_t;
 /** @brief An elliptic curve
 *
 * @typedef ec_curve_t
 *
 * @struct ec_curve_t
 *
 * An elliptic curve in projective Montgomery form
 */
 typedef struct ec_curve_t {
    fp2_t A;
    fp2_t C; ///< cannot be 0
 } ec_curve_t;
 /** @brief An isogeny of degree a power of 2
 *
 * @typedef ec_isog_even_t
 *
 * @struct ec_isog_even_t
 */
 typedef struct ec_isog_even_t {
    ec_curve_t curve;      ///< The domain curve
    ec_point_t kernel;     ///< A kernel generator
    unsigned short length; ///< The length as a 2-isogeny walk
 } ec_isog_even_t;
 /** @brief An odd divisor of p² - 1
 *
 * @typedef ec_isog_odd_t
 *
 * Given that the list of divisors of p² - 1 is known, this is
 * represented as a fixed-length vector of integer exponents.
 */
 typedef uint8_t ec_degree_odd_t[P_LEN + M_LEN];
 /** @brief An isogeny of odd degree dividing p² - 1
 *
 * @typedef ec_isog_odd_t
 *
 * @struct ec_isog_odd_t
 */
 typedef struct ec_isog_odd_t {
    ec_curve_t curve;
    ec_point_t ker_plus;    ///< A generator of E[p+1] ∩ ker(φ)
    ec_point_t ker_minus;   ///< A generator of E[p-1] ∩ ker(φ)
    ec_degree_odd_t degree; ///< The degree of the isogeny
 } ec_isog_odd_t;
 /** @brief Isomorphism of Montgomery curves
 *
 * @typedef ec_isom_t
 *
 * @struct ec_isom_t
 *
 * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X - Nz Z) : (D Z) )
 */
 typedef struct ec_isom_t {
    fp2_t Nx;
    fp2_t Nz;
    fp2_t D;
 } ec_isom_t;
 // end ec_t
 /** @}
 */
 /** @defgroup ec_curve_t Curves and isomorphisms
 * @{
 */
 /**
 * @brief j-invariant.
 *
 * @param j_inv computed j_invariant
 * @param curve input curve
 */
 void ec_j_inv(fp2_t* j_inv, const ec_curve_t* curve);
 /**
 * @brief Isomorphism of elliptic curve
 *
 * @param isom computed isomorphism
 * @param from domain curve
 * @param to image curve
 */
 void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to);
 /**
 * @brief In-place inversion of an isomorphism
 *
 * @param isom an isomorphism
 */
 void ec_iso_inv(ec_isom_t* isom);
 /**
 * @brief In-place evaluation of an isomorphism
 *
 * @param P a point
 * @param isom an isomorphism
 */
 void ec_iso_eval(ec_point_t* P, ec_isom_t* isom);
 /**
 * @brief Given a Montgomery curve, computes a standard model for it and the isomorphism to it.
 *
 * @param new computed new curve
 * @param isom computed isomorphism from `old` to `new`
 * @param old A Montgomery curve
 */
 void ec_curve_normalize(ec_curve_t *new, ec_isom_t *isom, const ec_curve_t *old);
 /** @}
 */
 /** @defgroup ec_point_t Point operations
 * @{
 */
 /**
 * @brief Point equality
 *
 * @param P a point
 * @param Q a point
 * @return 1 if equal
 */
 bool ec_is_equal(const ec_point_t* P, const ec_point_t* Q);
 /**
 * @brief Reduce Z-coordinate of point in place
 *
 * @param P a point
 */
 void ec_normalize(ec_point_t* P);
 /**
 * @brief Test whether a point is on a curve
 *
 * @param curve a curve
 * @param P a point
 * @return 1 if P is on the curve
 */
 int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Point negation
 *
 * @param res computed opposite of P
 * @param P a point
 */
 void ec_neg(ec_point_t* res, const ec_point_t* P);
 /**
 * @brief Point addition
 *
 * @param res computed sum of P and Q
 * @param P a point
 * @param Q a point
 * @param PQ the difference P-Q
 */
 void ec_add(ec_point_t* res, const ec_point_t* P, const ec_point_t* Q, const ec_point_t* PQ);
 /**
 * @brief Point doubling
 *
 * @param res computed double of P
 * @param P a point
 */
 void ec_dbl(ec_point_t* res, const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Point multiplication
 *
 * @param res computed scalar * P
 * @param curve the curve
 * @param scalar an unsigned multi-precision integer
 * @param P a point
 */
 void ec_mul(ec_point_t* res, const ec_curve_t* curve, const digit_t* scalar, const ec_point_t* P);
 /**
 * @brief Point multiplication by a scalar of limited length
 *
 * @param res computed scalar * P
 * @param curve the curve
 * @param scalar an unsigned multi-precision integer
 * @param kbits the bit size of scalar
 * @param P a point
 */
 void xMULv2(ec_point_t* Q, ec_point_t const* P, digit_t const* k, const int kbits, ec_point_t const* A24);
 /**
 * @brief Combination P+m*Q
 *
 * @param R computed P + m * Q
 * @param curve the curve
 * @param m an unsigned multi-precision integer
 * @param P a point
 * @param Q a point
 * @param PQ the difference P-Q
 */
 void ec_ladder3pt(ec_point_t *R, fp_t const m, ec_point_t const *P, ec_point_t const *Q, ec_point_t const *PQ, ec_curve_t const *A);
 /**
 * @brief Linear combination of points of a basis
 *
 * @param res computed scalarP * P + scalarQ * Q
 * @param curve the curve
 * @param scalarP an unsigned multi-precision integer
 * @param scalarQ an unsigned multi-precision integer
 * @param PQ a torsion basis consisting of points P and Q
 */
 void ec_biscalar_mul(ec_point_t* res, const ec_curve_t* curve,
    const digit_t* scalarP, const digit_t* scalarQ,
    const ec_basis_t* PQ);
 /** @}
 */
 /** @defgroup ec_dlog_t Discrete logs and bases
 * @{
 */
 /**
 * @brief Generate a Montgomery curve and a 2^f-torsion basis
 *
 * The algorithm is deterministc
 *
 * @param PQ2 computed basis of the 2^f-torsion
 * @param curve the computed curve
 */
 void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve);
 /**
 * @brief Complete a basis of the 2^f-torsion
 *
 * The algorithm is deterministic
 *
 * @param PQ2 a basis of the 2^f-torsion containing P as first generator
 * @param curve the curve
 * @param P a point of order 2^f
 */
 void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Generate a 3^e-torsion basis
 *
 * The algorithm is deterministic
 *
 * @param PQ3 the computed 3^e-torsion basis
 * @param curve a curve
 */
 void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve);
 /**
 * @brief Generate a 6^e-torsion basis
 *
 * The algorithm is deterministic
 *
 * @param PQ6 the computed 2^f*3^g-torsion basis
 * @param curve a curve
 */
 void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve);
 /**
 * @brief Compute the generalized dlog of R wrt the 2^f-basis PQ2
 *
 * Ensure that R = scalarP * P + scalarQ * Q
 *
 * @param scalarP the computed dlog
 * @param scalarQ the computed dlog
 * @param PQ2 a 2^f-torsion basis
 * @param R a point of order dividing 2^f
 */
 void ec_dlog_2(digit_t* scalarP, digit_t* scalarQ,
    const ec_basis_t* PQ2, const ec_point_t* R, const ec_curve_t* curve);
 /**
 * @brief Compute the generalized dlog of R wrt the 3^e-basis PQ3
 *
 * Ensure that R = scalarP * P + scalarQ * Q
 *
 * @param scalarP the computed dlog
 * @param scalarQ the computed dlog
 * @param PQ3 a 3^e-torsion basis
 * @param R a point of order dividing 3^e
 */
 void ec_dlog_3(digit_t* scalarP, digit_t* scalarQ,
    const ec_basis_t* PQ3, const ec_point_t* R, const ec_curve_t* curve);
 /** @}
 */
 /** @defgroup ec_isog_t Isogenies
 * @{
 */
 /**
 * @brief Evaluate isogeny of even degree on list of points
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of points to evaluate the isogeny on, modified in place
 * @param length of the list points
 */
 void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_point_t* points, unsigned short length);
 /**
 * @brief Evaluate isogeny of even degree on list of points, assuming the point (0,0) is not in the kernel
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of points to evaluate the isogeny on, modified in place
 * @param length of the list points
 */
 void ec_eval_even_nonzero(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_point_t* points, unsigned short length);
 /**
 * @brief Evaluate isogeny of even degree on list of torsion bases
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of bases to evaluate the isogeny on, modified in place
 * @param length of the list bases
 */
 static inline void ec_eval_even_basis(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_basis_t* points, unsigned short length) {
    ec_eval_even(image, phi, (ec_point_t*)points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
 }
 /**
 * @brief Evaluate isogeny of odd degree on list of points
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of points to evaluate the isogeny on, modified in place
 * @param length of the list points
 */
 void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
    ec_point_t* points, unsigned short length);
 /**
 * @brief Evaluate isogeny of odd degree on list of torsion bases
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of bases to evaluate the isogeny on, modified in place
 * @param length of the list bases
 */
 static inline void ec_eval_odd_basis(ec_curve_t* image, const ec_isog_odd_t* phi,
    ec_basis_t* points, unsigned short length) {
    ec_eval_odd(image, phi, (ec_point_t*)points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
 }
 /** @}
 */
 // end ec
 /** @}
 */
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////// ORIGINAL VERSION
 #if 0
 /** @file
 *
 * @authors Luca De Feo, Francisco RH
 *
 * @brief Elliptic curve stuff
 */
 #ifndef EC_H
 #define EC_H
 #include <gf.h>
 /** @defgroup ec Elliptic curves
 * @{
 */
 /** @defgroup ec_t Data structures
 * @{
 */
 /** @brief Projective point
 *
 * @typedef ec_point_t
 *
 * @struct ec_point_t
 *
 * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
 */
 typedef struct ec_point_t {
    fp2_t X;
    //fp2_t Y;
    fp2_t Z;
 } ec_point_t;
 /** @brief A basis of a torsion subgroup
 *
 * @typedef ec_basis_t
 *
 * @struct ec_basis_t
 *
 * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
 */
 typedef struct ec_basis_t {
    ec_point_t P;
    ec_point_t Q;
    ec_point_t PmQ;  // or maybe not
 } ec_basis_t;
 /** @brief An elliptic curve
 *
 * @typedef ec_curve_t
 *
 * @struct ec_curve_t
 *
 * An elliptic curve in projective Montgomery form
 */
 typedef struct ec_curve_t {
    fp2_t A;
    fp2_t C; ///< cannot be 0
 } ec_curve_t;
 /** @brief An isogeny of degree a power of 2
 *
 * @typedef ec_isog_even_t
 *
 * @struct ec_isog_even_t
 */
 typedef struct ec_isog_even_t {
    ec_curve_t curve;      ///< The domain curve
    ec_point_t kernel;     ///< A kernel generator
    unsigned short length; ///< The length as a 2-isogeny walk
 } ec_isog_even_t;
 /** @brief An odd divisor of p² - 1
 *
 * @typedef ec_isog_odd_t
 *
 * @struct ec_isog_odd_t
 *
 * Given that the list of divisors of p² - 1 is known, this could be
 * represented as a fixed-length vector of integer exponents, possibly
 * distinguishing the divisors of p + 1 from those of p - 1.
 */
 typedef struct ec_degree_odd_t {
    // todo (basically a ushort vector)
 } ec_degree_odd_t;
 /** @brief An isogeny of odd degree dividing p² - 1
 *
 * @typedef ec_isog_odd_t
 *
 * @struct ec_isog_odd_t
 */
 typedef struct ec_isog_odd_t {
    ec_point_t ker_plus;    ///< A generator of E[p+1] ∩ ker(φ)
    ec_point_t ker_minus;   ///< A generator of E[p-1] ∩ ker(φ)
    ec_degree_odd_t degree; ///< The degree of the isogeny
 } ec_isog_odd_t;
 /** @brief Isomorphism of Montgomery curves
 *
 * @typedef ec_isom_t
 *
 * @struct ec_isom_t
 *
 * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X - Nz Z) : (D Z) )
 * TODO: fix if (X:Y:Z) coordinates.
 */
 typedef struct ec_isom_t {
    fp2_t Nx;
    fp2_t Nz;
    fp2_t D;
 } ec_isom_t;
 // end ec_t
 /** @}
 */
 /** @defgroup ec_curve_t Curves and isomorphisms
 * @{
 */
 /**
 * @brief j-invariant.
 *
 * @param j_inv computed j_invariant
 * @param curve input curve
 */
 void ec_j_inv(fp2_t* j_inv, const ec_curve_t* curve);
 /**
 * @brief Isomorphism of elliptic curve
 *
 * @param isom computed isomorphism
 * @param from domain curve
 * @param to image curve
 */
 void ec_isomorphism(ec_isom_t* isom, const ec_curve_t* from, const ec_curve_t* to);
 /** @}
 */
 /** @defgroup ec_point_t Point operations
 * @{
 */
 /**
 * @brief Point equality
 *
 * @param P a point
 * @param Q a point
 * @return 1 if equal
 */
 int ec_is_equal(const ec_point_t* P, const ec_point_t* Q);
 /**
 * @brief Reduce Z-coordinate of point in place
 *
 * @param P a point
 */
 void ec_normalize(ec_point_t* P);
 /**
 * @brief Test whether a point is on a curve
 *
 * @param curve a curve
 * @param P a point
 * @return 1 if P is on the curve
 */
 int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Point negation
 *
 * @param res computed opposite of P
 * @param P a point
 */
 void ec_neg(ec_point_t* res, const ec_point_t* P);
 /**
 * @brief Point addition
 *
 * Needs to be adjusted if (X:Z) arithmetic.
 *
 * @param res computed sum of P and Q
 * @param P a point
 * @param Q a point
 */
 void ec_add(ec_point_t* res, const ec_point_t* P, const ec_point_t* Q);
 /**
 * @brief Point doubling
 *
 * @param res computed double of P
 * @param P a point
 */
 void ec_dbl(ec_point_t* res, const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Point multiplication
 *
 * @param res computed scalar * P
 * @param curve the curve
 * @param scalar an unsigned multi-precision integer
 * @param P a point
 */
 void ec_mul(ec_point_t* res, const ec_curve_t* curve, const digit_t* scalar, const ec_point_t* P);
 /**
 * @brief Linear combination of points of a basis
 *
 * @param res computed scalarP * P + scalarQ * Q
 * @param curve the curve
 * @param scalarP an unsigned multi-precision integer
 * @param scalarQ an unsigned multi-precision integer
 * @param PQ a torsion basis consisting of points P and Q
 */
 void ec_biscalar_mul(ec_point_t* res, const ec_curve_t* curve,
    const digit_t* scalarP, const digit_t* scalarQ,
    const ec_basis_t* PQ);
 /** @}
 */
 /** @defgroup ec_dlog_t Discrete logs and bases
 * @{
 */
 /**
 * @brief Generate a Montgomery curve and a 2^f-torsion basis
 *
 * The algorithm is deterministc
 *
 * @param curve the computed curve
 * @param PQ2 a basis of the 2^f-torsion
 * @param j_inv a j-invariant
 */
 void ec_j_to_basis_2(ec_curve_t* curve, ec_basis_t* PQ2, const fp2_t* j_inv);
 /**
 * @brief Complete a basis of the 2^f-torsion
 *
 * The algorithm is deterministc
 *
 * @param PQ2 a basis of the 2^f-torsion containing P as first generator
 * @param curve the curve
 * @param P a point of order 2^f
 */
 void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P);
 /**
 * @brief Generate a 3^e-torsion basis
 *
 * The algorithm is deterministc
 *
 * @param PQ3 the computed 3^e-torsion basis
 * @param curve a curve
 */
 void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve);
 /**
 * @brief Compute the generalized dlog of R wrt the 2^f-basis PQ2
 *
 * Ensure that R = scalarP * P + scalarQ * Q
 *
 * @param scalarP the computed dlog
 * @param scalarQ the computed dlog
 * @param PQ2 a 2^f-torsion basis
 * @param R a point of order dividing 2^f
 */
 void ec_dlog_2(digit_t* scalarP, digit_t* scalarQ,
    const ec_basis_t* PQ2, const ec_point_t* R);
 /**
 * @brief Compute the generalized dlog of R wrt the 3^e-basis PQ3
 *
 * Ensure that R = scalarP * P + scalarQ * Q
 *
 * @param scalarP the computed dlog
 * @param scalarQ the computed dlog
 * @param PQ3 a 3^e-torsion basis
 * @param R a point of order dividing 3^e
 */
 void ec_dlog_3(digit_t* scalarP, digit_t* scalarQ,
    const ec_basis_t* PQ3, const ec_point_t* R);
 /** @}
 */
 /** @defgroup ec_isog_t Isogenies
 * @{
 */
 /**
 * @brief Evaluate isogeny of even degree on list of points
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of points to evaluate the isogeny on, modified in place
 * @param length of the list points
 */
 void ec_eval_even(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_point_t* points, unsigned short length);
 /**
 * @brief Evaluate isogeny of even degree on list of torsion bases
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of bases to evaluate the isogeny on, modified in place
 * @param length of the list bases
 */
 static inline void ec_eval_even_basis(ec_curve_t* image, const ec_isog_even_t* phi,
    ec_basis_t* points, unsigned short length) {
    ec_eval_even(image, phi, points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
 }
 /**
 * @brief Evaluate isogeny of odd degree on list of points
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of points to evaluate the isogeny on, modified in place
 * @param length of the list points
 */
 void ec_eval_odd(ec_curve_t* image, const ec_isog_odd_t* phi,
    ec_point_t* points, unsigned short length);
 /**
 * @brief Evaluate isogeny of odd degree on list of torsion bases
 *
 * @param image computed image curve
 * @param phi isogeny
 * @param points a list of bases to evaluate the isogeny on, modified in place
 * @param length of the list bases
 */
 static inline void ec_eval_odd_basis(ec_curve_t* image, const ec_isog_odd_t* phi,
    ec_basis_t* points, unsigned short length) {
    ec_eval_odd(image, phi, points, sizeof(ec_basis_t) / sizeof(ec_point_t) * length);
 }
 /** @}
 */
 // end ec
 /** @}
 */
 #endif
 #endif
--- a/src/ec/ref/include/isog.h
+++ b/src/ec/ref/include/isog.h
@@ -0,0 +1,84 @@
 #ifndef _ISOG_H_
 #define _ISOG_H_
 #include "curve_extras.h"
 #include "poly.h"
 extern int sI, sJ, sK;	// Sizes of each current I, J, and K	
 extern fp2_t I[sI_max][2],		// I plays also as the linear factors of the polynomial h_I(X)
 			EJ_0[sJ_max][3], EJ_1[sJ_max][3];	// To be used in xisog y xeval
 extern ec_point_t J[sJ_max], K[sK_max];		// Finite subsets of the kernel
 extern fp2_t XZJ4[sJ_max],		// -4* (Xj * Zj) for each j in J, and x([j]P) = (Xj : Zj)
    rtree_A[(1 << (ceil_log_sI_max+2)) - 1],		// constant multiple of the reciprocal tree computation
    A0;			// constant multiple of the reciprocal R0
 extern poly ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// product tree of h_I(X)
     rtree_hI[(1 << (ceil_log_sI_max+2)) - 1],		// reciprocal tree of h_I(X)
     ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];		// product tree of E_J(X)
 extern fp2_t R0[2*sJ_max + 1];		// Reciprocal of h_I(X) required in the scaled remainder tree approach
 extern int deg_ptree_hI[(1 << (ceil_log_sI_max+2)) - 1],	// degree of each noed in the product tree of h_I(X)
    deg_ptree_EJ[(1 << (ceil_log_sJ_max+2)) - 1];	// degree of each node in the product tree of E_J(X)
 extern fp2_t leaves[sI_max];		// leaves of the remainder tree, which are required in the Resultant computation
 void eds2mont(ec_point_t* P);						// mapping from Twisted edwards into Montogmery
 void yadd(ec_point_t* R, ec_point_t* const P, ec_point_t* const Q, ec_point_t* const PQ);	// differential addition on Twisted edwards model
 void CrissCross(fp2_t *r0, fp2_t *r1, fp2_t const alpha, fp2_t const beta, fp2_t const gamma, fp2_t const delta);
 void kps_t(uint64_t const i, ec_point_t const P, ec_point_t const A);	// tvelu formulae
 void kps_s(uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
 void xisog_4(ec_point_t* B, ec_point_t const P);			// degree-4 isogeny construction
 void xisog_4_singular(ec_point_t* B24, ec_point_t const P, ec_point_t A24);
 void xisog_2(ec_point_t* B, ec_point_t const P);			// degree-2 isogeny construction
 void xisog_t(ec_point_t* B, uint64_t const i, ec_point_t const A);	// tvelu formulae
 void xisog_s(ec_point_t* B, uint64_t const i, ec_point_t const A);	// svelu formulae
 void xeval_4(ec_point_t* R, const ec_point_t* Q, const int lenQ);					// degree-4 isogeny evaluation
 void xeval_4_singular(ec_point_t* R, const ec_point_t* Q, const int lenQ, const ec_point_t P);
 void xeval_2(ec_point_t* R, ec_point_t* const Q, const int lenQ);	// degree-2 isogeny evaluation
 void xeval_t(ec_point_t* Q, uint64_t const i, ec_point_t const P);			// tvelu formulae
 void xeval_s(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A);	// svelu formulae
 // Strategy-based 4-isogeny chain
 static void ec_eval_even_strategy(ec_curve_t* image, ec_point_t* points, unsigned short points_len,
    ec_point_t* A24, const ec_point_t *kernel, const int isog_len);
 void kps_clear(int i);	// Clear memory assigned by KPS
 // hybrid velu formulae
 static inline void kps(uint64_t const i, ec_point_t const P, ec_point_t const A)	
 {
 	// Next branch only depends on a fixed public bound (named gap)
 	if (TORSION_ODD_PRIMES[i] <= gap)
 		kps_t(i, P, A);
 	else
 		kps_s(i, P, A);
 }
 static inline void xisog(ec_point_t* B, uint64_t const i, ec_point_t const A)
 {
 	// Next branch only depends on a fixed public bound (named gap)
 	if (TORSION_ODD_PRIMES[i] <= gap)
 		xisog_t(B, i, A);
 	else
 		xisog_s(B, i, A);
 }
 static inline void xeval(ec_point_t* Q, uint64_t const i, ec_point_t const P, ec_point_t const A)
 {
 	// Next branch only depends on a fixed public bound (named gap)
 	if (TORSION_ODD_PRIMES[i] <= gap)
 		xeval_t(Q, i, P);
 	else
 		xeval_s(Q, i, P, A);
 }
 #endif
--- a/src/ec/ref/include/poly.h
+++ b/src/ec/ref/include/poly.h
@@ -0,0 +1,28 @@
 #ifndef _POLY_H_
 #define _POLY_H_
 #include <fp2.h>
 typedef fp2_t *poly; // Polynomials are arrays of coeffs over Fq, lowest degree first
 void poly_mul(poly h, const poly f, const int lenf, const poly g, const int leng);
 void poly_mul_low(poly h, const int n, const poly f, const int lenf, const poly g, const int leng);
 void poly_mul_middle(poly h, const poly g, const int leng, const poly f, const int lenf);
 void poly_mul_selfreciprocal(poly h, const poly g, const int leng, const poly f, const int lenf);
 void product_tree(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
 void product_tree_LENFeq2(poly H[], int DEG[], const int root, const fp2_t F[][2], const int n);
 void product_tree_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
 void product_tree_selfreciprocal(poly H[], int DEG[], const int root, const poly F[], const int LENF, const int n);
 void product_tree_selfreciprocal_LENFeq3(poly H[], int DEG[], const int root, const fp2_t F[][3], const int n);
 void clear_tree(poly H[], const int root, const int n);
 void product(fp2_t *c, const fp2_t F[], const int n);
 void reciprocal(poly h, fp2_t *c, const poly f, const int lenf, const int n);
 void poly_redc(poly h, const poly g, const int leng, const poly f, const int lenf,const poly f_inv, const fp2_t c);
 void reciprocal_tree(poly *R, fp2_t *A, const int leng, const poly H[], const int DEG[], const int root, const int n);
 void multieval_unscaled(fp2_t REM[], const poly g, const int leng, const poly R[], const fp2_t A[], const poly H[], const int DEG[], const int root, const int n);
 void multieval_scaled(fp2_t REM[], const poly G, const poly H[], const int DEG[], const int root, const int n);
 #endif /* _POLY_H */
--- a/src/ec/ref/include/sdacs.h
+++ b/src/ec/ref/include/sdacs.h
@@ -0,0 +1,50 @@
 #ifndef _SDACS_H_
 #define _SDACS_H_
 static char SDAC_P_0[] = "0";
 static char SDAC_P_1[] = "10";
 static char SDAC_P_2[] = "100";
 static char SDAC_P_3[] = "0100";
 static char SDAC_P_4[] = "10000";
 static char SDAC_P_5[] = "110000";
 static char SDAC_P_6[] = "100000";
 static char SDAC_P_7[] = "1100010001";
 static char SDAC_P_8[] = "1001010000";
 static char SDAC_P_9[] = "0101001000";
 static char SDAC_P_10[] = "110110010000";
 static char SDAC_P_11[] = "10000000000";
 static char SDAC_P_12[] = "1010100001001000";
 static char SDAC_M_0[] = "";
 static char SDAC_M_1[] = "000";
 static char SDAC_M_2[] = "1010";
 static char SDAC_M_3[] = "100010";
 static char SDAC_M_4[] = "0010000";
 static char SDAC_M_5[] = "110000000";
 static char SDAC_M_6[] = "1010101010";
 static char SDAC_M_7[] = "1010001000";
 static char SDAC_M_8[] = "1001000000";
 static char SDAC_M_9[] = "0100001000";
 static char SDAC_M_10[] ="101101010000"; 
 static char SDAC_M_11[] = "100100010010";
 static char SDAC_M_12[] = "010100011000";
 static char SDAC_M_13[] = "101010000001";
 static char SDAC_M_14[] = "010100001000";
 static char SDAC_M_15[] = "1101010010000";
 static char SDAC_M_16[] = "1001010001010";
 static char SDAC_M_17[] = "101001000000101";
 static char *SDACs[31] = {
 	SDAC_P_0, SDAC_P_1, SDAC_P_2, SDAC_P_3, SDAC_P_4, 
 	SDAC_P_5, SDAC_P_6, SDAC_P_7, SDAC_P_8, SDAC_P_9, 
 	SDAC_P_10, SDAC_P_11, SDAC_P_12, 
 	SDAC_M_0, SDAC_M_1, SDAC_M_2, SDAC_M_3, SDAC_M_4, 
 	SDAC_M_5, SDAC_M_6, SDAC_M_7, SDAC_M_8, SDAC_M_9, 
 	SDAC_M_10, SDAC_M_11, SDAC_M_12, SDAC_M_13, SDAC_M_14, 
 	SDAC_M_15, SDAC_M_16, SDAC_M_17
 	};
 static int LENGTHS[] =	{
 1, 2, 3, 4, 5, 6, 6, 10, 10, 10, 12, 11, 16, 0, 3, 4, 6, 7, 9, 10, 10, 10, 10, 12, 12, 12, 12, 12, 13, 13, 15
 	};
 #endif
--- a/src/ec/ref/include/tedwards.h
+++ b/src/ec/ref/include/tedwards.h
@@ -0,0 +1,28 @@
 #ifndef TEDWARDS_H
 #define TEDWARDS_H
 #include <fp2.h>
 #include "ec.h"
 // a*x^2+y^2=1+d*x^2*y^2
 typedef struct ted_point_t {
    fp2_t x;
    fp2_t y;
    fp2_t z;
    fp2_t t; // t = x*y/z
 } ted_point_t;
 void ted_init(ted_point_t* P);
 bool is_ted_equal(ted_point_t const* P1, ted_point_t const* P2);
 void copy_ted_point(ted_point_t* P, ted_point_t const* Q);
 void ted_neg(ted_point_t* Q, ted_point_t const* P);
 void ted_dbl(ted_point_t* Q, ted_point_t const* P, ec_curve_t const* E);
 void ted_add(ted_point_t* S, ted_point_t const* P, ted_point_t const* Q, ec_curve_t const* E);
 void mont_to_ted(ec_curve_t* E, ec_curve_t const* A);
 void mont_to_ted_point(ted_point_t* Q, ec_point_t const* P, ec_curve_t const* A);
 void ted_to_mont_point(ec_point_t* Q, ted_point_t const* P);
 #endif
--- a/src/ec/ref/lvl1/CMakeLists.txt
+++ b/src/ec/ref/lvl1/CMakeLists.txt
@@ -0,0 +1,17 @@
 set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
    ${ECX_DIR}/poly-mul.c 
    ${ECX_DIR}/poly-redc.c 
    ${ECX_DIR}/ec.c 
    ${ECX_DIR}/tedwards.c 
    ${ECX_DIR}/kps.c 
    ${ECX_DIR}/xisog.c 
    ${ECX_DIR}/xeval.c 
    ${ECX_DIR}/isog_chains.c 
    ${ECX_DIR}/basis.c
 )
 add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
 target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
 target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
 add_subdirectory(test)
--- a/src/ec/ref/lvl1/test/CMakeLists.txt
+++ b/src/ec/ref/lvl1/test/CMakeLists.txt
@@ -0,0 +1,36 @@
 add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
 	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
 	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
 add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
 	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
 	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
 	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
 	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
 	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
 	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
 add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl1/test/ec-tests.h
+++ b/src/ec/ref/lvl1/test/ec-tests.h
@@ -0,0 +1,400 @@
 #ifndef EC_TESTS_H
 #define EC_TESTS_H
 #include "test_extras.h"
 #include <stdio.h>
 #include <string.h>
 #include <bench.h>       //////// NOTE: enable later
 #include "test-basis.h"
 #include "ec_params.h"
 // Global constants
 extern const digit_t p[NWORDS_FIELD];
 // Benchmark and test parameters  
 static int BENCH_LOOPS = 1000;       // Number of iterations per bench
 static int TEST_LOOPS  = 512;       // Number of iterations per test
 bool ec_test()
 { // Tests for ecc arithmetic
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_point_t AC = {0};
    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Testing ecc functions: \n\n"); 
    // Point doubling
    passed = 1;
    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
    P.z.re[0] = 0x01;
    AC.z.re[0] = 0x01;
    fp2_tomont(&AC.z, &AC.z);
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    xDBL(&S, &R, &AC);
    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
    fp2_copy(&SS.z, &S.z);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
    Q.z.re[0] = 0x01;
    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
    PQ.z.re[0] = 0x01;
    fp2_tomont(&S.x, &Q.x);
    fp2_tomont(&S.z, &Q.z);
    fp2_tomont(&PQ.x, &PQ.x);
    fp2_tomont(&PQ.z, &PQ.z);
    xADD(&S, &SS, &S, &PQ);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 126;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &Q.x);
    fp2_tomont(&R.z, &Q.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    l[0] = 0x34AB78B6C6B2D8C0;
    l[1] = 0xDE6B2D8CD00F38D1;
    l[2] = 0xA35F4A7897E17CE2;
    l[3] = 0x20ACF4A789614D13;
    fp2_inv(&SS.z);
    fp2_mul(&SS.x, &SS.x, &SS.z);
    fp2_copy(&SS.z, &R.z);
    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
 out0:
    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    return OK;
 }
 bool dlog_test()
 { // Tests for dlog
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Testing dlog functions: \n\n");
    // dlog2 testing
    passed = 1;
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(f1, TWOpFm1);
    fp_copy(f2, TWOpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[0] = 0xFFFFFFFFFFFFFFFF;
    k[1] = 0x00000000000007FF;
    l[0] = 0xFFFFFFFFFFFFFFFE;
    l[1] = 0x00000000000007FF;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, f2, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, f2, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    // dlog3 testing
    passed = 1;
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(tpFdiv2, THREEpFdiv2);
    fp_copy(tpF, THREEpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[1] = 0;
    l[1] = 0;
    k[0] = 0x02153E468B91C6D1;
    l[0] = 0x02153E468B91C6D0;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, tpF, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, tpF, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    return OK;
 }
 bool ec_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P, Q, R, PQ, AC;
    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Benchmarking ecc arithmetic: \n\n"); 
    // Point doubling
    cycles = 0;
    for (n=0; n<BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles(); 
        xDBL(&Q, &P, &AC);
        cycles2 = cpucycles();
        cycles = cycles+(cycles2-cycles1);
    }
    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point addition
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xADD(&R, &Q, &P, &PQ);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 bool dlog_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Benchmarking dlog2: \n\n");
    // dlog2 computation
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // dlog3 computation
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 #endif
--- a/src/ec/ref/lvl1/test/test-basis.h
+++ b/src/ec/ref/lvl1/test/test-basis.h
@@ -0,0 +1,24 @@
 #ifndef TEST_BASIS_H
 #define TEST_BASIS_H
 #include "fp2.h"
 // Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
 const fp2_t xPA = {{0x7505815fb30f099e,0x89e78dbb4294c8df,0x7db9b4b1f7716d7b,0x13fcd4c87af65308},{0x93533c1017088fd4,0x6df9e398a1bb4cb1,0xc928f082be2e2b4c,0x17aa7e2906bef0af}};
 const fp2_t xQA = {{0xe96336b75eb5a505,0x5640cecad0ad7b5a,0x1394f0771bc58ac1,0x18d92124656d68d9},{0xa54e8e24605754f0,0xe52de9790bbe4bb9,0x3bf9b7833f62e255,0x277a07644ec4f0e2}};
 const fp2_t xPQA = {{0xc8fcceb408e3444c,0x9f8ca4d2c05c3287,0x259e496f17c0f529,0x0eb18a51c2a3dd1a},{0x1014dbe2534b8310,0x6b035ee3c371ea12,0x8354ecb4c111db6d,0x178259b78fe08093}};
 const fp2_t xPB = {{0xbd0a2f0c9a5378ca,0x74af17405042203d,0x0ccdcb4b7f0b8c15,0x314c70951a92d8bf},{0xe889e6bc5f9842af,0xefb0edbb5e266ab3,0x7bfb9d05f1ba6962,0x0a5f3f4fe6f16514}};
 const fp2_t xQB = {{0x137e215438caaf3b,0xc4403ee1b69f1382,0x2b5783edcefa7246,0x3015572698262f66},{0x8e88e4293f84536e,0x8d6dbc277f85ff77,0xb3f17b53b01da916,0x08dd3f4976c5dad1}};
 const fp2_t xPQB = {{0xf0c2701a7050d9b9,0xc8fdb069c0234d3a,0x9ec25780f2b101a8,0x221a0565053e8ff4},{0xd8513bf6a05910ae,0x47ff2422258dfb3a,0xb98ccceae31ac407,0x21bcc8e659aaa1b3}};
 // 2^f-torsion basis for A=0
 const fp2_t xP2 = {{0xfc93bac7df77fd30,0xa8d37e10783215bd,0x4bd2ece4f148039b,0x2bd5b83f5f8c09fb},{0x444112970b59f12f,0x557b8b9beb55c276,0x633f97cd9464df6c,0x00a1b21b593a2dfd}};
 const fp2_t xQ2 = {{0x6b4289960273222c,0xa290d8eb8e343a04,0x0c0a333f80a0ed68,0x31a58910e276aff0},{0xb7ca615ad7473865,0xeb6f72f20794f050,0x2941c3fe3203b94f,0x32ad5cbe915e467b}};
 const fp2_t xPQ2 = {{0xac9f90005e47b095,0x47eafdafd5168836,0xb88aac8334acdad0,0x1a5cf52a20f665b4},{0x4baa70fb1f5fa99c,0xffb7ddb12c87f1a3,0xdd3a229d370a8484,0x1e992ad0a14baf03}};
 // 3^g-torsion basis for A=0
 const fp2_t xP3 = {{0x8cf496c2722f340d,0x3e329c5a507ad39c,0xa0c7caa3e4537e25,0x1371d43cf97de48e},{0xa4b94c97b8149e7d,0xd290853fa14704c7,0x158b854173c1b289,0x04c6dcda7872c23f}};
 const fp2_t xQ3 = {{0x0f6380fd4c963950,0x101a22a245c4f563,0x601d3e30b21a5f43,0x0becd5f73b067949},{0xd364123c6806057e,0x8ff24fca9e060260,0x3b52df5bfb817901,0x30950462489b838f}};
 const fp2_t xPQ3 = {{0xe04cab7169e64a82,0x56df573ea9295c19,0x06cbb6af8e341990,0x0f1046ca03017ca1},{0x2dac3457c35be728,0x2f59af21113f25f9,0xa0dc4f54eec2715d,0x102ecf9a7ff2f2ff}};
 #endif
--- a/src/ec/ref/lvl3/CMakeLists.txt
+++ b/src/ec/ref/lvl3/CMakeLists.txt
@@ -0,0 +1,17 @@
 set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
    ${ECX_DIR}/poly-mul.c 
    ${ECX_DIR}/poly-redc.c 
    ${ECX_DIR}/ec.c 
    ${ECX_DIR}/tedwards.c 
    ${ECX_DIR}/kps.c 
    ${ECX_DIR}/xisog.c 
    ${ECX_DIR}/xeval.c 
    ${ECX_DIR}/isog_chains.c 
    ${ECX_DIR}/basis.c
 )
 add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
 target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
 target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
 add_subdirectory(test)
--- a/src/ec/ref/lvl3/test/CMakeLists.txt
+++ b/src/ec/ref/lvl3/test/CMakeLists.txt
@@ -0,0 +1,36 @@
 add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
 	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
 	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
 add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
 	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
 	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
 	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
 	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
 	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
 	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
 add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl3/test/ec-tests.h
+++ b/src/ec/ref/lvl3/test/ec-tests.h
@@ -0,0 +1,400 @@
 #ifndef EC_TESTS_H
 #define EC_TESTS_H
 #include "test_extras.h"
 #include <stdio.h>
 #include <string.h>
 #include <bench.h>       //////// NOTE: enable later
 #include "test-basis.h"
 #include "ec_params.h"
 // Global constants
 extern const digit_t p[NWORDS_FIELD];
 // Benchmark and test parameters  
 static int BENCH_LOOPS = 1000;       // Number of iterations per bench
 static int TEST_LOOPS  = 512;       // Number of iterations per test
 bool ec_test()
 { // Tests for ecc arithmetic
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_point_t AC = {0};
    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
 /*
    // Point doubling
    passed = 1;
    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
    P.z.re[0] = 0x01;
    AC.z.re[0] = 0x01;
    fp2_tomont(&AC.z, &AC.z);
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    xDBL(&S, &R, &AC);
    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
    fp2_copy(&SS.z, &S.z);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
    Q.z.re[0] = 0x01;
    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
    PQ.z.re[0] = 0x01;
    fp2_tomont(&S.x, &Q.x);
    fp2_tomont(&S.z, &Q.z);
    fp2_tomont(&PQ.x, &PQ.x);
    fp2_tomont(&PQ.z, &PQ.z);
    xADD(&S, &SS, &S, &PQ);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 126;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &Q.x);
    fp2_tomont(&R.z, &Q.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    l[0] = 0x34AB78B6C6B2D8C0;
    l[1] = 0xDE6B2D8CD00F38D1;
    l[2] = 0xA35F4A7897E17CE2;
    l[3] = 0x20ACF4A789614D13;
    fp2_inv(&SS.z);
    fp2_mul(&SS.x, &SS.x, &SS.z);
    fp2_copy(&SS.z, &R.z);
    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
 out0:
    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
    printf("\n");
 */
    return OK;
 }
 bool dlog_test()
 { // Tests for dlog
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Testing dlog functions: \n\n");
    // dlog2 testing
    passed = 1;
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(f1, TWOpFm1);
    fp_copy(f2, TWOpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[0] = 0xFFFFFFFFFFFFFFFF;
    k[1] = 0x00000000000007FF;
    l[0] = 0xFFFFFFFFFFFFFFFE;
    l[1] = 0x00000000000007FF;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, f2, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, f2, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    // dlog3 testing
    passed = 1;
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(tpFdiv2, THREEpFdiv2);
    fp_copy(tpF, THREEpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[1] = 0;
    l[1] = 0;
    k[0] = 0x02153E468B91C6D1;
    l[0] = 0x02153E468B91C6D0;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, tpF, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, tpF, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    return OK;
 }
 bool ec_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P, Q, R, PQ, AC;
    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Benchmarking ecc arithmetic: \n\n"); 
    // Point doubling
    cycles = 0;
    for (n=0; n<BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles(); 
        xDBL(&Q, &P, &AC);
        cycles2 = cpucycles();
        cycles = cycles+(cycles2-cycles1);
    }
    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point addition
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xADD(&R, &Q, &P, &PQ);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 bool dlog_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Benchmarking dlog2: \n\n");
    // dlog2 computation
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // dlog3 computation
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 #endif
--- a/src/ec/ref/lvl3/test/test-basis.h
+++ b/src/ec/ref/lvl3/test/test-basis.h
@@ -0,0 +1,24 @@
 #ifndef TEST_BASIS_H
 #define TEST_BASIS_H
 #include "fp2.h"
 // Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
 const fp2_t xPA = {{0x35b53c72e7494775,0x5791b499bc29710d,0x2060f3aca68fa4ff,0x81150c19a14f523a,0x08af6c81a906d44a,0x00cca2a93efb536e},{0x14eaac356375af76,0x5655011e771be3b4,0x6273ccee274d7754,0x440d6b5b4496c183,0xa3d7f80e9f9111ba,0x0302e153bee01a18}};
 const fp2_t xQA = {{0x80c0767d1b7b5fd8,0x24e9039d430ca3b5,0x26485254625dc85a,0x612eaebc345b64d1,0x59669fbd946a4409,0x004c3a8564e16101},{0x0e1eac4e38449c54,0x752c042b4c6675cb,0x88ec0e75c8e9ea0e,0xbf7c4cdbfc4483f0,0xd594cb5474bbc264,0x02f5e2345a9b4654}};
 const fp2_t xPQA = {{0x1f5accaff9a7da90,0x91884964774d4cb2,0x0e938e13dd088e63,0x453c9af09879a724,0xb2bd09ec3740312b,0x0007a5837e23aaa1},{0x8e1ac4b319787bd4,0x7cb9fba402f67bfe,0x370b2951f9ec29cf,0x7a020172566f9d17,0x063e31753d703130,0x01551136265bade6}};
 const fp2_t xPB = {{0xb702a70a8ae132ad,0x56d8804c83a8e696,0x5ac3e12f4df1792e,0x0a89da435664746e,0xd8758765206844bd,0x01a92f6e9e0e9296},{0x8aaab711b76b0959,0x210e6695ca5e5fdd,0x593be0d75909ca12,0xfbc074d8ebdeb927,0xb61fcc328d3756bc,0x0198a5942855c8bf}};
 const fp2_t xQB = {{0x2b6b82b950b61fda,0x0ef2dd717daed334,0x99dee4db0b268ac9,0x3534eb384e1fcaf0,0xbaf112845a4f2d81,0x037f1492d8d815a1},{0x97e80590f9a0556b,0x7d9b4b87a22a7792,0xda4534fe75595b4b,0xbe1092a2733c03e1,0xbf5b1bd147b0d630,0x0125721476e5267f}};
 const fp2_t xPQB = {{0xb7d459a56d4aebec,0x6ac7f10ba20e1e71,0x9a95a8928507f7ef,0xc4c5aff6b97f3dfe,0x644beb3e86806b77,0x022319eb6eaf072a},{0x8ad0f6b18934790e,0xdad82b7b38e166bf,0xcb08f5a3ab53d9a9,0xd2ff39b401ba8aba,0xbff9b5e40ed9e5ce,0x03c1773791f554c0}};
 // 2^f-torsion basis for A=0
 const fp2_t xP2 = {{0x7a26fdb0e5844206,0x0752b2ba140f7dfd,0x1728013f8f5fe257,0xd05f129975ed6bba,0xe736dbce707ad5a8,0x01f861715896d0be},{0xdac046927a0c5352,0x5a42474ac156ff18,0xe887982ff4c5a9ea,0x3875be6432251f1c,0xdfae47315af877ee,0x005627f085582ecc}};
 const fp2_t xQ2 = {{0xc4f03ab3db57331b,0xf04261fc3b713778,0xa99b82430c7e40d1,0x5fe52b1324c2a091,0xfcaa2a7049d0f657,0x021f2caa09302141},{0x4a92a1d5ff9f6730,0x6dcd5f600f33783e,0xdb8b4e2e5149b45e,0x993458635c01d0c0,0x5f9bc7d3bb307f91,0x01fcc7eae4712b6a}};
 const fp2_t xPQ2 = {{0x7f4ee9c86c4341a2,0x0c867f482063bdfc,0xe46fb7b0fbd479c7,0xddaa716e091be9ad,0x29239eadddf5dc59,0x0231c09c660f0a89},{0xde64fa344dd64237,0xa89aaaed3dd84555,0xbb70924d8fb73f27,0x0869ec018b3366dc,0x47a0356ce742bcbc,0x00547dbda6dc094d}};
 // 3^g-torsion basis for A==0
 const fp2_t xP3 = {{0x7c878d0ceaa821f0,0xf94db4cab7186625,0x7cff6d5fb0ca7867,0x4e3f5bd19cbca9d6,0x05ec8273d0042548,0x0233a79cf87040b3},{0x060e9f3dcab8192c,0xa94e86d063a46398,0x0e5cc403bfb60867,0x3ea1277f98087283,0xaff1fd95bb094917,0x025041b12719d3b8}};
 const fp2_t xQ3 = {{0xb25aaa192bd351b7,0xc5db1962aed7e543,0x1f722ab174319947,0xd1c9bb4a0a5d8aa3,0x351415ec64f88921,0x0288ae044d62c930},{0xb41ede1724f8e06a,0xfb10ce5a83c66629,0x9846173e31a9d448,0x35c94966192f08db,0x72f7252946af3f9c,0x02ea05c971e7b34c}};
 const fp2_t xPQ3 = {{0x674703cc3134d90b,0x507e338e496b8f75,0x0c8cb1f138346e4c,0x54cb7ad5ba580da7,0x65750f0bcd0a9857,0x038b435f51669e87},{0xdcdc0116c67589a0,0x45ce94f4d345c827,0x0f2cbfb3c53b73ea,0x03e7951bc98efbb8,0x3335ad0991864858,0x01e151a64210f74f}};
 #endif
--- a/src/ec/ref/lvl5/CMakeLists.txt
+++ b/src/ec/ref/lvl5/CMakeLists.txt
@@ -0,0 +1,17 @@
 set(SOURCE_FILES_EC_${SVARIANT_UPPER}_REF
    ${ECX_DIR}/poly-mul.c 
    ${ECX_DIR}/poly-redc.c 
    ${ECX_DIR}/ec.c 
    ${ECX_DIR}/tedwards.c 
    ${ECX_DIR}/kps.c 
    ${ECX_DIR}/xisog.c 
    ${ECX_DIR}/xeval.c 
    ${ECX_DIR}/isog_chains.c 
    ${ECX_DIR}/basis.c
 )
 add_library(${LIB_EC_${SVARIANT_UPPER}} ${SOURCE_FILES_EC_${SVARIANT_UPPER}_REF})
 target_include_directories(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_EC})
 target_compile_options(${LIB_EC_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
 add_subdirectory(test)
--- a/src/ec/ref/lvl5/test/CMakeLists.txt
+++ b/src/ec/ref/lvl5/test/CMakeLists.txt
@@ -0,0 +1,36 @@
 add_executable(fp2.test_${SVARIANT_LOWER} ${ECX_DIR}/test/fp2-test.c)
 	target_include_directories(fp2.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include ../include ${INC_GF_${SVARIANT_UPPER}} ${INC_EC} ${INC_COMMON})
 	target_link_libraries(fp2.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}})
 add_executable(poly-mul.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-mul-test.c)
 	target_include_directories(poly-mul.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-mul.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(poly-redc.test_${SVARIANT_LOWER} ${ECX_DIR}/test/poly-redc-test.c)
 	target_include_directories(poly-redc.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include  ${INC_EC} ${INC_COMMON})
 	target_link_libraries(poly-redc.test_${SVARIANT_LOWER} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(mont.test_${SVARIANT_LOWER} ${ECX_DIR}/test/mont-test.c)
 	target_include_directories(mont.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(mont.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(ec.test_${SVARIANT_LOWER} ${ECX_DIR}/test/ec-test.c ${ECX_DIR}/test/test_extras.c)
 	target_include_directories(ec.test_${SVARIANT_LOWER} PUBLIC ${ECX_DIR}/test ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(ec.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(velu.test_${SVARIANT_LOWER} ${ECX_DIR}/test/velu-test.c)
 	target_include_directories(velu.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(velu.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_executable(isog.test_${SVARIANT_LOWER} ${ECX_DIR}/test/isog-test.c)
 	target_include_directories(isog.test_${SVARIANT_LOWER} PUBLIC ${INC_GF_${SVARIANT_UPPER}} ${INC_INTBIG} ${INC_PRECOMP_${SVARIANT_UPPER}} ${PROJECT_SOURCE_DIR}/include  ../include ${INC_EC} ${INC_COMMON} .)
 	target_link_libraries(isog.test_${SVARIANT_LOWER} ${LIB_PRECOMP_${SVARIANT_UPPER}} ${LIB_INTBIG} ${LIB_GF_${SVARIANT_UPPER}} ${LIB_EC_${SVARIANT_UPPER}})
 add_test(ec_fp2.test_${SVARIANT_LOWER} fp2.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-mul.test_${SVARIANT_LOWER} poly-mul.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_poly-redc.test_${SVARIANT_LOWER} poly-redc.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_mont.test_${SVARIANT_LOWER} mont.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_ec.test_${SVARIANT_LOWER} ec.test_${SVARIANT_LOWER} test ${SQISIGN_TEST_REPS})
 add_test(ec_velu.test_${SVARIANT_LOWER} velu.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
 add_test(ec_isog.test_${SVARIANT_LOWER} isog.test_${SVARIANT_LOWER} ${SQISIGN_TEST_REPS})
--- a/src/ec/ref/lvl5/test/ec-tests.h
+++ b/src/ec/ref/lvl5/test/ec-tests.h
@@ -0,0 +1,400 @@
 #ifndef EC_TESTS_H
 #define EC_TESTS_H
 #include "test_extras.h"
 #include <stdio.h>
 #include <string.h>
 #include <bench.h>       //////// NOTE: enable later
 #include "test-basis.h"
 #include "ec_params.h"
 // Global constants
 extern const digit_t p[NWORDS_FIELD];
 // Benchmark and test parameters  
 static int BENCH_LOOPS = 1000;       // Number of iterations per bench
 static int TEST_LOOPS  = 512;       // Number of iterations per test
 bool ec_test()
 { // Tests for ecc arithmetic
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_point_t AC = {0};
    digit_t k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Testing ecc functions: (NOT IMPLEMENTED) \n\n"); 
 /*
    // Point doubling
    passed = 1;
    P.x.re[0] = 0xDFD70ED0861BD329; P.x.re[1] = 0x20ACD3758C7F5540; P.x.re[2] = 0x3DCCDC007277F80A; P.x.re[3] = 0x18D6D2A22981DCE1;
    P.x.im[0] = 0x3C23730A3F08F38C; P.x.im[1] = 0x98BB973AFD3D954D; P.x.im[2] = 0x8D98ADFC2829AE8A; P.x.im[3] = 0x21A2464D6369AFBA;
    P.z.re[0] = 0x01;
    AC.z.re[0] = 0x01;
    fp2_tomont(&AC.z, &AC.z);
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    xDBL(&S, &R, &AC);
    fp2_copy(&SS.x, &S.x);    // Copy of S = SS <- 2P 
    fp2_copy(&SS.z, &S.z);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x5950EE0A4AF90FC8; R.x.re[1] = 0x16488065A0A98B08; R.x.re[2] = 0xCE65322229DA0FD1; R.x.re[3] = 0x270A35FF781EE204;
    R.x.im[0] = 0x564447FD9EC57F6B; R.x.im[1] = 0x2EE24E984294F729; R.x.im[2] = 0x53A6C7360E972C71; R.x.im[3] = 0x4FCF4B9928A7C7E;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2)!=0) { passed=0; goto out0; }
    Q.x.re[0] = 0xC46076A670C70053; Q.x.re[1] = 0x97517AFA3AB9ED13; Q.x.re[2] = 0x349644C942EDF993; Q.x.re[3] = 0xBB4A4DB6F29AF9E;
    Q.x.im[0] = 0x8B47629FB5A15BB0; Q.x.im[1] = 0x4EC6E809953C1A10; Q.x.im[2] = 0x1F83F0EC6CBB84D6; Q.x.im[3] = 0x1D8417C1D33265D3;
    Q.z.re[0] = 0x01;
    PQ.x.re[0] = 0x853F66D11BE5534F; PQ.x.re[1] = 0x27C8FD4E52D03D4A; PQ.x.re[2] = 0xF88EA78D0A0C29D2; PQ.x.re[3] = 0x2F6DFB07D397A067;
    PQ.x.im[0] = 0xE8DBC4AA34434BA1; PQ.x.im[1] = 0x7A73AE182636F8A0; PQ.x.im[2] = 0x419EC260137868EB; PQ.x.im[3] = 0x129B3E301703D43F;
    PQ.z.re[0] = 0x01;
    fp2_tomont(&S.x, &Q.x);
    fp2_tomont(&S.z, &Q.z);
    fp2_tomont(&PQ.x, &PQ.x);
    fp2_tomont(&PQ.z, &PQ.z);
    xADD(&S, &SS, &S, &PQ);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xED0BEB8F93AB4FF9; R.x.re[1] = 0x27CF508B80CD49BF; R.x.re[2] = 0x38A6134DFA04B2BA; R.x.re[3] = 0x27B4CB15E109EF1F;
    R.x.im[0] = 0x6F731BA6FD227BDE; R.x.im[1] = 0x14C12335341167F8; R.x.im[2] = 0xECA7B60F7866E27A; R.x.im[3] = 0x2A7A79A152880457;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 126;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xDE80F87A1203A147; R.x.re[1] = 0xD59E1215928A3B2D; R.x.re[2] = 0xD5A67F83A5A8CE46; R.x.re[3] = 0xA11E162488C9CDF;
    R.x.im[0] = 0x9417D0D79A26741B; R.x.im[1] = 0x8B1F47D6F0FE5EEC; R.x.im[2] = 0xE52188DCB054CE36; R.x.im[3] = 0x1A8075A6C3148AB3;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &P.x);
    fp2_tomont(&R.z, &P.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    xMUL(&S, &R, k, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0xD3938B0A68A3E7C0; R.x.re[1] = 0xE0667113208A0595; R.x.re[2] = 0x258F314C84E9CB60; R.x.re[3] = 0x14984BA7CA59AB71;
    R.x.im[0] = 0xFE728423EE3BFEF4; R.x.im[1] = 0xBF68C42FE21AE0E4; R.x.im[2] = 0xA8FAF9C9528609CA; R.x.im[3] = 0x1225EC77A1DC0285;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
    fp2_tomont(&R.x, &Q.x);
    fp2_tomont(&R.z, &Q.z);
    k[0] = 0xE77AD6B6C6B2D8CD;
    k[1] = 0xDE43A0B600F38D12;
    k[2] = 0xA35F4A7897E17CE2;
    k[3] = 0x10ACB62E614D1237;
    l[0] = 0x34AB78B6C6B2D8C0;
    l[1] = 0xDE6B2D8CD00F38D1;
    l[2] = 0xA35F4A7897E17CE2;
    l[3] = 0x20ACF4A789614D13;
    fp2_inv(&SS.z);
    fp2_mul(&SS.x, &SS.x, &SS.z);
    fp2_copy(&SS.z, &R.z);
    xDBLMUL(&S, &R, k, &SS, l, &PQ, (ec_curve_t*)&AC);
    fp2_inv(&S.z);
    fp2_mul(&S.x, &S.x, &S.z);
    fp2_frommont(&S.x, &S.x);
    R.x.re[0] = 0x554E1ADC609B992F; R.x.re[1] = 0xE407D961F8CC4C42; R.x.re[2] = 0x1CF626AFED5A68CE; R.x.re[3] = 0x6D02692EE110483;
    R.x.im[0] = 0x16FB094E831C8997; R.x.im[1] = 0xFDE4ECF31DC5F702; R.x.im[2] = 0x89303D868DFAD7B4; R.x.im[3] = 0xC91ACE81346F22D;
    if (compare_words((digit_t*)&R.x, (digit_t*)&S.x, NWORDS_FIELD*2) != 0) { passed = 0; goto out0; }
 out0:
    if (passed==1) printf("  ECC arithmetic tests ............................................ PASSED");
    else { printf("  ECC arithmetic tests... FAILED"); printf("\n"); return false; }
    printf("\n");
 */
    return OK;
 }
 bool dlog_test()
 { // Tests for dlog
    bool OK = true;
    int passed;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    digit_t kt[NWORDS_ORDER], lt[NWORDS_ORDER], f1[NWORDS_ORDER] = {0}, f2[NWORDS_ORDER] = {0}, zero[NWORDS_ORDER] = {0}, tpFdiv2[NWORDS_ORDER] = {0}, tpF[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Testing dlog functions: \n\n");
    // dlog2 testing
    passed = 1;
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(f1, TWOpFm1);
    fp_copy(f2, TWOpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[0] = 0xFFFFFFFFFFFFFFFF;
    k[1] = 0x00000000000007FF;
    l[0] = 0xFFFFFFFFFFFFFFFE;
    l[1] = 0x00000000000007FF;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, f1, NWORDS_ORDER) == 1 ||
           (compare_words(l, f1, NWORDS_ORDER) == 1 && (compare_words(k, zero, NWORDS_ORDER) == 0 || compare_words(k, f1, NWORDS_ORDER) == 0))) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, f2, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, f2, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog2 tests ..................................................... PASSED");
    else { printf("  dlog2 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    // dlog3 testing
    passed = 1;
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp_copy(tpFdiv2, THREEpFdiv2);
    fp_copy(tpF, THREEpF);
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    k[1] = 0;
    l[1] = 0;
    k[0] = 0x02153E468B91C6D1;
    l[0] = 0x02153E468B91C6D0;
    for (int n = 0; n < TEST_LOOPS; n++)
    {
        k[0] -= 1;
        l[0] -= 2;
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        memcpy(kt, k, NWORDS_ORDER*RADIX/8);
        memcpy(lt, l, NWORDS_ORDER*RADIX/8);
        if (compare_words(k, tpFdiv2, NWORDS_ORDER) == 1 ||
           (compare_words(l, tpFdiv2, NWORDS_ORDER) == 1 && compare_words(k, zero, NWORDS_ORDER) == 0)) {
            if (compare_words(k, zero, NWORDS_ORDER) != 0) {
                sub_test(kt, tpF, kt, NWORDS_ORDER);
            }
            if (compare_words(l, zero, NWORDS_ORDER) != 0) {
                sub_test(lt, tpF, lt, NWORDS_ORDER);
            }
        }
        if (compare_words((digit_t*)scalarP, (digit_t*)kt, NWORDS_ORDER) != 0 || compare_words((digit_t*)scalarQ, (digit_t*)lt, NWORDS_ORDER) != 0) { passed = 0; break; }
    }
    if (passed == 1) printf("  dlog3 tests ..................................................... PASSED");
    else { printf("  dlog3 tests... FAILED"); printf("\n"); return false; }
    printf("\n");
    return OK;
 }
 bool ec_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P, Q, R, PQ, AC;
    digit_t k[NWORDS_ORDER], l[NWORDS_ORDER];
    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
    printf("Benchmarking ecc arithmetic: \n\n"); 
    // Point doubling
    cycles = 0;
    for (n=0; n<BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles(); 
        xDBL(&Q, &P, &AC);
        cycles2 = cpucycles();
        cycles = cycles+(cycles2-cycles1);
    }
    printf("  Montgomery x-only doubling runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point addition
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xADD(&R, &Q, &P, &PQ);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only addition runs in .............................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xMUL(&Q, &P, k, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only scalar multiplication runs in ................. %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // Point multiplication
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        cycles1 = cpucycles();
        xDBLMUL(&R, &P, k, &Q, l, &PQ, (ec_curve_t*)&AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  Montgomery x-only double-scalar multiplication runs in .......... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 bool dlog_run()
 {
    bool OK = true;
    int n;
    unsigned long long cycles, cycles1, cycles2;
    ec_point_t P = {0}, Q = {0}, R = {0}, S = {0}, SS = {0}, PQ = {0};
    ec_curve_t AC = {0};
    ec_basis_t PQ2;
    digit_t scalarP[NWORDS_ORDER], scalarQ[NWORDS_ORDER], k[NWORDS_ORDER] = {0}, l[NWORDS_ORDER] = {0};
    printf("\n--------------------------------------------------------------------------------------------------------\n\n");
    printf("Benchmarking dlog2: \n\n");
    // dlog2 computation
    fp2_tomont(&P.x, &xP2);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ2);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ2);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    AC.C.re[0] = 0x01;
    fp2_tomont(&AC.C, &AC.C);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_2(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog2 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    // dlog3 computation
    fp2_tomont(&P.x, &xP3);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_tomont(&Q.x, &xQ3);
    fp_mont_setone(Q.z.re);
    fp_set(Q.z.im, 0);
    fp2_tomont(&PQ.x, &xPQ3);
    fp_mont_setone(PQ.z.re);
    fp_set(PQ.z.im, 0);
    copy_point(&PQ2.P, &P);
    copy_point(&PQ2.Q, &Q);
    copy_point(&PQ2.PmQ, &PQ);
    cycles = 0;
    for (n = 0; n < BENCH_LOOPS; n++)
    {
        fprandom_test(k); fprandom_test(l);
        xDBLMUL(&R, &P, k, &Q, l, &PQ, &AC);
        cycles1 = cpucycles();
        ec_dlog_3(scalarP, scalarQ, &PQ2, &R, &AC);
        cycles2 = cpucycles();
        cycles = cycles + (cycles2 - cycles1);
    }
    printf("  dlog3 runs in ................................................... %7lld cycles", cycles/BENCH_LOOPS);
    printf("\n");
    return OK;
 }
 #endif
--- a/src/ec/ref/lvl5/test/test-basis.h
+++ b/src/ec/ref/lvl5/test/test-basis.h
@@ -0,0 +1,24 @@
 #ifndef TEST_BASIS_H
 #define TEST_BASIS_H
 #include "fp2.h"
 // Full-torsion basis for A=0 (excluding 2^f and huge prime factors)
 const fp2_t xPA = {{0x3c780e636a5869dc,0xb8a1d106332efe8e,0x7dd946e490e6578e,0x71d1fadbea881f88,0xb94912baba3999f0,0x85343be0a74ca9e1,0x22ae01775a9f7fa4,0x001032ffab70a66e},{0x15908a4b85221a67,0x342f82e6a1db4e1d,0x3d7c806a0d47b041,0x693830fad798c598,0xcfa244134a61827a,0x7f723d6f5d9628cf,0x10da657833d4d027,0x000c48499df01216}};
 const fp2_t xQA = {{0x79a766df9c10c642,0x7677cb85097be8be,0x2a21c7f9b84b9deb,0xb263e837f57210ce,0x551d6636b7c7e061,0x78d332581bee10b2,0xce30a9926772e06c,0x00150b5009b1d6ed},{0xbb2f097dae470eb9,0x53940c6df1eb93a9,0x7786a4bab87320c1,0x89d32acc1c91db18,0x733ef7f139fb7f9b,0x7bc336ee25a3901b,0xf7dfe8f5559eeeb1,0x00210555ab63e7f3}};
 const fp2_t xPQA = {{0x315ead6fadc8b0d6,0x7da37e8b7e94de95,0xcc6a9e206f513651,0x84fa9fab584acf3d,0x293b25689ac50519,0xe3222bd1c8154964,0x8ad7f39d04a8274f,0x000898edca69c223},{0x3e6c3e1864851e7e,0x01807c724f75ad5e,0xe9cd50eff4e66fb7,0x6c7c19a88fed9707,0x3ab57d0499386a40,0x6b5fd53c6efdc0b5,0x092fe030da27bc43,0x00076f2f409c5f8e}};
 const fp2_t xPB = {{0x229e388475511856,0x2f6b17e9ec9258c0,0x0cb28c568697f9f4,0xca039e28512c9f9b,0xd52d823761b0daa2,0xa09c3800e22c5e3b,0x2971022668c3b76a,0x0006e91c4415afd1},{0xbd5059b7406e1dcd,0x9da456ed8c11f1a3,0x1fb30e9cf66f928e,0x867c348b2f488d26,0x9d4b03d8aa4229bc,0x1c01ca1088d145a8,0xc9d6a201d77644a1,0x000a0d45131bf5b0}};
 const fp2_t xQB = {{0x712f0e5d0e3b4dfa,0x52260082dda1a07e,0x5a7513dcfd273829,0xc686f0976cbb5dcf,0xf5fc3df004cc7efc,0x615d0c2da4f2fb9f,0x796efbb3f65aede8,0x00028176c42e1d9f},{0xb8779b5a7bd2436b,0x4067b7e09d0ca56c,0xfdbaee6ff27ebe38,0x69310e98174025de,0x71960a10fa15706e,0x08ffb4b3f6efafbf,0xb7116ca162211ea3,0x00253c0f60765f1f}};
 const fp2_t xPQB = {{0x0e90506c89b46e0c,0x24ec65d5deb4e5b9,0x8477f7e141db8725,0xf76957ec1940dbd3,0xc2857af32534e715,0x06820654c6bae5f4,0x5ac928ef3c90c1f8,0x0024f724366faeed},{0xf6d7d2fdb06b91c4,0xe603cf05ce3f7555,0x8a0876277637415c,0xa1ef891f00155f8f,0x159db3ac93d39d57,0x5a05683aeaa453ff,0x180c38da2402f6fc,0x000b69d01dcb9107}};
 // 2^f-torsion basis for A=0
 const fp2_t xP2 = {{0x5d453ee3e6de9bf6,0xb5e51a5e88d8bbf3,0xc91ce6ef41eda957,0x4e0ba74e86fd3385,0xeff87c1def35e01f,0xedcd6c20496988a5,0x91a2c14abdb955fe,0x000be92a3f4de175},{0xa8a13d8e0022a825,0xb26bb70885d42bef,0x2533c31e799596b4,0xc41d58b247fb5ac9,0x8d45fa188fd5cb65,0x1b0593f6e4af948d,0x0ede22e4fcbe17ca,0x0014f54c5d5e1308}};
 const fp2_t xQ2 = {{0x90414b2365f868cd,0x68af18688f73fe25,0x46ca4c4b4ca19114,0xadae5e2564f79c98,0xfe3e09af9d00eb08,0x6856810a298a57bf,0x170d41ba9327205d,0x001d588b6744b4ea},{0xfb94e978bcf29be5,0x136700c07b264bd6,0x62a3c89d8466b8f9,0x9f990ca7d3084bd8,0xaab6fb1040e242d0,0x9e9325c5a5c20740,0xa9a6ee97f376e198,0x0003c8eee3581511}};
 const fp2_t xPQ2 = {{0x873d426c501eafe6,0xdeb1e87769484669,0x57c38f42bd1fef4d,0x53ca12d14b2ded18,0xb72ef4a808fc9d70,0x59d9a54b1844cca1,0x6ca7ccb15b6a9e49,0x00132a12929654f7},{0xffc6b824b6603270,0xb4152cbd3b607298,0xbe97764acdcb16ce,0x5205b1ec222c3be9,0x0cf5ac18d1eb4984,0xf5233664fd72c328,0x492e775887a3367c,0x001ce6bdfc847b45}};
 // 3^g-torsion basis for A=0
 const fp2_t xP3 = {{0x807a6abcb56d1915,0x3ab8ff7df809ea8f,0x2bd4f1eba48b23ac,0xeb32542370dde5ff,0xe6c50551eaaf2329,0x545dceaf98f07f09,0x90bfb0e10f3e5b48,0x000cc0084da1b367},{0xbd6f9c82cd4acc13,0x9b39d0711267d8a2,0x0ff31ab9fd38bb36,0xccc169cd75c1a58b,0xd943ad3571e304b4,0xfc3cda0859595d00,0xabda66362732b019,0x00070c5abcf1f329}};
 const fp2_t xQ3 = {{0x2b46bbfa6e57a9db,0xa7a5881479d3aaff,0x5c8106d57698b7cb,0xde0ccd3c436cd1ad,0xed351e8fbc28fd8f,0xe18a9a18e4f5bf03,0x9a98961a81073911,0x001ed93f47abe8f2},{0x5dc96ddee6e9a9eb,0x5e8905d15b918006,0xe89cecdc3f9b48f1,0x9d1a98543001e35e,0x0795c7b134dadeba,0x8050c48376f36d87,0xe9f364f7c6fbee1f,0x00061cb05b384f81}};
 const fp2_t xPQ3 = {{0xd44970f662987227,0x4c8eda7256920e8d,0x857f42e972e25a0e,0xc66a5b62daa3644d,0x6ab4ded74a464c38,0x4157cc1048b85a3a,0x9916ab1ee4e2305a,0x000c6943137ffba1},{0x0c5118f818e5279d,0xacb0c4a011613c7a,0xb87b4a9cb16a7565,0xc997ccbe0159f318,0x6fc50720bce6f45f,0xbd1916a5ca7789d7,0x3f48f437fdeccc64,0x000674d925340bc4}};
 #endif
--- a/src/gf/CMakeLists.txt
+++ b/src/gf/CMakeLists.txt
@@ -0,0 +1 @@
 include(${SELECT_IMPL_TYPE})
--- a/src/gf/broadwell/CMakeLists.txt
+++ b/src/gf/broadwell/CMakeLists.txt
@@ -0,0 +1 @@
 include(${SELECT_SQISIGN_VARIANT})
--- a/src/gf/broadwell/lvl1/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/CMakeLists.txt
@@ -0,0 +1,10 @@
 set(SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL
    fp_asm.S fp.c fp2.c
 )
 add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL})
 target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
 target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
 add_subdirectory(test)
--- a/src/gf/broadwell/lvl1/Makefile
+++ b/src/gf/broadwell/lvl1/Makefile
@@ -0,0 +1,46 @@
 CC=gcc
 CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
 LDFLAGS=-lm
 AR=ar rcs
 RANLIB=ranlib
 OBJECTS=objs/fp_p1913.o objs/fp.o objs/fp2.o objs/fp_asm.o objs/random.o
 all: lib tests
 objs/fp_p1913.o: fp_p1913.c
 	@mkdir -p $(@D)
 	$(CC) -c $(CFLAGS) fp_p1913.c -o objs/fp_p1913.o
 objs/fp.o: fp.c
 	@mkdir -p $(@D)
 	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
 objs/fp2.o: fp2.c
 	@mkdir -p $(@D)
 	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
 objs/fp_asm.o: fp_asm.S
 	$(CC) -c $(CFLAGS) fp_asm.S -o objs/fp_asm.o
 objs/random.o: ../../../common/generic/randombytes_system.c
 	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
 lib: $(OBJECTS)
 	rm -rf lib
 	mkdir lib
 	$(AR) lib/libtest.a $^
 	$(RANLIB) lib/libtest.a
 tests: lib
 	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
 	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
 check: tests
 .PHONY: clean
 clean:
 	rm -rf *.req objs lib test_fp*
--- a/src/gf/broadwell/lvl1/fp.c
+++ b/src/gf/broadwell/lvl1/fp.c
@@ -0,0 +1,192 @@
 #include "include/fp.h"
 const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0x252C9E49355147FF, 0x33A6A86587407437, 0x34E29E286B95D98C };
 const uint64_t R2[NWORDS_FIELD] = { 0x233625AE400674D4, 0x20AFD6C1025A1C2E, 0x30A841AB0920655D, 0x0D72E7D67C30CD3D };
 const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
 void fp_set(digit_t* x, const digit_t val)
 { // Set field element x = val, where val has wordsize
    x[0] = val;
    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
        x[i] = 0;
    }
 }
 void fp_mont_setone(digit_t* out1) {
    out1[0] = 0x4;
    out1[1] = UINT64_C(0x6b4d86db2abae000);
    out1[2] = UINT64_C(0x31655e69e2fe2f23);
    out1[3] = UINT64_C(0x2c75875e51a899cf);
 }
 bool fp_is_equal(const digit_t* a, const digit_t* b)
 { // Compare two field elements in constant time
  // Returns 1 (true) if a=b, 0 (false) otherwise
    digit_t r = 0;
    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
        r |= a[i] ^ b[i];
    return (bool)is_digit_zero_ct(r);
 }
 bool fp_is_zero(const digit_t* a)
 { // Is a field element zero?
  // Returns 1 (true) if a=0, 0 (false) otherwise
    digit_t r = 0;
    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
        r |= a[i] ^ 0;
    return (bool)is_digit_zero_ct(r);
 }
 void fp_copy(digit_t* out, const digit_t* a)
 {
    memcpy(out, a, NWORDS_FIELD*RADIX/8);
 }
 void fp_neg(digit_t* out, const digit_t* a)
 { // Modular negation, out = -a mod p
  // Input: a in [0, p-1] 
  // Output: out in [0, p-1] 
    unsigned int i, borrow = 0;
    for (i = 0; i < NWORDS_FIELD; i++) {
        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
    }
    fp_sub(out, out, (digit_t*)p);
 }
 void fp_tomont(digit_t* out, const digit_t* a)
 { // Conversion to Montgomery representation
  // out = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
    fp_mul(out, a, (digit_t*)&R2);
 }
 void fp_frommont(digit_t* out, const digit_t* a)
 { // Conversion from Montgomery representation to standard representation
  // out = a*R^(-1) mod p, where a in [0, p-1].
    digit_t one[NWORDS_FIELD] = {0};
    one[0] = 1;
    fp_mul(out, a, one);
 }
 void MUL(digit_t* out, const digit_t a, const digit_t b)
 { // Digit multiplication, digit*digit -> 2-digit result 
  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
  // Output: 0 < out < 2^(2w)-1    
    register digit_t al, ah, bl, bh, temp;
    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
    al = a & mask_low;                        // Low part
    ah = a >> (sizeof(digit_t)*4);            // High part
    bl = b & mask_low;
    bh = b >> (sizeof(digit_t)*4);
    albl = al * bl;
    albh = al * bh;
    ahbl = ah * bl;
    ahbh = ah * bh;
    out[0] = albl & mask_low;                 // out00
    res1 = albl >> (sizeof(digit_t)*4);
    res2 = ahbl & mask_low;
    res3 = albh & mask_low;
    temp = res1 + res2 + res3;
    carry = temp >> (sizeof(digit_t)*4);
    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
    res1 = ahbl >> (sizeof(digit_t)*4);
    res2 = albh >> (sizeof(digit_t)*4);
    res3 = ahbh & mask_low;
    temp = res1 + res2 + res3 + carry;
    out[1] = temp & mask_low;                 // out10 
    carry = temp & mask_high;
    out[1] ^= (ahbh & mask_high) + carry;     // out11
 }
 digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
 { // Multiprecision right shift
    digit_t bit_out = x[0] & 1;
    for (unsigned int i = 0; i < nwords-1; i++) {
        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
    }
    x[nwords-1] >>= shift;
    return bit_out;
 }
 void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
 { // Multiprecision left shift
    for (int i = nwords-1; i > 0; i--) {
        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
    }
    x[0] <<= shift;
 }
 static void fp_exp3div4(digit_t* out, const digit_t* a)
 { // Fixed exponentiation out = a^((p-3)/4) mod p
  // Input: a in [0, p-1] 
  // Output: out in [0, p-1] 
  // Requirement: p = 3(mod 4)
    fp_t p_t, acc;
    digit_t bit;
    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
    mp_shiftr(p_t, 1, NWORDS_FIELD);
    mp_shiftr(p_t, 1, NWORDS_FIELD);
    fp_set(out, 1);
    fp_tomont(out, out);
    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
        bit = p_t[0] & 1;
        mp_shiftr(p_t, 1, NWORDS_FIELD);
        if (bit == 1) {
            fp_mul(out, out, acc);
        }
        fp_sqr(acc, acc);
    }
 }
 void fp_inv(digit_t* a)
 { // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
  // Input: a=xR in [0, p-1] 
  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
  // Requirement: Ceiling(Log(p)) < w*nwords
    fp_t t;
    fp_exp3div4(t, a);
    fp_sqr(t, t);
    fp_sqr(t, t);
    fp_mul(a, t, a);    // a^(p-2)
 }
 bool fp_is_square(const digit_t* a)
 { // Is field element a square?
  // Output: out = 0 (false), 1 (true)
    fp_t t, one;
    fp_exp3div4(t, a);
    fp_sqr(t, t);
    fp_mul(t, t, a);    // a^((p-1)/2)
    fp_frommont(t, t);
    fp_set(one, 1);
    return fp_is_equal(t, one);
 }
 void fp_sqrt(digit_t* a)
 { // Square root computation, out = a^((p+1)/4) mod p
    fp_t t;
    fp_exp3div4(t, a);
    fp_mul(a, t, a);    // a^((p+1)/4)
 }
--- a/src/gf/broadwell/lvl1/fp2.c
+++ b/src/gf/broadwell/lvl1/fp2.c
@@ -0,0 +1,190 @@
 #include <fp2.h>
 extern const digit_t R[NWORDS_FIELD];
 extern void fp2_sq_c0(fp2_t *out, const fp2_t *in);
 extern void fp2_sq_c1(fp_t *out, const fp2_t *in);
 extern void fp2_mul_c0(fp_t *out, const fp2_t *in0, const fp2_t *in1);
 extern void fp2_mul_c1(fp_t *out, const fp2_t *in0, const fp2_t *in1);
 /* Arithmetic modulo X^2 + 1 */
 void fp2_set(fp2_t* x, const digit_t val)
 {
    fp_set(x->re, val);
    fp_set(x->im, 0);
 }
 bool fp2_is_zero(const fp2_t* a)
 { // Is a GF(p^2) element zero?
  // Returns 1 (true) if a=0, 0 (false) otherwise
    return fp_is_zero(a->re) & fp_is_zero(a->im);
 }
 bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
 { // Compare two GF(p^2) elements in constant time
  // Returns 1 (true) if a=b, 0 (false) otherwise
    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
 }
 void fp2_copy(fp2_t* x, const fp2_t* y)
 {
    fp_copy(x->re, y->re);
    fp_copy(x->im, y->im);
 }
 fp2_t fp2_non_residue()
 { // 2 + i is a quadratic non-residue for p1913
    fp_t one = {0};
    fp2_t res;
    one[0] = 1;
    fp_tomont(one, one);
    fp_add(res.re, one, one);
    fp_copy(res.im, one);
    return res;
 }
 void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
 {
    fp_add(x->re, y->re, z->re);
    fp_add(x->im, y->im, z->im);
 }
 void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
 {
    fp_sub(x->re, y->re, z->re);
    fp_sub(x->im, y->im, z->im);
 }
 void fp2_neg(fp2_t* x, const fp2_t* y)
 {
    fp_neg(x->re, y->re);
    fp_neg(x->im, y->im);
 }
 void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
 {
    fp_t t;
    fp2_mul_c0(&t, y, z);              // c0 = a0*b0 - a1*b1
    fp2_mul_c1(&x->im, y, z);          // c1 = a0*b1 + a1*b0 
    x->re[0] = t[0]; x->re[1] = t[1]; x->re[2] = t[2]; x->re[3] = t[3];
 }
 void fp2_sqr(fp2_t* x, const fp2_t* y) {
    fp2_t t;
    fp2_sq_c0(&t, y);               // c0 = (a0+a1)(a0-a1)
    fp2_sq_c1(&x->im, y);           // c1 = 2a0*a1
    x->re[0] = t.re[0]; x->re[1] = t.re[1]; x->re[2] = t.re[2]; x->re[3] = t.re[3];
 }
 void fp2_inv(fp2_t* x)
 {
    fp_t t0, t1;
    fp_sqr(t0, x->re);
    fp_sqr(t1, x->im);
    fp_add(t0, t0, t1);
    fp_inv(t0);
    fp_mul(x->re, x->re, t0);
    fp_mul(x->im, x->im, t0);
    fp_neg(x->im, x->im);
 }
 bool fp2_is_square(const fp2_t* x)
 {
    fp_t t0, t1;
    fp_sqr(t0, x->re);
    fp_sqr(t1, x->im);
    fp_add(t0, t0, t1);
    return fp_is_square(t0);
 }
 void fp2_frob(fp2_t* x, const fp2_t* y)
 {
    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
    fp_neg(x->im, y->im);
 }
 void fp2_tomont(fp2_t* x, const fp2_t* y)
 { 
    fp_tomont(x->re, y->re);
    fp_tomont(x->im, y->im);
 }
 void fp2_frommont(fp2_t* x, const fp2_t* y)
 {
    fp_frommont(x->re, y->re);
    fp_frommont(x->im, y->im);
 }
 // NOTE: old, non-constant-time implementation. Could be optimized
 void fp2_sqrt(fp2_t* x)
 {
    fp_t sdelta, re, tmp1, tmp2, inv2, im;
    if (fp_is_zero(x->im)) {
        if (fp_is_square(x->re)) {
            fp_sqrt(x->re);
            return;
        } else {
            fp_neg(x->im, x->re);
            fp_sqrt(x->im);
            fp_set(x->re, 0);
            return;
        }
    }
    // sdelta = sqrt(re^2 + im^2)
    fp_sqr(sdelta, x->re);
    fp_sqr(tmp1, x->im);
    fp_add(sdelta, sdelta, tmp1);
    fp_sqrt(sdelta);
    fp_set(inv2, 2);
    fp_tomont(inv2, inv2);     // inv2 <- 2
    fp_inv(inv2);
    fp_add(re, x->re, sdelta);
    fp_mul(re, re, inv2);
    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
    if (!fp_is_square(tmp2)) {
        fp_sub(re, x->re, sdelta);
        fp_mul(re, re, inv2);
    }
    fp_sqrt(re);
    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
    fp_inv(im);
    fp_mul(im, im, inv2);
    fp_mul(x->im, im, x->im);    
    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
 }
 // Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
 int fp2_cmp(fp2_t* x, fp2_t* y){
    fp2_t a, b;
    fp2_frommont(&a, x);
    fp2_frommont(&b, y);
    for(int i = NWORDS_FIELD-1; i >= 0; i--){
        if(a.re[i] > b.re[i])
            return 1;
        if(a.re[i] < b.re[i])
            return -1;
    }
    for(int i = NWORDS_FIELD-1; i >= 0; i--){
        if(a.im[i] > b.im[i])
            return 1;
        if(a.im[i] < b.im[i])
            return -1;
    }
    return 0;
 }
--- a/src/gf/broadwell/lvl1/fp_asm.S
+++ b/src/gf/broadwell/lvl1/fp_asm.S
@@ -0,0 +1,555 @@
 .intel_syntax noprefix
 .set pbytes,32
 .set plimbs,4
 .global p_plus_1
 p_plus_1: .quad 0x0000000000000000, 0x252C9E4935514800, 0x33A6A86587407437, 0x34E29E286B95D98C
 .text
 .p2align 4,,15
 .global fp_add
 fp_add:
  push   r12  
  xor    rax, rax
  mov    r8, [rsi]
  mov    r9, [rsi+8]
  mov    r10, [rsi+16]
  mov    r11, [rsi+24]
  add    r8, [rdx] 
  adc    r9, [rdx+8] 
  adc    r10, [rdx+16] 
  adc    r11, [rdx+24] 
  mov    r12, [rip+p]
  sub    r8, r12
  mov    rcx, [rip+p+8]
  sbb    r9, rcx
  mov    rsi, [rip+p+16]
  sbb    r10, rsi
  mov    rdx, [rip+p+24]
  sbb    r11, rdx
  sbb    rax, 0
  and    r12, rax
  and    rcx, rax
  and    rsi, rax
  and    rdx, rax
  add    r8, r12  
  adc    r9, rcx  
  adc    r10, rsi  
  adc    r11, rdx 
  mov    [rdi], r8
  mov    [rdi+8], r9 
  mov    [rdi+16], r10 
  mov    [rdi+24], r11
  pop    r12
  ret
 .global fp_sub
 fp_sub:
  push   r12  
  xor    rax, rax
  mov    r8, [rsi]
  mov    r9, [rsi+8]
  mov    r10, [rsi+16]
  mov    r11, [rsi+24]
  sub    r8, [rdx] 
  sbb    r9, [rdx+8] 
  sbb    r10, [rdx+16] 
  sbb    r11, [rdx+24]
  sbb    rax, 0
  mov    r12, [rip+p]
  mov    rcx, [rip+p+8]
  mov    rsi, [rip+p+16]
  mov    rdx, [rip+p+24]
  and    r12, rax
  and    rcx, rax
  and    rsi, rax
  and    rdx, rax  
  add    r8, r12  
  adc    r9, rcx 
  adc    r10, rsi  
  adc    r11, rdx 
  mov    [rdi], r8
  mov    [rdi+8], r9 
  mov    [rdi+16], r10 
  mov    [rdi+24], r11 
  pop    r12
  ret
 ///////////////////////////////////////////////////////////////// MACROS
 // z = a x bi + z
 // Inputs: base memory pointer M1 (a),
 //         bi pre-stored in rdx,
 //         accumulator z in [Z0:Z4]
 // Output: [Z0:Z4]
 // Temps:  regs T0:T1
 /////////////////////////////////////////////////////////////////
 .macro MULADD64x256 M1, Z0, Z1, Z2, Z3, Z4, T0, T1, C
    mulx   \T0, \T1, \M1     // A0*B0
    xor    \C, \C
    adox   \Z0, \T1
    adox   \Z1, \T0  
    mulx   \T0, \T1, 8\M1    // A0*B1
    adcx   \Z1, \T1
    adox   \Z2, \T0    
    mulx   \T0, \T1, 16\M1   // A0*B2
    adcx   \Z2, \T1
    adox   \Z3, \T0
    mulx   \T0, \T1, 24\M1   // A0*B3          
    adcx   \Z3, \T1
    adox   \Z4, \T0
    adc    \Z4, 0   
 .endm
 .macro MULADD64x192 M1, Z0, Z1, Z2, Z3, T0, T1
    mulx   \T0, \T1, \M1     // A0*B0
    xor    rax, rax
    adox   \Z0, \T1
    adox   \Z1, \T0  
    mulx   \T0, \T1, 8\M1    // A0*B1
    adcx   \Z1, \T1
    adox   \Z2, \T0    
    mulx   \T0, \T1, 16\M1   // A0*B2
    adcx   \Z2, \T1
    adox   \Z3, \T0
    adc    \Z3, 0   
 .endm
 //***********************************************************************
 //  Multiplication in GF(p^2), non-complex part
 //  Operation: c [rdi] = a0 x b0 - a1 x b1
 //  Inputs: a = [a1, a0] stored in [rsi] 
 //          b = [b1, b0] stored in [rdx] 
 //  Output: c stored in [rdi]
 //***********************************************************************
 .global fp2_mul_c0
 fp2_mul_c0:    
    push   r12 
    push   r13 
    push   r14   
    mov    rcx, rdx
 	// [rdi0:3] <- p - b1
 	mov    r8, [rip+p]  
 	mov    r9, [rip+p+8]   
 	mov    r10, [rip+p+16]
 	mov    r11, [rip+p+24] 
 	mov    rax, [rcx+32]
 	mov    rdx, [rcx+40]        
 	sub    r8, rax
 	sbb    r9, rdx
 	mov    rax, [rcx+48]
 	mov    rdx, [rcx+56]
 	sbb    r10, rax
 	sbb    r11, rdx
 	mov    [rdi], r8
 	mov    [rdi+8], r9
 	mov    [rdi+16], r10
 	mov    [rdi+24], r11
    // [r8:r12] <- z = a0 x b00 - a1 x b10
    mov    rdx, [rcx]
    mulx   r9, r8, [rsi]         
    xor    rax, rax
    mulx   r10, r11, [rsi+8]
    adox   r9, r11        
    mulx   r11, r12, [rsi+16] 
    adox   r10, r12        
    mulx   r12, r13, [rsi+24]
    adox   r11, r13  
    adox   r12, rax
    mov    rdx, [rdi]    
    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r8                 // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
    mov    rdx, [rcx+8]
    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
    mov    rdx, [rdi+8]    
    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r9                 // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
    mov    rdx, [rcx+16]
    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
    mov    rdx, [rdi+16]    
    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r10                // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
    mov    rdx, [rcx+24]
    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
    mov    rdx, [rdi+24]    
    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r11                // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
 	// Final correction                        
 	mov    rsi, [rip+p]
 	mov    rcx, [rip+p+8]
 	mov    rdx, [rip+p+16]
 	mov    r11, [rip+p+24]
 	sub    r12, rsi
 	sbb    r8, rcx
 	sbb    r9, rdx
 	sbb    r10, r11
 	sbb    rax, 0
 	and    rsi, rax
 	and    rcx, rax
 	and    rdx, rax
 	and    r11, rax
 	add    r12, rsi
 	adc    r8, rcx
 	adc    r9, rdx
 	adc    r10, r11
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10 
    pop    r14
    pop    r13
    pop    r12
    ret
 //***********************************************************************
 //  Multiplication in GF(p^2), complex part
 //  Operation: c [rdi] = a0 x b1 + a1 x b0
 //  Inputs: a = [a1, a0] stored in [rsi] 
 //          b = [b1, b0] stored in [rdx] 
 //  Output: c stored in [rdi]
 //***********************************************************************
 .global fp2_mul_c1
 fp2_mul_c1:    
    push   r12 
    push   r13 
    push   r14   
    mov    rcx, rdx
    // [r8:r12] <- z = a0 x b10 + a1 x b00
    mov    rdx, [rcx+32]
    mulx   r9, r8, [rsi]         
    xor    rax, rax
    mulx   r10, r11, [rsi+8]
    adox   r9, r11        
    mulx   r11, r12, [rsi+16] 
    adox   r10, r12        
    mulx   r12, r13, [rsi+24]
    adox   r11, r13  
    adox   r12, rax
    mov    rdx, [rcx]    
    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r8                 // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
    mov    rdx, [rcx+40]
    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
    mov    rdx, [rcx+8]    
    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r9                 // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
    mov    rdx, [rcx+48]
    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
    mov    rdx, [rcx+16]    
    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r10                // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
    mov    rdx, [rcx+56]
    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
    mov    rdx, [rcx+24]    
    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, r11                // rdx <- z0 
    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
 	// Final correction                        
 	mov    rsi, [rip+p]
 	mov    rcx, [rip+p+8]
 	mov    rdx, [rip+p+16]
 	mov    r11, [rip+p+24]
 	sub    r12, rsi
 	sbb    r8, rcx
 	sbb    r9, rdx
 	sbb    r10, r11
 	sbb    rax, 0
 	and    rsi, rax
 	and    rcx, rax
 	and    rdx, rax
 	and    r11, rax
 	add    r12, rsi
 	adc    r8, rcx
 	adc    r9, rdx
 	adc    r10, r11
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10 
    pop    r14
    pop    r13
    pop    r12
    ret
 ///////////////////////////////////////////////////////////////// MACRO
 // z = a x b (mod p)
 // Inputs: base memory pointers M0 (a), M1 (b)
 //         bi pre-stored in rdx,
 //         accumulator z in [Z0:Z4], pre-stores a0 x b
 // Output: [Z0:Z4]
 // Temps:  regs T0:T1
 /////////////////////////////////////////////////////////////////
 .macro FPMUL256x256 M0, M1, Z0, Z1, Z2, Z3, Z4, T0, T1           
    // [Z1:Z4] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z0                 // rdx <- z0
    MULADD64x192 [rip+p_plus_1+8], \Z1, \Z2, \Z3, \Z4, \T0, \T1
    // [Z1:Z4, Z0] <- z = a01 x a1 + z 
    mov    rdx, 8\M0
    MULADD64x256 \M1, \Z1, \Z2, \Z3, \Z4, \Z0, \T0, \T1, \Z0
    // [Z2:Z4, Z0] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z1                 // rdx <- z0
    MULADD64x192 [rip+p_plus_1+8], \Z2, \Z3, \Z4, \Z0, \T0, \T1
    // [Z2:Z4, Z0:Z1] <- z = a02 x a1 + z  
    mov    rdx, 16\M0
    MULADD64x256 \M1, \Z2, \Z3, \Z4, \Z0, \Z1, \T0, \T1, \Z1
    // [Z3:Z4, Z0:Z1] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z2                // rdx <- z0
    MULADD64x192 [rip+p_plus_1+8], \Z3, \Z4, \Z0, \Z1, \T0, \T1
    // [Z3:Z4, Z0:Z2] <- z = a03 x a1 + z
    mov    rdx, 24\M0
    MULADD64x256 \M1, \Z3, \Z4, \Z0, \Z1, \Z2, \T0, \T1, \Z2
    // [Z4, Z0:Z2] <- z = (z0 x p_plus_1 + z)/2^64
    mov    rdx, \Z3                // rdx <- z0
    MULADD64x192 [rip+p_plus_1+8], \Z4, \Z0, \Z1, \Z2, \T0, \T1
 .endm
 //***********************************************************************
 //  Squaring in GF(p^2), non-complex part
 //  Operation: c [rdi] = (a0+a1) x (a0-a1)
 //  Inputs: a = [a1, a0] stored in [rsi] 
 //  Output: c stored in [rdi]
 //***********************************************************************
 .global fp2_sq_c0
 fp2_sq_c0:   
    push   r12 
    push   r13
 	// a0 + a1
 	mov    rdx, [rsi]
 	mov    r9, [rsi+8]
 	mov    r10, [rsi+16]
 	mov    r11, [rsi+24]
 	add    rdx, [rsi+32]
 	adc    r9, [rsi+40]
 	adc    r10, [rsi+48]
 	adc    r11, [rsi+56]
 	mov    [rdi], rdx
 	mov    [rdi+8], r9
 	mov    [rdi+16], r10
 	mov    [rdi+24], r11
 	// a0 - a1 + p
 	mov    r8, [rsi]
 	mov    r10, [rsi+8]
 	mov    r12, [rsi+16]
 	mov    r13, [rsi+24]
 	sub    r8, [rsi+32]
 	sbb    r10, [rsi+40]
 	sbb    r12, [rsi+48] 
 	sbb    r13, [rsi+56]
 	add    r8, [rip+p]                    
 	adc    r10, [rip+p+8]
 	adc    r12, [rip+p+16]
 	adc    r13, [rip+p+24]
 	mov    [rdi+32], r8               
 	mov    [rdi+40], r10 
 	mov    [rdi+48], r12 
 	mov    [rdi+56], r13 
    // [r8:r12] <- z = a00 x a1
    mulx   r9, r8, r8   
    xor    rax, rax
    mulx   r10, r11, r10  
    adox   r9, r11        
    mulx   r11, r12, r12  
    adox   r10, r12        
    mulx   r12, r13, r13  
    adox   r11, r13
    adox   r12, rax 
    FPMUL256x256 [rdi], [rdi+32], r8, r9, r10, r11, r12, r13, rcx
 	// Final correction                        
 	mov    rsi, [rip+p]
 	mov    rcx, [rip+p+8]
 	mov    rdx, [rip+p+16]
 	mov    r11, [rip+p+24]
 	sub    r12, rsi
 	sbb    r8, rcx
 	sbb    r9, rdx
 	sbb    r10, r11
 	sbb    rax, 0
 	and    rsi, rax
 	and    rcx, rax
 	and    rdx, rax
 	and    r11, rax
 	add    r12, rsi
 	adc    r8, rcx
 	adc    r9, rdx
 	adc    r10, r11
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10
    pop    r13
    pop    r12
    ret
 //***********************************************************************
 //  Squaring in GF(p^2), complex part
 //  Operation: c [rdi] = 2a0 x a1
 //  Inputs: a = [a1, a0] stored in [reg_p1] 
 //  Output: c stored in [rdi]
 //***********************************************************************
 .global fp2_sq_c1
 fp2_sq_c1:  
    push   r12
    push   r13 
 	mov    rdx, [rsi]
 	mov    r9, [rsi+8]
 	mov    r10, [rsi+16]
 	mov    r11, [rsi+24]
 	add    rdx, rdx
 	adc    r9, r9
 	adc    r10, r10
 	adc    r11, r11
 	sub    rsp, 32
 	mov    [rsp+8], r9
 	mov    [rsp+16], r10 
 	mov    [rsp+24], r11   
    // [r8:r12] <- z = a00 x a1
    mulx   r9, r8, [rsi+32]
    xor    rax, rax 
    mulx   r10, r11, [rsi+40]
    adox   r9, r11        
    mulx   r11, r12, [rsi+48]
    adox   r10, r12        
    mulx   r12, r13, [rsi+56]
    adox   r11, r13  
    adox   r12, rax 
 	FPMUL256x256 [rsp], [rsi+32], r8, r9, r10, r11, r12, r13, rcx
 	add    rsp, 32
 	// Final correction                        
 	mov    rsi, [rip+p]
 	mov    rcx, [rip+p+8]
 	mov    rdx, [rip+p+16]
 	mov    r11, [rip+p+24]
 	sub    r12, rsi
 	sbb    r8, rcx
 	sbb    r9, rdx
 	sbb    r10, r11
 	sbb    rax, 0
 	and    rsi, rax
 	and    rcx, rax
 	and    rdx, rax
 	and    r11, rax
 	add    r12, rsi
 	adc    r8, rcx
 	adc    r9, rdx
 	adc    r10, r11
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10 
    pop    r13
    pop    r12
    ret
 //***********************************************************************
 //  Field multiplication in GF(p)
 //  Operation: c = a x b mod p
 //  Inputs: a stored in [rsi], b stored in [rdx] 
 //  Output: c stored in [rdi]
 //***********************************************************************
 .global fp_mul
 fp_mul: 
    push   r12
    push   r13 
    push   r14 
    mov    rcx, rdx 
    // [r8:r12] <- z = a x b0
    mov    rdx, [rcx]
    mulx   r9, r8, [rsi]
    xor    rax, rax 
    mulx   r10, r11, [rsi+8]
    adox   r9, r11        
    mulx   r11, r12, [rsi+16]
    adox   r10, r12        
    mulx   r12, r13, [rsi+24] 
    adox   r11, r13
    adox   r12, rax 
 	FPMUL256x256 [rcx], [rsi], r8, r9, r10, r11, r12, r13, r14
 	// Final correction                        
 	mov    rsi, [rip+p]
 	mov    rcx, [rip+p+8]
 	mov    rdx, [rip+p+16]
 	mov    r11, [rip+p+24]
 	sub    r12, rsi
 	sbb    r8, rcx
 	sbb    r9, rdx
 	sbb    r10, r11
 	sbb    rax, 0
 	and    rsi, rax
 	and    rcx, rax
 	and    rdx, rax
 	and    r11, rax
 	add    r12, rsi
 	adc    r8, rcx
 	adc    r9, rdx
 	adc    r10, r11
    mov    [rdi], r12          
    mov    [rdi+8], r8         
    mov    [rdi+16], r9         
    mov    [rdi+24], r10  
    pop    r14
    pop    r13
    pop    r12
    ret
 .global fp_sqr
 fp_sqr:
    mov rdx, rsi
    jmp fp_mul
--- a/src/gf/broadwell/lvl1/include/fp.h
+++ b/src/gf/broadwell/lvl1/include/fp.h
@@ -0,0 +1,76 @@
 #ifndef FP_H
 #define FP_H
 //////////////////////////////////////////////// NOTE: this is placed here for now
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
 #include <tutil.h>
 #include <fp_constants.h>
 typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
 void fp_set(digit_t* x, const digit_t val);
 bool fp_is_equal(const digit_t* a, const digit_t* b);
 bool fp_is_zero(const digit_t* a);
 void fp_copy(digit_t* out, const digit_t* a);
 digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
 void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
 void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
 void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
 void fp_neg(digit_t* out, const digit_t* a);
 void fp_sqr(digit_t* out, const digit_t* a);
 void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
 void MUL(digit_t* out, const digit_t a, const digit_t b);
 void fp_inv(digit_t* x);
 bool fp_is_square(const digit_t* a);
 void fp_sqrt(digit_t* a);
 void fp_tomont(digit_t* out, const digit_t* a);
 void fp_frommont(digit_t* out, const digit_t* a);
 void fp_mont_setone(digit_t* out);
 /********************** Constant-time unsigned comparisons ***********************/
 // The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
 static inline unsigned int is_digit_nonzero_ct(digit_t x)
 { // Is x != 0?
    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
 }
 static inline unsigned int is_digit_zero_ct(digit_t x)
 { // Is x = 0?
    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
 }
 static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
 { // Is x < y?
    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
 }
 /********************** Platform-independent macros for digit-size operations **********************/
 // Digit addition with carry
 #define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
    (sumOut) = (addend2) + tempReg;                                                               \
    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
 // Digit subtraction with borrow
 #define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
    (borrowOut) = borrowReg; }
 // Shift right with flexible datatype
 #define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
 // Digit shift left
 #define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
 #endif
--- a/src/gf/broadwell/lvl1/include/fp2.h
+++ b/src/gf/broadwell/lvl1/include/fp2.h
@@ -0,0 +1,29 @@
 #ifndef FP2_H
 #define FP2_H
 #include "fp.h"
 // Structure for representing elements in GF(p^2)
 typedef struct fp2_t {
    fp_t re, im;
 } fp2_t;
 void fp2_set(fp2_t* x, const digit_t val);
 bool fp2_is_zero(const fp2_t* a);
 bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
 void fp2_copy(fp2_t* x, const fp2_t* y);
 fp2_t fp2_non_residue();
 void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
 void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
 void fp2_neg(fp2_t* x, const fp2_t* y);
 void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
 void fp2_sqr(fp2_t* x, const fp2_t* y);
 void fp2_inv(fp2_t* x);
 bool fp2_is_square(const fp2_t* x);
 void fp2_frob(fp2_t* x, const fp2_t* y);
 void fp2_sqrt(fp2_t* x);
 void fp2_tomont(fp2_t* x, const fp2_t* y);
 void fp2_frommont(fp2_t* x, const fp2_t* y);
 int fp2_cmp(fp2_t* x, fp2_t* y);
 #endif
--- a/src/gf/broadwell/lvl1/test/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/test/CMakeLists.txt
@@ -0,0 +1,9 @@
 add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
 target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
 target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
 add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
 add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
 target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
 target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
 add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)`

							`include(${SELECT_SQISIGN_VARIANT})`