From 9f0409b6b4f72e39dfdffd3ca1784b1aa08f5855 Mon Sep 17 00:00:00 2001
From: StarsAC <starsac@starsac.cn>
Date: Mon, 20 Oct 2025 01:07:36 +0000
Subject: [PATCH] add pqm4

---
 .gitignore                                    |    2 +
 src/pqm4/sqisign_lvl1/ref/api.h               |   31 +
 src/pqm4/sqisign_lvl1/ref/basis.c             |  416 +++++
 src/pqm4/sqisign_lvl1/ref/common.c            |   88 +
 src/pqm4/sqisign_lvl1/ref/config.mk           |    2 +
 src/pqm4/sqisign_lvl1/ref/e0_basis.c          |   55 +
 src/pqm4/sqisign_lvl1/ref/e0_basis.h          |    3 +
 src/pqm4/sqisign_lvl1/ref/ec.c                |  665 ++++++++
 src/pqm4/sqisign_lvl1/ref/ec.h                |  668 ++++++++
 src/pqm4/sqisign_lvl1/ref/ec_jac.c            |  335 ++++
 src/pqm4/sqisign_lvl1/ref/ec_params.c         |    4 +
 src/pqm4/sqisign_lvl1/ref/ec_params.h         |   12 +
 .../sqisign_lvl1/ref/encode_verification.c    |  220 +++
 src/pqm4/sqisign_lvl1/ref/encoded_sizes.h     |   11 +
 src/pqm4/sqisign_lvl1/ref/fp.c                |   15 +
 src/pqm4/sqisign_lvl1/ref/fp.h                |   48 +
 src/pqm4/sqisign_lvl1/ref/fp2.c               |  328 ++++
 src/pqm4/sqisign_lvl1/ref/fp2.h               |   41 +
 src/pqm4/sqisign_lvl1/ref/fp_constants.h      |   17 +
 src/pqm4/sqisign_lvl1/ref/fp_p5248_32.c       |  942 ++++++++++
 src/pqm4/sqisign_lvl1/ref/hd.c                |   93 +
 src/pqm4/sqisign_lvl1/ref/hd.h                |  435 +++++
 .../ref/hd_splitting_transforms.c             |  143 ++
 .../ref/hd_splitting_transforms.h             |   18 +
 src/pqm4/sqisign_lvl1/ref/isog.h              |   28 +
 src/pqm4/sqisign_lvl1/ref/isog_chains.c       |  241 +++
 src/pqm4/sqisign_lvl1/ref/mp.c                |  357 ++++
 src/pqm4/sqisign_lvl1/ref/mp.h                |   88 +
 src/pqm4/sqisign_lvl1/ref/pqm4_api.c          |   60 +
 src/pqm4/sqisign_lvl1/ref/rng.h               |    8 +
 src/pqm4/sqisign_lvl1/ref/sig.h               |   85 +
 src/pqm4/sqisign_lvl1/ref/sqisign.c           |  106 ++
 src/pqm4/sqisign_lvl1/ref/sqisign_namespace.h | 1022 +++++++++++
 src/pqm4/sqisign_lvl1/ref/theta_isogenies.c   | 1283 ++++++++++++++
 src/pqm4/sqisign_lvl1/ref/theta_isogenies.h   |   18 +
 src/pqm4/sqisign_lvl1/ref/theta_structure.c   |   78 +
 src/pqm4/sqisign_lvl1/ref/theta_structure.h   |  135 ++
 src/pqm4/sqisign_lvl1/ref/tools.h             |   49 +
 src/pqm4/sqisign_lvl1/ref/tutil.h             |   36 +
 src/pqm4/sqisign_lvl1/ref/verification.h      |  123 ++
 src/pqm4/sqisign_lvl1/ref/verify.c            |  309 ++++
 src/pqm4/sqisign_lvl1/ref/xeval.c             |   64 +
 src/pqm4/sqisign_lvl1/ref/xisog.c             |   61 +
 src/pqm4/sqisign_lvl3/ref/api.h               |   31 +
 src/pqm4/sqisign_lvl3/ref/basis.c             |  416 +++++
 src/pqm4/sqisign_lvl3/ref/common.c            |   88 +
 src/pqm4/sqisign_lvl3/ref/config.mk           |    2 +
 src/pqm4/sqisign_lvl3/ref/e0_basis.c          |   55 +
 src/pqm4/sqisign_lvl3/ref/e0_basis.h          |    3 +
 src/pqm4/sqisign_lvl3/ref/ec.c                |  665 ++++++++
 src/pqm4/sqisign_lvl3/ref/ec.h                |  668 ++++++++
 src/pqm4/sqisign_lvl3/ref/ec_jac.c            |  335 ++++
 src/pqm4/sqisign_lvl3/ref/ec_params.c         |    4 +
 src/pqm4/sqisign_lvl3/ref/ec_params.h         |   12 +
 .../sqisign_lvl3/ref/encode_verification.c    |  220 +++
 src/pqm4/sqisign_lvl3/ref/encoded_sizes.h     |   11 +
 src/pqm4/sqisign_lvl3/ref/fp.c                |   15 +
 src/pqm4/sqisign_lvl3/ref/fp.h                |   48 +
 src/pqm4/sqisign_lvl3/ref/fp2.c               |  328 ++++
 src/pqm4/sqisign_lvl3/ref/fp2.h               |   41 +
 src/pqm4/sqisign_lvl3/ref/fp_constants.h      |   17 +
 src/pqm4/sqisign_lvl3/ref/fp_p65376_32.c      | 1231 ++++++++++++++
 src/pqm4/sqisign_lvl3/ref/hd.c                |   93 +
 src/pqm4/sqisign_lvl3/ref/hd.h                |  435 +++++
 .../ref/hd_splitting_transforms.c             |  143 ++
 .../ref/hd_splitting_transforms.h             |   18 +
 src/pqm4/sqisign_lvl3/ref/isog.h              |   28 +
 src/pqm4/sqisign_lvl3/ref/isog_chains.c       |  241 +++
 src/pqm4/sqisign_lvl3/ref/mp.c                |  357 ++++
 src/pqm4/sqisign_lvl3/ref/mp.h                |   88 +
 src/pqm4/sqisign_lvl3/ref/pqm4_api.c          |   60 +
 src/pqm4/sqisign_lvl3/ref/rng.h               |    8 +
 src/pqm4/sqisign_lvl3/ref/sig.h               |   85 +
 src/pqm4/sqisign_lvl3/ref/sqisign.c           |  106 ++
 src/pqm4/sqisign_lvl3/ref/sqisign_namespace.h | 1022 +++++++++++
 src/pqm4/sqisign_lvl3/ref/theta_isogenies.c   | 1283 ++++++++++++++
 src/pqm4/sqisign_lvl3/ref/theta_isogenies.h   |   18 +
 src/pqm4/sqisign_lvl3/ref/theta_structure.c   |   78 +
 src/pqm4/sqisign_lvl3/ref/theta_structure.h   |  135 ++
 src/pqm4/sqisign_lvl3/ref/tools.h             |   49 +
 src/pqm4/sqisign_lvl3/ref/tutil.h             |   36 +
 src/pqm4/sqisign_lvl3/ref/verification.h      |  123 ++
 src/pqm4/sqisign_lvl3/ref/verify.c            |  309 ++++
 src/pqm4/sqisign_lvl3/ref/xeval.c             |   64 +
 src/pqm4/sqisign_lvl3/ref/xisog.c             |   61 +
 src/pqm4/sqisign_lvl5/ref/api.h               |   31 +
 src/pqm4/sqisign_lvl5/ref/basis.c             |  416 +++++
 src/pqm4/sqisign_lvl5/ref/common.c            |   88 +
 src/pqm4/sqisign_lvl5/ref/config.mk           |    2 +
 src/pqm4/sqisign_lvl5/ref/e0_basis.c          |   55 +
 src/pqm4/sqisign_lvl5/ref/e0_basis.h          |    3 +
 src/pqm4/sqisign_lvl5/ref/ec.c                |  665 ++++++++
 src/pqm4/sqisign_lvl5/ref/ec.h                |  668 ++++++++
 src/pqm4/sqisign_lvl5/ref/ec_jac.c            |  335 ++++
 src/pqm4/sqisign_lvl5/ref/ec_params.c         |    4 +
 src/pqm4/sqisign_lvl5/ref/ec_params.h         |   12 +
 .../sqisign_lvl5/ref/encode_verification.c    |  220 +++
 src/pqm4/sqisign_lvl5/ref/encoded_sizes.h     |   11 +
 src/pqm4/sqisign_lvl5/ref/fp.c                |   15 +
 src/pqm4/sqisign_lvl5/ref/fp.h                |   48 +
 src/pqm4/sqisign_lvl5/ref/fp2.c               |  328 ++++
 src/pqm4/sqisign_lvl5/ref/fp2.h               |   41 +
 src/pqm4/sqisign_lvl5/ref/fp_constants.h      |   17 +
 src/pqm4/sqisign_lvl5/ref/fp_p27500_32.c      | 1514 +++++++++++++++++
 src/pqm4/sqisign_lvl5/ref/hd.c                |   93 +
 src/pqm4/sqisign_lvl5/ref/hd.h                |  435 +++++
 .../ref/hd_splitting_transforms.c             |  143 ++
 .../ref/hd_splitting_transforms.h             |   18 +
 src/pqm4/sqisign_lvl5/ref/isog.h              |   28 +
 src/pqm4/sqisign_lvl5/ref/isog_chains.c       |  241 +++
 src/pqm4/sqisign_lvl5/ref/mp.c                |  357 ++++
 src/pqm4/sqisign_lvl5/ref/mp.h                |   88 +
 src/pqm4/sqisign_lvl5/ref/pqm4_api.c          |   60 +
 src/pqm4/sqisign_lvl5/ref/rng.h               |    8 +
 src/pqm4/sqisign_lvl5/ref/sig.h               |   85 +
 src/pqm4/sqisign_lvl5/ref/sqisign.c           |  106 ++
 src/pqm4/sqisign_lvl5/ref/sqisign_namespace.h | 1022 +++++++++++
 src/pqm4/sqisign_lvl5/ref/theta_isogenies.c   | 1283 ++++++++++++++
 src/pqm4/sqisign_lvl5/ref/theta_isogenies.h   |   18 +
 src/pqm4/sqisign_lvl5/ref/theta_structure.c   |   78 +
 src/pqm4/sqisign_lvl5/ref/theta_structure.h   |  135 ++
 src/pqm4/sqisign_lvl5/ref/tools.h             |   49 +
 src/pqm4/sqisign_lvl5/ref/tutil.h             |   36 +
 src/pqm4/sqisign_lvl5/ref/verification.h      |  123 ++
 src/pqm4/sqisign_lvl5/ref/verify.c            |  309 ++++
 src/pqm4/sqisign_lvl5/ref/xeval.c             |   64 +
 src/pqm4/sqisign_lvl5/ref/xisog.c             |   61 +
 127 files changed, 27086 insertions(+)
 create mode 100644 src/pqm4/sqisign_lvl1/ref/api.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/basis.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/common.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/config.mk
 create mode 100644 src/pqm4/sqisign_lvl1/ref/e0_basis.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/e0_basis.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/ec.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/ec.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/ec_jac.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/ec_params.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/ec_params.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/encode_verification.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/encoded_sizes.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp2.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp2.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp_constants.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/fp_p5248_32.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/hd.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/hd.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/isog.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/isog_chains.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/mp.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/mp.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/pqm4_api.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/rng.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/sig.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/sqisign.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/sqisign_namespace.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/theta_isogenies.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/theta_isogenies.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/theta_structure.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/theta_structure.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/tools.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/tutil.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/verification.h
 create mode 100644 src/pqm4/sqisign_lvl1/ref/verify.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/xeval.c
 create mode 100644 src/pqm4/sqisign_lvl1/ref/xisog.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/api.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/basis.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/common.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/config.mk
 create mode 100644 src/pqm4/sqisign_lvl3/ref/e0_basis.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/e0_basis.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/ec.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/ec.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/ec_jac.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/ec_params.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/ec_params.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/encode_verification.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/encoded_sizes.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp2.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp2.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp_constants.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/fp_p65376_32.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/hd.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/hd.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/isog.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/isog_chains.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/mp.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/mp.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/pqm4_api.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/rng.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/sig.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/sqisign.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/sqisign_namespace.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/theta_isogenies.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/theta_isogenies.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/theta_structure.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/theta_structure.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/tools.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/tutil.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/verification.h
 create mode 100644 src/pqm4/sqisign_lvl3/ref/verify.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/xeval.c
 create mode 100644 src/pqm4/sqisign_lvl3/ref/xisog.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/api.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/basis.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/common.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/config.mk
 create mode 100644 src/pqm4/sqisign_lvl5/ref/e0_basis.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/e0_basis.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/ec.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/ec.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/ec_jac.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/ec_params.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/ec_params.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/encode_verification.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/encoded_sizes.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp2.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp2.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp_constants.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/fp_p27500_32.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/hd.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/hd.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/isog.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/isog_chains.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/mp.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/mp.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/pqm4_api.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/rng.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/sig.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/sqisign.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/sqisign_namespace.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/theta_isogenies.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/theta_isogenies.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/theta_structure.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/theta_structure.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/tools.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/tutil.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/verification.h
 create mode 100644 src/pqm4/sqisign_lvl5/ref/verify.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/xeval.c
 create mode 100644 src/pqm4/sqisign_lvl5/ref/xisog.c

diff --git a/.gitignore b/.gitignore
index ef8e9b6..c0c967e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
+bulid
 build*/
 html/
 latex/
 .vscode
 
 *.DS_Store
+
diff --git a/src/pqm4/sqisign_lvl1/ref/api.h b/src/pqm4/sqisign_lvl1/ref/api.h
new file mode 100644
index 0000000..652f39f
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/api.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <stddef.h>
+#include <sqisign_namespace.h>
+
+#define CRYPTO_SECRETKEYBYTES 353
+#define CRYPTO_PUBLICKEYBYTES 65
+#define CRYPTO_BYTES 148
+
+#define CRYPTO_ALGNAME "SQIsign_lvl1"
+
+SQISIGN_API
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#endif /* api_h */
diff --git a/src/pqm4/sqisign_lvl1/ref/basis.c b/src/pqm4/sqisign_lvl1/ref/basis.c
new file mode 100644
index 0000000..94cb7fc
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/basis.c
@@ -0,0 +1,416 @@
+#include "ec.h"
+#include "fp2.h"
+#include "e0_basis.h"
+#include <assert.h>
+
+uint32_t
+ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve)
+{ // Recover y-coordinate of a point on the Montgomery curve y^2 = x^3 + Ax^2 + x
+    fp2_t t0;
+
+    fp2_sqr(&t0, Px);
+    fp2_mul(y, &t0, &curve->A); // Ax^2
+    fp2_add(y, y, Px);          // Ax^2 + x
+    fp2_mul(&t0, &t0, Px);
+    fp2_add(y, y, &t0); // x^3 + Ax^2 + x
+    // This is required, because we do not yet know that our curves are
+    // supersingular so our points live on the twist with B = 1.
+    return fp2_sqrt_verify(y);
+}
+
+static void
+difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve)
+{
+    // Given P,Q in projective x-only, computes a deterministic choice for (P-Q)
+    // Based on Proposition 3 of https://eprint.iacr.org/2017/518.pdf
+
+    fp2_t Bxx, Bxz, Bzz, t0, t1;
+
+    fp2_mul(&t0, &P->x, &Q->x);
+    fp2_mul(&t1, &P->z, &Q->z);
+    fp2_sub(&Bxx, &t0, &t1);
+    fp2_sqr(&Bxx, &Bxx);
+    fp2_mul(&Bxx, &Bxx, &curve->C); // C*(P.x*Q.x-P.z*Q.z)^2
+    fp2_add(&Bxz, &t0, &t1);
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    fp2_add(&Bzz, &t0, &t1);
+    fp2_mul(&Bxz, &Bxz, &Bzz); // (P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_sub(&Bzz, &t0, &t1);
+    fp2_sqr(&Bzz, &Bzz);
+    fp2_mul(&Bzz, &Bzz, &curve->C); // C*(P.x*Q.z-P.z*Q.x)^2
+    fp2_mul(&Bxz, &Bxz, &curve->C); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&t0, &t0, &curve->A);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&Bxz, &Bxz, &t0); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x) + 2*A*P.x*Q.z*P.z*Q.x
+
+    // To ensure that the denominator is a fourth power in Fp, we normalize by
+    // C*C_bar^2*(P.z)_bar^2*(Q.z)_bar^2
+    fp_copy(&t0.re, &curve->C.re);
+    fp_neg(&t0.im, &curve->C.im);
+    fp2_sqr(&t0, &t0);
+    fp2_mul(&t0, &t0, &curve->C);
+    fp_copy(&t1.re, &P->z.re);
+    fp_neg(&t1.im, &P->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp_copy(&t1.re, &Q->z.re);
+    fp_neg(&t1.im, &Q->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&Bxx, &Bxx, &t0);
+    fp2_mul(&Bxz, &Bxz, &t0);
+    fp2_mul(&Bzz, &Bzz, &t0);
+
+    // Solving quadratic equation
+    fp2_sqr(&t0, &Bxz);
+    fp2_mul(&t1, &Bxx, &Bzz);
+    fp2_sub(&t0, &t0, &t1);
+    // No need to check if t0 is square, as per the entangled basis algorithm.
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &Bxz, &t0);
+    fp2_copy(&PQ->z, &Bzz);
+}
+
+// Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and the point
+// P = (X/Z : 1). For generic implementation see lift_basis()
+uint32_t
+lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    assert(fp2_is_one(&B->P.z));
+    assert(fp2_is_one(&E->C));
+
+    fp2_copy(&P->x, &B->P.x);
+    fp2_copy(&Q->x, &B->Q.x);
+    fp2_copy(&Q->z, &B->Q.z);
+    fp2_set_one(&P->z);
+    uint32_t ret = ec_recover_y(&P->y, &P->x, E);
+
+    // Algorithm of Okeya-Sakurai to recover y.Q in the montgomery model
+    fp2_t v1, v2, v3, v4;
+    fp2_mul(&v1, &P->x, &Q->z);
+    fp2_add(&v2, &Q->x, &v1);
+    fp2_sub(&v3, &Q->x, &v1);
+    fp2_sqr(&v3, &v3);
+    fp2_mul(&v3, &v3, &B->PmQ.x);
+    fp2_add(&v1, &E->A, &E->A);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_add(&v2, &v2, &v1);
+    fp2_mul(&v4, &P->x, &Q->x);
+    fp2_add(&v4, &v4, &Q->z);
+    fp2_mul(&v2, &v2, &v4);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_sub(&v2, &v2, &v1);
+    fp2_mul(&v2, &v2, &B->PmQ.z);
+    fp2_sub(&Q->y, &v3, &v2);
+    fp2_add(&v1, &P->y, &P->y);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_mul(&v1, &v1, &B->PmQ.z);
+    fp2_mul(&Q->x, &Q->x, &v1);
+    fp2_mul(&Q->z, &Q->z, &v1);
+
+    // Transforming to a jacobian coordinate
+    fp2_sqr(&v1, &Q->z);
+    fp2_mul(&Q->y, &Q->y, &v1);
+    fp2_mul(&Q->x, &Q->x, &Q->z);
+    return ret;
+}
+
+uint32_t
+lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    // Normalise the curve E such that (A : C) is (A/C : 1)
+    // and the point x(P) = (X/Z : 1).
+    fp2_t inverses[2];
+    fp2_copy(&inverses[0], &B->P.z);
+    fp2_copy(&inverses[1], &E->C);
+
+    fp2_batched_inv(inverses, 2);
+    fp2_set_one(&B->P.z);
+    fp2_set_one(&E->C);
+
+    fp2_mul(&B->P.x, &B->P.x, &inverses[0]);
+    fp2_mul(&E->A, &E->A, &inverses[1]);
+
+    // Lift the basis to Jacobian points P, Q
+    return lift_basis_normalized(P, Q, B, E);
+}
+
+// Given an x-coordinate, determines if this is a valid
+// point on the curve. Assumes C=1.
+static uint32_t
+is_on_curve(const fp2_t *x, const ec_curve_t *curve)
+{
+    assert(fp2_is_one(&curve->C));
+    fp2_t t0;
+
+    fp2_add(&t0, x, &curve->A); // x + (A/C)
+    fp2_mul(&t0, &t0, x);       // x^2 + (A/C)*x
+    fp2_add_one(&t0, &t0);      // x^2 + (A/C)*x + 1
+    fp2_mul(&t0, &t0, x);       // x^3 + (A/C)*x^2 + x
+
+    return fp2_is_square(&t0);
+}
+
+// Helper function which given a point of order k*2^n with n maximal
+// and k odd, computes a point of order 2^f
+static inline void
+clear_cofactor_for_maximal_even_order(ec_point_t *P, ec_curve_t *curve, int f)
+{
+    // clear out the odd cofactor to get a point of order 2^n
+    ec_mul(P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, P, curve);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_A24(P, P, &curve->A24, curve->is_A24_computed_and_normalized);
+    }
+}
+
+// Helper function which finds an NQR -1 / (1 + i*b) for entangled basis generation
+static uint8_t
+find_nqr_factor(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    // factor = -1/(1 + i*b) for b in Fp will be NQR whenever 1 + b^2 is NQR
+    // in Fp, so we find one of these and then invert (1 + i*b). We store b
+    // as a u8 hint to save time in verification.
+
+    // We return the hint as a u8, but use (uint16_t)n to give 2^16 - 1
+    // to make failure cryptographically negligible, with a fallback when
+    // n > 128 is required.
+    uint8_t hint;
+    uint32_t found = 0;
+    uint16_t n = start;
+
+    bool qr_b = 1;
+    fp_t b, tmp;
+    fp2_t z, t0, t1;
+
+    do {
+        while (qr_b) {
+            // find b with 1 + b^2 a non-quadratic residue
+            fp_set_small(&tmp, (uint32_t)n * n + 1);
+            qr_b = fp_is_square(&tmp);
+            n++; // keeps track of b = n - 1
+        }
+
+        // for Px := -A/(1 + i*b) to be on the curve
+        // is equivalent to A^2*(z-1) - z^2 NQR for z = 1 + i*b
+        // thus prevents unnecessary inversion pre-check
+
+        // t0 = z - 1 = i*b
+        // t1 = z = 1 + i*b
+        fp_set_small(&b, (uint32_t)n - 1);
+        fp2_set_zero(&t0);
+        fp2_set_one(&z);
+        fp_copy(&z.im, &b);
+        fp_copy(&t0.im, &b);
+
+        // A^2*(z-1) - z^2
+        fp2_sqr(&t1, &curve->A);
+        fp2_mul(&t0, &t0, &t1); // A^2 * (z - 1)
+        fp2_sqr(&t1, &z);
+        fp2_sub(&t0, &t0, &t1); // A^2 * (z - 1) - z^2
+        found = !fp2_is_square(&t0);
+
+        qr_b = 1;
+    } while (!found);
+
+    // set Px to -A/(1 + i*b)
+    fp2_copy(x, &z);
+    fp2_inv(x);
+    fp2_mul(x, x, &curve->A);
+    fp2_neg(x, x);
+
+    /*
+     * With very low probability n will not fit in 7 bits.
+     * We set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    hint = n <= 128 ? n - 1 : 0;
+
+    return hint;
+}
+
+// Helper function which finds a point x(P) = n * A
+static uint8_t
+find_nA_x_coord(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    assert(!fp2_is_square(&curve->A)); // Only to be called when A is a NQR
+
+    // when A is NQR we allow x(P) to be a multiple n*A of A
+    uint8_t n = start;
+    if (n == 1) {
+        fp2_copy(x, &curve->A);
+    } else {
+        fp2_mul_small(x, &curve->A, n);
+    }
+
+    while (!is_on_curve(x, curve)) {
+        fp2_add(x, x, &curve->A);
+        n++;
+    }
+
+    /*
+     * With very low probability (1/2^128), n will not fit in 7 bits.
+     * In this case, we set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    uint8_t hint = n < 128 ? n : 0;
+    return hint;
+}
+
+// The entangled basis generation does not allow A = 0
+// so we simply return the one we have already precomputed
+static void
+ec_basis_E0_2f(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    assert(fp2_is_zero(&curve->A));
+    ec_point_t P, Q;
+
+    // Set P, Q to precomputed (X : 1) values
+    fp2_copy(&P.x, &BASIS_E0_PX);
+    fp2_copy(&Q.x, &BASIS_E0_QX);
+    fp2_set_one(&P.z);
+    fp2_set_one(&Q.z);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_E0(&P, &P);
+        xDBL_E0(&Q, &Q);
+    }
+
+    // Set P, Q in the basis and compute x(P - Q)
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->Q, &Q);
+    difference_point(&PQ2->PmQ, &P, &Q, curve);
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// and stores hints as an array for faster recomputation at a later point
+uint8_t
+ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 0;
+    }
+
+    uint8_t hint;
+    bool hint_A = fp2_is_square(&curve->A);
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_A) {
+        // when A is NQR we allow x(P) to be a multiple n*A of A
+        hint = find_nA_x_coord(&P.x, curve, 1);
+    } else {
+        // when A is QR we instead have to find (1 + b^2) a NQR
+        // such that x(P) = -A / (1 + i*b)
+        hint = find_nqr_factor(&P.x, curve, 1);
+    }
+
+    fp2_set_one(&P.z);
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+    // Finally, we compress hint_A and hint into a single bytes.
+    // We choose to set the LSB of hint to hint_A
+    assert(hint < 128); // We expect hint to be 7-bits in size
+    return (hint << 1) | hint_A;
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// given the hints as an array for faster basis computation
+int
+ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 1;
+    }
+
+    // The LSB of hint encodes whether A is a QR
+    // The remaining 7-bits are used to find a valid x(P)
+    bool hint_A = hint & 1;
+    uint8_t hint_P = hint >> 1;
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_P) {
+        // When hint_P = 0 it means we did not find a point in 128 attempts
+        // this is very rare and we almost never expect to need this fallback
+        // In either case, we can start with b = 128 to skip testing the known
+        // values which will not work
+        if (!hint_A) {
+            find_nA_x_coord(&P.x, curve, 128);
+        } else {
+            find_nqr_factor(&P.x, curve, 128);
+        }
+    } else {
+        // Otherwise we use the hint to directly find x(P) based on hint_A
+        if (!hint_A) {
+            // when A is NQR, we have found n such that x(P) = n*A
+            fp2_mul_small(&P.x, &curve->A, hint_P);
+        } else {
+            // when A is QR we have found b such that (1 + b^2) is a NQR in
+            // Fp, so we must compute x(P) = -A / (1 + i*b)
+            fp_set_one(&P.x.re);
+            fp_set_small(&P.x.im, hint_P);
+            fp2_inv(&P.x);
+            fp2_mul(&P.x, &P.x, &curve->A);
+            fp2_neg(&P.x, &P.x);
+        }
+    }
+    fp2_set_one(&P.z);
+
+#ifndef NDEBUG
+    int passed = 1;
+    passed = is_on_curve(&P.x, curve);
+    passed &= !fp2_is_square(&P.x);
+
+    if (!passed)
+        return 0;
+#endif
+
+    // set xQ to -xP - A
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+#ifndef NDEBUG
+    passed &= test_basis_order_twof(PQ2, curve, f);
+
+    if (!passed)
+        return 0;
+#endif
+
+    return 1;
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/common.c b/src/pqm4/sqisign_lvl1/ref/common.c
new file mode 100644
index 0000000..d393e9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/common.c
@@ -0,0 +1,88 @@
+#include <fips202.h>
+#include <tutil.h>
+#include <mp.h>
+#include <encoded_sizes.h>
+#include <ec_params.h>
+#include <verification.h>
+
+void
+public_key_init(public_key_t *pk)
+{
+    ec_curve_init(&pk->curve);
+}
+
+void
+public_key_finalize(public_key_t *pk)
+{
+}
+
+// compute the challenge as the hash of the message and the commitment curve and public key
+void
+hash_to_challenge(scalar_t *scalar,
+                  const public_key_t *pk,
+                  const ec_curve_t *com_curve,
+                  const unsigned char *message,
+                  size_t length)
+{
+    unsigned char buf[2 * FP2_ENCODED_BYTES];
+    {
+        fp2_t j1, j2;
+        ec_j_inv(&j1, &pk->curve);
+        ec_j_inv(&j2, com_curve);
+        fp2_encode(buf, &j1);
+        fp2_encode(buf + FP2_ENCODED_BYTES, &j2);
+    }
+
+    {
+        // The type scalar_t represents an element of GF(p), which is about
+        // 2*lambda bits, where lambda = 128, 192 or 256, according to the
+        // security level. Thus, the variable scalar should have enough memory
+        // for the values produced by SHAKE256 in the intermediate iterations.
+
+        shake256incctx ctx;
+
+        size_t hash_bytes = ((2 * SECURITY_BITS) + 7) / 8;
+        size_t limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        size_t bits = (2 * SECURITY_BITS) % RADIX;
+        digit_t mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, buf, 2 * FP2_ENCODED_BYTES);
+        shake256_inc_absorb(&ctx, message, length);
+        shake256_inc_finalize(&ctx);
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+        for (int i = 2; i < HASH_ITERATIONS; i++) {
+            shake256_inc_init(&ctx);
+            shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+            shake256_inc_finalize(&ctx);
+            shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+            (*scalar)[limbs - 1] &= mask;
+        }
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+        shake256_inc_finalize(&ctx);
+
+        hash_bytes = ((TORSION_EVEN_POWER - SQIsign_response_length) + 7) / 8;
+        limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        bits = (TORSION_EVEN_POWER - SQIsign_response_length) % RADIX;
+        mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        memset(*scalar, 0, NWORDS_ORDER * sizeof(digit_t));
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+
+#ifdef TARGET_BIG_ENDIAN
+        for (int i = 0; i < NWORDS_ORDER; i++)
+            (*scalar)[i] = BSWAP_DIGIT((*scalar)[i]);
+#endif
+
+        mp_mod_2exp(*scalar, SECURITY_BITS, NWORDS_ORDER);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/config.mk b/src/pqm4/sqisign_lvl1/ref/config.mk
new file mode 100644
index 0000000..212eb4a
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/config.mk
@@ -0,0 +1,2 @@
+elf/crypto_sign_sqisign_lvl1_ref_%.elf: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl1 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
+obj/libcrypto_sign_sqisign_lvl1_ref.a: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl1 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
diff --git a/src/pqm4/sqisign_lvl1/ref/e0_basis.c b/src/pqm4/sqisign_lvl1/ref/e0_basis.c
new file mode 100644
index 0000000..5be2b8e
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/e0_basis.c
@@ -0,0 +1,55 @@
+#include <e0_basis.h>
+const fp2_t BASIS_E0_PX = {
+#if 0
+#elif RADIX == 16
+{0x107, 0xc, 0x1890, 0xf2a, 0x52b, 0xb68, 0x152d, 0xa4c, 0x1054, 0x642, 0x36a, 0x6f8, 0x7ad, 0x146c, 0x1d66, 0x1b67, 0x236, 0x10d, 0x1933, 0x3}
+#elif RADIX == 32
+{0x3020e, 0xb795624, 0x5ab6829, 0x1514995, 0x1b5190a, 0x187ad37c, 0x19facd46, 0x8688db6, 0x3c998}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x52b795624001810, 0x8c8505452654b56d, 0xf59a8d87ad37c0da, 0x24e4cc21a236db3}
+#else
+{0x5bcab12000c08, 0x452654b56d052, 0x26f81b5190a0a, 0x36cfd66a361eb, 0x12726610d11b}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x1f87, 0x83e, 0x32e, 0xe58, 0xd9d, 0x1416, 0x752, 0x13b4, 0x1efa, 0xe62, 0x12f5, 0x1907, 0x1814, 0x1ddd, 0x1aa6, 0x1420, 0x2cd, 0x1431, 0x1be2, 0x7}
+#elif RADIX == 32
+{0x120fbf0f, 0x1d72c0cb, 0xa54166c, 0x1bea7687, 0x197ab98b, 0x1b814c83, 0x8354ddd, 0x188b368, 0x2df15}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xcd9d72c0cb907df8, 0x5cc5efa9da1d4a82, 0x6a9bbbb814c83cbd, 0x26ef8a8622cda10}
+#else
+{0x6b96065c83efc, 0x29da1d4a82cd9, 0x190797ab98bdf, 0x6841aa6eeee05, 0x1377c5431166}
+#endif
+#endif
+};
+const fp2_t BASIS_E0_QX = {
+#if 0
+#elif RADIX == 16
+{0x5ff, 0x1783, 0xadc, 0x775, 0xad4, 0x593, 0xb4c, 0x21e, 0x1cb2, 0x13d8, 0x179f, 0x680, 0x1a9c, 0x1824, 0x118e, 0x13d9, 0x24, 0x1956, 0x1dd2, 0x9}
+#elif RADIX == 32
+{0x5e0cbff, 0x143baab7, 0x9859356, 0x12c843cb, 0xbcfcf63, 0x9a9c340, 0x16631d82, 0xab00927, 0x4ee96}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x6ad43baab72f065f, 0xe7b1cb210f2d30b2, 0xc63b049a9c3405e7, 0x4ff74b2ac0249ec}
+#else
+{0x21dd55b97832f, 0x210f2d30b26ad, 0x680bcfcf6396, 0x27b318ec126a7, 0x4ffba5956012}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x1c7f, 0x1117, 0xa4, 0x1164, 0x6e, 0x1e63, 0x1b7b, 0x1305, 0x424, 0x131a, 0x1b61, 0xae3, 0x17b1, 0xe5e, 0x1848, 0x1e81, 0x14a5, 0x1cb5, 0x1d87, 0x8}
+#elif RADIX == 32
+{0x445f8ff, 0xe8b2029, 0xf7e6303, 0x109260bb, 0x1db0cc68, 0x1d7b1571, 0x7090e5, 0x5ad297d, 0x3ec3f}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x606e8b2029222fc7, 0x6634424982edefcc, 0xe121cbd7b1571ed8, 0x4f761f96b4a5f40}
+#else
+{0x74590149117e3, 0x4982edefcc606, 0x2ae3db0cc6884, 0x7d0384872f5ec, 0x4fbb0fcb5a52}
+#endif
+#endif
+};
diff --git a/src/pqm4/sqisign_lvl1/ref/e0_basis.h b/src/pqm4/sqisign_lvl1/ref/e0_basis.h
new file mode 100644
index 0000000..05cafb8
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/e0_basis.h
@@ -0,0 +1,3 @@
+#include <fp2.h>
+extern const fp2_t BASIS_E0_PX;
+extern const fp2_t BASIS_E0_QX;
diff --git a/src/pqm4/sqisign_lvl1/ref/ec.c b/src/pqm4/sqisign_lvl1/ref/ec.c
new file mode 100644
index 0000000..be4e4e5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/ec.c
@@ -0,0 +1,665 @@
+#include <assert.h>
+#include <stdio.h>
+#include <mp.h>
+#include <ec.h>
+
+void
+ec_point_init(ec_point_t *P)
+{ // Initialize point as identity element (1:0)
+    fp2_set_one(&(P->x));
+    fp2_set_zero(&(P->z));
+}
+
+void
+ec_curve_init(ec_curve_t *E)
+{ // Initialize the curve struct
+    // Initialize the constants
+    fp2_set_zero(&(E->A));
+    fp2_set_one(&(E->C));
+
+    // Initialize the point (A+2 : 4C)
+    ec_point_init(&(E->A24));
+
+    // Set the bool to be false by default
+    E->is_A24_computed_and_normalized = false;
+}
+
+void
+select_point(ec_point_t *Q, const ec_point_t *P1, const ec_point_t *P2, const digit_t option)
+{ // Select points in constant time
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+cswap_points(ec_point_t *P, ec_point_t *Q, const digit_t option)
+{ // Swap points in constant time
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
+    fp2_cswap(&(P->x), &(Q->x), option);
+    fp2_cswap(&(P->z), &(Q->z), option);
+}
+
+void
+ec_normalize_point(ec_point_t *P)
+{
+    fp2_inv(&P->z);
+    fp2_mul(&P->x, &P->x, &P->z);
+    fp2_set_one(&(P->z));
+}
+
+void
+ec_normalize_curve(ec_curve_t *E)
+{
+    fp2_inv(&E->C);
+    fp2_mul(&E->A, &E->A, &E->C);
+    fp2_set_one(&E->C);
+}
+
+void
+ec_curve_normalize_A24(ec_curve_t *E)
+{
+    if (!E->is_A24_computed_and_normalized) {
+        AC_to_A24(&E->A24, E);
+        ec_normalize_point(&E->A24);
+        E->is_A24_computed_and_normalized = true;
+    }
+    assert(fp2_is_one(&E->A24.z));
+}
+
+void
+ec_normalize_curve_and_A24(ec_curve_t *E)
+{ // Neither the curve or A24 are guaranteed to be normalized.
+  // First we normalize (A/C : 1) and conditionally compute
+    if (!fp2_is_one(&E->C)) {
+        ec_normalize_curve(E);
+    }
+
+    if (!E->is_A24_computed_and_normalized) {
+        // Now compute A24 = ((A + 2) / 4 : 1)
+        fp2_add_one(&E->A24.x, &E->A);     // re(A24.x) = re(A) + 1
+        fp2_add_one(&E->A24.x, &E->A24.x); // re(A24.x) = re(A) + 2
+        fp_copy(&E->A24.x.im, &E->A.im);   // im(A24.x) = im(A)
+
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 2
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 4
+        fp2_set_one(&E->A24.z);
+
+        E->is_A24_computed_and_normalized = true;
+    }
+}
+
+uint32_t
+ec_is_zero(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_has_zero_coordinate(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->x) | fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_is_equal(const ec_point_t *P, const ec_point_t *Q)
+{ // Evaluate if two points in Montgomery coordinates (X:Z) are equal
+  // Returns 0xFFFFFFFF (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1;
+
+    // Check if P, Q are the points at infinity
+    uint32_t l_zero = ec_is_zero(P);
+    uint32_t r_zero = ec_is_zero(Q);
+
+    // Check if PX * QZ = QX * PZ
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    uint32_t lr_equal = fp2_is_equal(&t0, &t1);
+
+    // Points are equal if
+    // - Both are zero, or
+    // - neither are zero AND PX * QZ = QX * PZ
+    return (l_zero & r_zero) | (~l_zero & ~r_zero * lr_equal);
+}
+
+uint32_t
+ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    if (ec_is_zero(P))
+        return 0;
+
+    uint32_t x_is_zero, tmp_is_zero;
+    fp2_t t0, t1, t2;
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t0, &t1);
+    fp2_mul(&t2, &t2, &E->A);
+    fp2_mul(&t1, &t1, &E->C);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t1, &t2); // 4 (CX^2+CZ^2+AXZ)
+
+    x_is_zero = fp2_is_zero(&P->x);
+    tmp_is_zero = fp2_is_zero(&t0);
+
+    // two torsion if x or x^2 + Ax + 1 is zero
+    return x_is_zero | tmp_is_zero;
+}
+
+uint32_t
+ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    ec_point_t test;
+    xDBL_A24(&test, P, &E->A24, E->is_A24_computed_and_normalized);
+    return ec_is_two_torsion(&test, E);
+}
+
+uint32_t
+ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E)
+{ // Check if basis points (P, Q) form a full 2^t-basis
+    ec_point_t P2, Q2;
+    xDBL_A24(&P2, &B->P, &E->A24, E->is_A24_computed_and_normalized);
+    xDBL_A24(&Q2, &B->Q, &E->A24, E->is_A24_computed_and_normalized);
+    return (ec_is_two_torsion(&P2, E) & ec_is_two_torsion(&Q2, E) & ~ec_is_equal(&P2, &Q2));
+}
+
+int
+ec_curve_verify_A(const fp2_t *A)
+{ // Verify the Montgomery coefficient A is valid (A^2-4 \ne 0)
+  // Return 1 if curve is valid, 0 otherwise
+    fp2_t t;
+    fp2_set_one(&t);
+    fp_add(&t.re, &t.re, &t.re); // t=2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    fp_neg(&t.re, &t.re); // t=-2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    return 1;
+}
+
+int
+ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A)
+{ // Initialize the curve from the A coefficient and check it is valid
+  // Return 1 if curve is valid, 0 otherwise
+    ec_curve_init(E);
+    fp2_copy(&E->A, A); // Set A
+    return ec_curve_verify_A(A);
+}
+
+void
+ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve)
+{ // j-invariant computation for Montgommery coefficient A2=(A+2C:4C)
+    fp2_t t0, t1;
+
+    fp2_sqr(&t1, &curve->C);
+    fp2_sqr(j_inv, &curve->A);
+    fp2_add(&t0, &t1, &t1);
+    fp2_sub(&t0, j_inv, &t0);
+    fp2_sub(&t0, &t0, &t1);
+    fp2_sub(j_inv, &t0, &t1);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(j_inv, j_inv, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_sqr(&t1, &t0);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(j_inv);
+    fp2_mul(j_inv, &t0, j_inv);
+}
+
+void
+xDBL_E0(ec_point_t *Q, const ec_point_t *P)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z) on the curve E0 with (A:C) = (0:1).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C) = (0:1). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&Q->z, &t1, &t2);
+    fp2_mul(&Q->z, &Q->z, &t2);
+}
+
+void
+xDBL(ec_point_t *Q, const ec_point_t *P, const ec_point_t *AC)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z). Computation of coefficient values A+2C and 4C
+  // on-the-fly. 
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t3, &AC->z, &AC->z);
+    fp2_mul(&t1, &t1, &t3);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&t0, &t3, &AC->x);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and
+  //        the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    if (!A24_normalized)
+        fp2_mul(&t1, &t1, &A24->z);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_mul(&t0, &t2, &A24->x);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ)
+{ // Differential addition of Montgomery points in projective coordinates (X:Z).
+  // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, and difference
+  //        PQ=P-Q=(XPQ:ZPQ).
+  // Output: projective Montgomery point R <- P+Q = (XR:ZR) such that x(P+Q)=XR/ZR.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&t2, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &t2);
+    fp2_add(&t2, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&t2, &t2);
+    fp2_sqr(&t3, &t3);
+    fp2_mul(&t2, &PQ->z, &t2);
+    fp2_mul(&R->z, &PQ->x, &t3);
+    fp2_copy(&R->x, &t2);
+}
+
+void
+xDBLADD(ec_point_t *R,
+        ec_point_t *S,
+        const ec_point_t *P,
+        const ec_point_t *Q,
+        const ec_point_t *PQ,
+        const ec_point_t *A24,
+        const bool A24_normalized)
+{ // Simultaneous doubling and differential addition.
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, the difference
+  //         PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points R <- 2*P = (XR:ZR) such that x(2P)=XR/ZR, and S <- P+Q = (XS:ZS) such that =
+  //         x(Q+P)=XS/ZS.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&R->x, &t0);
+    fp2_sub(&t2, &Q->x, &Q->z);
+    fp2_add(&S->x, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_sqr(&R->z, &t1);
+    fp2_mul(&t1, &t1, &S->x);
+    fp2_sub(&t2, &R->x, &R->z);
+    if (!A24_normalized)
+        fp2_mul(&R->z, &R->z, &A24->z);
+    fp2_mul(&R->x, &R->x, &R->z);
+    fp2_mul(&S->x, &A24->x, &t2);
+    fp2_sub(&S->z, &t0, &t1);
+    fp2_add(&R->z, &R->z, &S->x);
+    fp2_add(&S->x, &t0, &t1);
+    fp2_mul(&R->z, &R->z, &t2);
+    fp2_sqr(&S->z, &S->z);
+    fp2_sqr(&S->x, &S->x);
+    fp2_mul(&S->z, &S->z, &PQ->x);
+    fp2_mul(&S->x, &S->x, &PQ->z);
+}
+
+void
+xMUL(ec_point_t *Q, const ec_point_t *P, const digit_t *k, const int kbits, const ec_curve_t *curve)
+{ // The Montgomery ladder
+  // Input: projective Montgomery point P=(XP:ZP) such that xP=XP/ZP, a scalar k of bitlength kbits, and
+  //        the Montgomery curve constants (A:C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points Q <- k*P = (XQ:ZQ) such that x(k*P)=XQ/ZQ.
+    ec_point_t R0, R1, A24;
+    digit_t mask;
+    unsigned int bit, prevbit = 0, swap;
+
+    if (!curve->is_A24_computed_and_normalized) {
+        // Computation of A24=(A+2C:4C)
+        fp2_add(&A24.x, &curve->C, &curve->C);
+        fp2_add(&A24.z, &A24.x, &A24.x);
+        fp2_add(&A24.x, &A24.x, &curve->A);
+    } else {
+        fp2_copy(&A24.x, &curve->A24.x);
+        fp2_copy(&A24.z, &curve->A24.z);
+        // Assert A24 has been normalised
+        assert(fp2_is_one(&A24.z));
+    }
+
+    // R0 <- (1:0), R1 <- P
+    ec_point_init(&R0);
+    fp2_copy(&R1.x, &P->x);
+    fp2_copy(&R1.z, &P->z);
+
+    // Main loop
+    for (int i = kbits - 1; i >= 0; i--) {
+        bit = (k[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
+        swap = bit ^ prevbit;
+        prevbit = bit;
+        mask = 0 - (digit_t)swap;
+
+        cswap_points(&R0, &R1, mask);
+        xDBLADD(&R0, &R1, &R0, &R1, P, &A24, true);
+    }
+    swap = 0 ^ prevbit;
+    mask = 0 - (digit_t)swap;
+    cswap_points(&R0, &R1, mask);
+
+    fp2_copy(&Q->x, &R0.x);
+    fp2_copy(&Q->z, &R0.z);
+}
+
+int
+xDBLMUL(ec_point_t *S,
+        const ec_point_t *P,
+        const digit_t *k,
+        const ec_point_t *Q,
+        const digit_t *l,
+        const ec_point_t *PQ,
+        const int kbits,
+        const ec_curve_t *curve)
+{ // The Montgomery biladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, scalars k and l of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants (A:C).
+  // Output: projective Montgomery point S <- k*P + l*Q = (XS:ZS) such that x(k*P + l*Q)=XS/ZS.
+
+    int i, A_is_zero;
+    digit_t evens, mevens, bitk0, bitl0, maskk, maskl, temp, bs1_ip1, bs2_ip1, bs1_i, bs2_i, h;
+    digit_t sigma[2] = { 0 }, pre_sigma = 0;
+    digit_t k_t[NWORDS_ORDER], l_t[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }, r[2 * BITS] = { 0 };
+    ec_point_t DIFF1a, DIFF1b, DIFF2a, DIFF2b, R[3] = { 0 }, T[3];
+
+    // differential additions formulas are invalid in this case
+    if (ec_has_zero_coordinate(P) | ec_has_zero_coordinate(Q) | ec_has_zero_coordinate(PQ))
+        return 0;
+
+    // Derive sigma according to parity
+    bitk0 = (k[0] & 1);
+    bitl0 = (l[0] & 1);
+    maskk = 0 - bitk0; // Parity masks: 0 if even, otherwise 1...1
+    maskl = 0 - bitl0;
+    sigma[0] = (bitk0 ^ 1);
+    sigma[1] = (bitl0 ^ 1);
+    evens = sigma[0] + sigma[1]; // Count number of even scalars
+    mevens = 0 - (evens & 1);    // Mask mevens <- 0 if # even of scalars = 0 or 2, otherwise mevens = 1...1
+
+    // If k and l are both even or both odd, pick sigma = (0,1)
+    sigma[0] = (sigma[0] & mevens);
+    sigma[1] = (sigma[1] & mevens) | (1 & ~mevens);
+
+    // Convert even scalars to odd
+    one[0] = 1;
+    mp_sub(k_t, k, one, NWORDS_ORDER);
+    mp_sub(l_t, l, one, NWORDS_ORDER);
+    select_ct(k_t, k_t, k, maskk, NWORDS_ORDER);
+    select_ct(l_t, l_t, l, maskl, NWORDS_ORDER);
+
+    // Scalar recoding
+    for (i = 0; i < kbits; i++) {
+        // If sigma[0] = 1 swap k_t and l_t
+        maskk = 0 - (sigma[0] ^ pre_sigma);
+        swap_ct(k_t, l_t, maskk, NWORDS_ORDER);
+
+        if (i == kbits - 1) {
+            bs1_ip1 = 0;
+            bs2_ip1 = 0;
+        } else {
+            bs1_ip1 = mp_shiftr(k_t, 1, NWORDS_ORDER);
+            bs2_ip1 = mp_shiftr(l_t, 1, NWORDS_ORDER);
+        }
+        bs1_i = k_t[0] & 1;
+        bs2_i = l_t[0] & 1;
+
+        r[2 * i] = bs1_i ^ bs1_ip1;
+        r[2 * i + 1] = bs2_i ^ bs2_ip1;
+
+        // Revert sigma if second bit, r_(2i+1), is 1
+        pre_sigma = sigma[0];
+        maskk = 0 - r[2 * i + 1];
+        select_ct(&temp, &sigma[0], &sigma[1], maskk, 1);
+        select_ct(&sigma[1], &sigma[1], &sigma[0], maskk, 1);
+        sigma[0] = temp;
+    }
+
+    // Point initialization
+    ec_point_init(&R[0]);
+    maskk = 0 - sigma[0];
+    select_point(&R[1], P, Q, maskk);
+    select_point(&R[2], Q, P, maskk);
+
+    fp2_copy(&DIFF1a.x, &R[1].x);
+    fp2_copy(&DIFF1a.z, &R[1].z);
+    fp2_copy(&DIFF1b.x, &R[2].x);
+    fp2_copy(&DIFF1b.z, &R[2].z);
+
+    // Initialize DIFF2a <- P+Q, DIFF2b <- P-Q
+    xADD(&R[2], &R[1], &R[2], PQ);
+    if (ec_has_zero_coordinate(&R[2]))
+        return 0; // non valid formulas
+
+    fp2_copy(&DIFF2a.x, &R[2].x);
+    fp2_copy(&DIFF2a.z, &R[2].z);
+    fp2_copy(&DIFF2b.x, &PQ->x);
+    fp2_copy(&DIFF2b.z, &PQ->z);
+
+    A_is_zero = fp2_is_zero(&curve->A);
+
+    // Main loop
+    for (i = kbits - 1; i >= 0; i--) {
+        h = r[2 * i] + r[2 * i + 1]; // in {0, 1, 2}
+        maskk = 0 - (h & 1);
+        select_point(&T[0], &R[0], &R[1], maskk);
+        maskk = 0 - (h >> 1);
+        select_point(&T[0], &T[0], &R[2], maskk);
+        if (A_is_zero) {
+            xDBL_E0(&T[0], &T[0]);
+        } else {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(&T[0], &T[0], &curve->A24, true);
+        }
+
+        maskk = 0 - r[2 * i + 1]; // in {0, 1}
+        select_point(&T[1], &R[0], &R[1], maskk);
+        select_point(&T[2], &R[1], &R[2], maskk);
+
+        cswap_points(&DIFF1a, &DIFF1b, maskk);
+        xADD(&T[1], &T[1], &T[2], &DIFF1a);
+        xADD(&T[2], &R[0], &R[2], &DIFF2a);
+
+        // If hw (mod 2) = 1 then swap DIFF2a and DIFF2b
+        maskk = 0 - (h & 1);
+        cswap_points(&DIFF2a, &DIFF2b, maskk);
+
+        // R <- T
+        copy_point(&R[0], &T[0]);
+        copy_point(&R[1], &T[1]);
+        copy_point(&R[2], &T[2]);
+    }
+
+    // Output R[evens]
+    select_point(S, &R[0], &R[1], mevens);
+
+    maskk = 0 - (bitk0 & bitl0);
+    select_point(S, S, &R[2], maskk);
+    return 1;
+}
+
+int
+ec_ladder3pt(ec_point_t *R,
+             const digit_t *m,
+             const ec_point_t *P,
+             const ec_point_t *Q,
+             const ec_point_t *PQ,
+             const ec_curve_t *E)
+{ // The 3-point Montgomery ladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, a scalar k of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C/4C:1).
+  // Output: projective Montgomery point R <- P + m*Q = (XR:ZR) such that x(P + m*Q)=XR/ZR.
+    assert(E->is_A24_computed_and_normalized);
+    if (!fp2_is_one(&E->A24.z)) {
+        return 0;
+    }
+    // Formulas are not valid in that case
+    if (ec_has_zero_coordinate(PQ)) {
+        return 0;
+    }
+
+    ec_point_t X0, X1, X2;
+    copy_point(&X0, Q);
+    copy_point(&X1, P);
+    copy_point(&X2, PQ);
+
+    int i, j;
+    digit_t t;
+    for (i = 0; i < NWORDS_ORDER; i++) {
+        t = 1;
+        for (j = 0; j < RADIX; j++) {
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            xDBLADD(&X0, &X1, &X0, &X1, &X2, &E->A24, true);
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            t <<= 1;
+        };
+    };
+    copy_point(R, &X1);
+    return 1;
+}
+
+// WRAPPERS to export
+
+void
+ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve)
+{
+    // If A24 = ((A+2)/4 : 1) we save multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+    } else {
+        // Otherwise we compute A24 on the fly for doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+    }
+}
+
+void
+ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve)
+{
+    if (n == 0) {
+        copy_point(res, P);
+        return;
+    }
+
+    // When the chain is long enough, we should normalise A24
+    if (n > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is normalized we can save some multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+        for (int i = 0; i < n - 1; i++) {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(res, res, &curve->A24, true);
+        }
+    } else {
+        // Otherwise we do normal doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+        for (int i = 0; i < n - 1; i++) {
+            xDBL(res, res, (const ec_point_t *)curve);
+        }
+    }
+}
+
+void
+ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve)
+{
+    ec_dbl_iter(&res->P, n, &B->P, curve);
+    ec_dbl_iter(&res->Q, n, &B->Q, curve);
+    ec_dbl_iter(&res->PmQ, n, &B->PmQ, curve);
+}
+
+void
+ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve)
+{
+    // For large scalars it's worth normalising anyway
+    if (kbits > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is computed and normalized we save some Fp2 multiplications
+    xMUL(res, P, scalar, kbits, curve);
+}
+
+int
+ec_biscalar_mul(ec_point_t *res,
+                const digit_t *scalarP,
+                const digit_t *scalarQ,
+                const int kbits,
+                const ec_basis_t *PQ,
+                const ec_curve_t *curve)
+{
+    if (fp2_is_zero(&PQ->PmQ.z))
+        return 0;
+
+    /* Differential additions behave badly when PmQ = (0:1), so we need to
+     * treat this case specifically. Since we assume P, Q are a basis, this
+     * can happen only if kbits==1 */
+    if (kbits == 1) {
+        // Sanity check: our basis should be given by 2-torsion points
+        if (!ec_is_two_torsion(&PQ->P, curve) || !ec_is_two_torsion(&PQ->Q, curve) ||
+            !ec_is_two_torsion(&PQ->PmQ, curve))
+            return 0;
+        digit_t bP, bQ;
+        bP = (scalarP[0] & 1);
+        bQ = (scalarQ[0] & 1);
+        if (bP == 0 && bQ == 0)
+            ec_point_init(res); //(1: 0)
+        else if (bP == 1 && bQ == 0)
+            copy_point(res, &PQ->P);
+        else if (bP == 0 && bQ == 1)
+            copy_point(res, &PQ->Q);
+        else if (bP == 1 && bQ == 1)
+            copy_point(res, &PQ->PmQ);
+        else // should never happen
+            assert(0);
+        return 1;
+    } else {
+        ec_curve_t E;
+        copy_curve(&E, curve);
+
+        if (!fp2_is_zero(&curve->A)) { // If A is not zero normalize
+            ec_curve_normalize_A24(&E);
+        }
+        return xDBLMUL(res, &PQ->P, scalarP, &PQ->Q, scalarQ, &PQ->PmQ, kbits, (const ec_curve_t *)&E);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/ec.h b/src/pqm4/sqisign_lvl1/ref/ec.h
new file mode 100644
index 0000000..ee2be38
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/ec.h
@@ -0,0 +1,668 @@
+﻿/** @file
+ *
+ * @authors Luca De Feo, Francisco RH
+ *
+ * @brief Elliptic curve stuff
+ */
+
+#ifndef EC_H
+#define EC_H
+#include <sqisign_namespace.h>
+#include <ec_params.h>
+#include <fp2.h>
+#include <tools.h>
+#include <stdio.h>
+
+/** @defgroup ec Elliptic curves
+ * @{
+ */
+
+/** @defgroup ec_t Data structures
+ * @{
+ */
+
+/** @brief Projective point on the Kummer line E/pm 1 in Montgomery coordinates
+ *
+ * @typedef ec_point_t
+ *
+ * @struct ec_point_t
+ *
+ * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
+ */
+typedef struct ec_point_t
+{
+    fp2_t x;
+    fp2_t z;
+} ec_point_t;
+
+/** @brief Projective point in Montgomery coordinates
+ *
+ * @typedef jac_point_t
+ *
+ * @struct jac_point_t
+ *
+ * A projective point in (X:Y:Z) coordinates
+ */
+typedef struct jac_point_t
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+} jac_point_t;
+
+/** @brief Addition components
+ *
+ * @typedef add_components_t
+ *
+ * @struct add_components_t
+ *
+ * 3 components u,v,w that define the (X:Z) coordinates of both
+ * addition and substraction of two distinct points with
+ * P+Q =(u-v:w) and P-Q = (u+v=w)
+ */
+typedef struct add_components_t
+{
+    fp2_t u;
+    fp2_t v;
+    fp2_t w;
+} add_components_t;
+
+/** @brief A basis of a torsion subgroup
+ *
+ * @typedef ec_basis_t
+ *
+ * @struct ec_basis_t
+ *
+ * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
+ */
+typedef struct ec_basis_t
+{
+    ec_point_t P;
+    ec_point_t Q;
+    ec_point_t PmQ;
+} ec_basis_t;
+
+/** @brief An elliptic curve
+ *
+ * @typedef ec_curve_t
+ *
+ * @struct ec_curve_t
+ *
+ * An elliptic curve in projective Montgomery form
+ */
+typedef struct ec_curve_t
+{
+    fp2_t A;
+    fp2_t C;                             ///< cannot be 0
+    ec_point_t A24;                      // the point (A+2 : 4C)
+    bool is_A24_computed_and_normalized; // says if A24 has been computed and normalized
+} ec_curve_t;
+
+/** @brief An isogeny of degree a power of 2
+ *
+ * @typedef ec_isog_even_t
+ *
+ * @struct ec_isog_even_t
+ */
+typedef struct ec_isog_even_t
+{
+    ec_curve_t curve;  ///< The domain curve
+    ec_point_t kernel; ///< A kernel generator
+    unsigned length;   ///< The length as a 2-isogeny walk
+} ec_isog_even_t;
+
+/** @brief Isomorphism of Montgomery curves
+ *
+ * @typedef ec_isom_t
+ *
+ * @struct ec_isom_t
+ *
+ * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X + Nz Z) : (D Z) )
+ */
+typedef struct ec_isom_t
+{
+    fp2_t Nx;
+    fp2_t Nz;
+    fp2_t D;
+} ec_isom_t;
+
+// end ec_t
+/** @}
+ */
+
+/** @defgroup ec_curve_t Curves and isomorphisms
+ * @{
+ */
+
+// Initalisation for curves and points
+void ec_curve_init(ec_curve_t *E);
+void ec_point_init(ec_point_t *P);
+
+/**
+ * @brief Verify that a Montgomery coefficient is valid
+ *
+ * @param A an fp2_t
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_verify_A(const fp2_t *A);
+
+/**
+ * @brief Initialize an elliptic curve from a coefficient
+ *
+ * @param A an fp2_t
+ * @param E the elliptic curve to initialize
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A);
+
+// Copying points, bases and curves
+static inline void
+copy_point(ec_point_t *P, const ec_point_t *Q)
+{
+    fp2_copy(&P->x, &Q->x);
+    fp2_copy(&P->z, &Q->z);
+}
+
+static inline void
+copy_basis(ec_basis_t *B1, const ec_basis_t *B0)
+{
+    copy_point(&B1->P, &B0->P);
+    copy_point(&B1->Q, &B0->Q);
+    copy_point(&B1->PmQ, &B0->PmQ);
+}
+
+static inline void
+copy_curve(ec_curve_t *E1, const ec_curve_t *E2)
+{
+    fp2_copy(&(E1->A), &(E2->A));
+    fp2_copy(&(E1->C), &(E2->C));
+    E1->is_A24_computed_and_normalized = E2->is_A24_computed_and_normalized;
+    copy_point(&E1->A24, &E2->A24);
+}
+
+// Functions for working with the A24 point and normalisation
+
+/**
+ * @brief Reduce (A : C) to (A/C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve(ec_curve_t *E);
+
+/**
+ * @brief Reduce (A + 2 : 4C) to ((A+2)/4C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_curve_normalize_A24(ec_curve_t *E);
+
+/**
+ * @brief Normalise both (A : C) and (A + 2 : 4C) as above, in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve_and_A24(ec_curve_t *E);
+
+/**
+ * @brief Given a curve E, compute (A+2 : 4C)
+ *
+ * @param A24 the value (A+2 : 4C) to return into
+ * @param E a curve
+ */
+static inline void
+AC_to_A24(ec_point_t *A24, const ec_curve_t *E)
+{
+    // Maybe we already have this computed
+    if (E->is_A24_computed_and_normalized) {
+        copy_point(A24, &E->A24);
+        return;
+    }
+
+    // A24 = (A+2C : 4C)
+    fp2_add(&A24->z, &E->C, &E->C);
+    fp2_add(&A24->x, &E->A, &A24->z);
+    fp2_add(&A24->z, &A24->z, &A24->z);
+}
+
+/**
+ * @brief Given a curve the point (A+2 : 4C) compute the curve coefficients (A : C)
+ *
+ * @param E a curve to compute
+ * @param A24 the value (A+2 : 4C)
+ */
+static inline void
+A24_to_AC(ec_curve_t *E, const ec_point_t *A24)
+{
+    // (A:C) = ((A+2C)*2-4C : 4C)
+    fp2_add(&E->A, &A24->x, &A24->x);
+    fp2_sub(&E->A, &E->A, &A24->z);
+    fp2_add(&E->A, &E->A, &E->A);
+    fp2_copy(&E->C, &A24->z);
+}
+
+/**
+ * @brief j-invariant.
+ *
+ * @param j_inv computed j_invariant
+ * @param curve input curve
+ */
+void ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve);
+
+/**
+ * @brief Isomorphism of elliptic curve
+ * Takes as input two isomorphic Kummer lines in Montgomery form, and output an isomorphism between
+ * them
+ *
+ * @param isom computed isomorphism
+ * @param from domain curve
+ * @param to image curve
+ * @return 0xFFFFFFFF if there was an error during the computation, zero otherwise
+ */
+uint32_t ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to);
+
+/**
+ * @brief In-place evaluation of an isomorphism
+ *
+ * @param P a point
+ * @param isom an isomorphism
+ */
+void ec_iso_eval(ec_point_t *P, ec_isom_t *isom);
+
+/** @}
+ */
+/** @defgroup ec_point_t Point operations
+ * @{
+ */
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @param Q a point
+ * @return 0xFFFFFFFF if equal, zero otherwise
+ */
+uint32_t ec_is_equal(const ec_point_t *P, const ec_point_t *Q);
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @return 0xFFFFFFFF if point at infinity, zero otherwise
+ */
+uint32_t ec_is_zero(const ec_point_t *P);
+
+/**
+ * @brief Two torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Four torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Reduce Z-coordinate of point in place
+ *
+ * @param P a point
+ */
+void ec_normalize_point(ec_point_t *P);
+
+void xDBL_E0(ec_point_t *Q, const ec_point_t *P);
+void xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ);
+void xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized);
+
+/**
+ * @brief Point doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param curve an elliptic curve
+ */
+void ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve);
+
+/**
+ * @brief Point iterated doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param n the number of double
+ * @param curve the curve on which P lays
+ */
+void ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Iterated doubling for a basis P, Q, PmQ
+ *
+ * @param res the computed iterated double of basis B
+ * @param n the number of doubles
+ * @param B the basis to double
+ * @param curve the parent curve of the basis
+ */
+void ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve);
+
+/**
+ * @brief Point multiplication
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param P a point
+ * @param kbits numer of bits of the scalar
+ */
+void ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Combination P+m*Q
+ *
+ * @param R computed P + m * Q
+ * @param curve the curve
+ * @param m an unsigned multi-precision integer
+ * @param P a point
+ * @param Q a point
+ * @param PQ the difference P-Q
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_ladder3pt(ec_point_t *R,
+                 const digit_t *m,
+                 const ec_point_t *P,
+                 const ec_point_t *Q,
+                 const ec_point_t *PQ,
+                 const ec_curve_t *curve);
+
+/**
+ * @brief Linear combination of points of a basis
+ *
+ * @param res computed scalarP * P + scalarQ * Q
+ * @param scalarP an unsigned multi-precision integer
+ * @param scalarQ an unsigned multi-precision integer
+ * @param kbits number of bits of the scalars, or n for points of order 2^n
+ * @param PQ a torsion basis consisting of points P and Q
+ * @param curve the curve
+ *
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_biscalar_mul(ec_point_t *res,
+                    const digit_t *scalarP,
+                    const digit_t *scalarQ,
+                    const int kbits,
+                    const ec_basis_t *PQ,
+                    const ec_curve_t *curve);
+
+// end point computations
+/**
+ * @}
+ */
+
+/** @defgroup ec_dlog_t Torsion basis computations
+ * @{
+ */
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve along with a hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ *
+ * @return A hint
+ *
+ * The algorithm is deterministc
+ */
+uint8_t ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f);
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve and a given hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ * @param hint the hint
+ *
+ * @return 1 is the basis is valid, 0 otherwise
+ *
+ * The algorithm is deterministc
+ */
+int ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint);
+/** // end basis computations
+ * @}
+ */
+
+/** @defgroup ec_isog_t Isogenies
+ * @{
+ */
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points.
+ * Returns 0 if successful and -1 if kernel has the wrong order or includes (0:1).
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points);
+
+/**
+ * @brief Multiplicative strategy for a short isogeny chain. Returns 1 if successfull and -1
+ * if kernel has the wrong order or includes (0:1) when special=false.
+ *
+ * @param curve domain curve, to be overwritten by the codomain curve.
+ * @param kernel a kernel generator of order 2^len
+ * @param len the length of t he 2-isogeny chain
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ * @param special if true, allow isogenies with (0:1) in the kernel
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_small_chain(ec_curve_t *curve,
+                             const ec_point_t *kernel,
+                             int len,
+                             ec_point_t *points,
+                             unsigned len_points,
+                             bool special);
+
+/**
+ * @brief Recover Y-coordinate from X-coordinate and curve coefficients.
+ *
+ * @param y: a y-coordinate
+ * @param Px: a x-coordinate
+ * @param curve: the elliptic curve
+ *
+ * @return 0xFFFFFFFF if the point was on the curve, 0 otherwise
+ */
+uint32_t ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve);
+
+// Jacobian point init and copying
+void jac_init(jac_point_t *P);
+void copy_jac_point(jac_point_t *P, const jac_point_t *Q);
+
+/**
+ * @brief Test if two Jacobian points are equal
+ *
+ * @param P: a point
+ * @param Q: a point
+ *
+ * @return 0xFFFFFFFF if they are equal, 0 otherwise
+ */
+uint32_t jac_is_equal(const jac_point_t *P, const jac_point_t *Q);
+
+// Convert from Jacobian to x-only (just drop the Y-coordinate)
+void jac_to_xz(ec_point_t *P, const jac_point_t *xyP);
+// Convert from Jacobian coordinates in Montgomery model to Weierstrass
+void jac_to_ws(jac_point_t *P, fp2_t *t, fp2_t *ao3, const jac_point_t *Q, const ec_curve_t *curve);
+void jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve);
+
+// Jacobian arithmetic
+void jac_neg(jac_point_t *Q, const jac_point_t *P);
+void ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+void DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC);
+void DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t);
+void jac_to_xz_add_components(add_components_t *uvw, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ *
+ *
+ * Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and
+ * the point P = (X/Z : 1). For generic implementation see lift_basis()
+ */
+uint32_t lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ */
+uint32_t lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Check if basis points (P, Q) form a full 4-basis
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if they form a basis, 0 otherwise
+ */
+uint32_t ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E);
+
+/*
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Test functions for printing and order checking, only used in debug mode
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+
+/**
+ * @brief Check if a point (X : Z) has order exactly 2^t
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_point_order_twof(const ec_point_t *P, const ec_curve_t *E, int t)
+{
+    ec_point_t test;
+    ec_curve_t curve;
+    test = *P;
+    copy_curve(&curve, E);
+
+    if (ec_is_zero(&test))
+        return 0;
+    // Scale point by 2^(t-1)
+    ec_dbl_iter(&test, t - 1, &test, &curve);
+    // If it's zero now, it doesnt have order 2^t
+    if (ec_is_zero(&test))
+        return 0;
+    // Ensure [2^t] P = 0
+    ec_dbl(&test, &test, &curve);
+    return ec_is_zero(&test);
+}
+
+/**
+ * @brief Check if basis points (P, Q, PmQ) all have order exactly 2^t
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_basis_order_twof(const ec_basis_t *B, const ec_curve_t *E, int t)
+{
+    int check_P = test_point_order_twof(&B->P, E, t);
+    int check_Q = test_point_order_twof(&B->Q, E, t);
+    int check_PmQ = test_point_order_twof(&B->PmQ, E, t);
+
+    return check_P & check_Q & check_PmQ;
+}
+
+/**
+ * @brief Check if a Jacobian point (X : Y : Z) has order exactly 2^f
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_jac_order_twof(const jac_point_t *P, const ec_curve_t *E, int t)
+{
+    jac_point_t test;
+    test = *P;
+    if (fp2_is_zero(&test.z))
+        return 0;
+    for (int i = 0; i < t - 1; i++) {
+        DBL(&test, &test, E);
+    }
+    if (fp2_is_zero(&test.z))
+        return 0;
+    DBL(&test, &test, E);
+    return (fp2_is_zero(&test.z));
+}
+
+// Prints the x-coordinate of the point (X : 1)
+static void
+ec_point_print(const char *name, ec_point_t P)
+{
+    fp2_t a;
+    if (fp2_is_zero(&P.z)) {
+        printf("%s = INF\n", name);
+    } else {
+        fp2_copy(&a, &P.z);
+        fp2_inv(&a);
+        fp2_mul(&a, &a, &P.x);
+        fp2_print(name, &a);
+    }
+}
+
+// Prints the Montgomery coefficient A
+static void
+ec_curve_print(const char *name, ec_curve_t E)
+{
+    fp2_t a;
+    fp2_copy(&a, &E.C);
+    fp2_inv(&a);
+    fp2_mul(&a, &a, &E.A);
+    fp2_print(name, &a);
+}
+
+#endif
+// end isogeny computations
+/**
+ * @}
+ */
+
+// end ec
+/**
+ * @}
+ */
diff --git a/src/pqm4/sqisign_lvl1/ref/ec_jac.c b/src/pqm4/sqisign_lvl1/ref/ec_jac.c
new file mode 100644
index 0000000..20ca68c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/ec_jac.c
@@ -0,0 +1,335 @@
+#include <assert.h>
+#include <ec.h>
+
+void
+jac_init(jac_point_t *P)
+{ // Initialize Montgomery in Jacobian coordinates as identity element (0:1:0)
+    fp2_set_zero(&P->x);
+    fp2_set_one(&P->y);
+    fp2_set_zero(&P->z);
+}
+
+uint32_t
+jac_is_equal(const jac_point_t *P, const jac_point_t *Q)
+{ // Evaluate if two points in Jacobian coordinates (X:Y:Z) are equal
+  // Returns 1 (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1, t2, t3;
+
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t2, &P->x, &t0); // x1*z2^2
+    fp2_sqr(&t1, &P->z);
+    fp2_mul(&t3, &Q->x, &t1); // x2*z1^2
+    fp2_sub(&t2, &t2, &t3);
+
+    fp2_mul(&t0, &t0, &Q->z);
+    fp2_mul(&t0, &P->y, &t0); // y1*z2^3
+    fp2_mul(&t1, &t1, &P->z);
+    fp2_mul(&t1, &Q->y, &t1); // y2*z1^3
+    fp2_sub(&t0, &t0, &t1);
+
+    return fp2_is_zero(&t0) & fp2_is_zero(&t2);
+}
+
+void
+jac_to_xz(ec_point_t *P, const jac_point_t *xyP)
+{
+    fp2_copy(&P->x, &xyP->x);
+    fp2_copy(&P->z, &xyP->z);
+    fp2_sqr(&P->z, &P->z);
+
+    // If xyP = (0:1:0), we currently have P=(0 : 0) but we want to set P=(1:0)
+    uint32_t c1, c2;
+    fp2_t one;
+    fp2_set_one(&one);
+
+    c1 = fp2_is_zero(&P->x);
+    c2 = fp2_is_zero(&P->z);
+    fp2_select(&P->x, &P->x, &one, c1 & c2);
+}
+
+void
+jac_to_ws(jac_point_t *Q, fp2_t *t, fp2_t *ao3, const jac_point_t *P, const ec_curve_t *curve)
+{
+    // Cost of 3M + 2S when A != 0.
+    fp_t one;
+    fp2_t a;
+    /* a = 1 - A^2/3, U = X + (A*Z^2)/3, V = Y, W = Z, T = a*Z^4*/
+    fp_set_one(&one);
+    if (!fp2_is_zero(&(curve->A))) {
+        fp_div3(&(ao3->re), &(curve->A.re));
+        fp_div3(&(ao3->im), &(curve->A.im));
+        fp2_sqr(t, &P->z);
+        fp2_mul(&Q->x, ao3, t);
+        fp2_add(&Q->x, &Q->x, &P->x);
+        fp2_sqr(t, t);
+        fp2_mul(&a, ao3, &(curve->A));
+        fp_sub(&(a.re), &one, &(a.re));
+        fp_neg(&(a.im), &(a.im));
+        fp2_mul(t, t, &a);
+    } else {
+        fp2_copy(&Q->x, &P->x);
+        fp2_sqr(t, &P->z);
+        fp2_sqr(t, t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve)
+{
+    // Cost of 1M + 1S when A != 0.
+    fp2_t t;
+    /* X = U - (A*W^2)/3, Y = V, Z = W. */
+    if (!fp2_is_zero(&(curve->A))) {
+        fp2_sqr(&t, &P->z);
+        fp2_mul(&t, &t, ao3);
+        fp2_sub(&Q->x, &P->x, &t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+copy_jac_point(jac_point_t *P, const jac_point_t *Q)
+{
+    fp2_copy(&(P->x), &(Q->x));
+    fp2_copy(&(P->y), &(Q->y));
+    fp2_copy(&(P->z), &(Q->z));
+}
+
+void
+jac_neg(jac_point_t *Q, const jac_point_t *P)
+{
+    fp2_copy(&Q->x, &P->x);
+    fp2_neg(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC)
+{ // Cost of 6M + 6S.
+  // Doubling on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding to
+  // (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    fp2_t t0, t1, t2, t3;
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_sqr(&t0, &P->x); // t0 = x1^2
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1); // t0 = 3x1^2
+    fp2_sqr(&t1, &P->z);    // t1 = z1^2
+    fp2_mul(&t2, &P->x, &AC->A);
+    fp2_add(&t2, &t2, &t2); // t2 = 2Ax1
+    fp2_add(&t2, &t1, &t2); // t2 = 2Ax1+z1^2
+    fp2_mul(&t2, &t1, &t2); // t2 = z1^2(2Ax1+z1^2)
+    fp2_add(&t2, &t0, &t2); // t2 = alpha = 3x1^2 + z1^2(2Ax1+z1^2)
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z); // z2 = 2y1z1
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t0, &t0, &AC->A); // t0 = 4Ay1^2z1^2
+    fp2_sqr(&t1, &P->y);
+    fp2_add(&t1, &t1, &t1);     // t1 = 2y1^2
+    fp2_add(&t3, &P->x, &P->x); // t3 = 2x1
+    fp2_mul(&t3, &t1, &t3);     // t3 = 4x1y1^2
+    fp2_sqr(&Q->x, &t2);        // x2 = alpha^2
+    fp2_sub(&Q->x, &Q->x, &t0); // x2 = alpha^2 - 4Ay1^2z1^2
+    fp2_sub(&Q->x, &Q->x, &t3);
+    fp2_sub(&Q->x, &Q->x, &t3); // x2 = alpha^2 - 4Ay1^2z1^2 - 8x1y1^2
+    fp2_sub(&Q->y, &t3, &Q->x); // y2 = 4x1y1^2 - x2
+    fp2_mul(&Q->y, &Q->y, &t2); // y2 = alpha(4x1y1^2 - x2)
+    fp2_sqr(&t1, &t1);          // t1 = 4y1^4
+    fp2_sub(&Q->y, &Q->y, &t1);
+    fp2_sub(&Q->y, &Q->y, &t1); // y2 = alpha(4x1y1^2 - x2) - 8y1^4
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t)
+{ // Cost of 3M + 5S.
+  // Doubling on a Weierstrass curve, representation in modified Jacobian coordinates
+  // (X:Y:Z:T=a*Z^4) corresponding to (X/Z^2,Y/Z^3), where a is the curve coefficient.
+  // Formula from https://hyperelliptic.org/EFD/g1p/auto-shortw-modified.html
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_t xx, c, cc, r, s, m;
+    // XX = X^2
+    fp2_sqr(&xx, &P->x);
+    // A = 2*Y^2
+    fp2_sqr(&c, &P->y);
+    fp2_add(&c, &c, &c);
+    // AA = A^2
+    fp2_sqr(&cc, &c);
+    // R = 2*AA
+    fp2_add(&r, &cc, &cc);
+    // S = (X+A)^2-XX-AA
+    fp2_add(&s, &P->x, &c);
+    fp2_sqr(&s, &s);
+    fp2_sub(&s, &s, &xx);
+    fp2_sub(&s, &s, &cc);
+    // M = 3*XX+T1
+    fp2_add(&m, &xx, &xx);
+    fp2_add(&m, &m, &xx);
+    fp2_add(&m, &m, t);
+    // X3 = M^2-2*S
+    fp2_sqr(&Q->x, &m);
+    fp2_sub(&Q->x, &Q->x, &s);
+    fp2_sub(&Q->x, &Q->x, &s);
+    // Z3 = 2*Y*Z
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z);
+    // Y3 = M*(S-X3)-R
+    fp2_sub(&Q->y, &s, &Q->x);
+    fp2_mul(&Q->y, &Q->y, &m);
+    fp2_sub(&Q->y, &Q->y, &r);
+    // T3 = 2*R*T1
+    fp2_mul(u, t, &r);
+    fp2_add(u, u, u);
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+select_jac_point(jac_point_t *Q, const jac_point_t *P1, const jac_point_t *P2, const digit_t option)
+{ // Select points
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->y), &(P1->y), &(P2->y), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Addition on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding
+    // to (x,y) = (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    //
+    // Complete routine, to handle all edge cases:
+    //   if ZP == 0:            # P == inf
+    //       return Q
+    //   if ZQ == 0:            # Q == inf
+    //       return P
+    //   dy <- YQ*ZP**3 - YP*ZQ**3
+    //   dx <- XQ*ZP**2 - XP*ZQ**2
+    //   if dx == 0:             # x1 == x2
+    //       if dy == 0:         # ... and y1 == y2: doubling case
+    //           dy <- ZP*ZQ * (3*XP^2 + ZP^2 * (2*A*XP + ZP^2))
+    //           dx <- 2*YP*ZP
+    //       else:              # ... but y1 != y2, thus P = -Q
+    //           return inf
+    //   XR <- dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2)
+    //   YR <- dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3
+    //   ZR <- dx * ZP * ZQ
+
+    // Constant time processing:
+    // - The case for P == 0 or Q == 0 is handled at the end with conditional select
+    // - dy and dx are computed for both the normal and doubling cases, we switch when
+    //   dx == dy == 0 for the normal case.
+    // - If we have that P = -Q then dx = 0 and so ZR will be zero, giving us the point
+    //   at infinity for "free".
+    //
+    // These current formula are expensive and I'm probably missing some tricks...
+    // Thought I'd get the ball rolling.
+    // Cost 17M + 6S + 13a
+    fp2_t t0, t1, t2, t3, u1, u2, v1, dx, dy;
+
+    /* If P is zero or Q is zero we will conditionally swap before returning. */
+    uint32_t ctl1 = fp2_is_zero(&P->z);
+    uint32_t ctl2 = fp2_is_zero(&Q->z);
+
+    /* Precompute some values */
+    fp2_sqr(&t0, &P->z); // t0 = z1^2
+    fp2_sqr(&t1, &Q->z); // t1 = z2^2
+
+    /* Compute dy and dx for ordinary case */
+    fp2_mul(&v1, &t1, &Q->z); // v1 = z2^3
+    fp2_mul(&t2, &t0, &P->z); // t2 = z1^3
+    fp2_mul(&v1, &v1, &P->y); // v1 = y1z2^3
+    fp2_mul(&t2, &t2, &Q->y); // t2 = y2z1^3
+    fp2_sub(&dy, &t2, &v1);   // dy = y2z1^3 - y1z2^3
+    fp2_mul(&u2, &t0, &Q->x); // u2 = x2z1^2
+    fp2_mul(&u1, &t1, &P->x); // u1 = x1z2^2
+    fp2_sub(&dx, &u2, &u1);   // dx = x2z1^2 - x1z2^2
+
+    /* Compute dy and dx for doubling case */
+    fp2_add(&t1, &P->y, &P->y);   // dx_dbl = t1 = 2y1
+    fp2_add(&t2, &AC->A, &AC->A); // t2 = 2A
+    fp2_mul(&t2, &t2, &P->x);     // t2 = 2Ax1
+    fp2_add(&t2, &t2, &t0);       // t2 = 2Ax1 + z1^2
+    fp2_mul(&t2, &t2, &t0);       // t2 = z1^2 * (2Ax1 + z1^2)
+    fp2_sqr(&t0, &P->x);          // t0 = x1^2
+    fp2_add(&t2, &t2, &t0);       // t2 = x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 2*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 3*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_mul(&t2, &t2, &Q->z);     // dy_dbl = t2 = z2 * (3*x1^2 + z1^2 * (2Ax1 + z1^2))
+
+    /* If dx is zero and dy is zero swap with double variables */
+    uint32_t ctl = fp2_is_zero(&dx) & fp2_is_zero(&dy);
+    fp2_select(&dx, &dx, &t1, ctl);
+    fp2_select(&dy, &dy, &t2, ctl);
+
+    /* Some more precomputations */
+    fp2_mul(&t0, &P->z, &Q->z); // t0 = z1z2
+    fp2_sqr(&t1, &t0);          // t1 = z1z2^2
+    fp2_sqr(&t2, &dx);          // t2 = dx^2
+    fp2_sqr(&t3, &dy);          // t3 = dy^2
+
+    /* Compute x3 = dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2) */
+    fp2_mul(&R->x, &AC->A, &t1); // x3 = A*(z1z2)^2
+    fp2_add(&R->x, &R->x, &u1);  // x3 = A*(z1z2)^2 + u1
+    fp2_add(&R->x, &R->x, &u2);  // x3 = A*(z1z2)^2 + u1 + u2
+    fp2_mul(&R->x, &R->x, &t2);  // x3 = dx^2 * (A*(z1z2)^2 + u1 + u2)
+    fp2_sub(&R->x, &t3, &R->x);  // x3 = dy^2 - dx^2 * (A*(z1z2)^2 + u1 + u2)
+
+    /* Compute y3 = dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3*/
+    fp2_mul(&R->y, &u1, &t2);     // y3 = u1 * dx^2
+    fp2_sub(&R->y, &R->y, &R->x); // y3 = u1 * dx^2 - x3
+    fp2_mul(&R->y, &R->y, &dy);   // y3 = dy * (u1 * dx^2 - x3)
+    fp2_mul(&t3, &t2, &dx);       // t3 = dx^3
+    fp2_mul(&t3, &t3, &v1);       // t3 = v1 * dx^3
+    fp2_sub(&R->y, &R->y, &t3);   // y3 = dy * (u1 * dx^2 - x3) - v1 * dx^3
+
+    /* Compute z3 = dx * z1 * z2 */
+    fp2_mul(&R->z, &dx, &t0);
+
+    /* Finally, we need to set R = P is Q.Z = 0 and R = Q if P.Z = 0 */
+    select_jac_point(R, R, Q, ctl1);
+    select_jac_point(R, R, P, ctl2);
+}
+
+void
+jac_to_xz_add_components(add_components_t *add_comp, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Take P and Q in E distinct, two jac_point_t, return three components u,v and w in Fp2 such
+    // that the xz coordinates of P+Q are (u-v:w) and of P-Q are (u+v:w)
+
+    fp2_t t0, t1, t2, t3, t4, t5, t6;
+
+    fp2_sqr(&t0, &P->z);             // t0 = z1^2
+    fp2_sqr(&t1, &Q->z);             // t1 = z2^2
+    fp2_mul(&t2, &P->x, &t1);        // t2 = x1z2^2
+    fp2_mul(&t3, &t0, &Q->x);        // t3 = z1^2x2
+    fp2_mul(&t4, &P->y, &Q->z);      // t4 = y1z2
+    fp2_mul(&t4, &t4, &t1);          // t4 = y1z2^3
+    fp2_mul(&t5, &P->z, &Q->y);      // t5 = z1y2
+    fp2_mul(&t5, &t5, &t0);          // t5 = z1^3y2
+    fp2_mul(&t0, &t0, &t1);          // t0 = (z1z2)^2
+    fp2_mul(&t6, &t4, &t5);          // t6 = (z1z_2)^3y1y2
+    fp2_add(&add_comp->v, &t6, &t6); // v  = 2(z1z_2)^3y1y2
+    fp2_sqr(&t4, &t4);               // t4 = y1^2z2^6
+    fp2_sqr(&t5, &t5);               // t5 = z1^6y_2^2
+    fp2_add(&t4, &t4, &t5);          // t4 = z1^6y_2^2 + y1^2z2^6
+    fp2_add(&t5, &t2, &t3);          // t5 = x1z2^2 +z_1^2x2
+    fp2_add(&t6, &t3, &t3);          // t6 = 2z_1^2x2
+    fp2_sub(&t6, &t5, &t6);          // t6 = lambda = x1z2^2 - z_1^2x2
+    fp2_sqr(&t6, &t6);               // t6 = lambda^2 = (x1z2^2 - z_1^2x2)^2
+    fp2_mul(&t1, &AC->A, &t0);       // t1 = A*(z1z2)^2
+    fp2_add(&t1, &t5, &t1);          // t1 = gamma =A*(z1z2)^2 + x1z2^2 +z_1^2x2
+    fp2_mul(&t1, &t1, &t6);          // t1 = gamma*lambda^2
+    fp2_sub(&add_comp->u, &t4, &t1); // u  = z1^6y_2^2 + y1^2z2^6 - gamma*lambda^2
+    fp2_mul(&add_comp->w, &t6, &t0); // w  = (z1z2)^2(lambda)^2
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/ec_params.c b/src/pqm4/sqisign_lvl1/ref/ec_params.c
new file mode 100644
index 0000000..5011f10
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/ec_params.c
@@ -0,0 +1,4 @@
+#include <ec_params.h>
+// p+1 divided by the power of 2
+const digit_t p_cofactor_for_2f[1] = {5};
+
diff --git a/src/pqm4/sqisign_lvl1/ref/ec_params.h b/src/pqm4/sqisign_lvl1/ref/ec_params.h
new file mode 100644
index 0000000..e02ac1d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/ec_params.h
@@ -0,0 +1,12 @@
+#ifndef EC_PARAMS_H
+#define EC_PARAMS_H
+
+#include <fp.h>
+
+#define TORSION_EVEN_POWER 248
+
+// p+1 divided by the power of 2
+extern const digit_t p_cofactor_for_2f[1];
+#define P_COFACTOR_FOR_2F_BITLENGTH 3
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/encode_verification.c b/src/pqm4/sqisign_lvl1/ref/encode_verification.c
new file mode 100644
index 0000000..fecdb9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/encode_verification.c
@@ -0,0 +1,220 @@
+#include <verification.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp2.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+typedef unsigned char byte_t;
+
+// digits
+
+static void
+encode_digits(byte_t *enc, const digit_t *x, size_t nbytes)
+{
+#ifdef TARGET_BIG_ENDIAN
+    const size_t ndigits = nbytes / sizeof(digit_t);
+    const size_t rem = nbytes % sizeof(digit_t);
+
+    for (size_t i = 0; i < ndigits; i++)
+        ((digit_t *)enc)[i] = BSWAP_DIGIT(x[i]);
+    if (rem) {
+        digit_t ld = BSWAP_DIGIT(x[ndigits]);
+        memcpy(enc + ndigits * sizeof(digit_t), (byte_t *)&ld, rem);
+    }
+#else
+    memcpy(enc, (const byte_t *)x, nbytes);
+#endif
+}
+
+static void
+decode_digits(digit_t *x, const byte_t *enc, size_t nbytes, size_t ndigits)
+{
+    assert(nbytes <= ndigits * sizeof(digit_t));
+    memcpy((byte_t *)x, enc, nbytes);
+    memset((byte_t *)x + nbytes, 0, ndigits * sizeof(digit_t) - nbytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (size_t i = 0; i < ndigits; i++)
+        x[i] = BSWAP_DIGIT(x[i]);
+#endif
+}
+
+// fp2_t
+
+static byte_t *
+fp2_to_bytes(byte_t *enc, const fp2_t *x)
+{
+    fp2_encode(enc, x);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+static const byte_t *
+fp2_from_bytes(fp2_t *x, const byte_t *enc)
+{
+    fp2_decode(x, enc);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+// curves and points
+
+static byte_t *
+proj_to_bytes(byte_t *enc, const fp2_t *x, const fp2_t *z)
+{
+    assert(!fp2_is_zero(z));
+    fp2_t tmp = *z;
+    fp2_inv(&tmp);
+#ifndef NDEBUG
+    {
+        fp2_t chk;
+        fp2_mul(&chk, z, &tmp);
+        fp2_t one;
+        fp2_set_one(&one);
+        assert(fp2_is_equal(&chk, &one));
+    }
+#endif
+    fp2_mul(&tmp, x, &tmp);
+    enc = fp2_to_bytes(enc, &tmp);
+    return enc;
+}
+
+static const byte_t *
+proj_from_bytes(fp2_t *x, fp2_t *z, const byte_t *enc)
+{
+    enc = fp2_from_bytes(x, enc);
+    fp2_set_one(z);
+    return enc;
+}
+
+static byte_t *
+ec_curve_to_bytes(byte_t *enc, const ec_curve_t *curve)
+{
+    return proj_to_bytes(enc, &curve->A, &curve->C);
+}
+
+static const byte_t *
+ec_curve_from_bytes(ec_curve_t *curve, const byte_t *enc)
+{
+    memset(curve, 0, sizeof(*curve));
+    return proj_from_bytes(&curve->A, &curve->C, enc);
+}
+
+static byte_t *
+ec_point_to_bytes(byte_t *enc, const ec_point_t *point)
+{
+    return proj_to_bytes(enc, &point->x, &point->z);
+}
+
+static const byte_t *
+ec_point_from_bytes(ec_point_t *point, const byte_t *enc)
+{
+    return proj_from_bytes(&point->x, &point->z, enc);
+}
+
+static byte_t *
+ec_basis_to_bytes(byte_t *enc, const ec_basis_t *basis)
+{
+    enc = ec_point_to_bytes(enc, &basis->P);
+    enc = ec_point_to_bytes(enc, &basis->Q);
+    enc = ec_point_to_bytes(enc, &basis->PmQ);
+    return enc;
+}
+
+static const byte_t *
+ec_basis_from_bytes(ec_basis_t *basis, const byte_t *enc)
+{
+    enc = ec_point_from_bytes(&basis->P, enc);
+    enc = ec_point_from_bytes(&basis->Q, enc);
+    enc = ec_point_from_bytes(&basis->PmQ, enc);
+    return enc;
+}
+
+// public API
+
+byte_t *
+public_key_to_bytes(byte_t *enc, const public_key_t *pk)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_to_bytes(enc, &pk->curve);
+    *enc++ = pk->hint_pk;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+const byte_t *
+public_key_from_bytes(public_key_t *pk, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_from_bytes(&pk->curve, enc);
+    pk->hint_pk = *enc++;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+void
+signature_to_bytes(byte_t *enc, const signature_t *sig)
+{
+#ifndef NDEBUG
+    byte_t *const start = enc;
+#endif
+
+    enc = fp2_to_bytes(enc, &sig->E_aux_A);
+
+    *enc++ = sig->backtracking;
+    *enc++ = sig->two_resp_length;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][1], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][1], nbytes);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    encode_digits(enc, sig->chall_coeff, nbytes);
+    enc += nbytes;
+
+    *enc++ = sig->hint_aux;
+    *enc++ = sig->hint_chall;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
+
+void
+signature_from_bytes(signature_t *sig, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+
+    enc = fp2_from_bytes(&sig->E_aux_A, enc);
+
+    sig->backtracking = *enc++;
+    sig->two_resp_length = *enc++;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    decode_digits(sig->chall_coeff, enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    sig->hint_aux = *enc++;
+    sig->hint_chall = *enc++;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/encoded_sizes.h b/src/pqm4/sqisign_lvl1/ref/encoded_sizes.h
new file mode 100644
index 0000000..02f8642
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/encoded_sizes.h
@@ -0,0 +1,11 @@
+#define SECURITY_BITS 128
+#define SQIsign_response_length 126
+#define HASH_ITERATIONS 64
+#define FP_ENCODED_BYTES 32
+#define FP2_ENCODED_BYTES 64
+#define EC_CURVE_ENCODED_BYTES 64
+#define EC_POINT_ENCODED_BYTES 64
+#define EC_BASIS_ENCODED_BYTES 192
+#define PUBLICKEY_BYTES 65
+#define SECRETKEY_BYTES 353
+#define SIGNATURE_BYTES 148
diff --git a/src/pqm4/sqisign_lvl1/ref/fp.c b/src/pqm4/sqisign_lvl1/ref/fp.c
new file mode 100644
index 0000000..48e2937
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp.c
@@ -0,0 +1,15 @@
+#include <fp.h>
+
+/*
+ * If ctl == 0x00000000, then *d is set to a0
+ * If ctl == 0xFFFFFFFF, then *d is set to a1
+ * ctl MUST be either 0x00000000 or 0xFFFFFFFF.
+ */
+void
+fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl)
+{
+    digit_t cw = (int32_t)ctl;
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++) {
+        (*d)[i] = (*a0)[i] ^ (cw & ((*a0)[i] ^ (*a1)[i]));
+    }
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/fp.h b/src/pqm4/sqisign_lvl1/ref/fp.h
new file mode 100644
index 0000000..1241d58
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp.h
@@ -0,0 +1,48 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <sqisign_namespace.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD]; // Datatype for representing field elements
+
+extern const digit_t ONE[NWORDS_FIELD];
+extern const digit_t ZERO[NWORDS_FIELD];
+// extern const digit_t PM1O3[NWORDS_FIELD];
+
+void fp_set_small(fp_t *x, const digit_t val);
+void fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val);
+void fp_set_zero(fp_t *x);
+void fp_set_one(fp_t *x);
+uint32_t fp_is_equal(const fp_t *a, const fp_t *b);
+uint32_t fp_is_zero(const fp_t *a);
+void fp_copy(fp_t *out, const fp_t *a);
+
+void fp_encode(void *dst, const fp_t *a);
+void fp_decode_reduce(fp_t *d, const void *src, size_t len);
+uint32_t fp_decode(fp_t *d, const void *src);
+
+void fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl);
+void fp_cswap(fp_t *a, fp_t *b, uint32_t ctl);
+
+void fp_add(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_sub(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_neg(fp_t *out, const fp_t *a);
+void fp_sqr(fp_t *out, const fp_t *a);
+void fp_mul(fp_t *out, const fp_t *a, const fp_t *b);
+
+void fp_inv(fp_t *x);
+uint32_t fp_is_square(const fp_t *a);
+void fp_sqrt(fp_t *a);
+void fp_half(fp_t *out, const fp_t *a);
+void fp_exp3div4(fp_t *out, const fp_t *a);
+void fp_div3(fp_t *out, const fp_t *a);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/fp2.c b/src/pqm4/sqisign_lvl1/ref/fp2.c
new file mode 100644
index 0000000..a258952
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp2.c
@@ -0,0 +1,328 @@
+#include <inttypes.h>
+#include <encoded_sizes.h>
+#include <fp2.h>
+
+/* Arithmetic modulo X^2 + 1 */
+
+void
+fp2_set_small(fp2_t *x, const digit_t val)
+{
+    fp_set_small(&(x->re), val);
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n)
+{
+    fp_mul_small(&x->re, &y->re, n);
+    fp_mul_small(&x->im, &y->im, n);
+}
+
+void
+fp2_set_one(fp2_t *x)
+{
+    fp_set_one(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_set_zero(fp2_t *x)
+{
+    fp_set_zero(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+// Is a GF(p^2) element zero?
+// Returns 0xFF...FF (true) if a=0, 0 (false) otherwise
+uint32_t
+fp2_is_zero(const fp2_t *a)
+{
+    return fp_is_zero(&(a->re)) & fp_is_zero(&(a->im));
+}
+
+// Compare two GF(p^2) elements in constant time
+// Returns 0xFF...FF (true) if a=b, 0 (false) otherwise
+uint32_t
+fp2_is_equal(const fp2_t *a, const fp2_t *b)
+{
+    return fp_is_equal(&(a->re), &(b->re)) & fp_is_equal(&(a->im), &(b->im));
+}
+
+// Is a GF(p^2) element one?
+// Returns 0xFF...FF (true) if a=1, 0 (false) otherwise
+uint32_t
+fp2_is_one(const fp2_t *a)
+{
+    return fp_is_equal(&(a->re), &ONE) & fp_is_zero(&(a->im));
+}
+
+void
+fp2_copy(fp2_t *x, const fp2_t *y)
+{
+    fp_copy(&(x->re), &(y->re));
+    fp_copy(&(x->im), &(y->im));
+}
+
+void
+fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_add(&(x->re), &(y->re), &(z->re));
+    fp_add(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_add_one(fp2_t *x, const fp2_t *y)
+{
+    fp_add(&x->re, &y->re, &ONE);
+    fp_copy(&x->im, &y->im);
+}
+
+void
+fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_sub(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_neg(fp2_t *x, const fp2_t *y)
+{
+    fp_neg(&(x->re), &(y->re));
+    fp_neg(&(x->im), &(y->im));
+}
+
+void
+fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_t t0, t1;
+
+    fp_add(&t0, &(y->re), &(y->im));
+    fp_add(&t1, &(z->re), &(z->im));
+    fp_mul(&t0, &t0, &t1);
+    fp_mul(&t1, &(y->im), &(z->im));
+    fp_mul(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &t0, &t1);
+    fp_sub(&(x->im), &(x->im), &(x->re));
+    fp_sub(&(x->re), &(x->re), &t1);
+}
+
+void
+fp2_sqr(fp2_t *x, const fp2_t *y)
+{
+    fp_t sum, diff;
+
+    fp_add(&sum, &(y->re), &(y->im));
+    fp_sub(&diff, &(y->re), &(y->im));
+    fp_mul(&(x->im), &(y->re), &(y->im));
+    fp_add(&(x->im), &(x->im), &(x->im));
+    fp_mul(&(x->re), &sum, &diff);
+}
+
+void
+fp2_inv(fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+    fp_inv(&t0);
+    fp_mul(&(x->re), &(x->re), &t0);
+    fp_mul(&(x->im), &(x->im), &t0);
+    fp_neg(&(x->im), &(x->im));
+}
+
+uint32_t
+fp2_is_square(const fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+
+    return fp_is_square(&t0);
+}
+
+void
+fp2_sqrt(fp2_t *a)
+{
+    fp_t x0, x1, t0, t1;
+
+    /* From "Optimized One-Dimensional SQIsign Verification on Intel and
+     * Cortex-M4" by Aardal et al: https://eprint.iacr.org/2024/1563 */
+
+    // x0 = \delta = sqrt(a0^2 + a1^2).
+    fp_sqr(&x0, &(a->re));
+    fp_sqr(&x1, &(a->im));
+    fp_add(&x0, &x0, &x1);
+    fp_sqrt(&x0);
+    // If a1 = 0, there is a risk of \delta = -a0, which makes x0 = 0 below.
+    // In that case, we restore the value \delta = a0.
+    fp_select(&x0, &x0, &(a->re), fp_is_zero(&(a->im)));
+    // x0 = \delta + a0, t0 = 2 * x0.
+    fp_add(&x0, &x0, &(a->re));
+    fp_add(&t0, &x0, &x0);
+
+    // x1 = t0^(p-3)/4
+    fp_exp3div4(&x1, &t0);
+
+    // x0 = x0 * x1, x1 = x1 * a1, t1 = (2x0)^2.
+    fp_mul(&x0, &x0, &x1);
+    fp_mul(&x1, &x1, &(a->im));
+    fp_add(&t1, &x0, &x0);
+    fp_sqr(&t1, &t1);
+    // If t1 = t0, return x0 + x1*i, otherwise x1 - x0*i.
+    fp_sub(&t0, &t0, &t1);
+    uint32_t f = fp_is_zero(&t0);
+    fp_neg(&t1, &x0);
+    fp_copy(&t0, &x1);
+    fp_select(&t0, &t0, &x0, f);
+    fp_select(&t1, &t1, &x1, f);
+
+    // Check if t0 is zero
+    uint32_t t0_is_zero = fp_is_zero(&t0);
+
+    // Check whether t0, t1 are odd
+    // Note: we encode to ensure canonical representation
+    uint8_t tmp_bytes[FP_ENCODED_BYTES];
+    fp_encode(tmp_bytes, &t0);
+    uint32_t t0_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+    fp_encode(tmp_bytes, &t1);
+    uint32_t t1_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+
+    // We negate the output if:
+    // t0 is odd, or
+    // t0 is zero and t1 is odd
+    uint32_t negate_output = t0_is_odd | (t0_is_zero & t1_is_odd);
+    fp_neg(&x0, &t0);
+    fp_select(&(a->re), &t0, &x0, negate_output);
+    fp_neg(&x0, &t1);
+    fp_select(&(a->im), &t1, &x0, negate_output);
+}
+
+uint32_t
+fp2_sqrt_verify(fp2_t *a)
+{
+    fp2_t t0, t1;
+
+    fp2_copy(&t0, a);
+    fp2_sqrt(a);
+    fp2_sqr(&t1, a);
+
+    return (fp2_is_equal(&t0, &t1));
+}
+
+void
+fp2_half(fp2_t *x, const fp2_t *y)
+{
+    fp_half(&(x->re), &(y->re));
+    fp_half(&(x->im), &(y->im));
+}
+
+void
+fp2_batched_inv(fp2_t *x, int len)
+{
+    fp2_t t1[len], t2[len];
+    fp2_t inverse;
+
+    // x = x0,...,xn
+    // t1 = x0, x0*x1, ... ,x0 * x1 * ... * xn
+    fp2_copy(&t1[0], &x[0]);
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t1[i], &t1[i - 1], &x[i]);
+    }
+
+    // inverse = 1/ (x0 * x1 * ... * xn)
+    fp2_copy(&inverse, &t1[len - 1]);
+    fp2_inv(&inverse);
+
+    fp2_copy(&t2[0], &inverse);
+    // t2 = 1/ (x0 * x1 * ... * xn), 1/ (x0 * x1 * ... * x(n-1)) , ... , 1/xO
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t2[i], &t2[i - 1], &x[len - i]);
+    }
+
+    fp2_copy(&x[0], &t2[len - 1]);
+
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&x[i], &t1[i - 1], &t2[len - i - 1]);
+    }
+}
+
+// exponentiation using square and multiply
+// Warning!! Not constant time!
+void
+fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size)
+{
+    fp2_t acc;
+    digit_t bit;
+
+    fp2_copy(&acc, x);
+    fp2_set_one(out);
+
+    // Iterate over each word of exp
+    for (int j = 0; j < size; j++) {
+        // Iterate over each bit of the word
+        for (int i = 0; i < RADIX; i++) {
+            bit = (exp[j] >> i) & 1;
+            if (bit == 1) {
+                fp2_mul(out, out, &acc);
+            }
+            fp2_sqr(&acc, &acc);
+        }
+    }
+}
+
+void
+fp2_print(const char *name, const fp2_t *a)
+{
+    printf("%s0x", name);
+
+    uint8_t buf[FP_ENCODED_BYTES];
+    fp_encode(&buf, &a->re); // Encoding ensures canonical rep
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+
+    printf(" + i*0x");
+
+    fp_encode(&buf, &a->im);
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+    printf("\n");
+}
+
+void
+fp2_encode(void *dst, const fp2_t *a)
+{
+    uint8_t *buf = dst;
+    fp_encode(buf, &(a->re));
+    fp_encode(buf + FP_ENCODED_BYTES, &(a->im));
+}
+
+uint32_t
+fp2_decode(fp2_t *d, const void *src)
+{
+    const uint8_t *buf = src;
+    uint32_t re, im;
+
+    re = fp_decode(&(d->re), buf);
+    im = fp_decode(&(d->im), buf + FP_ENCODED_BYTES);
+    return re & im;
+}
+
+void
+fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl)
+{
+    fp_select(&(d->re), &(a0->re), &(a1->re), ctl);
+    fp_select(&(d->im), &(a0->im), &(a1->im), ctl);
+}
+
+void
+fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl)
+{
+    fp_cswap(&(a->re), &(b->re), ctl);
+    fp_cswap(&(a->im), &(b->im), ctl);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/fp2.h b/src/pqm4/sqisign_lvl1/ref/fp2.h
new file mode 100644
index 0000000..00e673b
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp2.h
@@ -0,0 +1,41 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include <sqisign_namespace.h>
+#include "fp.h"
+#include <stdio.h>
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t
+{
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set_small(fp2_t *x, const digit_t val);
+void fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n);
+void fp2_set_one(fp2_t *x);
+void fp2_set_zero(fp2_t *x);
+uint32_t fp2_is_zero(const fp2_t *a);
+uint32_t fp2_is_equal(const fp2_t *a, const fp2_t *b);
+uint32_t fp2_is_one(const fp2_t *a);
+void fp2_copy(fp2_t *x, const fp2_t *y);
+void fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_add_one(fp2_t *x, const fp2_t *y);
+void fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_neg(fp2_t *x, const fp2_t *y);
+void fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_sqr(fp2_t *x, const fp2_t *y);
+void fp2_inv(fp2_t *x);
+uint32_t fp2_is_square(const fp2_t *x);
+void fp2_sqrt(fp2_t *x);
+uint32_t fp2_sqrt_verify(fp2_t *a);
+void fp2_half(fp2_t *x, const fp2_t *y);
+void fp2_batched_inv(fp2_t *x, int len);
+void fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size);
+void fp2_print(const char *name, const fp2_t *a);
+void fp2_encode(void *dst, const fp2_t *a);
+uint32_t fp2_decode(fp2_t *d, const void *src);
+void fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl);
+void fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/fp_constants.h b/src/pqm4/sqisign_lvl1/ref/fp_constants.h
new file mode 100644
index 0000000..c770b78
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp_constants.h
@@ -0,0 +1,17 @@
+#if RADIX == 32
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 8
+#else
+#define NWORDS_FIELD 9
+#endif
+#define NWORDS_ORDER 8
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 4
+#else
+#define NWORDS_FIELD 5
+#endif
+#define NWORDS_ORDER 4
+#endif
+#define BITS 256
+#define LOG2P 8
diff --git a/src/pqm4/sqisign_lvl1/ref/fp_p5248_32.c b/src/pqm4/sqisign_lvl1/ref/fp_p5248_32.c
new file mode 100644
index 0000000..a52add3
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/fp_p5248_32.c
@@ -0,0 +1,942 @@
+// clang-format off
+// Command line : python monty.py 32
+// 0x4ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define sspint int32_t
+#define spint uint32_t
+#define udpint uint64_t
+#define dpint uint64_t
+
+#define Wordlength 32
+#define Nlimbs 9
+#define Radix 29
+#define Nbits 251
+#define Nbytes 32
+
+#define MONTGOMERY
+// propagate carries
+inline static spint prop(spint *n) {
+  int i;
+  spint mask = ((spint)1 << 29u) - (spint)1;
+  sspint carry = (sspint)n[0];
+  carry >>= 29u;
+  n[0] &= mask;
+  for (i = 1; i < 8; i++) {
+    carry += (sspint)n[i];
+    n[i] = (spint)carry & mask;
+    carry >>= 29u;
+  }
+  n[8] += (spint)carry;
+  return -((n[8] >> 1) >> 30u);
+}
+
+// propagate carries and add p if negative, propagate carries again
+inline static int flatten(spint *n) {
+  spint carry = prop(n);
+  n[0] -= (spint)1u & carry;
+  n[8] += ((spint)0x50000u) & carry;
+  (void)prop(n);
+  return (int)(carry & 1);
+}
+
+// Montgomery final subtract
+static int modfsb(spint *n) {
+  n[0] += (spint)1u;
+  n[8] -= (spint)0x50000u;
+  return flatten(n);
+}
+
+// Modular addition - reduce less than 2p
+static void modadd(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] + b[0];
+  n[1] = a[1] + b[1];
+  n[2] = a[2] + b[2];
+  n[3] = a[3] + b[3];
+  n[4] = a[4] + b[4];
+  n[5] = a[5] + b[5];
+  n[6] = a[6] + b[6];
+  n[7] = a[7] + b[7];
+  n[8] = a[8] + b[8];
+  n[0] += (spint)2u;
+  n[8] -= (spint)0xa0000u;
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[8] += ((spint)0xa0000u) & carry;
+  (void)prop(n);
+}
+
+// Modular subtraction - reduce less than 2p
+static void modsub(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] - b[0];
+  n[1] = a[1] - b[1];
+  n[2] = a[2] - b[2];
+  n[3] = a[3] - b[3];
+  n[4] = a[4] - b[4];
+  n[5] = a[5] - b[5];
+  n[6] = a[6] - b[6];
+  n[7] = a[7] - b[7];
+  n[8] = a[8] - b[8];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[8] += ((spint)0xa0000u) & carry;
+  (void)prop(n);
+}
+
+// Modular negation
+static void modneg(const spint *b, spint *n) {
+  spint carry;
+  n[0] = (spint)0 - b[0];
+  n[1] = (spint)0 - b[1];
+  n[2] = (spint)0 - b[2];
+  n[3] = (spint)0 - b[3];
+  n[4] = (spint)0 - b[4];
+  n[5] = (spint)0 - b[5];
+  n[6] = (spint)0 - b[6];
+  n[7] = (spint)0 - b[7];
+  n[8] = (spint)0 - b[8];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[8] += ((spint)0xa0000u) & carry;
+  (void)prop(n);
+}
+
+// Overflow limit   = 18446744073709551616
+// maximum possible = 2594249331921584137
+// Modular multiplication, c=a*b mod 2p
+static void modmul(const spint *a, const spint *b, spint *c) {
+  dpint t = 0;
+  spint p8 = 0x50000u;
+  spint q = ((spint)1 << 29u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  t += (dpint)a[0] * b[0];
+  spint v0 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[1];
+  t += (dpint)a[1] * b[0];
+  spint v1 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[2];
+  t += (dpint)a[1] * b[1];
+  t += (dpint)a[2] * b[0];
+  spint v2 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[3];
+  t += (dpint)a[1] * b[2];
+  t += (dpint)a[2] * b[1];
+  t += (dpint)a[3] * b[0];
+  spint v3 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[4];
+  t += (dpint)a[1] * b[3];
+  t += (dpint)a[2] * b[2];
+  t += (dpint)a[3] * b[1];
+  t += (dpint)a[4] * b[0];
+  spint v4 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[5];
+  t += (dpint)a[1] * b[4];
+  t += (dpint)a[2] * b[3];
+  t += (dpint)a[3] * b[2];
+  t += (dpint)a[4] * b[1];
+  t += (dpint)a[5] * b[0];
+  spint v5 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[6];
+  t += (dpint)a[1] * b[5];
+  t += (dpint)a[2] * b[4];
+  t += (dpint)a[3] * b[3];
+  t += (dpint)a[4] * b[2];
+  t += (dpint)a[5] * b[1];
+  t += (dpint)a[6] * b[0];
+  spint v6 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[7];
+  t += (dpint)a[1] * b[6];
+  t += (dpint)a[2] * b[5];
+  t += (dpint)a[3] * b[4];
+  t += (dpint)a[4] * b[3];
+  t += (dpint)a[5] * b[2];
+  t += (dpint)a[6] * b[1];
+  t += (dpint)a[7] * b[0];
+  spint v7 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[8];
+  t += (dpint)a[1] * b[7];
+  t += (dpint)a[2] * b[6];
+  t += (dpint)a[3] * b[5];
+  t += (dpint)a[4] * b[4];
+  t += (dpint)a[5] * b[3];
+  t += (dpint)a[6] * b[2];
+  t += (dpint)a[7] * b[1];
+  t += (dpint)a[8] * b[0];
+  t += (dpint)v0 * (dpint)p8;
+  spint v8 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[1] * b[8];
+  t += (dpint)a[2] * b[7];
+  t += (dpint)a[3] * b[6];
+  t += (dpint)a[4] * b[5];
+  t += (dpint)a[5] * b[4];
+  t += (dpint)a[6] * b[3];
+  t += (dpint)a[7] * b[2];
+  t += (dpint)a[8] * b[1];
+  t += (dpint)v1 * (dpint)p8;
+  c[0] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[2] * b[8];
+  t += (dpint)a[3] * b[7];
+  t += (dpint)a[4] * b[6];
+  t += (dpint)a[5] * b[5];
+  t += (dpint)a[6] * b[4];
+  t += (dpint)a[7] * b[3];
+  t += (dpint)a[8] * b[2];
+  t += (dpint)v2 * (dpint)p8;
+  c[1] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[3] * b[8];
+  t += (dpint)a[4] * b[7];
+  t += (dpint)a[5] * b[6];
+  t += (dpint)a[6] * b[5];
+  t += (dpint)a[7] * b[4];
+  t += (dpint)a[8] * b[3];
+  t += (dpint)v3 * (dpint)p8;
+  c[2] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[4] * b[8];
+  t += (dpint)a[5] * b[7];
+  t += (dpint)a[6] * b[6];
+  t += (dpint)a[7] * b[5];
+  t += (dpint)a[8] * b[4];
+  t += (dpint)v4 * (dpint)p8;
+  c[3] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[5] * b[8];
+  t += (dpint)a[6] * b[7];
+  t += (dpint)a[7] * b[6];
+  t += (dpint)a[8] * b[5];
+  t += (dpint)v5 * (dpint)p8;
+  c[4] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[6] * b[8];
+  t += (dpint)a[7] * b[7];
+  t += (dpint)a[8] * b[6];
+  t += (dpint)v6 * (dpint)p8;
+  c[5] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[7] * b[8];
+  t += (dpint)a[8] * b[7];
+  t += (dpint)v7 * (dpint)p8;
+  c[6] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[8] * b[8];
+  t += (dpint)v8 * (dpint)p8;
+  c[7] = ((spint)t & mask);
+  t >>= 29;
+  c[8] = (spint)t;
+}
+
+// Modular squaring, c=a*a  mod 2p
+static void modsqr(const spint *a, spint *c) {
+  udpint tot;
+  udpint t = 0;
+  spint p8 = 0x50000u;
+  spint q = ((spint)1 << 29u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  tot = (udpint)a[0] * a[0];
+  t = tot;
+  spint v0 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[1];
+  tot *= 2;
+  t += tot;
+  spint v1 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[2];
+  tot *= 2;
+  tot += (udpint)a[1] * a[1];
+  t += tot;
+  spint v2 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[3];
+  tot += (udpint)a[1] * a[2];
+  tot *= 2;
+  t += tot;
+  spint v3 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[4];
+  tot += (udpint)a[1] * a[3];
+  tot *= 2;
+  tot += (udpint)a[2] * a[2];
+  t += tot;
+  spint v4 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[5];
+  tot += (udpint)a[1] * a[4];
+  tot += (udpint)a[2] * a[3];
+  tot *= 2;
+  t += tot;
+  spint v5 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[6];
+  tot += (udpint)a[1] * a[5];
+  tot += (udpint)a[2] * a[4];
+  tot *= 2;
+  tot += (udpint)a[3] * a[3];
+  t += tot;
+  spint v6 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[7];
+  tot += (udpint)a[1] * a[6];
+  tot += (udpint)a[2] * a[5];
+  tot += (udpint)a[3] * a[4];
+  tot *= 2;
+  t += tot;
+  spint v7 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[8];
+  tot += (udpint)a[1] * a[7];
+  tot += (udpint)a[2] * a[6];
+  tot += (udpint)a[3] * a[5];
+  tot *= 2;
+  tot += (udpint)a[4] * a[4];
+  t += tot;
+  t += (udpint)v0 * p8;
+  spint v8 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[1] * a[8];
+  tot += (udpint)a[2] * a[7];
+  tot += (udpint)a[3] * a[6];
+  tot += (udpint)a[4] * a[5];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v1 * p8;
+  c[0] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[2] * a[8];
+  tot += (udpint)a[3] * a[7];
+  tot += (udpint)a[4] * a[6];
+  tot *= 2;
+  tot += (udpint)a[5] * a[5];
+  t += tot;
+  t += (udpint)v2 * p8;
+  c[1] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[3] * a[8];
+  tot += (udpint)a[4] * a[7];
+  tot += (udpint)a[5] * a[6];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v3 * p8;
+  c[2] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[4] * a[8];
+  tot += (udpint)a[5] * a[7];
+  tot *= 2;
+  tot += (udpint)a[6] * a[6];
+  t += tot;
+  t += (udpint)v4 * p8;
+  c[3] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[5] * a[8];
+  tot += (udpint)a[6] * a[7];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v5 * p8;
+  c[4] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[6] * a[8];
+  tot *= 2;
+  tot += (udpint)a[7] * a[7];
+  t += tot;
+  t += (udpint)v6 * p8;
+  c[5] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[7] * a[8];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v7 * p8;
+  c[6] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[8] * a[8];
+  t += tot;
+  t += (udpint)v8 * p8;
+  c[7] = ((spint)t & mask);
+  t >>= 29;
+  c[8] = (spint)t;
+}
+
+// copy
+static void modcpy(const spint *a, spint *c) {
+  int i;
+  for (i = 0; i < 9; i++) {
+    c[i] = a[i];
+  }
+}
+
+// square n times
+static void modnsqr(spint *a, int n) {
+  int i;
+  for (i = 0; i < n; i++) {
+    modsqr(a, a);
+  }
+}
+
+// Calculate progenitor
+static void modpro(const spint *w, spint *z) {
+  spint x[9];
+  spint t0[9];
+  spint t1[9];
+  spint t2[9];
+  spint t3[9];
+  spint t4[9];
+  modcpy(w, x);
+  modsqr(x, z);
+  modmul(x, z, t0);
+  modsqr(t0, z);
+  modmul(x, z, z);
+  modsqr(z, t1);
+  modsqr(t1, t3);
+  modsqr(t3, t2);
+  modcpy(t2, t4);
+  modnsqr(t4, 3);
+  modmul(t2, t4, t2);
+  modcpy(t2, t4);
+  modnsqr(t4, 6);
+  modmul(t2, t4, t2);
+  modcpy(t2, t4);
+  modnsqr(t4, 2);
+  modmul(t3, t4, t3);
+  modnsqr(t3, 13);
+  modmul(t2, t3, t2);
+  modcpy(t2, t3);
+  modnsqr(t3, 27);
+  modmul(t2, t3, t2);
+  modmul(z, t2, z);
+  modcpy(z, t2);
+  modnsqr(t2, 4);
+  modmul(t1, t2, t1);
+  modmul(t0, t1, t0);
+  modmul(t1, t0, t1);
+  modmul(t0, t1, t0);
+  modmul(t1, t0, t2);
+  modmul(t0, t2, t0);
+  modmul(t1, t0, t1);
+  modnsqr(t1, 63);
+  modmul(t0, t1, t1);
+  modnsqr(t1, 64);
+  modmul(t0, t1, t0);
+  modnsqr(t0, 57);
+  modmul(z, t0, z);
+}
+
+// calculate inverse, provide progenitor h if available
+static void modinv(const spint *x, const spint *h, spint *z) {
+  spint s[9];
+  spint t[9];
+  if (h == NULL) {
+    modpro(x, t);
+  } else {
+    modcpy(h, t);
+  }
+  modcpy(x, s);
+  modnsqr(t, 2);
+  modmul(s, t, z);
+}
+
+// Convert m to n-residue form, n=nres(m)
+static void nres(const spint *m, spint *n) {
+  const spint c[9] = {0xcf5c28fu,  0x6666666u,  0x13333333u,
+                      0x19999999u, 0xcccccccu,  0x6666666u,
+                      0x13333333u, 0x19999999u, 0x1ccccu};
+  modmul(m, c, n);
+}
+
+// Convert n back to normal form, m=redc(n)
+static void redc(const spint *n, spint *m) {
+  int i;
+  spint c[9];
+  c[0] = 1;
+  for (i = 1; i < 9; i++) {
+    c[i] = 0;
+  }
+  modmul(n, c, m);
+  (void)modfsb(m);
+}
+
+// is unity?
+static int modis1(const spint *a) {
+  int i;
+  spint c[9];
+  spint c0;
+  spint d = 0;
+  redc(a, c);
+  for (i = 1; i < 9; i++) {
+    d |= c[i];
+  }
+  c0 = (spint)c[0];
+  return ((spint)1 & ((d - (spint)1) >> 29u) &
+          (((c0 ^ (spint)1) - (spint)1) >> 29u));
+}
+
+// is zero?
+static int modis0(const spint *a) {
+  int i;
+  spint c[9];
+  spint d = 0;
+  redc(a, c);
+  for (i = 0; i < 9; i++) {
+    d |= c[i];
+  }
+  return ((spint)1 & ((d - (spint)1) >> 29u));
+}
+
+// set to zero
+static void modzer(spint *a) {
+  int i;
+  for (i = 0; i < 9; i++) {
+    a[i] = 0;
+  }
+}
+
+// set to one
+static void modone(spint *a) {
+  int i;
+  a[0] = 1;
+  for (i = 1; i < 9; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// set to integer
+static void modint(int x, spint *a) {
+  int i;
+  a[0] = (spint)x;
+  for (i = 1; i < 9; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// Modular multiplication by an integer, c=a*b mod 2p
+static void modmli(const spint *a, int b, spint *c) {
+  spint t[9];
+  modint(b, t);
+  modmul(a, t, c);
+}
+
+// Test for quadratic residue
+static int modqr(const spint *h, const spint *x) {
+  spint r[9];
+  if (h == NULL) {
+    modpro(x, r);
+    modsqr(r, r);
+  } else {
+    modsqr(h, r);
+  }
+  modmul(r, x, r);
+  return modis1(r) | modis0(x);
+}
+
+// conditional move g to f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcmv(int b, const spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 9; i++) {
+    s = g[i];
+    t = f[i];
+    f[i] = c0 * t + c1 * s;
+    f[i] -= r * (t + s);
+  }
+}
+
+// conditional swap g and f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcsw(int b, volatile spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t, w;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 9; i++) {
+    s = g[i];
+    t = f[i];
+    w = r * (t + s);
+    f[i] = c0 * t + c1 * s;
+    f[i] -= w;
+    g[i] = c0 * s + c1 * t;
+    g[i] -= w;
+  }
+}
+
+// Modular square root, provide progenitor h if available, NULL if not
+static void modsqrt(const spint *x, const spint *h, spint *r) {
+  spint s[9];
+  spint y[9];
+  if (h == NULL) {
+    modpro(x, y);
+  } else {
+    modcpy(h, y);
+  }
+  modmul(y, x, s);
+  modcpy(s, r);
+}
+
+// shift left by less than a word
+static void modshl(unsigned int n, spint *a) {
+  int i;
+  a[8] = ((a[8] << n)) | (a[7] >> (29u - n));
+  for (i = 7; i > 0; i--) {
+    a[i] = ((a[i] << n) & (spint)0x1fffffff) | (a[i - 1] >> (29u - n));
+  }
+  a[0] = (a[0] << n) & (spint)0x1fffffff;
+}
+
+// shift right by less than a word. Return shifted out part
+static int modshr(unsigned int n, spint *a) {
+  int i;
+  spint r = a[0] & (((spint)1 << n) - (spint)1);
+  for (i = 0; i < 8; i++) {
+    a[i] = (a[i] >> n) | ((a[i + 1] << (29u - n)) & (spint)0x1fffffff);
+  }
+  a[8] = a[8] >> n;
+  return r;
+}
+
+// set a= 2^r
+static void mod2r(unsigned int r, spint *a) {
+  unsigned int n = r / 29u;
+  unsigned int m = r % 29u;
+  modzer(a);
+  if (r >= 32 * 8)
+    return;
+  a[n] = 1;
+  a[n] <<= m;
+  nres(a, a);
+}
+
+// export to byte array
+static void modexp(const spint *a, char *b) {
+  int i;
+  spint c[9];
+  redc(a, c);
+  for (i = 31; i >= 0; i--) {
+    b[i] = c[0] & (spint)0xff;
+    (void)modshr(8, c);
+  }
+}
+
+// import from byte array
+// returns 1 if in range, else 0
+static int modimp(const char *b, spint *a) {
+  int i, res;
+  for (i = 0; i < 9; i++) {
+    a[i] = 0;
+  }
+  for (i = 0; i < 32; i++) {
+    modshl(8, a);
+    a[0] += (spint)(unsigned char)b[i];
+  }
+  res = modfsb(a);
+  nres(a, a);
+  return res;
+}
+
+// determine sign
+static int modsign(const spint *a) {
+  spint c[9];
+  redc(a, c);
+  return c[0] % 2;
+}
+
+// return true if equal
+static int modcmp(const spint *a, const spint *b) {
+  spint c[9], d[9];
+  int i, eq = 1;
+  redc(a, c);
+  redc(b, d);
+  for (i = 0; i < 9; i++) {
+    eq &= (((c[i] ^ d[i]) - 1) >> 29) & 1;
+  }
+  return eq;
+}
+
+// clang-format on
+/******************************************************************************
+ API functions calling generated code above
+ ******************************************************************************/
+
+#include <fp.h>
+
+const digit_t ZERO[NWORDS_FIELD] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
+const digit_t ONE[NWORDS_FIELD] = { 0x00000666, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                    0x00000000, 0x00000000, 0x00000000, 0x00020000 };
+// Montgomery representation of 2^-1
+static const digit_t TWO_INV[NWORDS_FIELD] = { 0x00000333, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                               0x00000000, 0x00000000, 0x00000000, 0x00010000 };
+// Montgomery representation of 3^-1
+static const digit_t THREE_INV[NWORDS_FIELD] = {
+    0x15555777, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x00025555,
+};
+// Montgomery representation of 2^256
+static const digit_t R2[NWORDS_FIELD] = { 0x0667ae14, 0x13333333, 0x19999999, 0x0ccccccc, 0x06666666,
+                                          0x13333333, 0x19999999, 0x0ccccccc, 0x00026666 };
+
+void
+fp_set_small(fp_t *x, const digit_t val)
+{
+    modint((int)val, *x);
+}
+
+void
+fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val)
+{
+    modmli(*a, (int)val, *x);
+}
+
+void
+fp_set_zero(fp_t *x)
+{
+    modzer(*x);
+}
+
+void
+fp_set_one(fp_t *x)
+{
+    modone(*x);
+}
+
+uint32_t
+fp_is_equal(const fp_t *a, const fp_t *b)
+{
+    return -(uint32_t)modcmp(*a, *b);
+}
+
+uint32_t
+fp_is_zero(const fp_t *a)
+{
+    return -(uint32_t)modis0(*a);
+}
+
+void
+fp_copy(fp_t *out, const fp_t *a)
+{
+    modcpy(*a, *out);
+}
+
+void
+fp_cswap(fp_t *a, fp_t *b, uint32_t ctl)
+{
+    modcsw((int)(ctl & 0x1), *a, *b);
+}
+
+void
+fp_add(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modadd(*a, *b, *out);
+}
+
+void
+fp_sub(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modsub(*a, *b, *out);
+}
+
+void
+fp_neg(fp_t *out, const fp_t *a)
+{
+    modneg(*a, *out);
+}
+
+void
+fp_sqr(fp_t *out, const fp_t *a)
+{
+    modsqr(*a, *out);
+}
+
+void
+fp_mul(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modmul(*a, *b, *out);
+}
+
+void
+fp_inv(fp_t *x)
+{
+    modinv(*x, NULL, *x);
+}
+
+uint32_t
+fp_is_square(const fp_t *a)
+{
+    return -(uint32_t)modqr(NULL, *a);
+}
+
+void
+fp_sqrt(fp_t *a)
+{
+    modsqrt(*a, NULL, *a);
+}
+
+void
+fp_half(fp_t *out, const fp_t *a)
+{
+    modmul(TWO_INV, *a, *out);
+}
+
+void
+fp_exp3div4(fp_t *out, const fp_t *a)
+{
+    modpro(*a, *out);
+}
+
+void
+fp_div3(fp_t *out, const fp_t *a)
+{
+    modmul(THREE_INV, *a, *out);
+}
+
+void
+fp_encode(void *dst, const fp_t *a)
+{
+    // Modified version of modexp()
+    int i;
+    spint c[9];
+    redc(*a, c);
+    for (i = 0; i < 32; i++) {
+        ((char *)dst)[i] = c[0] & (spint)0xff;
+        (void)modshr(8, c);
+    }
+}
+
+uint32_t
+fp_decode(fp_t *d, const void *src)
+{
+    // Modified version of modimp()
+    int i;
+    spint res;
+    const unsigned char *b = src;
+    for (i = 0; i < 9; i++) {
+        (*d)[i] = 0;
+    }
+    for (i = 31; i >= 0; i--) {
+        modshl(8, *d);
+        (*d)[0] += (spint)b[i];
+    }
+    res = (spint)-modfsb(*d);
+    nres(*d, *d);
+    // If the value was canonical then res = -1; otherwise, res = 0
+    for (i = 0; i < 9; i++) {
+        (*d)[i] &= res;
+    }
+    return (uint32_t)res;
+}
+
+static inline unsigned char
+add_carry(unsigned char cc, spint a, spint b, spint *d)
+{
+    udpint t = (udpint)a + (udpint)b + cc;
+    *d = (spint)t;
+    return (unsigned char)(t >> Wordlength);
+}
+
+static void
+partial_reduce(spint *out, const spint *src)
+{
+    spint h, l, quo, rem;
+    unsigned char cc;
+
+    // Split value in high (8 bits) and low (248 bits) parts.
+    h = src[7] >> 24;
+    l = src[7] & 0x00FFFFFF;
+
+    // 5*2^248 = 1 mod q; hence, we add floor(h/5) + (h mod 5)*2^248
+    // to the low part.
+    quo = (h * 0xCD) >> 10;
+    rem = h - (5 * quo);
+    cc = add_carry(0, src[0], quo, &out[0]);
+    cc = add_carry(cc, src[1], 0, &out[1]);
+    cc = add_carry(cc, src[2], 0, &out[2]);
+    cc = add_carry(cc, src[3], 0, &out[3]);
+    cc = add_carry(cc, src[4], 0, &out[4]);
+    cc = add_carry(cc, src[5], 0, &out[5]);
+    cc = add_carry(cc, src[6], 0, &out[6]);
+    (void)add_carry(cc, l, rem << 24, &out[7]);
+}
+
+// Little-endian encoding of a 32-bit integer.
+static inline void
+enc32le(void *dst, uint32_t x)
+{
+    uint8_t *buf = dst;
+    buf[0] = (uint8_t)x;
+    buf[1] = (uint8_t)(x >> 8);
+    buf[2] = (uint8_t)(x >> 16);
+    buf[3] = (uint8_t)(x >> 24);
+}
+
+// Little-endian decoding of a 32-bit integer.
+static inline uint32_t
+dec32le(const void *src)
+{
+    const uint8_t *buf = src;
+    return (spint)buf[0] | ((spint)buf[1] << 8) | ((spint)buf[2] << 16) | ((spint)buf[3] << 24);
+}
+
+void
+fp_decode_reduce(fp_t *d, const void *src, size_t len)
+{
+    uint32_t t[8];   // Stores Nbytes * 8 bits
+    uint8_t tmp[32]; // Nbytes
+    const uint8_t *b = src;
+
+    fp_set_zero(d);
+    if (len == 0) {
+        return;
+    }
+
+    size_t rem = len % 32;
+    if (rem != 0) {
+        // Input size is not a multiple of 32, we decode a partial
+        // block, which is already less than 2^248.
+        size_t k = len - rem;
+        memcpy(tmp, b + k, len - k);
+        memset(tmp + len - k, 0, (sizeof tmp) - (len - k));
+        fp_decode(d, tmp);
+        len = k;
+    }
+    // Process all remaining blocks, in descending address order.
+    while (len > 0) {
+        fp_mul(d, d, &R2);
+        len -= 32;
+        t[0] = dec32le(b + len);
+        t[1] = dec32le(b + len + 4);
+        t[2] = dec32le(b + len + 8);
+        t[3] = dec32le(b + len + 12);
+        t[4] = dec32le(b + len + 16);
+        t[5] = dec32le(b + len + 20);
+        t[6] = dec32le(b + len + 24);
+        t[7] = dec32le(b + len + 28);
+        partial_reduce(t, t);
+        enc32le(tmp, t[0]);
+        enc32le(tmp + 4, t[1]);
+        enc32le(tmp + 8, t[2]);
+        enc32le(tmp + 12, t[3]);
+        enc32le(tmp + 16, t[4]);
+        enc32le(tmp + 20, t[5]);
+        enc32le(tmp + 24, t[6]);
+        enc32le(tmp + 28, t[7]);
+        fp_t a;
+        fp_decode(&a, tmp);
+        fp_add(d, d, &a);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/hd.c b/src/pqm4/sqisign_lvl1/ref/hd.c
new file mode 100644
index 0000000..0424108
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/hd.c
@@ -0,0 +1,93 @@
+#include <hd.h>
+#include <assert.h>
+
+void
+double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2)
+{
+    ec_dbl(&out->P1, &in->P1, &E1E2->E1);
+    ec_dbl(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_point_iter(theta_couple_point_t *out,
+                         unsigned n,
+                         const theta_couple_point_t *in,
+                         const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        memmove(out, in, sizeof(theta_couple_point_t));
+    } else {
+        double_couple_point(out, in, E1E2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            double_couple_point(out, out, E1E2);
+        }
+    }
+}
+
+void
+add_couple_jac_points(theta_couple_jac_point_t *out,
+                      const theta_couple_jac_point_t *T1,
+                      const theta_couple_jac_point_t *T2,
+                      const theta_couple_curve_t *E1E2)
+{
+    ADD(&out->P1, &T1->P1, &T2->P1, &E1E2->E1);
+    ADD(&out->P2, &T1->P2, &T2->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point(theta_couple_jac_point_t *out,
+                        const theta_couple_jac_point_t *in,
+                        const theta_couple_curve_t *E1E2)
+{
+    DBL(&out->P1, &in->P1, &E1E2->E1);
+    DBL(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                             unsigned n,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        *out = *in;
+    } else if (n == 1) {
+        double_couple_jac_point(out, in, E1E2);
+    } else {
+        fp2_t a1, a2, t1, t2;
+
+        jac_to_ws(&out->P1, &t1, &a1, &in->P1, &E1E2->E1);
+        jac_to_ws(&out->P2, &t2, &a2, &in->P2, &E1E2->E2);
+
+        DBLW(&out->P1, &t1, &out->P1, &t1);
+        DBLW(&out->P2, &t2, &out->P2, &t2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            DBLW(&out->P1, &t1, &out->P1, &t1);
+            DBLW(&out->P2, &t2, &out->P2, &t2);
+        }
+
+        jac_from_ws(&out->P1, &out->P1, &a1, &E1E2->E1);
+        jac_from_ws(&out->P2, &out->P2, &a2, &E1E2->E2);
+    }
+}
+
+void
+couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP)
+{
+    jac_to_xz(&P->P1, &xyP->P1);
+    jac_to_xz(&P->P2, &xyP->P2);
+}
+
+void
+copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2)
+{
+    // Copy the basis on E1 to (P, _) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P1, &B1->P);
+    copy_point(&ker->T2.P1, &B1->Q);
+    copy_point(&ker->T1m2.P1, &B1->PmQ);
+
+    // Copy the basis on E2 to (_, P) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P2, &B2->P);
+    copy_point(&ker->T2.P2, &B2->Q);
+    copy_point(&ker->T1m2.P2, &B2->PmQ);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/hd.h b/src/pqm4/sqisign_lvl1/ref/hd.h
new file mode 100644
index 0000000..2b16e23
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/hd.h
@@ -0,0 +1,435 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief The HD-isogenies algorithm required by the signature
+ *
+ */
+
+#ifndef HD_H
+#define HD_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <stdio.h>
+
+/** @defgroup hd_module Abelian surfaces and their isogenies
+ * @{
+ */
+
+#define HD_extra_torsion 2
+
+/** @defgroup hd_struct Data structures for dimension 2
+ * @{
+ */
+
+/** @brief Type for couple point with XZ coordinates
+ * @typedef theta_couple_point_t
+ *
+ * @struct theta_couple_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XZ coordinates
+ */
+typedef struct theta_couple_point
+{
+    ec_point_t P1;
+    ec_point_t P2;
+} theta_couple_point_t;
+
+/** @brief Type for three couple points T1, T2, T1-T2 with XZ coordinates
+ * @typedef theta_kernel_couple_points_t
+ *
+ * @struct theta_kernel_couple_points
+ *
+ * Structure for a triple of theta couple points T1, T2 and T1 - T2
+ */
+typedef struct theta_kernel_couple_points
+{
+    theta_couple_point_t T1;
+    theta_couple_point_t T2;
+    theta_couple_point_t T1m2;
+} theta_kernel_couple_points_t;
+
+/** @brief Type for couple point with XYZ coordinates
+ * @typedef theta_couple_jac_point_t
+ *
+ * @struct theta_couple_jac_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XYZ coordinates
+ */
+typedef struct theta_couple_jac_point
+{
+    jac_point_t P1;
+    jac_point_t P2;
+} theta_couple_jac_point_t;
+
+/** @brief Type for couple curve *
+ * @typedef theta_couple_curve_t
+ *
+ * @struct theta_couple_curve
+ *
+ * the  theta_couple_curve structure
+ */
+typedef struct theta_couple_curve
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+} theta_couple_curve_t;
+
+/** @brief Type for a product E1 x E2 with corresponding bases
+ * @typedef theta_couple_curve_with_basis_t
+ *
+ * @struct theta_couple_curve_with_basis
+ *
+ * tType for a product E1 x E2 with corresponding bases Ei[2^n]
+ */
+typedef struct theta_couple_curve_with_basis
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+    ec_basis_t B1;
+    ec_basis_t B2;
+} theta_couple_curve_with_basis_t;
+
+/** @brief Type for theta point *
+ * @typedef theta_point_t
+ *
+ * @struct theta_point
+ *
+ * the  theta_point structure used
+ */
+typedef struct theta_point
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+    fp2_t t;
+} theta_point_t;
+
+/** @brief Type for theta point with repeating components
+ * @typedef theta_point_compact_t
+ *
+ * @struct theta_point_compact
+ *
+ * the  theta_point structure used for points with repeated components
+ */
+typedef struct theta_point_compact
+{
+    fp2_t x;
+    fp2_t y;
+} theta_point_compact_t;
+
+/** @brief Type for theta structure *
+ * @typedef theta_structure_t
+ *
+ * @struct theta_structure
+ *
+ * the  theta_structure structure used
+ */
+typedef struct theta_structure
+{
+    theta_point_t null_point;
+    bool precomputation;
+
+    // Eight precomputed values used for doubling and
+    // (2,2)-isogenies.
+    fp2_t XYZ0;
+    fp2_t YZT0;
+    fp2_t XZT0;
+    fp2_t XYT0;
+
+    fp2_t xyz0;
+    fp2_t yzt0;
+    fp2_t xzt0;
+    fp2_t xyt0;
+} theta_structure_t;
+
+/** @brief A 2x2 matrix used for action by translation
+ * @typedef translation_matrix_t
+ *
+ * @struct translation_matrix
+ *
+ * Structure to hold 4 fp2_t elements representing a 2x2 matrix used when computing
+ * a compatible theta structure during gluing.
+ */
+typedef struct translation_matrix
+{
+    fp2_t g00;
+    fp2_t g01;
+    fp2_t g10;
+    fp2_t g11;
+} translation_matrix_t;
+
+/** @brief A 4x4 matrix used for basis changes
+ * @typedef basis_change_matrix_t
+ *
+ * @struct basis_change_matrix
+ *
+ * Structure to hold 16 elements representing a 4x4 matrix used for changing
+ * the basis of a theta point.
+ */
+typedef struct basis_change_matrix
+{
+    fp2_t m[4][4];
+} basis_change_matrix_t;
+
+/** @brief Type for gluing (2,2) theta isogeny *
+ * @typedef theta_gluing_t
+ *
+ * @struct theta_gluing
+ *
+ * the  theta_gluing structure
+ */
+typedef struct theta_gluing
+{
+
+    theta_couple_curve_t domain;
+    theta_couple_jac_point_t xyK1_8;
+    theta_point_compact_t imageK1_8;
+    basis_change_matrix_t M;
+    theta_point_t precomputation;
+    theta_point_t codomain;
+
+} theta_gluing_t;
+
+/** @brief Type for standard (2,2) theta isogeny *
+ * @typedef theta_isogeny_t
+ *
+ * @struct theta_isogeny
+ *
+ * the  theta_isogeny structure
+ */
+typedef struct theta_isogeny
+{
+    theta_point_t T1_8;
+    theta_point_t T2_8;
+    bool hadamard_bool_1;
+    bool hadamard_bool_2;
+    theta_structure_t domain;
+    theta_point_t precomputation;
+    theta_structure_t codomain;
+} theta_isogeny_t;
+
+/** @brief Type for splitting isomorphism *
+ * @typedef theta_splitting_t
+ *
+ * @struct theta_splitting
+ *
+ * the theta_splitting structure
+ */
+typedef struct theta_splitting
+{
+    basis_change_matrix_t M;
+    theta_structure_t B;
+
+} theta_splitting_t;
+
+// end of hd_struct
+/**
+ * @}
+ */
+
+/** @defgroup hd_functions Functions for dimension 2
+ * @{
+ */
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param n : the number of iteration
+ * @param E1E2 an elliptic product
+ * @param in the theta couple point in the elliptic product
+ * in = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_point_iter(theta_couple_point_t *out,
+                              unsigned n,
+                              const theta_couple_point_t *in,
+                              const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the addition of two points in (X : Y : Z) coordinates on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param T1 the theta couple jac point in the elliptic product
+ * @param T2 the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1, P2), (Q1, Q2)
+ * out = (P1 + Q1, P2 + Q2)
+ *
+ **/
+void add_couple_jac_points(theta_couple_jac_point_t *out,
+                           const theta_couple_jac_point_t *T1,
+                           const theta_couple_jac_point_t *T2,
+                           const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_jac_point(theta_couple_jac_point_t *out,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple jac point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param n : the number of iteration
+ * @param in the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                                  unsigned n,
+                                  const theta_couple_jac_point_t *in,
+                                  const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief A forgetful function which returns (X : Z) points given a pair of (X : Y : Z) points
+ *
+ * @param P Output: the theta_couple_point
+ * @param xyP : the theta_couple_jac_point
+ **/
+void couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval(unsigned n,
+                                 /*const*/ theta_couple_curve_t *E12,
+                                 const theta_kernel_couple_points_t *ker,
+                                 bool extra_torsion,
+                                 theta_couple_curve_t *E34,
+                                 theta_couple_point_t *P12,
+                                 size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it does extra isotropy
+ * checks on the kernel.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_verify(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it selects a random Montgomery
+ * model of the codomain.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success, 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_randomized(unsigned n,
+                                            /*const*/ theta_couple_curve_t *E12,
+                                            const theta_kernel_couple_points_t *ker,
+                                            bool extra_torsion,
+                                            theta_couple_curve_t *E34,
+                                            theta_couple_point_t *P12,
+                                            size_t numP);
+
+/**
+ * @brief Given a bases B1 on E1 and B2 on E2 copies this to create a kernel
+ *         on E1 x E2 as couple points T1, T2 and T1 - T2
+ *
+ * @param ker Output: a kernel for dim_two_isogenies (T1, T2, T1-T2)
+ * @param B1 Input basis on E1
+ * @param B2 Input basis on E2
+ **/
+void copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2);
+
+/**
+ * @brief Given a couple of points (P1, P2) on a couple of curves (E1, E2)
+ * this function tests if both points are of order exactly 2^t
+ *
+ * @param T: couple point (P1, P2)
+ * @param E: a couple of curves (E1, E2)
+ * @param t: an integer
+ * @returns 0xFFFFFFFF on success, 0 on failure
+ */
+static int
+test_couple_point_order_twof(const theta_couple_point_t *T, const theta_couple_curve_t *E, int t)
+{
+    int check_P1 = test_point_order_twof(&T->P1, &E->E1, t);
+    int check_P2 = test_point_order_twof(&T->P2, &E->E2, t);
+
+    return check_P1 & check_P2;
+}
+
+// end of hd_functions
+/**
+ * @}
+ */
+// end of hd_module
+/**
+ * @}
+ */
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.c b/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.c
new file mode 100644
index 0000000..6332d21
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.c
@@ -0,0 +1,143 @@
+#include <hd_splitting_transforms.h>
+
+#define FP2_ZERO 0
+#define FP2_ONE 1
+#define FP2_I 2
+#define FP2_MINUS_ONE 3
+#define FP2_MINUS_I 4
+
+const int EVEN_INDEX[10][2] = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 0}, {1, 2}, {2, 0}, {2, 1}, {3, 0}, {3, 3}};
+const int CHI_EVAL[4][4] = {{1, 1, 1, 1}, {1, -1, 1, -1}, {1, 1, -1, -1}, {1, -1, -1, 1}};
+const fp2_t FP2_CONSTANTS[5] = {{
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x333, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}
+#elif RADIX == 32
+{0x666, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20000}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x33, 0x0, 0x0, 0x100000000000000}
+#else
+{0x19, 0x0, 0x0, 0x0, 0x300000000000}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x333, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}
+#elif RADIX == 32
+{0x666, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20000}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x33, 0x0, 0x0, 0x100000000000000}
+#else
+{0x19, 0x0, 0x0, 0x0, 0x300000000000}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x1ccc, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x7}
+#elif RADIX == 32
+{0x1ffff999, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x2ffff}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xffffffffffffffcc, 0xffffffffffffffff, 0xffffffffffffffff, 0x3ffffffffffffff}
+#else
+{0x7ffffffffffe6, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x1fffffffffff}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x1ccc, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x7}
+#elif RADIX == 32
+{0x1ffff999, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x2ffff}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xffffffffffffffcc, 0xffffffffffffffff, 0xffffffffffffffff, 0x3ffffffffffffff}
+#else
+{0x7ffffffffffe6, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x1fffffffffff}
+#endif
+#endif
+}};
+const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10] = {{{{FP2_ONE, FP2_I, FP2_ONE, FP2_I}, {FP2_ONE, FP2_MINUS_I, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_MINUS_ONE, FP2_MINUS_I}, {FP2_MINUS_ONE, FP2_I, FP2_MINUS_ONE, FP2_I}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}};
+const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6] = {{{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}, {{{FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}}}};
diff --git a/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.h b/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.h
new file mode 100644
index 0000000..b3147a4
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/hd_splitting_transforms.h
@@ -0,0 +1,18 @@
+#ifndef HD_SPLITTING_H
+#define HD_SPLITTING_H
+
+#include <hd.h>
+#include <stdint.h>
+
+typedef struct precomp_basis_change_matrix {
+    uint8_t m[4][4];
+} precomp_basis_change_matrix_t;
+
+extern const int EVEN_INDEX[10][2];
+extern const int CHI_EVAL[4][4];
+extern const fp2_t FP2_CONSTANTS[5];
+extern const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10];
+extern const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6];
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl1/ref/isog.h b/src/pqm4/sqisign_lvl1/ref/isog.h
new file mode 100644
index 0000000..b251ca3
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/isog.h
@@ -0,0 +1,28 @@
+#ifndef _ISOG_H_
+#define _ISOG_H_
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/* KPS structure for isogenies of degree 2 or 4 */
+typedef struct
+{
+    ec_point_t K;
+} ec_kps2_t;
+typedef struct
+{
+    ec_point_t K[3];
+} ec_kps4_t;
+
+void xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P); // degree-2 isogeny construction
+void xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24);
+
+void xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P); // degree-4 isogeny construction
+void xisog_4_singular(ec_kps4_t *kps, ec_point_t *B24, const ec_point_t P, ec_point_t A24);
+
+void xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps);
+void xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps);
+
+void xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps);
+void xeval_4_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_point_t P, const ec_kps4_t *kps);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/isog_chains.c b/src/pqm4/sqisign_lvl1/ref/isog_chains.c
new file mode 100644
index 0000000..abc9808
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/isog_chains.c
@@ -0,0 +1,241 @@
+#include "isog.h"
+#include <assert.h>
+
+// since we use degree 4 isogeny steps, we need to handle the odd case with care
+static uint32_t
+ec_eval_even_strategy(ec_curve_t *curve,
+                      ec_point_t *points,
+                      unsigned len_points,
+                      const ec_point_t *kernel,
+                      const int isog_len)
+{
+    ec_curve_normalize_A24(curve);
+    ec_point_t A24;
+    copy_point(&A24, &curve->A24);
+
+    int space = 1;
+    for (int i = 1; i < isog_len; i *= 2)
+        ++space;
+
+    // Stack of remaining kernel points and their associated orders
+    ec_point_t splits[space];
+    uint16_t todo[space];
+    splits[0] = *kernel;
+    todo[0] = isog_len;
+
+    int current = 0; // Pointer to current top of stack
+
+    // Chain of 4-isogenies
+    for (int j = 0; j < isog_len / 2; ++j) {
+        assert(current >= 0);
+        assert(todo[current] >= 1);
+        // Get the next point of order 4
+        while (todo[current] != 2) {
+            assert(todo[current] >= 3);
+            // A new split will be added
+            ++current;
+            assert(current < space);
+            // We set the seed of the new split to be computed and saved
+            copy_point(&splits[current], &splits[current - 1]);
+            // if we copied from the very first element, then we perform one additional doubling
+            unsigned num_dbls = todo[current - 1] / 4 * 2 + todo[current - 1] % 2;
+            todo[current] = todo[current - 1] - num_dbls;
+            while (num_dbls--)
+                xDBL_A24(&splits[current], &splits[current], &A24, false);
+        }
+
+        if (j == 0) {
+            assert(fp2_is_one(&A24.z));
+            if (!ec_is_four_torsion(&splits[current], curve))
+                return -1;
+
+            ec_point_t T;
+            xDBL_A24(&T, &splits[current], &A24, false);
+            if (fp2_is_zero(&T.x))
+                return -1; // special isogenies not allowed
+        } else {
+            assert(todo[current] == 2);
+#ifndef NDEBUG
+            if (fp2_is_zero(&splits[current].z))
+                debug_print("splitting point z coordinate is unexpectedly zero");
+
+            ec_point_t test;
+            xDBL_A24(&test, &splits[current], &A24, false);
+            if (fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly zero before doubling");
+            xDBL_A24(&test, &test, &A24, false);
+            if (!fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+        }
+
+        // Evaluate 4-isogeny
+        ec_kps4_t kps4;
+        xisog_4(&kps4, &A24, splits[current]);
+        xeval_4(splits, splits, current, &kps4);
+        for (int i = 0; i < current; ++i)
+            todo[i] -= 2;
+        xeval_4(points, points, len_points, &kps4);
+
+        --current;
+    }
+    assert(isog_len % 2 ? !current : current == -1);
+
+    // Final 2-isogeny
+    if (isog_len % 2) {
+#ifndef NDEBUG
+        if (fp2_is_zero(&splits[0].z))
+            debug_print("splitting point z coordinate is unexpectedly zero");
+        ec_point_t test;
+        copy_point(&test, &splits[0]);
+        xDBL_A24(&test, &test, &A24, false);
+        if (!fp2_is_zero(&test.z))
+            debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+
+        // We need to check the order of this point in case there were no 4-isogenies
+        if (isog_len == 1 && !ec_is_two_torsion(&splits[0], curve))
+            return -1;
+        if (fp2_is_zero(&splits[0].x)) {
+            // special isogenies not allowed
+            // this case can only happen if isog_len == 1; otherwise the
+            // previous 4-isogenies we computed ensure that $T=(0:1)$ is put
+            // as the kernel of the dual isogeny
+            return -1;
+        }
+
+        ec_kps2_t kps2;
+        xisog_2(&kps2, &A24, splits[0]);
+        xeval_2(points, points, len_points, &kps2);
+    }
+
+    // Output curve in the form (A:C)
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+
+    return 0;
+}
+
+uint32_t
+ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points)
+{
+    copy_curve(image, &phi->curve);
+    return ec_eval_even_strategy(image, points, len_points, &phi->kernel, phi->length);
+}
+
+// naive implementation
+uint32_t
+ec_eval_small_chain(ec_curve_t *curve,
+                    const ec_point_t *kernel,
+                    int len,
+                    ec_point_t *points,
+                    unsigned len_points,
+                    bool special) // do we allow special isogenies?
+{
+
+    ec_point_t A24;
+    AC_to_A24(&A24, curve);
+
+    ec_kps2_t kps;
+    ec_point_t small_K, big_K;
+    copy_point(&big_K, kernel);
+
+    for (int i = 0; i < len; i++) {
+        copy_point(&small_K, &big_K);
+        // small_K = big_K;
+        for (int j = 0; j < len - i - 1; j++) {
+            xDBL_A24(&small_K, &small_K, &A24, false);
+        }
+        // Check the order of the point before the first isogeny step
+        if (i == 0 && !ec_is_two_torsion(&small_K, curve))
+            return (uint32_t)-1;
+        // Perform isogeny step
+        if (fp2_is_zero(&small_K.x)) {
+            if (special) {
+                ec_point_t B24;
+                xisog_2_singular(&kps, &B24, A24);
+                xeval_2_singular(&big_K, &big_K, 1, &kps);
+                xeval_2_singular(points, points, len_points, &kps);
+                copy_point(&A24, &B24);
+            } else {
+                return (uint32_t)-1;
+            }
+        } else {
+            xisog_2(&kps, &A24, small_K);
+            xeval_2(&big_K, &big_K, 1, &kps);
+            xeval_2(points, points, len_points, &kps);
+        }
+    }
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+    return 0;
+}
+
+uint32_t
+ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to)
+{
+    fp2_t t0, t1, t2, t3, t4;
+
+    fp2_mul(&t0, &from->A, &from->C);
+    fp2_mul(&t1, &to->A, &to->C);
+
+    fp2_mul(&t2, &t1, &to->C); // toA*toC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*toA*toC^2
+    fp2_sqr(&t3, &to->A);
+    fp2_mul(&t3, &t3, &to->A); // toA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->Nx, &t3, &t2); // 2*toA^3-9*toA*toC^2
+    fp2_mul(&t2, &t0, &from->A);  // fromA^2*fromC
+    fp2_sqr(&t3, &from->C);
+    fp2_mul(&t3, &t3, &from->C); // fromC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);             // 3*fromC^3
+    fp2_sub(&t3, &t3, &t2);             // 3*fromC^3-fromA^2*fromC
+    fp2_mul(&isom->Nx, &isom->Nx, &t3); // lambda_x = (2*toA^3-9*toA*toC^2)*(3*fromC^3-fromA^2*fromC)
+
+    fp2_mul(&t2, &t0, &from->C); // fromA*fromC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*fromA*fromC^2
+    fp2_sqr(&t3, &from->A);
+    fp2_mul(&t3, &t3, &from->A); // fromA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->D, &t3, &t2); // 2*fromA^3-9*fromA*fromC^2
+    fp2_mul(&t2, &t1, &to->A);   // toA^2*toC
+    fp2_sqr(&t3, &to->C);
+    fp2_mul(&t3, &t3, &to->C); // toC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);           // 3*toC^3
+    fp2_sub(&t3, &t3, &t2);           // 3*toC^3-toA^2*toC
+    fp2_mul(&isom->D, &isom->D, &t3); // lambda_z = (2*fromA^3-9*fromA*fromC^2)*(3*toC^3-toA^2*toC)
+
+    // Mont -> SW -> SW -> Mont
+    fp2_mul(&t0, &to->C, &from->A);
+    fp2_mul(&t0, &t0, &isom->Nx); // lambda_x*toC*fromA
+    fp2_mul(&t1, &from->C, &to->A);
+    fp2_mul(&t1, &t1, &isom->D);  // lambda_z*fromC*toA
+    fp2_sub(&isom->Nz, &t0, &t1); // lambda_x*toC*fromA - lambda_z*fromC*toA
+    fp2_mul(&t0, &from->C, &to->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1);             // 3*fromC*toC
+    fp2_mul(&isom->D, &isom->D, &t0);   // 3*lambda_z*fromC*toC
+    fp2_mul(&isom->Nx, &isom->Nx, &t0); // 3*lambda_x*fromC*toC
+
+    return (fp2_is_zero(&isom->Nx) | fp2_is_zero(&isom->D));
+}
+
+void
+ec_iso_eval(ec_point_t *P, ec_isom_t *isom)
+{
+    fp2_t tmp;
+    fp2_mul(&P->x, &P->x, &isom->Nx);
+    fp2_mul(&tmp, &P->z, &isom->Nz);
+    fp2_add(&P->x, &P->x, &tmp);
+    fp2_mul(&P->z, &P->z, &isom->D);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/mp.c b/src/pqm4/sqisign_lvl1/ref/mp.c
new file mode 100644
index 0000000..27f4a96
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/mp.c
@@ -0,0 +1,357 @@
+#include <mp.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+// double-wide multiplication
+void
+MUL(digit_t *out, const digit_t a, const digit_t b)
+{
+#ifdef RADIX_32
+    uint64_t r = (uint64_t)a * b;
+    out[0] = r & 0xFFFFFFFFUL;
+    out[1] = r >> 32;
+
+#elif defined(RADIX_64) && defined(_MSC_VER)
+    uint64_t umul_hi;
+    out[0] = _umul128(a, b, &umul_hi);
+    out[1] = umul_hi;
+
+#elif defined(RADIX_64) && defined(HAVE_UINT128)
+    unsigned __int128 umul_tmp;
+    umul_tmp = (unsigned __int128)(a) * (unsigned __int128)(b);
+    out[0] = (uint64_t)umul_tmp;
+    out[1] = (uint64_t)(umul_tmp >> 64);
+
+#else
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4);
+    al = a & mask_low;               // Low part
+    ah = a >> (sizeof(digit_t) * 4); // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t) * 4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low; // out00
+
+    res1 = albl >> (sizeof(digit_t) * 4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t) * 4);
+    out[0] ^= temp << (sizeof(digit_t) * 4); // out01
+
+    res1 = ahbl >> (sizeof(digit_t) * 4);
+    res2 = albh >> (sizeof(digit_t) * 4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low; // out10
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry; // out11
+
+#endif
+}
+
+void
+mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision addition
+    unsigned int i, carry = 0;
+
+    for (i = 0; i < nwords; i++) {
+        ADDC(c[i], carry, a[i], b[i], carry);
+    }
+}
+
+digit_t
+mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift by 1...RADIX-1
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords - 1; i++) {
+        SHIFTR(x[i + 1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords - 1] >>= shift;
+    return bit_out;
+}
+
+void
+mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift by 1...RADIX-1
+
+    for (int i = nwords - 1; i > 0; i--) {
+        SHIFTL(x[i], x[i - 1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+void
+multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{
+    int t = shift;
+    while (t > RADIX - 1) {
+        mp_shiftl(x, RADIX - 1, nwords);
+        t = t - (RADIX - 1);
+    }
+    mp_shiftl(x, t, nwords);
+}
+
+// The below functions were taken from the EC module
+
+void
+mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision subtraction, assuming a > b
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < nwords; i++) {
+        SUBC(c[i], borrow, a[i], b[i], borrow);
+    }
+}
+
+void
+select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords)
+{ // Select c <- a if mask = 0, select c <- b if mask = 1...1
+
+    for (int i = 0; i < nwords; i++) {
+        c[i] = ((a[i] ^ b[i]) & mask) ^ a[i];
+    }
+}
+
+void
+swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords)
+{ // Swap entries
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then a <- b and b <- a
+    digit_t temp;
+
+    for (int i = 0; i < nwords; i++) {
+        temp = option & (a[i] ^ b[i]);
+        a[i] = temp ^ a[i];
+        b[i] = temp ^ b[i];
+    }
+}
+
+int
+mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords)
+{ // Multiprecision comparison, a=b? : (1) a>b, (0) a=b, (-1) a<b
+
+    for (int i = nwords - 1; i >= 0; i--) {
+        if (a[i] > b[i])
+            return 1;
+        else if (a[i] < b[i])
+            return -1;
+    }
+    return 0;
+}
+
+bool
+mp_is_zero(const digit_t *a, unsigned int nwords)
+{ // Is a multiprecision element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < nwords; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void
+mp_mul2(digit_t *c, const digit_t *a, const digit_t *b)
+{ // Multiprecision multiplication fixed to two-digit operands
+    unsigned int carry = 0;
+    digit_t t0[2], t1[2], t2[2];
+
+    MUL(t0, a[0], b[0]);
+    MUL(t1, a[0], b[1]);
+    ADDC(t0[1], carry, t0[1], t1[0], carry);
+    ADDC(t1[1], carry, 0, t1[1], carry);
+    MUL(t2, a[1], b[1]);
+    ADDC(t2[0], carry, t2[0], t1[1], carry);
+    ADDC(t2[1], carry, 0, t2[1], carry);
+    c[0] = t0[0];
+    c[1] = t0[1];
+    c[2] = t2[0];
+    c[3] = t2[1];
+}
+
+void
+mp_print(const digit_t *a, size_t nwords)
+{
+    printf("0x");
+    for (size_t i = 0; i < nwords; i++) {
+#ifdef RADIX_32
+        printf("%08" PRIx32, a[nwords - i - 1]); // Print each word with 8 hex digits
+#elif defined(RADIX_64)
+        printf("%016" PRIx64, a[nwords - i - 1]); // Print each word with 16 hex digits
+#endif
+    }
+}
+
+void
+mp_copy(digit_t *b, const digit_t *a, size_t nwords)
+{
+    for (size_t i = 0; i < nwords; i++) {
+        b[i] = a[i];
+    }
+}
+
+void
+mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords)
+{
+    // Multiprecision multiplication, c = a*b, for nwords-digit inputs, with nwords-digit output
+    // explicitly does not use the higher half of c, as we do not need in our applications
+    digit_t carry, UV[2], t[nwords], cc[nwords];
+
+    for (size_t i = 0; i < nwords; i++) {
+        cc[i] = 0;
+    }
+
+    for (size_t i = 0; i < nwords; i++) {
+
+        MUL(t, a[i], b[0]);
+
+        for (size_t j = 1; j < nwords - 1; j++) {
+            MUL(UV, a[i], b[j]);
+            ADDC(t[j], carry, t[j], UV[0], 0);
+            t[j + 1] = UV[1] + carry;
+        }
+
+        int j = nwords - 1;
+        MUL(UV, a[i], b[j]);
+        ADDC(t[j], carry, t[j], UV[0], 0);
+
+        mp_add(&cc[i], &cc[i], t, nwords - i);
+    }
+
+    mp_copy(c, cc, nwords);
+}
+
+void
+mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords)
+{ // Multiprecision modulo 2^e, with 0 <= a < 2^(e)
+    unsigned int i, q = e >> LOG2RADIX, r = e & (RADIX - 1);
+
+    if (q < nwords) {
+        a[q] &= ((digit_t)1 << r) - 1;
+
+        for (i = q + 1; i < nwords; i++) {
+            a[i] = 0;
+        }
+    }
+}
+
+void
+mp_neg(digit_t *a, unsigned int nwords)
+{ // negates a
+    for (size_t i = 0; i < nwords; i++) {
+        a[i] ^= -1;
+    }
+
+    a[0] += 1;
+}
+
+bool
+mp_is_one(const digit_t *x, unsigned int nwords)
+{ // returns true if x represents 1, and false otherwise
+    if (x[0] != 1) {
+        return false;
+    }
+
+    for (size_t i = 1; i < nwords; i++) {
+        if (x[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void
+mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords)
+{ // Inversion modulo 2^e, using Newton's method and Hensel lifting
+    // we take the first power of 2 larger than e to use
+    // requires a to be odd, of course
+    // returns b such that a*b = 1 mod 2^e
+    assert((a[0] & 1) == 1);
+
+    digit_t x[nwords], y[nwords], aa[nwords], mp_one[nwords], tmp[nwords];
+    mp_copy(aa, a, nwords);
+
+    mp_one[0] = 1;
+    for (unsigned int i = 1; i < nwords; i++) {
+        mp_one[i] = 0;
+    }
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    p -= 2; // using k = 4 for initial inverse
+    int w = (1 << (p + 2));
+
+    mp_mod_2exp(aa, w, nwords);
+    mp_add(x, aa, aa, nwords);
+    mp_add(x, x, aa, nwords);  // should be 3a
+    x[0] ^= (1 << 1);          // so that x equals (3a)^2 xor 2
+    mp_mod_2exp(x, w, nwords); // now x*a = 1 mod 2^4, which we lift
+
+    mp_mul(tmp, aa, x, nwords);
+    mp_neg(tmp, nwords);
+    mp_add(y, mp_one, tmp, nwords);
+
+    // Hensel lifting for p rounds
+    for (int i = 0; i < p; i++) {
+        mp_add(tmp, mp_one, y, nwords);
+        mp_mul(x, x, tmp, nwords);
+        mp_mul(y, y, y, nwords);
+    }
+
+    mp_mod_2exp(x, w, nwords);
+    mp_copy(b, x, nwords);
+
+    //  verify results
+    mp_mul(x, x, aa, nwords);
+    mp_mod_2exp(x, w, nwords);
+    assert(mp_is_one(x, nwords));
+}
+
+void
+mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords)
+{
+    // given a matrix ( ( a, b ), (c,  d) ) of values mod 2^e
+    // returns the inverse matrix gamma ( (d, -b), (-c, a) )
+    // where gamma is the inverse of the determinant a*d - b*c
+    // assumes the matrix is invertible, otherwises, inversion of determinant fails
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    int w = (1 << (p));
+
+    digit_t det[nwords], tmp[nwords], resa[nwords], resb[nwords], resc[nwords], resd[nwords];
+    mp_mul(tmp, r1, s2, nwords);
+    mp_mul(det, r2, s1, nwords);
+    mp_sub(det, tmp, det, nwords);
+    mp_inv_2e(det, det, e, nwords);
+
+    mp_mul(resa, det, s2, nwords);
+    mp_mul(resb, det, r2, nwords);
+    mp_mul(resc, det, s1, nwords);
+    mp_mul(resd, det, r1, nwords);
+
+    mp_neg(resb, nwords);
+    mp_neg(resc, nwords);
+
+    mp_mod_2exp(resa, w, nwords);
+    mp_mod_2exp(resb, w, nwords);
+    mp_mod_2exp(resc, w, nwords);
+    mp_mod_2exp(resd, w, nwords);
+
+    mp_copy(r1, resa, nwords);
+    mp_copy(r2, resb, nwords);
+    mp_copy(s1, resc, nwords);
+    mp_copy(s2, resd, nwords);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/mp.h b/src/pqm4/sqisign_lvl1/ref/mp.h
new file mode 100644
index 0000000..b3733b5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/mp.h
@@ -0,0 +1,88 @@
+#ifndef MP_H
+#define MP_H
+
+#include <sqisign_namespace.h>
+#include <stdbool.h>
+#include <tutil.h>
+
+// Functions taken from the GF module
+
+void mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+digit_t mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void MUL(digit_t *out, const digit_t a, const digit_t b);
+
+// Functions taken from the EC module
+
+void mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+void select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords);
+void swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords);
+int mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords);
+bool mp_is_zero(const digit_t *a, unsigned int nwords);
+void mp_mul2(digit_t *c, const digit_t *a, const digit_t *b);
+
+// Further functions for multiprecision arithmetic
+void mp_print(const digit_t *a, size_t nwords);
+void mp_copy(digit_t *b, const digit_t *a, size_t nwords);
+void mp_neg(digit_t *a, unsigned int nwords);
+bool mp_is_one(const digit_t *x, unsigned int nwords);
+void mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords);
+void mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords);
+void mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords);
+void mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords);
+
+#define mp_is_odd(x, nwords) (((nwords) != 0) & (int)(x)[0])
+#define mp_is_even(x, nwords) (!mp_is_odd(x, nwords))
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+static inline unsigned int
+is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int
+is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int
+is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations
+ * **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                                              \
+    {                                                                                                                  \
+        digit_t tempReg = (addend1) + (digit_t)(carryIn);                                                              \
+        (sumOut) = (addend2) + tempReg;                                                                                \
+        (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg));    \
+    }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                                                  \
+    {                                                                                                                  \
+        digit_t tempReg = (minuend) - (subtrahend);                                                                    \
+        unsigned int borrowReg =                                                                                       \
+            (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));                \
+        (differenceOut) = tempReg - (digit_t)(borrowIn);                                                               \
+        (borrowOut) = borrowReg;                                                                                       \
+    }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/pqm4_api.c b/src/pqm4/sqisign_lvl1/ref/pqm4_api.c
new file mode 100644
index 0000000..998fbd2
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/pqm4_api.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <sig.h>
+#include <string.h>
+
+typedef struct {
+  size_t mlen;
+  char msg[59];
+  size_t smlen;
+  char sm[59 + CRYPTO_BYTES];
+} SQISign_KAT_t;
+
+const char kat_lvl1_pk[CRYPTO_PUBLICKEYBYTES] = {
+  0x9F, 0x5F, 0x7F, 0xF0, 0x79, 0x3F, 0x17, 0x1C, 0x9B, 0x5D, 0x1B, 0x05, 0x99, 0xA8, 0x17, 0x68, 0x95, 0x14, 0x35, 0xFE, 0x8B, 0x18, 0x6D, 0xE0, 0xA1, 0x8B, 0xA0, 0xAB, 0x58, 0x39, 0x8C, 0x03, 0x7F, 0x40, 0xCC, 0x35, 0x7B, 0x0F, 0x4C, 0xAE, 0x9A, 0x93, 0x23, 0xCB, 0x31, 0xF2, 0x4C, 0x24, 0x47, 0xCA, 0x47, 0x17, 0x38, 0xD6, 0x00, 0x09, 0x34, 0xC3, 0x16, 0x54, 0x10, 0x8B, 0x42, 0x01, 0x0B, 
+};
+
+const SQISign_KAT_t kat_lvl1[2] = {
+  {
+    .mlen = 32,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 32 + CRYPTO_BYTES,
+    .sm = { 0x7A, 0x34, 0xF2, 0xA7, 0xA8, 0x4F, 0xC0, 0x27, 0x0A, 0x4C, 0xEF, 0x98, 0x59, 0x0A, 0x18, 0x15, 0x6D, 0xBC, 0xC0, 0x22, 0xB5, 0x63, 0x1D, 0x20, 0xED, 0xB7, 0x37, 0x01, 0xC1, 0xF6, 0x02, 0x01, 0xF8, 0x51, 0x62, 0xA6, 0xA4, 0xF9, 0x6D, 0x92, 0xEA, 0x96, 0xE3, 0x11, 0x8B, 0x1A, 0x8C, 0xC9, 0x4A, 0x22, 0xF2, 0xD9, 0x36, 0x9A, 0xF6, 0xBD, 0x29, 0x84, 0x5A, 0xC8, 0x17, 0x2E, 0x73, 0x02, 0x00, 0x01, 0x36, 0x4C, 0x4B, 0x39, 0xFD, 0xF0, 0x1A, 0x6A, 0x89, 0xA4, 0xAB, 0x69, 0x67, 0x9D, 0xA0, 0x84, 0x5B, 0x2A, 0x9D, 0x1A, 0x89, 0x69, 0xAB, 0x7E, 0x6B, 0x44, 0xE5, 0xC9, 0x26, 0xEA, 0x3F, 0x16, 0x5A, 0x19, 0xFB, 0x24, 0x13, 0x4E, 0x69, 0x2B, 0x76, 0xBB, 0x41, 0x58, 0x90, 0x30, 0x2A, 0x37, 0x14, 0x84, 0xC9, 0x25, 0x92, 0x8D, 0xAB, 0x3C, 0x8E, 0x79, 0x08, 0x5C, 0xA6, 0x7F, 0x2F, 0x85, 0x10, 0x03, 0xB4, 0xE6, 0xCC, 0xB2, 0x09, 0xF2, 0xE2, 0x98, 0xC3, 0x8A, 0x47, 0xEF, 0x83, 0x00, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+  {
+    .mlen = 59,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 59 + CRYPTO_BYTES,
+    .sm = { 0xB5, 0xCC, 0x8D, 0xAA, 0xC9, 0xFD, 0x34, 0x72, 0xB7, 0xF5, 0xC8, 0x92, 0x9E, 0x4B, 0x3B, 0x6E, 0xF7, 0x32, 0x50, 0x9F, 0xC0, 0xFD, 0xA4, 0xDD, 0x54, 0xFB, 0xFB, 0x28, 0x60, 0xAB, 0x40, 0x00, 0xD8, 0x60, 0xE0, 0xDD, 0x7E, 0xBD, 0xB1, 0x0F, 0xB6, 0x0A, 0xDC, 0x5E, 0xCC, 0x47, 0xC7, 0xDE, 0x50, 0x39, 0x87, 0x04, 0x4D, 0xF3, 0xC1, 0xBB, 0xDE, 0xAC, 0x9D, 0x55, 0x01, 0x61, 0x75, 0x03, 0x02, 0x01, 0xEE, 0xC8, 0x45, 0x75, 0xBD, 0xAC, 0x80, 0xC8, 0x06, 0x0F, 0xB0, 0x64, 0x34, 0x38, 0x8F, 0x39, 0x45, 0x75, 0xFF, 0x58, 0x0D, 0x78, 0xB2, 0xB5, 0x90, 0x17, 0x51, 0x39, 0x42, 0xAC, 0x21, 0x1E, 0x78, 0x90, 0xD3, 0xFA, 0x8D, 0xDC, 0x02, 0xC7, 0xB8, 0x31, 0x8B, 0x8E, 0x31, 0xD2, 0xF1, 0x25, 0xE9, 0xA3, 0xAC, 0x1E, 0x16, 0x9B, 0xD2, 0xA4, 0x6B, 0xC9, 0x27, 0xF1, 0xE0, 0x13, 0x50, 0x28, 0x7B, 0x23, 0x10, 0xCB, 0x69, 0x7D, 0x67, 0x8C, 0xB2, 0xB7, 0x07, 0x7F, 0xD4, 0xF5, 0x48, 0x01, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+};
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+  memcpy(pk, kat_lvl1_pk, CRYPTO_PUBLICKEYBYTES);
+  // We don't need the secret key
+  memset(sk, 0, CRYPTO_SECRETKEYBYTES);
+}
+
+int crypto_sign(unsigned char *sm, size_t *smlen, const unsigned char *m,
+                size_t mlen, const unsigned char *sk) {
+  for (size_t i = 0; i < sizeof(kat_lvl1) / sizeof(kat_lvl1[0]); i++) {
+    if (mlen == kat_lvl1[i].mlen) {
+      memcpy(sm, kat_lvl1[i].sm, kat_lvl1[i].smlen);
+      *smlen = kat_lvl1[i].smlen;
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int crypto_sign_open(unsigned char *m, size_t *mlen, const unsigned char *sm,
+                     size_t smlen, const unsigned char *pk) {
+  unsigned long long mlen_ull = *mlen;
+  int ret = sqisign_open(m, &mlen_ull, sm, smlen, pk);
+  if (mlen) {
+    *mlen = mlen_ull;
+  }
+  return ret;
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/rng.h b/src/pqm4/sqisign_lvl1/ref/rng.h
new file mode 100644
index 0000000..3c24d07
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/rng.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef rng_h
+#define rng_h
+
+#include "randombytes.h"
+
+#endif /* rng_h */
diff --git a/src/pqm4/sqisign_lvl1/ref/sig.h b/src/pqm4/sqisign_lvl1/ref/sig.h
new file mode 100644
index 0000000..4c33510
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/sig.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SQISIGN_H
+#define SQISIGN_H
+
+#include <stdint.h>
+#include <sqisign_namespace.h>
+
+#if defined(ENABLE_SIGN)
+/**
+ * SQIsign keypair generation.
+ *
+ * The implementation corresponds to SQIsign.CompactKeyGen() in the SQIsign spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[out] pk SQIsign public key
+ * @param[out] sk SQIsign secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_keypair(unsigned char *pk, unsigned char *sk);
+
+/**
+ * SQIsign signature generation.
+ *
+ * The implementation performs SQIsign.expandSK() + SQIsign.sign() in the SQIsign spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_sign(unsigned char *sm,
+                 unsigned long long *smlen,
+                 const unsigned char *m,
+                 unsigned long long mlen,
+                 const unsigned char *sk);
+#endif
+
+/**
+ * SQIsign open signature.
+ *
+ * The implementation performs SQIsign.verify(). If the signature verification succeeded, the
+ * original message is stored in m. Keys provided is a compact public key. The caller is responsible
+ * to allocate sufficient memory to hold m.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+SQISIGN_API
+int sqisign_open(unsigned char *m,
+                 unsigned long long *mlen,
+                 const unsigned char *sm,
+                 unsigned long long smlen,
+                 const unsigned char *pk);
+
+/**
+ * SQIsign verify signature.
+ *
+ * If the signature verification succeeded, returns 0, otherwise 1.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] siglen Length of sig
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+SQISIGN_API 
+int sqisign_verify(const unsigned char *m,
+                   unsigned long long mlen,
+                   const unsigned char *sig,
+                   unsigned long long siglen,
+                   const unsigned char *pk);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/sqisign.c b/src/pqm4/sqisign_lvl1/ref/sqisign.c
new file mode 100644
index 0000000..57fd75d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/sqisign.c
@@ -0,0 +1,106 @@
+#include <sig.h>
+#include <string.h>
+#include <encoded_sizes.h>
+#include <verification.h>
+#if defined(ENABLE_SIGN)
+#include <signature.h>
+#endif
+
+#if defined(ENABLE_SIGN)
+SQISIGN_API
+int
+sqisign_keypair(unsigned char *pk, unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    secret_key_init(&skt);
+
+    ret = !protocols_keygen(&pkt, &skt);
+
+    secret_key_to_bytes(sk, &skt, &pkt);
+    public_key_to_bytes(pk, &pkt);
+    secret_key_finalize(&skt);
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_sign(unsigned char *sm,
+             unsigned long long *smlen,
+             const unsigned char *m,
+             unsigned long long mlen,
+             const unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+    secret_key_init(&skt);
+    secret_key_from_bytes(&skt, &pkt, sk);
+
+    memmove(sm + SIGNATURE_BYTES, m, mlen);
+
+    ret = !protocols_sign(&sigt, &pkt, &skt, sm + SIGNATURE_BYTES, mlen);
+    if (ret != 0) {
+        *smlen = 0;
+        goto err;
+    }
+
+    signature_to_bytes(sm, &sigt);
+    *smlen = SIGNATURE_BYTES + mlen;
+
+err:
+    secret_key_finalize(&skt);
+    return ret;
+}
+#endif
+
+SQISIGN_API
+int
+sqisign_open(unsigned char *m,
+             unsigned long long *mlen,
+             const unsigned char *sm,
+             unsigned long long smlen,
+             const unsigned char *pk)
+{
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sm);
+
+    ret = !protocols_verify(&sigt, &pkt, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES);
+
+    if (!ret) {
+        *mlen = smlen - SIGNATURE_BYTES;
+        memmove(m, sm + SIGNATURE_BYTES, *mlen);
+    } else {
+        *mlen = 0;
+        memset(m, 0, smlen - SIGNATURE_BYTES);
+    }
+
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_verify(const unsigned char *m,
+               unsigned long long mlen,
+               const unsigned char *sig,
+               unsigned long long siglen,
+               const unsigned char *pk)
+{
+
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sig);
+
+    ret = !protocols_verify(&sigt, &pkt, m, mlen);
+
+    return ret;
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/sqisign_namespace.h b/src/pqm4/sqisign_lvl1/ref/sqisign_namespace.h
new file mode 100644
index 0000000..14fd51d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/sqisign_namespace.h
@@ -0,0 +1,1022 @@
+
+#ifndef SQISIGN_NAMESPACE_H
+#define SQISIGN_NAMESPACE_H
+
+//#define DISABLE_NAMESPACING
+
+#if defined(_WIN32)
+#define SQISIGN_API __declspec(dllexport)
+#else
+#define SQISIGN_API __attribute__((visibility("default")))
+#endif
+
+#define PARAM_JOIN3_(a, b, c) sqisign_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(SQISIGN_VARIANT, end, s)
+
+#define PARAM_JOIN2_(a, b) sqisign_##a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME2(end, s) PARAM_JOIN2(end, s)
+
+#ifndef DISABLE_NAMESPACING
+#define SQISIGN_NAMESPACE_GENERIC(s) PARAM_NAME2(gen, s)
+#else
+#define SQISIGN_NAMESPACE_GENERIC(s) s
+#endif
+
+#if defined(SQISIGN_VARIANT) && !defined(DISABLE_NAMESPACING)
+#if defined(SQISIGN_BUILD_TYPE_REF)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(SQISIGN_BUILD_TYPE_OPT)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(SQISIGN_BUILD_TYPE_BROADWELL)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(broadwell, s)
+#elif defined(SQISIGN_BUILD_TYPE_ARM64CRYPTO)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(arm64crypto, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define SQISIGN_NAMESPACE(s) s
+#endif
+
+// Namespacing symbols exported from algebra.c:
+#undef quat_alg_add
+#undef quat_alg_conj
+#undef quat_alg_coord_mul
+#undef quat_alg_elem_copy
+#undef quat_alg_elem_copy_ibz
+#undef quat_alg_elem_equal
+#undef quat_alg_elem_is_zero
+#undef quat_alg_elem_mul_by_scalar
+#undef quat_alg_elem_set
+#undef quat_alg_equal_denom
+#undef quat_alg_init_set_ui
+#undef quat_alg_make_primitive
+#undef quat_alg_mul
+#undef quat_alg_norm
+#undef quat_alg_normalize
+#undef quat_alg_scalar
+#undef quat_alg_sub
+
+#define quat_alg_add                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_add)
+#define quat_alg_conj                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_conj)
+#define quat_alg_coord_mul                              SQISIGN_NAMESPACE_GENERIC(quat_alg_coord_mul)
+#define quat_alg_elem_copy                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy)
+#define quat_alg_elem_copy_ibz                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy_ibz)
+#define quat_alg_elem_equal                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_equal)
+#define quat_alg_elem_is_zero                           SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_is_zero)
+#define quat_alg_elem_mul_by_scalar                     SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_mul_by_scalar)
+#define quat_alg_elem_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_set)
+#define quat_alg_equal_denom                            SQISIGN_NAMESPACE_GENERIC(quat_alg_equal_denom)
+#define quat_alg_init_set_ui                            SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set_ui)
+#define quat_alg_make_primitive                         SQISIGN_NAMESPACE_GENERIC(quat_alg_make_primitive)
+#define quat_alg_mul                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_mul)
+#define quat_alg_norm                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_norm)
+#define quat_alg_normalize                              SQISIGN_NAMESPACE_GENERIC(quat_alg_normalize)
+#define quat_alg_scalar                                 SQISIGN_NAMESPACE_GENERIC(quat_alg_scalar)
+#define quat_alg_sub                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_sub)
+
+// Namespacing symbols exported from api.c:
+#undef crypto_sign
+#undef crypto_sign_keypair
+#undef crypto_sign_open
+
+#define crypto_sign                                     SQISIGN_NAMESPACE(crypto_sign)
+#define crypto_sign_keypair                             SQISIGN_NAMESPACE(crypto_sign_keypair)
+#define crypto_sign_open                                SQISIGN_NAMESPACE(crypto_sign_open)
+
+// Namespacing symbols exported from basis.c:
+#undef ec_curve_to_basis_2f_from_hint
+#undef ec_curve_to_basis_2f_to_hint
+#undef ec_recover_y
+#undef lift_basis
+#undef lift_basis_normalized
+
+#define ec_curve_to_basis_2f_from_hint                  SQISIGN_NAMESPACE(ec_curve_to_basis_2f_from_hint)
+#define ec_curve_to_basis_2f_to_hint                    SQISIGN_NAMESPACE(ec_curve_to_basis_2f_to_hint)
+#define ec_recover_y                                    SQISIGN_NAMESPACE(ec_recover_y)
+#define lift_basis                                      SQISIGN_NAMESPACE(lift_basis)
+#define lift_basis_normalized                           SQISIGN_NAMESPACE(lift_basis_normalized)
+
+// Namespacing symbols exported from biextension.c:
+#undef clear_cofac
+#undef ec_dlog_2_tate
+#undef ec_dlog_2_weil
+#undef fp2_frob
+#undef reduced_tate
+#undef weil
+
+#define clear_cofac                                     SQISIGN_NAMESPACE(clear_cofac)
+#define ec_dlog_2_tate                                  SQISIGN_NAMESPACE(ec_dlog_2_tate)
+#define ec_dlog_2_weil                                  SQISIGN_NAMESPACE(ec_dlog_2_weil)
+#define fp2_frob                                        SQISIGN_NAMESPACE(fp2_frob)
+#define reduced_tate                                    SQISIGN_NAMESPACE(reduced_tate)
+#define weil                                            SQISIGN_NAMESPACE(weil)
+
+// Namespacing symbols exported from common.c:
+#undef hash_to_challenge
+#undef public_key_finalize
+#undef public_key_init
+
+#define hash_to_challenge                               SQISIGN_NAMESPACE(hash_to_challenge)
+#define public_key_finalize                             SQISIGN_NAMESPACE(public_key_finalize)
+#define public_key_init                                 SQISIGN_NAMESPACE(public_key_init)
+
+// Namespacing symbols exported from dim2.c:
+#undef ibz_2x2_mul_mod
+#undef ibz_mat_2x2_add
+#undef ibz_mat_2x2_copy
+#undef ibz_mat_2x2_det_from_ibz
+#undef ibz_mat_2x2_eval
+#undef ibz_mat_2x2_inv_mod
+#undef ibz_mat_2x2_set
+#undef ibz_vec_2_set
+
+#define ibz_2x2_mul_mod                                 SQISIGN_NAMESPACE_GENERIC(ibz_2x2_mul_mod)
+#define ibz_mat_2x2_add                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_add)
+#define ibz_mat_2x2_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_copy)
+#define ibz_mat_2x2_det_from_ibz                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_det_from_ibz)
+#define ibz_mat_2x2_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_eval)
+#define ibz_mat_2x2_inv_mod                             SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_inv_mod)
+#define ibz_mat_2x2_set                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_set)
+#define ibz_vec_2_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_set)
+
+// Namespacing symbols exported from dim2id2iso.c:
+#undef dim2id2iso_arbitrary_isogeny_evaluation
+#undef dim2id2iso_ideal_to_isogeny_clapotis
+#undef find_uv
+#undef fixed_degree_isogeny_and_eval
+
+#define dim2id2iso_arbitrary_isogeny_evaluation         SQISIGN_NAMESPACE(dim2id2iso_arbitrary_isogeny_evaluation)
+#define dim2id2iso_ideal_to_isogeny_clapotis            SQISIGN_NAMESPACE(dim2id2iso_ideal_to_isogeny_clapotis)
+#define find_uv                                         SQISIGN_NAMESPACE(find_uv)
+#define fixed_degree_isogeny_and_eval                   SQISIGN_NAMESPACE(fixed_degree_isogeny_and_eval)
+
+// Namespacing symbols exported from dim4.c:
+#undef ibz_inv_dim4_make_coeff_mpm
+#undef ibz_inv_dim4_make_coeff_pmp
+#undef ibz_mat_4x4_copy
+#undef ibz_mat_4x4_equal
+#undef ibz_mat_4x4_eval
+#undef ibz_mat_4x4_eval_t
+#undef ibz_mat_4x4_gcd
+#undef ibz_mat_4x4_identity
+#undef ibz_mat_4x4_inv_with_det_as_denom
+#undef ibz_mat_4x4_is_identity
+#undef ibz_mat_4x4_mul
+#undef ibz_mat_4x4_negate
+#undef ibz_mat_4x4_scalar_div
+#undef ibz_mat_4x4_scalar_mul
+#undef ibz_mat_4x4_transpose
+#undef ibz_mat_4x4_zero
+#undef ibz_vec_4_add
+#undef ibz_vec_4_content
+#undef ibz_vec_4_copy
+#undef ibz_vec_4_copy_ibz
+#undef ibz_vec_4_is_zero
+#undef ibz_vec_4_linear_combination
+#undef ibz_vec_4_negate
+#undef ibz_vec_4_scalar_div
+#undef ibz_vec_4_scalar_mul
+#undef ibz_vec_4_set
+#undef ibz_vec_4_sub
+#undef quat_qf_eval
+
+#define ibz_inv_dim4_make_coeff_mpm                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_mpm)
+#define ibz_inv_dim4_make_coeff_pmp                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_pmp)
+#define ibz_mat_4x4_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_copy)
+#define ibz_mat_4x4_equal                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_equal)
+#define ibz_mat_4x4_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval)
+#define ibz_mat_4x4_eval_t                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval_t)
+#define ibz_mat_4x4_gcd                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_gcd)
+#define ibz_mat_4x4_identity                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_identity)
+#define ibz_mat_4x4_inv_with_det_as_denom               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_inv_with_det_as_denom)
+#define ibz_mat_4x4_is_identity                         SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_identity)
+#define ibz_mat_4x4_mul                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_mul)
+#define ibz_mat_4x4_negate                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_negate)
+#define ibz_mat_4x4_scalar_div                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_div)
+#define ibz_mat_4x4_scalar_mul                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_mul)
+#define ibz_mat_4x4_transpose                           SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_transpose)
+#define ibz_mat_4x4_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_zero)
+#define ibz_vec_4_add                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_add)
+#define ibz_vec_4_content                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_content)
+#define ibz_vec_4_copy                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy)
+#define ibz_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_ibz)
+#define ibz_vec_4_is_zero                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_is_zero)
+#define ibz_vec_4_linear_combination                    SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination)
+#define ibz_vec_4_negate                                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_negate)
+#define ibz_vec_4_scalar_div                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_div)
+#define ibz_vec_4_scalar_mul                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul)
+#define ibz_vec_4_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_set)
+#define ibz_vec_4_sub                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_sub)
+#define quat_qf_eval                                    SQISIGN_NAMESPACE_GENERIC(quat_qf_eval)
+
+// Namespacing symbols exported from ec.c:
+#undef cswap_points
+#undef ec_biscalar_mul
+#undef ec_curve_init
+#undef ec_curve_init_from_A
+#undef ec_curve_normalize_A24
+#undef ec_curve_verify_A
+#undef ec_dbl
+#undef ec_dbl_iter
+#undef ec_dbl_iter_basis
+#undef ec_has_zero_coordinate
+#undef ec_is_basis_four_torsion
+#undef ec_is_equal
+#undef ec_is_four_torsion
+#undef ec_is_two_torsion
+#undef ec_is_zero
+#undef ec_j_inv
+#undef ec_ladder3pt
+#undef ec_mul
+#undef ec_normalize_curve
+#undef ec_normalize_curve_and_A24
+#undef ec_normalize_point
+#undef ec_point_init
+#undef select_point
+#undef xADD
+#undef xDBL
+#undef xDBLADD
+#undef xDBLMUL
+#undef xDBL_A24
+#undef xDBL_E0
+#undef xMUL
+
+#define cswap_points                                    SQISIGN_NAMESPACE(cswap_points)
+#define ec_biscalar_mul                                 SQISIGN_NAMESPACE(ec_biscalar_mul)
+#define ec_curve_init                                   SQISIGN_NAMESPACE(ec_curve_init)
+#define ec_curve_init_from_A                            SQISIGN_NAMESPACE(ec_curve_init_from_A)
+#define ec_curve_normalize_A24                          SQISIGN_NAMESPACE(ec_curve_normalize_A24)
+#define ec_curve_verify_A                               SQISIGN_NAMESPACE(ec_curve_verify_A)
+#define ec_dbl                                          SQISIGN_NAMESPACE(ec_dbl)
+#define ec_dbl_iter                                     SQISIGN_NAMESPACE(ec_dbl_iter)
+#define ec_dbl_iter_basis                               SQISIGN_NAMESPACE(ec_dbl_iter_basis)
+#define ec_has_zero_coordinate                          SQISIGN_NAMESPACE(ec_has_zero_coordinate)
+#define ec_is_basis_four_torsion                        SQISIGN_NAMESPACE(ec_is_basis_four_torsion)
+#define ec_is_equal                                     SQISIGN_NAMESPACE(ec_is_equal)
+#define ec_is_four_torsion                              SQISIGN_NAMESPACE(ec_is_four_torsion)
+#define ec_is_two_torsion                               SQISIGN_NAMESPACE(ec_is_two_torsion)
+#define ec_is_zero                                      SQISIGN_NAMESPACE(ec_is_zero)
+#define ec_j_inv                                        SQISIGN_NAMESPACE(ec_j_inv)
+#define ec_ladder3pt                                    SQISIGN_NAMESPACE(ec_ladder3pt)
+#define ec_mul                                          SQISIGN_NAMESPACE(ec_mul)
+#define ec_normalize_curve                              SQISIGN_NAMESPACE(ec_normalize_curve)
+#define ec_normalize_curve_and_A24                      SQISIGN_NAMESPACE(ec_normalize_curve_and_A24)
+#define ec_normalize_point                              SQISIGN_NAMESPACE(ec_normalize_point)
+#define ec_point_init                                   SQISIGN_NAMESPACE(ec_point_init)
+#define select_point                                    SQISIGN_NAMESPACE(select_point)
+#define xADD                                            SQISIGN_NAMESPACE(xADD)
+#define xDBL                                            SQISIGN_NAMESPACE(xDBL)
+#define xDBLADD                                         SQISIGN_NAMESPACE(xDBLADD)
+#define xDBLMUL                                         SQISIGN_NAMESPACE(xDBLMUL)
+#define xDBL_A24                                        SQISIGN_NAMESPACE(xDBL_A24)
+#define xDBL_E0                                         SQISIGN_NAMESPACE(xDBL_E0)
+#define xMUL                                            SQISIGN_NAMESPACE(xMUL)
+
+// Namespacing symbols exported from ec_jac.c:
+#undef ADD
+#undef DBL
+#undef DBLW
+#undef copy_jac_point
+#undef jac_from_ws
+#undef jac_init
+#undef jac_is_equal
+#undef jac_neg
+#undef jac_to_ws
+#undef jac_to_xz
+#undef jac_to_xz_add_components
+#undef select_jac_point
+
+#define ADD                                             SQISIGN_NAMESPACE(ADD)
+#define DBL                                             SQISIGN_NAMESPACE(DBL)
+#define DBLW                                            SQISIGN_NAMESPACE(DBLW)
+#define copy_jac_point                                  SQISIGN_NAMESPACE(copy_jac_point)
+#define jac_from_ws                                     SQISIGN_NAMESPACE(jac_from_ws)
+#define jac_init                                        SQISIGN_NAMESPACE(jac_init)
+#define jac_is_equal                                    SQISIGN_NAMESPACE(jac_is_equal)
+#define jac_neg                                         SQISIGN_NAMESPACE(jac_neg)
+#define jac_to_ws                                       SQISIGN_NAMESPACE(jac_to_ws)
+#define jac_to_xz                                       SQISIGN_NAMESPACE(jac_to_xz)
+#define jac_to_xz_add_components                        SQISIGN_NAMESPACE(jac_to_xz_add_components)
+#define select_jac_point                                SQISIGN_NAMESPACE(select_jac_point)
+
+// Namespacing symbols exported from encode_signature.c:
+#undef secret_key_from_bytes
+#undef secret_key_to_bytes
+
+#define secret_key_from_bytes                           SQISIGN_NAMESPACE(secret_key_from_bytes)
+#define secret_key_to_bytes                             SQISIGN_NAMESPACE(secret_key_to_bytes)
+
+// Namespacing symbols exported from encode_verification.c:
+#undef public_key_from_bytes
+#undef public_key_to_bytes
+#undef signature_from_bytes
+#undef signature_to_bytes
+
+#define public_key_from_bytes                           SQISIGN_NAMESPACE(public_key_from_bytes)
+#define public_key_to_bytes                             SQISIGN_NAMESPACE(public_key_to_bytes)
+#define signature_from_bytes                            SQISIGN_NAMESPACE(signature_from_bytes)
+#define signature_to_bytes                              SQISIGN_NAMESPACE(signature_to_bytes)
+
+// Namespacing symbols exported from finit.c:
+#undef ibz_mat_2x2_finalize
+#undef ibz_mat_2x2_init
+#undef ibz_mat_4x4_finalize
+#undef ibz_mat_4x4_init
+#undef ibz_vec_2_finalize
+#undef ibz_vec_2_init
+#undef ibz_vec_4_finalize
+#undef ibz_vec_4_init
+#undef quat_alg_elem_finalize
+#undef quat_alg_elem_init
+#undef quat_alg_finalize
+#undef quat_alg_init_set
+#undef quat_lattice_finalize
+#undef quat_lattice_init
+#undef quat_left_ideal_finalize
+#undef quat_left_ideal_init
+
+#define ibz_mat_2x2_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_finalize)
+#define ibz_mat_2x2_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_init)
+#define ibz_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_finalize)
+#define ibz_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_init)
+#define ibz_vec_2_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_finalize)
+#define ibz_vec_2_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_init)
+#define ibz_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_finalize)
+#define ibz_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_init)
+#define quat_alg_elem_finalize                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_finalize)
+#define quat_alg_elem_init                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_init)
+#define quat_alg_finalize                               SQISIGN_NAMESPACE_GENERIC(quat_alg_finalize)
+#define quat_alg_init_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set)
+#define quat_lattice_finalize                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_finalize)
+#define quat_lattice_init                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_init)
+#define quat_left_ideal_finalize                        SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_finalize)
+#define quat_left_ideal_init                            SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_init)
+
+// Namespacing symbols exported from fp.c:
+#undef fp_select
+
+#define fp_select                                       SQISIGN_NAMESPACE(fp_select)
+
+// Namespacing symbols exported from fp.c, fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_exp3div4
+#undef fp_inv
+#undef fp_is_square
+#undef fp_sqrt
+
+#define fp_exp3div4                                     SQISIGN_NAMESPACE(fp_exp3div4)
+#define fp_inv                                          SQISIGN_NAMESPACE(fp_inv)
+#define fp_is_square                                    SQISIGN_NAMESPACE(fp_is_square)
+#define fp_sqrt                                         SQISIGN_NAMESPACE(fp_sqrt)
+
+// Namespacing symbols exported from fp2.c:
+#undef fp2_add
+#undef fp2_add_one
+#undef fp2_batched_inv
+#undef fp2_copy
+#undef fp2_cswap
+#undef fp2_decode
+#undef fp2_encode
+#undef fp2_half
+#undef fp2_inv
+#undef fp2_is_equal
+#undef fp2_is_one
+#undef fp2_is_square
+#undef fp2_is_zero
+#undef fp2_mul
+#undef fp2_mul_small
+#undef fp2_neg
+#undef fp2_pow_vartime
+#undef fp2_print
+#undef fp2_select
+#undef fp2_set_one
+#undef fp2_set_small
+#undef fp2_set_zero
+#undef fp2_sqr
+#undef fp2_sqrt
+#undef fp2_sqrt_verify
+#undef fp2_sub
+
+#define fp2_add                                         SQISIGN_NAMESPACE(fp2_add)
+#define fp2_add_one                                     SQISIGN_NAMESPACE(fp2_add_one)
+#define fp2_batched_inv                                 SQISIGN_NAMESPACE(fp2_batched_inv)
+#define fp2_copy                                        SQISIGN_NAMESPACE(fp2_copy)
+#define fp2_cswap                                       SQISIGN_NAMESPACE(fp2_cswap)
+#define fp2_decode                                      SQISIGN_NAMESPACE(fp2_decode)
+#define fp2_encode                                      SQISIGN_NAMESPACE(fp2_encode)
+#define fp2_half                                        SQISIGN_NAMESPACE(fp2_half)
+#define fp2_inv                                         SQISIGN_NAMESPACE(fp2_inv)
+#define fp2_is_equal                                    SQISIGN_NAMESPACE(fp2_is_equal)
+#define fp2_is_one                                      SQISIGN_NAMESPACE(fp2_is_one)
+#define fp2_is_square                                   SQISIGN_NAMESPACE(fp2_is_square)
+#define fp2_is_zero                                     SQISIGN_NAMESPACE(fp2_is_zero)
+#define fp2_mul                                         SQISIGN_NAMESPACE(fp2_mul)
+#define fp2_mul_small                                   SQISIGN_NAMESPACE(fp2_mul_small)
+#define fp2_neg                                         SQISIGN_NAMESPACE(fp2_neg)
+#define fp2_pow_vartime                                 SQISIGN_NAMESPACE(fp2_pow_vartime)
+#define fp2_print                                       SQISIGN_NAMESPACE(fp2_print)
+#define fp2_select                                      SQISIGN_NAMESPACE(fp2_select)
+#define fp2_set_one                                     SQISIGN_NAMESPACE(fp2_set_one)
+#define fp2_set_small                                   SQISIGN_NAMESPACE(fp2_set_small)
+#define fp2_set_zero                                    SQISIGN_NAMESPACE(fp2_set_zero)
+#define fp2_sqr                                         SQISIGN_NAMESPACE(fp2_sqr)
+#define fp2_sqrt                                        SQISIGN_NAMESPACE(fp2_sqrt)
+#define fp2_sqrt_verify                                 SQISIGN_NAMESPACE(fp2_sqrt_verify)
+#define fp2_sub                                         SQISIGN_NAMESPACE(fp2_sub)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_copy
+#undef fp_cswap
+#undef fp_decode
+#undef fp_decode_reduce
+#undef fp_div3
+#undef fp_encode
+#undef fp_half
+#undef fp_is_equal
+#undef fp_is_zero
+#undef fp_mul_small
+#undef fp_neg
+#undef fp_set_one
+#undef fp_set_small
+#undef fp_set_zero
+
+#define fp_copy                                         SQISIGN_NAMESPACE(fp_copy)
+#define fp_cswap                                        SQISIGN_NAMESPACE(fp_cswap)
+#define fp_decode                                       SQISIGN_NAMESPACE(fp_decode)
+#define fp_decode_reduce                                SQISIGN_NAMESPACE(fp_decode_reduce)
+#define fp_div3                                         SQISIGN_NAMESPACE(fp_div3)
+#define fp_encode                                       SQISIGN_NAMESPACE(fp_encode)
+#define fp_half                                         SQISIGN_NAMESPACE(fp_half)
+#define fp_is_equal                                     SQISIGN_NAMESPACE(fp_is_equal)
+#define fp_is_zero                                      SQISIGN_NAMESPACE(fp_is_zero)
+#define fp_mul_small                                    SQISIGN_NAMESPACE(fp_mul_small)
+#define fp_neg                                          SQISIGN_NAMESPACE(fp_neg)
+#define fp_set_one                                      SQISIGN_NAMESPACE(fp_set_one)
+#define fp_set_small                                    SQISIGN_NAMESPACE(fp_set_small)
+#define fp_set_zero                                     SQISIGN_NAMESPACE(fp_set_zero)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c, gf27500.c, gf5248.c, gf65376.c:
+#undef fp_add
+#undef fp_mul
+#undef fp_sqr
+#undef fp_sub
+
+#define fp_add                                          SQISIGN_NAMESPACE(fp_add)
+#define fp_mul                                          SQISIGN_NAMESPACE(fp_mul)
+#define fp_sqr                                          SQISIGN_NAMESPACE(fp_sqr)
+#define fp_sub                                          SQISIGN_NAMESPACE(fp_sub)
+
+// Namespacing symbols exported from gf27500.c:
+#undef gf27500_decode
+#undef gf27500_decode_reduce
+#undef gf27500_div
+#undef gf27500_div3
+#undef gf27500_encode
+#undef gf27500_invert
+#undef gf27500_legendre
+#undef gf27500_sqrt
+
+#define gf27500_decode                                  SQISIGN_NAMESPACE(gf27500_decode)
+#define gf27500_decode_reduce                           SQISIGN_NAMESPACE(gf27500_decode_reduce)
+#define gf27500_div                                     SQISIGN_NAMESPACE(gf27500_div)
+#define gf27500_div3                                    SQISIGN_NAMESPACE(gf27500_div3)
+#define gf27500_encode                                  SQISIGN_NAMESPACE(gf27500_encode)
+#define gf27500_invert                                  SQISIGN_NAMESPACE(gf27500_invert)
+#define gf27500_legendre                                SQISIGN_NAMESPACE(gf27500_legendre)
+#define gf27500_sqrt                                    SQISIGN_NAMESPACE(gf27500_sqrt)
+
+// Namespacing symbols exported from gf27500.c, gf5248.c, gf65376.c:
+#undef fp2_mul_c0
+#undef fp2_mul_c1
+#undef fp2_sq_c0
+#undef fp2_sq_c1
+
+#define fp2_mul_c0                                      SQISIGN_NAMESPACE(fp2_mul_c0)
+#define fp2_mul_c1                                      SQISIGN_NAMESPACE(fp2_mul_c1)
+#define fp2_sq_c0                                       SQISIGN_NAMESPACE(fp2_sq_c0)
+#define fp2_sq_c1                                       SQISIGN_NAMESPACE(fp2_sq_c1)
+
+// Namespacing symbols exported from gf5248.c:
+#undef gf5248_decode
+#undef gf5248_decode_reduce
+#undef gf5248_div
+#undef gf5248_div3
+#undef gf5248_encode
+#undef gf5248_invert
+#undef gf5248_legendre
+#undef gf5248_sqrt
+
+#define gf5248_decode                                   SQISIGN_NAMESPACE(gf5248_decode)
+#define gf5248_decode_reduce                            SQISIGN_NAMESPACE(gf5248_decode_reduce)
+#define gf5248_div                                      SQISIGN_NAMESPACE(gf5248_div)
+#define gf5248_div3                                     SQISIGN_NAMESPACE(gf5248_div3)
+#define gf5248_encode                                   SQISIGN_NAMESPACE(gf5248_encode)
+#define gf5248_invert                                   SQISIGN_NAMESPACE(gf5248_invert)
+#define gf5248_legendre                                 SQISIGN_NAMESPACE(gf5248_legendre)
+#define gf5248_sqrt                                     SQISIGN_NAMESPACE(gf5248_sqrt)
+
+// Namespacing symbols exported from gf65376.c:
+#undef gf65376_decode
+#undef gf65376_decode_reduce
+#undef gf65376_div
+#undef gf65376_div3
+#undef gf65376_encode
+#undef gf65376_invert
+#undef gf65376_legendre
+#undef gf65376_sqrt
+
+#define gf65376_decode                                  SQISIGN_NAMESPACE(gf65376_decode)
+#define gf65376_decode_reduce                           SQISIGN_NAMESPACE(gf65376_decode_reduce)
+#define gf65376_div                                     SQISIGN_NAMESPACE(gf65376_div)
+#define gf65376_div3                                    SQISIGN_NAMESPACE(gf65376_div3)
+#define gf65376_encode                                  SQISIGN_NAMESPACE(gf65376_encode)
+#define gf65376_invert                                  SQISIGN_NAMESPACE(gf65376_invert)
+#define gf65376_legendre                                SQISIGN_NAMESPACE(gf65376_legendre)
+#define gf65376_sqrt                                    SQISIGN_NAMESPACE(gf65376_sqrt)
+
+// Namespacing symbols exported from hd.c:
+#undef add_couple_jac_points
+#undef copy_bases_to_kernel
+#undef couple_jac_to_xz
+#undef double_couple_jac_point
+#undef double_couple_jac_point_iter
+#undef double_couple_point
+#undef double_couple_point_iter
+
+#define add_couple_jac_points                           SQISIGN_NAMESPACE(add_couple_jac_points)
+#define copy_bases_to_kernel                            SQISIGN_NAMESPACE(copy_bases_to_kernel)
+#define couple_jac_to_xz                                SQISIGN_NAMESPACE(couple_jac_to_xz)
+#define double_couple_jac_point                         SQISIGN_NAMESPACE(double_couple_jac_point)
+#define double_couple_jac_point_iter                    SQISIGN_NAMESPACE(double_couple_jac_point_iter)
+#define double_couple_point                             SQISIGN_NAMESPACE(double_couple_point)
+#define double_couple_point_iter                        SQISIGN_NAMESPACE(double_couple_point_iter)
+
+// Namespacing symbols exported from hnf.c:
+#undef ibz_mat_4x4_is_hnf
+#undef ibz_mat_4xn_hnf_mod_core
+#undef ibz_vec_4_copy_mod
+#undef ibz_vec_4_linear_combination_mod
+#undef ibz_vec_4_scalar_mul_mod
+
+#define ibz_mat_4x4_is_hnf                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_hnf)
+#define ibz_mat_4xn_hnf_mod_core                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_4xn_hnf_mod_core)
+#define ibz_vec_4_copy_mod                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_mod)
+#define ibz_vec_4_linear_combination_mod                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination_mod)
+#define ibz_vec_4_scalar_mul_mod                        SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul_mod)
+
+// Namespacing symbols exported from hnf_internal.c:
+#undef ibz_centered_mod
+#undef ibz_conditional_assign
+#undef ibz_mod_not_zero
+#undef ibz_xgcd_with_u_not_0
+
+#define ibz_centered_mod                                SQISIGN_NAMESPACE_GENERIC(ibz_centered_mod)
+#define ibz_conditional_assign                          SQISIGN_NAMESPACE_GENERIC(ibz_conditional_assign)
+#define ibz_mod_not_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mod_not_zero)
+#define ibz_xgcd_with_u_not_0                           SQISIGN_NAMESPACE_GENERIC(ibz_xgcd_with_u_not_0)
+
+// Namespacing symbols exported from ibz_division.c:
+#undef ibz_xgcd
+
+#define ibz_xgcd                                        SQISIGN_NAMESPACE_GENERIC(ibz_xgcd)
+
+// Namespacing symbols exported from id2iso.c:
+#undef change_of_basis_matrix_tate
+#undef change_of_basis_matrix_tate_invert
+#undef ec_biscalar_mul_ibz_vec
+#undef endomorphism_application_even_basis
+#undef id2iso_ideal_to_kernel_dlogs_even
+#undef id2iso_kernel_dlogs_to_ideal_even
+#undef matrix_application_even_basis
+
+#define change_of_basis_matrix_tate                     SQISIGN_NAMESPACE(change_of_basis_matrix_tate)
+#define change_of_basis_matrix_tate_invert              SQISIGN_NAMESPACE(change_of_basis_matrix_tate_invert)
+#define ec_biscalar_mul_ibz_vec                         SQISIGN_NAMESPACE(ec_biscalar_mul_ibz_vec)
+#define endomorphism_application_even_basis             SQISIGN_NAMESPACE(endomorphism_application_even_basis)
+#define id2iso_ideal_to_kernel_dlogs_even               SQISIGN_NAMESPACE(id2iso_ideal_to_kernel_dlogs_even)
+#define id2iso_kernel_dlogs_to_ideal_even               SQISIGN_NAMESPACE(id2iso_kernel_dlogs_to_ideal_even)
+#define matrix_application_even_basis                   SQISIGN_NAMESPACE(matrix_application_even_basis)
+
+// Namespacing symbols exported from ideal.c:
+#undef quat_lideal_add
+#undef quat_lideal_class_gram
+#undef quat_lideal_conjugate_without_hnf
+#undef quat_lideal_copy
+#undef quat_lideal_create
+#undef quat_lideal_create_principal
+#undef quat_lideal_equals
+#undef quat_lideal_generator
+#undef quat_lideal_inter
+#undef quat_lideal_inverse_lattice_without_hnf
+#undef quat_lideal_mul
+#undef quat_lideal_norm
+#undef quat_lideal_right_order
+#undef quat_lideal_right_transporter
+#undef quat_order_discriminant
+#undef quat_order_is_maximal
+
+#define quat_lideal_add                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_add)
+#define quat_lideal_class_gram                          SQISIGN_NAMESPACE_GENERIC(quat_lideal_class_gram)
+#define quat_lideal_conjugate_without_hnf               SQISIGN_NAMESPACE_GENERIC(quat_lideal_conjugate_without_hnf)
+#define quat_lideal_copy                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_copy)
+#define quat_lideal_create                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_create)
+#define quat_lideal_create_principal                    SQISIGN_NAMESPACE_GENERIC(quat_lideal_create_principal)
+#define quat_lideal_equals                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_equals)
+#define quat_lideal_generator                           SQISIGN_NAMESPACE_GENERIC(quat_lideal_generator)
+#define quat_lideal_inter                               SQISIGN_NAMESPACE_GENERIC(quat_lideal_inter)
+#define quat_lideal_inverse_lattice_without_hnf         SQISIGN_NAMESPACE_GENERIC(quat_lideal_inverse_lattice_without_hnf)
+#define quat_lideal_mul                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_mul)
+#define quat_lideal_norm                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_norm)
+#define quat_lideal_right_order                         SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_order)
+#define quat_lideal_right_transporter                   SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_transporter)
+#define quat_order_discriminant                         SQISIGN_NAMESPACE_GENERIC(quat_order_discriminant)
+#define quat_order_is_maximal                           SQISIGN_NAMESPACE_GENERIC(quat_order_is_maximal)
+
+// Namespacing symbols exported from intbig.c:
+#undef ibz_abs
+#undef ibz_add
+#undef ibz_bitsize
+#undef ibz_cmp
+#undef ibz_cmp_int32
+#undef ibz_convert_to_str
+#undef ibz_copy
+#undef ibz_copy_digits
+#undef ibz_div
+#undef ibz_div_2exp
+#undef ibz_div_floor
+#undef ibz_divides
+#undef ibz_finalize
+#undef ibz_gcd
+#undef ibz_get
+#undef ibz_init
+#undef ibz_invmod
+#undef ibz_is_even
+#undef ibz_is_odd
+#undef ibz_is_one
+#undef ibz_is_zero
+#undef ibz_legendre
+#undef ibz_mod
+#undef ibz_mod_ui
+#undef ibz_mul
+#undef ibz_neg
+#undef ibz_pow
+#undef ibz_pow_mod
+#undef ibz_print
+#undef ibz_probab_prime
+#undef ibz_rand_interval
+#undef ibz_rand_interval_bits
+#undef ibz_rand_interval_i
+#undef ibz_rand_interval_minm_m
+#undef ibz_set
+#undef ibz_set_from_str
+#undef ibz_size_in_base
+#undef ibz_sqrt
+#undef ibz_sqrt_floor
+#undef ibz_sqrt_mod_p
+#undef ibz_sub
+#undef ibz_swap
+#undef ibz_to_digits
+#undef ibz_two_adic
+
+#define ibz_abs                                         SQISIGN_NAMESPACE_GENERIC(ibz_abs)
+#define ibz_add                                         SQISIGN_NAMESPACE_GENERIC(ibz_add)
+#define ibz_bitsize                                     SQISIGN_NAMESPACE_GENERIC(ibz_bitsize)
+#define ibz_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibz_cmp)
+#define ibz_cmp_int32                                   SQISIGN_NAMESPACE_GENERIC(ibz_cmp_int32)
+#define ibz_convert_to_str                              SQISIGN_NAMESPACE_GENERIC(ibz_convert_to_str)
+#define ibz_copy                                        SQISIGN_NAMESPACE_GENERIC(ibz_copy)
+#define ibz_copy_digits                                 SQISIGN_NAMESPACE_GENERIC(ibz_copy_digits)
+#define ibz_div                                         SQISIGN_NAMESPACE_GENERIC(ibz_div)
+#define ibz_div_2exp                                    SQISIGN_NAMESPACE_GENERIC(ibz_div_2exp)
+#define ibz_div_floor                                   SQISIGN_NAMESPACE_GENERIC(ibz_div_floor)
+#define ibz_divides                                     SQISIGN_NAMESPACE_GENERIC(ibz_divides)
+#define ibz_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibz_finalize)
+#define ibz_gcd                                         SQISIGN_NAMESPACE_GENERIC(ibz_gcd)
+#define ibz_get                                         SQISIGN_NAMESPACE_GENERIC(ibz_get)
+#define ibz_init                                        SQISIGN_NAMESPACE_GENERIC(ibz_init)
+#define ibz_invmod                                      SQISIGN_NAMESPACE_GENERIC(ibz_invmod)
+#define ibz_is_even                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_even)
+#define ibz_is_odd                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_odd)
+#define ibz_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_one)
+#define ibz_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_zero)
+#define ibz_legendre                                    SQISIGN_NAMESPACE_GENERIC(ibz_legendre)
+#define ibz_mod                                         SQISIGN_NAMESPACE_GENERIC(ibz_mod)
+#define ibz_mod_ui                                      SQISIGN_NAMESPACE_GENERIC(ibz_mod_ui)
+#define ibz_mul                                         SQISIGN_NAMESPACE_GENERIC(ibz_mul)
+#define ibz_neg                                         SQISIGN_NAMESPACE_GENERIC(ibz_neg)
+#define ibz_pow                                         SQISIGN_NAMESPACE_GENERIC(ibz_pow)
+#define ibz_pow_mod                                     SQISIGN_NAMESPACE_GENERIC(ibz_pow_mod)
+#define ibz_print                                       SQISIGN_NAMESPACE_GENERIC(ibz_print)
+#define ibz_probab_prime                                SQISIGN_NAMESPACE_GENERIC(ibz_probab_prime)
+#define ibz_rand_interval                               SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval)
+#define ibz_rand_interval_bits                          SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_bits)
+#define ibz_rand_interval_i                             SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_i)
+#define ibz_rand_interval_minm_m                        SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_minm_m)
+#define ibz_set                                         SQISIGN_NAMESPACE_GENERIC(ibz_set)
+#define ibz_set_from_str                                SQISIGN_NAMESPACE_GENERIC(ibz_set_from_str)
+#define ibz_size_in_base                                SQISIGN_NAMESPACE_GENERIC(ibz_size_in_base)
+#define ibz_sqrt                                        SQISIGN_NAMESPACE_GENERIC(ibz_sqrt)
+#define ibz_sqrt_floor                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_floor)
+#define ibz_sqrt_mod_p                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_mod_p)
+#define ibz_sub                                         SQISIGN_NAMESPACE_GENERIC(ibz_sub)
+#define ibz_swap                                        SQISIGN_NAMESPACE_GENERIC(ibz_swap)
+#define ibz_to_digits                                   SQISIGN_NAMESPACE_GENERIC(ibz_to_digits)
+#define ibz_two_adic                                    SQISIGN_NAMESPACE_GENERIC(ibz_two_adic)
+
+// Namespacing symbols exported from integers.c:
+#undef ibz_cornacchia_prime
+#undef ibz_generate_random_prime
+
+#define ibz_cornacchia_prime                            SQISIGN_NAMESPACE_GENERIC(ibz_cornacchia_prime)
+#define ibz_generate_random_prime                       SQISIGN_NAMESPACE_GENERIC(ibz_generate_random_prime)
+
+// Namespacing symbols exported from isog_chains.c:
+#undef ec_eval_even
+#undef ec_eval_small_chain
+#undef ec_iso_eval
+#undef ec_isomorphism
+
+#define ec_eval_even                                    SQISIGN_NAMESPACE(ec_eval_even)
+#define ec_eval_small_chain                             SQISIGN_NAMESPACE(ec_eval_small_chain)
+#define ec_iso_eval                                     SQISIGN_NAMESPACE(ec_iso_eval)
+#define ec_isomorphism                                  SQISIGN_NAMESPACE(ec_isomorphism)
+
+// Namespacing symbols exported from keygen.c:
+#undef protocols_keygen
+#undef secret_key_finalize
+#undef secret_key_init
+
+#define protocols_keygen                                SQISIGN_NAMESPACE(protocols_keygen)
+#define secret_key_finalize                             SQISIGN_NAMESPACE(secret_key_finalize)
+#define secret_key_init                                 SQISIGN_NAMESPACE(secret_key_init)
+
+// Namespacing symbols exported from l2.c:
+#undef quat_lattice_lll
+#undef quat_lll_core
+
+#define quat_lattice_lll                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_lll)
+#define quat_lll_core                                   SQISIGN_NAMESPACE_GENERIC(quat_lll_core)
+
+// Namespacing symbols exported from lat_ball.c:
+#undef quat_lattice_bound_parallelogram
+#undef quat_lattice_sample_from_ball
+
+#define quat_lattice_bound_parallelogram                SQISIGN_NAMESPACE_GENERIC(quat_lattice_bound_parallelogram)
+#define quat_lattice_sample_from_ball                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_sample_from_ball)
+
+// Namespacing symbols exported from lattice.c:
+#undef quat_lattice_add
+#undef quat_lattice_alg_elem_mul
+#undef quat_lattice_conjugate_without_hnf
+#undef quat_lattice_contains
+#undef quat_lattice_dual_without_hnf
+#undef quat_lattice_equal
+#undef quat_lattice_gram
+#undef quat_lattice_hnf
+#undef quat_lattice_inclusion
+#undef quat_lattice_index
+#undef quat_lattice_intersect
+#undef quat_lattice_mat_alg_coord_mul_without_hnf
+#undef quat_lattice_mul
+#undef quat_lattice_reduce_denom
+
+#define quat_lattice_add                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_add)
+#define quat_lattice_alg_elem_mul                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_alg_elem_mul)
+#define quat_lattice_conjugate_without_hnf              SQISIGN_NAMESPACE_GENERIC(quat_lattice_conjugate_without_hnf)
+#define quat_lattice_contains                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_contains)
+#define quat_lattice_dual_without_hnf                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_dual_without_hnf)
+#define quat_lattice_equal                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_equal)
+#define quat_lattice_gram                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_gram)
+#define quat_lattice_hnf                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_hnf)
+#define quat_lattice_inclusion                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_inclusion)
+#define quat_lattice_index                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_index)
+#define quat_lattice_intersect                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_intersect)
+#define quat_lattice_mat_alg_coord_mul_without_hnf      SQISIGN_NAMESPACE_GENERIC(quat_lattice_mat_alg_coord_mul_without_hnf)
+#define quat_lattice_mul                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_mul)
+#define quat_lattice_reduce_denom                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_reduce_denom)
+
+// Namespacing symbols exported from lll_applications.c:
+#undef quat_lideal_lideal_mul_reduced
+#undef quat_lideal_prime_norm_reduced_equivalent
+#undef quat_lideal_reduce_basis
+
+#define quat_lideal_lideal_mul_reduced                  SQISIGN_NAMESPACE_GENERIC(quat_lideal_lideal_mul_reduced)
+#define quat_lideal_prime_norm_reduced_equivalent       SQISIGN_NAMESPACE_GENERIC(quat_lideal_prime_norm_reduced_equivalent)
+#define quat_lideal_reduce_basis                        SQISIGN_NAMESPACE_GENERIC(quat_lideal_reduce_basis)
+
+// Namespacing symbols exported from lll_verification.c:
+#undef ibq_vec_4_copy_ibz
+#undef quat_lll_bilinear
+#undef quat_lll_gram_schmidt_transposed_with_ibq
+#undef quat_lll_set_ibq_parameters
+#undef quat_lll_verify
+
+#define ibq_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_copy_ibz)
+#define quat_lll_bilinear                               SQISIGN_NAMESPACE_GENERIC(quat_lll_bilinear)
+#define quat_lll_gram_schmidt_transposed_with_ibq       SQISIGN_NAMESPACE_GENERIC(quat_lll_gram_schmidt_transposed_with_ibq)
+#define quat_lll_set_ibq_parameters                     SQISIGN_NAMESPACE_GENERIC(quat_lll_set_ibq_parameters)
+#define quat_lll_verify                                 SQISIGN_NAMESPACE_GENERIC(quat_lll_verify)
+
+// Namespacing symbols exported from mem.c:
+#undef sqisign_secure_clear
+#undef sqisign_secure_free
+
+#define sqisign_secure_clear                            SQISIGN_NAMESPACE_GENERIC(sqisign_secure_clear)
+#define sqisign_secure_free                             SQISIGN_NAMESPACE_GENERIC(sqisign_secure_free)
+
+// Namespacing symbols exported from mp.c:
+#undef MUL
+#undef mp_add
+#undef mp_compare
+#undef mp_copy
+#undef mp_inv_2e
+#undef mp_invert_matrix
+#undef mp_is_one
+#undef mp_is_zero
+#undef mp_mod_2exp
+#undef mp_mul
+#undef mp_mul2
+#undef mp_neg
+#undef mp_print
+#undef mp_shiftl
+#undef mp_shiftr
+#undef mp_sub
+#undef multiple_mp_shiftl
+#undef select_ct
+#undef swap_ct
+
+#define MUL                                             SQISIGN_NAMESPACE_GENERIC(MUL)
+#define mp_add                                          SQISIGN_NAMESPACE_GENERIC(mp_add)
+#define mp_compare                                      SQISIGN_NAMESPACE_GENERIC(mp_compare)
+#define mp_copy                                         SQISIGN_NAMESPACE_GENERIC(mp_copy)
+#define mp_inv_2e                                       SQISIGN_NAMESPACE_GENERIC(mp_inv_2e)
+#define mp_invert_matrix                                SQISIGN_NAMESPACE_GENERIC(mp_invert_matrix)
+#define mp_is_one                                       SQISIGN_NAMESPACE_GENERIC(mp_is_one)
+#define mp_is_zero                                      SQISIGN_NAMESPACE_GENERIC(mp_is_zero)
+#define mp_mod_2exp                                     SQISIGN_NAMESPACE_GENERIC(mp_mod_2exp)
+#define mp_mul                                          SQISIGN_NAMESPACE_GENERIC(mp_mul)
+#define mp_mul2                                         SQISIGN_NAMESPACE_GENERIC(mp_mul2)
+#define mp_neg                                          SQISIGN_NAMESPACE_GENERIC(mp_neg)
+#define mp_print                                        SQISIGN_NAMESPACE_GENERIC(mp_print)
+#define mp_shiftl                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftl)
+#define mp_shiftr                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftr)
+#define mp_sub                                          SQISIGN_NAMESPACE_GENERIC(mp_sub)
+#define multiple_mp_shiftl                              SQISIGN_NAMESPACE_GENERIC(multiple_mp_shiftl)
+#define select_ct                                       SQISIGN_NAMESPACE_GENERIC(select_ct)
+#define swap_ct                                         SQISIGN_NAMESPACE_GENERIC(swap_ct)
+
+// Namespacing symbols exported from normeq.c:
+#undef quat_change_to_O0_basis
+#undef quat_lattice_O0_set
+#undef quat_lattice_O0_set_extremal
+#undef quat_order_elem_create
+#undef quat_represent_integer
+#undef quat_sampling_random_ideal_O0_given_norm
+
+#define quat_change_to_O0_basis                         SQISIGN_NAMESPACE_GENERIC(quat_change_to_O0_basis)
+#define quat_lattice_O0_set                             SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set)
+#define quat_lattice_O0_set_extremal                    SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set_extremal)
+#define quat_order_elem_create                          SQISIGN_NAMESPACE_GENERIC(quat_order_elem_create)
+#define quat_represent_integer                          SQISIGN_NAMESPACE_GENERIC(quat_represent_integer)
+#define quat_sampling_random_ideal_O0_given_norm        SQISIGN_NAMESPACE_GENERIC(quat_sampling_random_ideal_O0_given_norm)
+
+// Namespacing symbols exported from printer.c:
+#undef ibz_mat_2x2_print
+#undef ibz_mat_4x4_print
+#undef ibz_vec_2_print
+#undef ibz_vec_4_print
+#undef quat_alg_elem_print
+#undef quat_alg_print
+#undef quat_lattice_print
+#undef quat_left_ideal_print
+
+#define ibz_mat_2x2_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_print)
+#define ibz_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_print)
+#define ibz_vec_2_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_print)
+#define ibz_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_print)
+#define quat_alg_elem_print                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_print)
+#define quat_alg_print                                  SQISIGN_NAMESPACE_GENERIC(quat_alg_print)
+#define quat_lattice_print                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_print)
+#define quat_left_ideal_print                           SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_print)
+
+// Namespacing symbols exported from random_input_generation.c:
+#undef quat_test_input_random_ideal_generation
+#undef quat_test_input_random_ideal_lattice_generation
+#undef quat_test_input_random_lattice_generation
+
+#define quat_test_input_random_ideal_generation         SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_generation)
+#define quat_test_input_random_ideal_lattice_generation SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_lattice_generation)
+#define quat_test_input_random_lattice_generation       SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_lattice_generation)
+
+// Namespacing symbols exported from rationals.c:
+#undef ibq_abs
+#undef ibq_add
+#undef ibq_cmp
+#undef ibq_copy
+#undef ibq_finalize
+#undef ibq_init
+#undef ibq_inv
+#undef ibq_is_ibz
+#undef ibq_is_one
+#undef ibq_is_zero
+#undef ibq_mat_4x4_finalize
+#undef ibq_mat_4x4_init
+#undef ibq_mat_4x4_print
+#undef ibq_mul
+#undef ibq_neg
+#undef ibq_reduce
+#undef ibq_set
+#undef ibq_sub
+#undef ibq_to_ibz
+#undef ibq_vec_4_finalize
+#undef ibq_vec_4_init
+#undef ibq_vec_4_print
+
+#define ibq_abs                                         SQISIGN_NAMESPACE_GENERIC(ibq_abs)
+#define ibq_add                                         SQISIGN_NAMESPACE_GENERIC(ibq_add)
+#define ibq_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibq_cmp)
+#define ibq_copy                                        SQISIGN_NAMESPACE_GENERIC(ibq_copy)
+#define ibq_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibq_finalize)
+#define ibq_init                                        SQISIGN_NAMESPACE_GENERIC(ibq_init)
+#define ibq_inv                                         SQISIGN_NAMESPACE_GENERIC(ibq_inv)
+#define ibq_is_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_ibz)
+#define ibq_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_one)
+#define ibq_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibq_is_zero)
+#define ibq_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_finalize)
+#define ibq_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_init)
+#define ibq_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_print)
+#define ibq_mul                                         SQISIGN_NAMESPACE_GENERIC(ibq_mul)
+#define ibq_neg                                         SQISIGN_NAMESPACE_GENERIC(ibq_neg)
+#define ibq_reduce                                      SQISIGN_NAMESPACE_GENERIC(ibq_reduce)
+#define ibq_set                                         SQISIGN_NAMESPACE_GENERIC(ibq_set)
+#define ibq_sub                                         SQISIGN_NAMESPACE_GENERIC(ibq_sub)
+#define ibq_to_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_to_ibz)
+#define ibq_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_finalize)
+#define ibq_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_init)
+#define ibq_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_print)
+
+// Namespacing symbols exported from sign.c:
+#undef protocols_sign
+
+#define protocols_sign                                  SQISIGN_NAMESPACE(protocols_sign)
+
+// Namespacing symbols exported from sqisign.c:
+#undef sqisign_keypair
+#undef sqisign_open
+#undef sqisign_sign
+#undef sqisign_verify
+
+#define sqisign_keypair                                 SQISIGN_NAMESPACE(sqisign_keypair)
+#define sqisign_open                                    SQISIGN_NAMESPACE(sqisign_open)
+#define sqisign_sign                                    SQISIGN_NAMESPACE(sqisign_sign)
+#define sqisign_verify                                  SQISIGN_NAMESPACE(sqisign_verify)
+
+// Namespacing symbols exported from theta_isogenies.c:
+#undef theta_chain_compute_and_eval
+#undef theta_chain_compute_and_eval_randomized
+#undef theta_chain_compute_and_eval_verify
+
+#define theta_chain_compute_and_eval                    SQISIGN_NAMESPACE(theta_chain_compute_and_eval)
+#define theta_chain_compute_and_eval_randomized         SQISIGN_NAMESPACE(theta_chain_compute_and_eval_randomized)
+#define theta_chain_compute_and_eval_verify             SQISIGN_NAMESPACE(theta_chain_compute_and_eval_verify)
+
+// Namespacing symbols exported from theta_structure.c:
+#undef double_iter
+#undef double_point
+#undef is_product_theta_point
+#undef theta_precomputation
+
+#define double_iter                                     SQISIGN_NAMESPACE(double_iter)
+#define double_point                                    SQISIGN_NAMESPACE(double_point)
+#define is_product_theta_point                          SQISIGN_NAMESPACE(is_product_theta_point)
+#define theta_precomputation                            SQISIGN_NAMESPACE(theta_precomputation)
+
+// Namespacing symbols exported from verify.c:
+#undef protocols_verify
+
+#define protocols_verify                                SQISIGN_NAMESPACE(protocols_verify)
+
+// Namespacing symbols exported from xeval.c:
+#undef xeval_2
+#undef xeval_2_singular
+#undef xeval_4
+
+#define xeval_2                                         SQISIGN_NAMESPACE(xeval_2)
+#define xeval_2_singular                                SQISIGN_NAMESPACE(xeval_2_singular)
+#define xeval_4                                         SQISIGN_NAMESPACE(xeval_4)
+
+// Namespacing symbols exported from xisog.c:
+#undef xisog_2
+#undef xisog_2_singular
+#undef xisog_4
+
+#define xisog_2                                         SQISIGN_NAMESPACE(xisog_2)
+#define xisog_2_singular                                SQISIGN_NAMESPACE(xisog_2_singular)
+#define xisog_4                                         SQISIGN_NAMESPACE(xisog_4)
+
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl1/ref/theta_isogenies.c b/src/pqm4/sqisign_lvl1/ref/theta_isogenies.c
new file mode 100644
index 0000000..478a9ab
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/theta_isogenies.c
@@ -0,0 +1,1283 @@
+#include "theta_isogenies.h"
+#include <stdio.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <tools.h>
+#include <rng.h>
+
+// Select a base change matrix in constant time, with M1 a regular
+// base change matrix and M2 a precomputed base change matrix
+// If option = 0 then M <- M1, else if option = 0xFF...FF then M <- M2
+static inline void
+select_base_change_matrix(basis_change_matrix_t *M,
+                          const basis_change_matrix_t *M1,
+                          const precomp_basis_change_matrix_t *M2,
+                          const uint32_t option)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            fp2_select(&M->m[i][j], &M1->m[i][j], &FP2_CONSTANTS[M2->m[i][j]], option);
+}
+
+// Set a regular base change matrix from a precomputed one
+static inline void
+set_base_change_matrix_from_precomp(basis_change_matrix_t *res, const precomp_basis_change_matrix_t *M)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            res->m[i][j] = FP2_CONSTANTS[M->m[i][j]];
+}
+
+static inline void
+choose_index_theta_point(fp2_t *res, int ind, const theta_point_t *T)
+{
+    const fp2_t *src = NULL;
+    switch (ind % 4) {
+        case 0:
+            src = &T->x;
+            break;
+        case 1:
+            src = &T->y;
+            break;
+        case 2:
+            src = &T->z;
+            break;
+        case 3:
+            src = &T->t;
+            break;
+        default:
+            assert(0);
+    }
+    fp2_copy(res, src);
+}
+
+// same as apply_isomorphism method but more efficient when the t component of P is zero.
+static void
+apply_isomorphism_general(theta_point_t *res,
+                          const basis_change_matrix_t *M,
+                          const theta_point_t *P,
+                          const bool Pt_not_zero)
+{
+    fp2_t x1;
+    theta_point_t temp;
+
+    fp2_mul(&temp.x, &P->x, &M->m[0][0]);
+    fp2_mul(&x1, &P->y, &M->m[0][1]);
+    fp2_add(&temp.x, &temp.x, &x1);
+    fp2_mul(&x1, &P->z, &M->m[0][2]);
+    fp2_add(&temp.x, &temp.x, &x1);
+
+    fp2_mul(&temp.y, &P->x, &M->m[1][0]);
+    fp2_mul(&x1, &P->y, &M->m[1][1]);
+    fp2_add(&temp.y, &temp.y, &x1);
+    fp2_mul(&x1, &P->z, &M->m[1][2]);
+    fp2_add(&temp.y, &temp.y, &x1);
+
+    fp2_mul(&temp.z, &P->x, &M->m[2][0]);
+    fp2_mul(&x1, &P->y, &M->m[2][1]);
+    fp2_add(&temp.z, &temp.z, &x1);
+    fp2_mul(&x1, &P->z, &M->m[2][2]);
+    fp2_add(&temp.z, &temp.z, &x1);
+
+    fp2_mul(&temp.t, &P->x, &M->m[3][0]);
+    fp2_mul(&x1, &P->y, &M->m[3][1]);
+    fp2_add(&temp.t, &temp.t, &x1);
+    fp2_mul(&x1, &P->z, &M->m[3][2]);
+    fp2_add(&temp.t, &temp.t, &x1);
+
+    if (Pt_not_zero) {
+        fp2_mul(&x1, &P->t, &M->m[0][3]);
+        fp2_add(&temp.x, &temp.x, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[1][3]);
+        fp2_add(&temp.y, &temp.y, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[2][3]);
+        fp2_add(&temp.z, &temp.z, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[3][3]);
+        fp2_add(&temp.t, &temp.t, &x1);
+    }
+
+    fp2_copy(&res->x, &temp.x);
+    fp2_copy(&res->y, &temp.y);
+    fp2_copy(&res->z, &temp.z);
+    fp2_copy(&res->t, &temp.t);
+}
+
+static void
+apply_isomorphism(theta_point_t *res, const basis_change_matrix_t *M, const theta_point_t *P)
+{
+    apply_isomorphism_general(res, M, P, true);
+}
+
+// set res = M1 * M2 with matrix multiplication
+static void
+base_change_matrix_multiplication(basis_change_matrix_t *res,
+                                  const basis_change_matrix_t *M1,
+                                  const basis_change_matrix_t *M2)
+{
+    basis_change_matrix_t tmp;
+    fp2_t sum, m_ik, m_kj;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            fp2_set_zero(&sum);
+            for (int k = 0; k < 4; k++) {
+                m_ik = M1->m[i][k];
+                m_kj = M2->m[k][j];
+                fp2_mul(&m_ik, &m_ik, &m_kj);
+                fp2_add(&sum, &sum, &m_ik);
+            }
+            tmp.m[i][j] = sum;
+        }
+    }
+    *res = tmp;
+}
+
+// compute the theta_point corresponding to the couple of point T on an elliptic product
+static void
+base_change(theta_point_t *out, const theta_gluing_t *phi, const theta_couple_point_t *T)
+{
+    theta_point_t null_point;
+
+    // null_point = (a : b : c : d)
+    // a = P1.x P2.x, b = P1.x P2.z, c = P1.z P2.x, d = P1.z P2.z
+    fp2_mul(&null_point.x, &T->P1.x, &T->P2.x);
+    fp2_mul(&null_point.y, &T->P1.x, &T->P2.z);
+    fp2_mul(&null_point.z, &T->P2.x, &T->P1.z);
+    fp2_mul(&null_point.t, &T->P1.z, &T->P2.z);
+
+    // Apply the basis change
+    apply_isomorphism(out, &phi->M, &null_point);
+}
+
+static void
+action_by_translation_z_and_det(fp2_t *z_inv, fp2_t *det_inv, const ec_point_t *P4, const ec_point_t *P2)
+{
+    // Store the Z-coordinate to invert
+    fp2_copy(z_inv, &P4->z);
+
+    // Then collect detij = xij wij - uij zij
+    fp2_t tmp;
+    fp2_mul(det_inv, &P4->x, &P2->z);
+    fp2_mul(&tmp, &P4->z, &P2->x);
+    fp2_sub(det_inv, det_inv, &tmp);
+}
+
+static void
+action_by_translation_compute_matrix(translation_matrix_t *G,
+                                     const ec_point_t *P4,
+                                     const ec_point_t *P2,
+                                     const fp2_t *z_inv,
+                                     const fp2_t *det_inv)
+{
+    fp2_t tmp;
+
+    // Gi.g10 = uij xij /detij - xij/zij
+    fp2_mul(&tmp, &P4->x, z_inv);
+    fp2_mul(&G->g10, &P4->x, &P2->x);
+    fp2_mul(&G->g10, &G->g10, det_inv);
+    fp2_sub(&G->g10, &G->g10, &tmp);
+
+    // Gi.g11 = uij zij * detij
+    fp2_mul(&G->g11, &P2->x, det_inv);
+    fp2_mul(&G->g11, &G->g11, &P4->z);
+
+    // Gi.g00 = -Gi.g11
+    fp2_neg(&G->g00, &G->g11);
+
+    // Gi.g01 = - wij zij detij
+    fp2_mul(&G->g01, &P2->z, det_inv);
+    fp2_mul(&G->g01, &G->g01, &P4->z);
+    fp2_neg(&G->g01, &G->g01);
+}
+
+// Returns 1 if the basis is as expected and 0 otherwise
+// We only expect this to fail for malformed signatures, so
+// do not require this to run in constant time.
+static int
+verify_two_torsion(const theta_couple_point_t *K1_2, const theta_couple_point_t *K2_2, const theta_couple_curve_t *E12)
+{
+    // First check if any point in K1_2 or K2_2 is zero, if they are then the points did not have
+    // order 8 when we started gluing
+    if (ec_is_zero(&K1_2->P1) | ec_is_zero(&K1_2->P2) | ec_is_zero(&K2_2->P1) | ec_is_zero(&K2_2->P2)) {
+        return 0;
+    }
+
+    // Now ensure that P1, Q1 and P2, Q2 are independent. For points of order two this means
+    // that they're not the same
+    if (ec_is_equal(&K1_2->P1, &K2_2->P1) | ec_is_equal(&K1_2->P2, &K2_2->P2)) {
+        return 0;
+    }
+
+    // Finally, double points to ensure all points have order exactly 0
+    theta_couple_point_t O1, O2;
+    double_couple_point(&O1, K1_2, E12);
+    double_couple_point(&O2, K2_2, E12);
+    // If this check fails then the points had order 2*f for some f, and the kernel is malformed.
+    if (!(ec_is_zero(&O1.P1) & ec_is_zero(&O1.P2) & ec_is_zero(&O2.P1) & ec_is_zero(&O2.P2))) {
+        return 0;
+    }
+
+    return 1;
+}
+
+// Computes the action by translation for four points
+// (P1, P2) and (Q1, Q2) on E1 x E2 simultaneously to
+// save on inversions.
+// Returns 0 if any of Pi or Qi does not have order 2
+// and 1 otherwise
+static int
+action_by_translation(translation_matrix_t *Gi,
+                      const theta_couple_point_t *K1_4,
+                      const theta_couple_point_t *K2_4,
+                      const theta_couple_curve_t *E12)
+{
+    // Compute points of order 2 from Ki_4
+    theta_couple_point_t K1_2, K2_2;
+    double_couple_point(&K1_2, K1_4, E12);
+    double_couple_point(&K2_2, K2_4, E12);
+
+    if (!verify_two_torsion(&K1_2, &K2_2, E12)) {
+        return 0;
+    }
+
+    // We need to invert four Z coordinates and
+    // four determinants which we do with batched
+    // inversion
+    fp2_t inverses[8];
+    action_by_translation_z_and_det(&inverses[0], &inverses[4], &K1_4->P1, &K1_2.P1);
+    action_by_translation_z_and_det(&inverses[1], &inverses[5], &K1_4->P2, &K1_2.P2);
+    action_by_translation_z_and_det(&inverses[2], &inverses[6], &K2_4->P1, &K2_2.P1);
+    action_by_translation_z_and_det(&inverses[3], &inverses[7], &K2_4->P2, &K2_2.P2);
+
+    fp2_batched_inv(inverses, 8);
+    if (fp2_is_zero(&inverses[0]))
+        return 0; // something was wrong with our input (which somehow was not caught by
+                  // verify_two_torsion)
+
+    action_by_translation_compute_matrix(&Gi[0], &K1_4->P1, &K1_2.P1, &inverses[0], &inverses[4]);
+    action_by_translation_compute_matrix(&Gi[1], &K1_4->P2, &K1_2.P2, &inverses[1], &inverses[5]);
+    action_by_translation_compute_matrix(&Gi[2], &K2_4->P1, &K2_2.P1, &inverses[2], &inverses[6]);
+    action_by_translation_compute_matrix(&Gi[3], &K2_4->P2, &K2_2.P2, &inverses[3], &inverses[7]);
+
+    return 1;
+}
+
+// Given the appropriate four torsion, computes the
+// change of basis to compute the correct theta null
+// point.
+// Returns 0 if the order of K1_4 or K2_4 is not 4
+static int
+gluing_change_of_basis(basis_change_matrix_t *M,
+                       const theta_couple_point_t *K1_4,
+                       const theta_couple_point_t *K2_4,
+                       const theta_couple_curve_t *E12)
+{
+    // Compute the four 2x2 matrices for the action by translation
+    // on the four points:
+    translation_matrix_t Gi[4];
+    if (!action_by_translation(Gi, K1_4, K2_4, E12))
+        return 0;
+
+    // Computation of the 4x4 matrix from Mij
+    // t001, t101 (resp t002, t102) first column of M11 * M21 (resp M12 * M22)
+    fp2_t t001, t101, t002, t102, tmp;
+
+    fp2_mul(&t001, &Gi[0].g00, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g01, &Gi[2].g10);
+    fp2_add(&t001, &t001, &tmp);
+
+    fp2_mul(&t101, &Gi[0].g10, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g11, &Gi[2].g10);
+    fp2_add(&t101, &t101, &tmp);
+
+    fp2_mul(&t002, &Gi[1].g00, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g01, &Gi[3].g10);
+    fp2_add(&t002, &t002, &tmp);
+
+    fp2_mul(&t102, &Gi[1].g10, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g11, &Gi[3].g10);
+    fp2_add(&t102, &t102, &tmp);
+
+    // trace for the first row
+    fp2_set_one(&M->m[0][0]);
+    fp2_mul(&tmp, &t001, &t002);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+
+    fp2_mul(&M->m[0][1], &t001, &t102);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+
+    fp2_mul(&M->m[0][2], &t101, &t002);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+
+    fp2_mul(&M->m[0][3], &t101, &t102);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+
+    // Compute the action of (0,out.K2_4.P2) for the second row
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][1]);
+    fp2_mul(&M->m[1][0], &Gi[3].g00, &M->m[0][0]);
+    fp2_add(&M->m[1][0], &M->m[1][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][1]);
+    fp2_mul(&M->m[1][1], &Gi[3].g10, &M->m[0][0]);
+    fp2_add(&M->m[1][1], &M->m[1][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][3]);
+    fp2_mul(&M->m[1][2], &Gi[3].g00, &M->m[0][2]);
+    fp2_add(&M->m[1][2], &M->m[1][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][3]);
+    fp2_mul(&M->m[1][3], &Gi[3].g10, &M->m[0][2]);
+    fp2_add(&M->m[1][3], &M->m[1][3], &tmp);
+
+    // compute the action of (K1_4.P1,0) for the third row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][2]);
+    fp2_mul(&M->m[2][0], &Gi[0].g00, &M->m[0][0]);
+    fp2_add(&M->m[2][0], &M->m[2][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][3]);
+    fp2_mul(&M->m[2][1], &Gi[0].g00, &M->m[0][1]);
+    fp2_add(&M->m[2][1], &M->m[2][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][2]);
+    fp2_mul(&M->m[2][2], &Gi[0].g10, &M->m[0][0]);
+    fp2_add(&M->m[2][2], &M->m[2][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][3]);
+    fp2_mul(&M->m[2][3], &Gi[0].g10, &M->m[0][1]);
+    fp2_add(&M->m[2][3], &M->m[2][3], &tmp);
+
+    // compute the action of (K1_4.P1,K2_4.P2) for the final row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][2]);
+    fp2_mul(&M->m[3][0], &Gi[0].g00, &M->m[1][0]);
+    fp2_add(&M->m[3][0], &M->m[3][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][3]);
+    fp2_mul(&M->m[3][1], &Gi[0].g00, &M->m[1][1]);
+    fp2_add(&M->m[3][1], &M->m[3][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][2]);
+    fp2_mul(&M->m[3][2], &Gi[0].g10, &M->m[1][0]);
+    fp2_add(&M->m[3][2], &M->m[3][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][3]);
+    fp2_mul(&M->m[3][3], &Gi[0].g10, &M->m[1][1]);
+    fp2_add(&M->m[3][3], &M->m[3][3], &tmp);
+
+    return 1;
+}
+
+/**
+ * @brief Compute the gluing isogeny from an elliptic product
+ *
+ * @param out Output: the theta_gluing
+ * @param K1_8 a couple point
+ * @param E12 an elliptic curve product
+ * @param K2_8 a point in E2[8]
+ *
+ * out : E1xE2 -> A of kernel [4](K1_8,K2_8)
+ * if the kernel supplied has the incorrect order, or gluing seems malformed,
+ * returns 0, otherwise returns 1.
+ */
+static int
+gluing_compute(theta_gluing_t *out,
+               const theta_couple_curve_t *E12,
+               const theta_couple_jac_point_t *xyK1_8,
+               const theta_couple_jac_point_t *xyK2_8,
+               bool verify)
+{
+    // Ensure that we have been given the eight torsion
+#ifndef NDEBUG
+    {
+        int check = test_jac_order_twof(&xyK1_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK1_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK1_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P2 does not have order 8");
+    }
+#endif
+
+    out->xyK1_8 = *xyK1_8;
+    out->domain = *E12;
+
+    // Given points in E[8] x E[8] we need the four torsion below
+    theta_couple_jac_point_t xyK1_4, xyK2_4;
+
+    double_couple_jac_point(&xyK1_4, xyK1_8, E12);
+    double_couple_jac_point(&xyK2_4, xyK2_8, E12);
+
+    // Convert from (X:Y:Z) coordinates to (X:Z)
+    theta_couple_point_t K1_8, K2_8;
+    theta_couple_point_t K1_4, K2_4;
+
+    couple_jac_to_xz(&K1_8, xyK1_8);
+    couple_jac_to_xz(&K2_8, xyK2_8);
+    couple_jac_to_xz(&K1_4, &xyK1_4);
+    couple_jac_to_xz(&K2_4, &xyK2_4);
+
+    // Set the basis change matrix, if we have not been given a valid K[8] for this computation
+    // gluing_change_of_basis will detect this and return 0
+    if (!gluing_change_of_basis(&out->M, &K1_4, &K2_4, E12)) {
+        debug_print("gluing failed as kernel does not have correct order");
+        return 0;
+    }
+
+    // apply the base change to the kernel
+    theta_point_t TT1, TT2;
+
+    base_change(&TT1, out, &K1_8);
+    base_change(&TT2, out, &K2_8);
+
+    // compute the codomain
+    to_squared_theta(&TT1, &TT1);
+    to_squared_theta(&TT2, &TT2);
+
+    // If the kernel is well formed then TT1.t and TT2.t are zero
+    // if they are not, we exit early as the signature we are validating
+    // is probably malformed
+    if (!(fp2_is_zero(&TT1.t) & fp2_is_zero(&TT2.t))) {
+        debug_print("gluing failed TT1.t or TT2.t is not zero");
+        return 0;
+    }
+    // Test our projective factors are non zero
+    if (fp2_is_zero(&TT1.x) | fp2_is_zero(&TT2.x) | fp2_is_zero(&TT1.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT1.z))
+        return 0; // invalid input
+
+    // Projective factor: Ax
+    fp2_mul(&out->codomain.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.y, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.z, &TT1.x, &TT2.z);
+    fp2_set_zero(&out->codomain.t);
+    // Projective factor: ABCxz
+    fp2_mul(&out->precomputation.x, &TT1.y, &TT2.z);
+    fp2_copy(&out->precomputation.y, &out->codomain.z);
+    fp2_copy(&out->precomputation.z, &out->codomain.y);
+    fp2_set_zero(&out->precomputation.t);
+
+    // Compute the two components of phi(K1_8) = (x:x:y:y).
+    fp2_mul(&out->imageK1_8.x, &TT1.x, &out->precomputation.x);
+    fp2_mul(&out->imageK1_8.y, &TT1.z, &out->precomputation.z);
+
+    // If K1_8 and K2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points [2]K1_8 and [2]K2_8 are isotropic.
+    if (verify) {
+        fp2_t t1, t2;
+        fp2_mul(&t1, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&out->imageK1_8.x, &t1))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t2, &t1))
+            return 0;
+    }
+
+    // compute the final codomain
+    hadamard(&out->codomain, &out->codomain);
+    return 1;
+}
+
+// sub routine of the gluing eval
+static void
+gluing_eval_point(theta_point_t *image, const theta_couple_jac_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T1, T2;
+    add_components_t add_comp1, add_comp2;
+
+    // Compute the cross addition components of P1+Q1 and P2+Q2
+    jac_to_xz_add_components(&add_comp1, &P->P1, &phi->xyK1_8.P1, &phi->domain.E1);
+    jac_to_xz_add_components(&add_comp2, &P->P2, &phi->xyK1_8.P2, &phi->domain.E2);
+
+    // Compute T1 and T2 derived from the cross addition components.
+    fp2_mul(&T1.x, &add_comp1.u, &add_comp2.u); // T1x = u1u2
+    fp2_mul(&T2.t, &add_comp1.v, &add_comp2.v); // T2t = v1v2
+    fp2_add(&T1.x, &T1.x, &T2.t);               // T1x = u1u2 + v1v2
+    fp2_mul(&T1.y, &add_comp1.u, &add_comp2.w); // T1y = u1w2
+    fp2_mul(&T1.z, &add_comp1.w, &add_comp2.u); // T1z = w1u2
+    fp2_mul(&T1.t, &add_comp1.w, &add_comp2.w); // T1t = w1w2
+    fp2_add(&T2.x, &add_comp1.u, &add_comp1.v); // T2x = (u1+v1)
+    fp2_add(&T2.y, &add_comp2.u, &add_comp2.v); // T2y = (u2+v2)
+    fp2_mul(&T2.x, &T2.x, &T2.y);               // T2x = (u1+v1)(u2+v2)
+    fp2_sub(&T2.x, &T2.x, &T1.x);               // T1x = v1u2 + u1v2
+    fp2_mul(&T2.y, &add_comp1.v, &add_comp2.w); // T2y = v1w2
+    fp2_mul(&T2.z, &add_comp1.w, &add_comp2.v); // T2z = w1v2
+    fp2_set_zero(&T2.t);                        // T2t = 0
+
+    // Apply the basis change and compute their respective square
+    // theta(P+Q) = M.T1 - M.T2 and theta(P-Q) = M.T1 + M.T2
+    apply_isomorphism_general(&T1, &phi->M, &T1, true);
+    apply_isomorphism_general(&T2, &phi->M, &T2, false);
+    pointwise_square(&T1, &T1);
+    pointwise_square(&T2, &T2);
+
+    // the difference between the two is therefore theta(P+Q)theta(P-Q)
+    // whose hadamard transform is then the product of the dual
+    // theta_points of phi(P) and phi(Q).
+    fp2_sub(&T1.x, &T1.x, &T2.x);
+    fp2_sub(&T1.y, &T1.y, &T2.y);
+    fp2_sub(&T1.z, &T1.z, &T2.z);
+    fp2_sub(&T1.t, &T1.t, &T2.t);
+    hadamard(&T1, &T1);
+
+    // Compute (x, y, z, t)
+    // As imageK1_8 = (x:x:y:y), its inverse is (y:y:x:x).
+    fp2_mul(&image->x, &T1.x, &phi->imageK1_8.y);
+    fp2_mul(&image->y, &T1.y, &phi->imageK1_8.y);
+    fp2_mul(&image->z, &T1.z, &phi->imageK1_8.x);
+    fp2_mul(&image->t, &T1.t, &phi->imageK1_8.x);
+
+    hadamard(image, image);
+}
+
+// Same as gluing_eval_point but in the very special case where we already know that the point will
+// have a zero coordinate at the place where the zero coordinate of the dual_theta_nullpoint would
+// have made the computation difficult
+static int
+gluing_eval_point_special_case(theta_point_t *image, const theta_couple_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T;
+
+    // Apply the basis change
+    base_change(&T, phi, P);
+
+    // Apply the to_squared_theta transform
+    to_squared_theta(&T, &T);
+
+    // This coordinate should always be 0 in a gluing because D=0.
+    // If this is not the case, something went very wrong, so reject
+    if (!fp2_is_zero(&T.t))
+        return 0;
+
+    // Compute (x, y, z, t)
+    fp2_mul(&image->x, &T.x, &phi->precomputation.x);
+    fp2_mul(&image->y, &T.y, &phi->precomputation.y);
+    fp2_mul(&image->z, &T.z, &phi->precomputation.z);
+    fp2_set_zero(&image->t);
+
+    hadamard(image, image);
+    return 1;
+}
+
+/**
+ * @brief Evaluate a gluing isogeny from an elliptic product on a basis
+ *
+ * @param image1 Output: the theta_point of the image of the first couple of points
+ * @param image2 Output : the theta point of the image of the second couple of points
+ * @param xyT1: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param xyT2: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param phi : a gluing isogeny E1 x E2 -> A
+ *
+ **/
+static void
+gluing_eval_basis(theta_point_t *image1,
+                  theta_point_t *image2,
+                  const theta_couple_jac_point_t *xyT1,
+                  const theta_couple_jac_point_t *xyT2,
+                  const theta_gluing_t *phi)
+{
+    gluing_eval_point(image1, xyT1, phi);
+    gluing_eval_point(image2, xyT2, phi);
+}
+
+/**
+ * @brief Compute a (2,2) isogeny in dimension 2 in the theta_model
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_8 a point in A[8]
+ * @param T2_8 a point in A[8]
+ * @param hadamard_bool_1 a boolean used for the last two steps of the chain
+ * @param hadamard_bool_2 a boolean used for the last two steps of the chain
+ *
+ * out : A -> B of kernel [4](T1_8,T2_8)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ * verify: add extra sanity check to ensure our 8-torsion points are coherent with the isogeny
+ *
+ */
+static int
+theta_isogeny_compute(theta_isogeny_t *out,
+                      const theta_structure_t *A,
+                      const theta_point_t *T1_8,
+                      const theta_point_t *T2_8,
+                      bool hadamard_bool_1,
+                      bool hadamard_bool_2,
+                      bool verify)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_8;
+    out->T2_8 = *T2_8;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_8);
+        to_squared_theta(&TT1, &TT1);
+        hadamard(&TT2, T2_8);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_8);
+        to_squared_theta(&TT2, T2_8);
+    }
+
+    fp2_t t1, t2;
+
+    // Test that our projective factor ABCDxzw is non zero, where
+    // TT1=(Ax, Bx, Cy, Dy), TT2=(Az, Bw, Cz, Dw)
+    // But ABCDxzw=0 can only happen if we had an unexpected splitting in
+    // the isogeny chain.
+    // In either case reject
+    // (this is not strictly necessary, we could just return (0:0:0:0))
+    if (fp2_is_zero(&TT2.x) | fp2_is_zero(&TT2.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT2.t) | fp2_is_zero(&TT1.x) |
+        fp2_is_zero(&TT1.y))
+        return 0;
+
+    fp2_mul(&t1, &TT1.x, &TT2.y);
+    fp2_mul(&t2, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.null_point.x, &TT2.x, &t1);
+    fp2_mul(&out->codomain.null_point.y, &TT2.y, &t2);
+    fp2_mul(&out->codomain.null_point.z, &TT2.z, &t1);
+    fp2_mul(&out->codomain.null_point.t, &TT2.t, &t2);
+    fp2_t t3;
+    fp2_mul(&t3, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.x, &t3, &TT1.y);
+    fp2_mul(&out->precomputation.y, &t3, &TT1.x);
+    fp2_copy(&out->precomputation.z, &out->codomain.null_point.t);
+    fp2_copy(&out->precomputation.t, &out->codomain.null_point.z);
+
+    // If T1_8 and T2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points 2T1_8 and 2T2_8 are isotropic.
+    if (verify) {
+        fp2_mul(&t1, &TT1.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT1.z, &out->precomputation.z);
+        fp2_mul(&t2, &TT1.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.y, &out->precomputation.y);
+        fp2_mul(&t2, &TT2.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+    }
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+    return 1;
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the 4 torsion above the kernel is known and not the 8
+ * torsion
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_4 a point in A[4]
+ * @param T2_4 a point in A[4]
+ * @param hadamard_bool_1 a boolean
+ * @param hadamard_bool_2 a boolean
+ *
+ * out : A -> B of kernel [2](T1_4,T2_4)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_4(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_4,
+                        const theta_point_t *T2_4,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_4;
+    out->T2_8 = *T2_4;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+    // we will compute:
+    // TT1 = (xAB, _ , xCD, _)
+    // TT2 = (AA,BB,CC,DD)
+
+    // fp2_t xA_inv,zA_inv,tB_inv;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_4);
+        to_squared_theta(&TT1, &TT1);
+
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_4);
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    fp2_t sqaabb, sqaacc;
+    fp2_mul(&sqaabb, &TT2.x, &TT2.y);
+    fp2_mul(&sqaacc, &TT2.x, &TT2.z);
+    // No need to check the square roots, only used for signing.
+    // sqaabb = sqrt(AA*BB)
+    fp2_sqrt(&sqaabb);
+    // sqaacc = sqrt(AA*CC)
+    fp2_sqrt(&sqaacc);
+
+    // we compute out->codomain.null_point = (xAB * sqaacc * AA, xAB *sqaabb *sqaacc, xCD*sqaabb *
+    // AA) out->precomputation = (xAB * BB * CC *DD , sqaabb * CC * DD * xAB , sqaacc * BB* DD * xAB
+    // , xCD * sqaabb *sqaacc * BB)
+
+    fp2_mul(&out->codomain.null_point.y, &sqaabb, &sqaacc);
+    fp2_mul(&out->precomputation.t, &out->codomain.null_point.y, &TT1.z);
+    fp2_mul(&out->codomain.null_point.y, &out->codomain.null_point.y,
+            &TT1.x); // done for out->codomain.null_point.y
+
+    fp2_mul(&out->codomain.null_point.t, &TT1.z, &sqaabb);
+    fp2_mul(&out->codomain.null_point.t, &out->codomain.null_point.t,
+            &TT2.x); // done for out->codomain.null_point.t
+
+    fp2_mul(&out->codomain.null_point.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.z, &out->codomain.null_point.x,
+            &TT2.z); // done for out->codomain.null_point.z
+    fp2_mul(&out->codomain.null_point.x, &out->codomain.null_point.x,
+            &sqaacc); // done for out->codomain.null_point.x
+
+    fp2_mul(&out->precomputation.x, &TT1.x, &TT2.t);
+    fp2_mul(&out->precomputation.z, &out->precomputation.x, &TT2.y);
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.z);
+    fp2_mul(&out->precomputation.y, &out->precomputation.x, &sqaabb); // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y);  // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &sqaacc); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y);  // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the kernel is known and not the 8 or 4 torsion above
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_2 a point in A[2]
+ * @param T2_2 a point in A[2]
+ * @param hadamard_bool_1 a boolean
+ * @param boo2 a boolean
+ *
+ * out : A -> B of kernel (T1_2,T2_2)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_2(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_2,
+                        const theta_point_t *T2_2,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_2;
+    out->T2_8 = *T2_2;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT2;
+    // we will compute:
+    // TT2 = (AA,BB,CC,DD)
+
+    if (hadamard_bool_1) {
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    // we compute out->codomain.null_point = (AA,sqaabb, sqaacc, sqaadd)
+    // out->precomputation = (  BB * CC *DD , sqaabb * CC * DD , sqaacc * BB* DD , sqaadd * BB * CC)
+    fp2_copy(&out->codomain.null_point.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.y, &TT2.x, &TT2.y);
+    fp2_mul(&out->codomain.null_point.z, &TT2.x, &TT2.z);
+    fp2_mul(&out->codomain.null_point.t, &TT2.x, &TT2.t);
+    // No need to check the square roots, only used for signing.
+    fp2_sqrt(&out->codomain.null_point.y);
+    fp2_sqrt(&out->codomain.null_point.z);
+    fp2_sqrt(&out->codomain.null_point.t);
+
+    fp2_mul(&out->precomputation.x, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.y,
+            &out->precomputation.x,
+            &out->codomain.null_point.y);                            // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y); // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &TT2.t, &out->codomain.null_point.z);
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &TT2.y); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &TT2.z, &out->codomain.null_point.t);
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y); // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+static void
+theta_isogeny_eval(theta_point_t *out, const theta_isogeny_t *phi, const theta_point_t *P)
+{
+    if (phi->hadamard_bool_1) {
+        hadamard(out, P);
+        to_squared_theta(out, out);
+    } else {
+        to_squared_theta(out, P);
+    }
+    fp2_mul(&out->x, &out->x, &phi->precomputation.x);
+    fp2_mul(&out->y, &out->y, &phi->precomputation.y);
+    fp2_mul(&out->z, &out->z, &phi->precomputation.z);
+    fp2_mul(&out->t, &out->t, &phi->precomputation.t);
+
+    if (phi->hadamard_bool_2) {
+        hadamard(out, out);
+    }
+}
+
+#if defined(ENABLE_SIGN)
+// Sample a random secret index in [0, 5] to select one of the 6 normalisation
+// matrices for the normalisation of the output of the (2,2)-chain during
+// splitting
+static unsigned char
+sample_random_index(void)
+{
+    // To avoid bias in reduction we should only consider integers smaller
+    // than 2^32 which are a multiple of 6, so we only reduce bytes with a
+    // value in [0, 4294967292-1].
+    // We have 4294967292/2^32 = ~99.9999999% chance that the first try is "good".
+    unsigned char seed_arr[4];
+    uint32_t seed;
+
+    do {
+        randombytes(seed_arr, 4);
+        seed = (seed_arr[0] | (seed_arr[1] << 8) | (seed_arr[2] << 16) | (seed_arr[3] << 24));
+    } while (seed >= 4294967292U);
+
+    uint32_t secret_index = seed - (((uint64_t)seed * 2863311531U) >> 34) * 6;
+    assert(secret_index == seed % 6); // ensure the constant time trick above works
+    return (unsigned char)secret_index;
+}
+#endif
+
+static bool
+splitting_compute(theta_splitting_t *out, const theta_structure_t *A, int zero_index, bool randomize)
+
+{
+    // init
+    uint32_t ctl;
+    uint32_t count = 0;
+    fp2_t U_cst, t1, t2;
+
+    memset(&out->M, 0, sizeof(basis_change_matrix_t));
+
+    // enumerate through all indices
+    for (int i = 0; i < 10; i++) {
+        fp2_set_zero(&U_cst);
+        for (int t = 0; t < 4; t++) {
+            // Iterate through the null point
+            choose_index_theta_point(&t2, t, &A->null_point);
+            choose_index_theta_point(&t1, t ^ EVEN_INDEX[i][1], &A->null_point);
+
+            // Compute t1 * t2
+            fp2_mul(&t1, &t1, &t2);
+            // If CHI_EVAL(i,t) is +1 we want ctl to be 0 and
+            // If CHI_EVAL(i,t) is -1 we want ctl to be 0xFF..FF
+            ctl = (uint32_t)(CHI_EVAL[EVEN_INDEX[i][0]][t] >> 1);
+            assert(ctl == 0 || ctl == 0xffffffff);
+
+            fp2_neg(&t2, &t1);
+            fp2_select(&t1, &t1, &t2, ctl);
+
+            // Then we compute U_cst ± (t1 * t2)
+            fp2_add(&U_cst, &U_cst, &t1);
+        }
+
+        // If U_cst is 0 then update the splitting matrix
+        ctl = fp2_is_zero(&U_cst);
+        count -= ctl;
+        select_base_change_matrix(&out->M, &out->M, &SPLITTING_TRANSFORMS[i], ctl);
+        if (zero_index != -1 && i == zero_index &&
+            !ctl) { // extra checks if we know exactly where the 0 index should be
+            return 0;
+        }
+    }
+
+#if defined(ENABLE_SIGN)
+    // Pick a random normalization matrix
+    if (randomize) {
+        unsigned char secret_index = sample_random_index();
+        basis_change_matrix_t Mrandom;
+
+        set_base_change_matrix_from_precomp(&Mrandom, &NORMALIZATION_TRANSFORMS[0]);
+
+        // Use a constant time selection to pick the index we want
+        for (unsigned char i = 1; i < 6; i++) {
+            // When i == secret_index, mask == 0 and 0xFF..FF otherwise
+            int32_t mask = i - secret_index;
+            mask = (mask | -mask) >> 31;
+            select_base_change_matrix(&Mrandom, &Mrandom, &NORMALIZATION_TRANSFORMS[i], ~mask);
+        }
+        base_change_matrix_multiplication(&out->M, &Mrandom, &out->M);
+    }
+#else
+    assert(!randomize);
+#endif
+
+    // apply the isomorphism to ensure the null point is compatible with splitting
+    apply_isomorphism(&out->B.null_point, &out->M, &A->null_point);
+
+    // splitting was successful only if exactly one zero was identified
+    return count == 1;
+}
+
+static int
+theta_product_structure_to_elliptic_product(theta_couple_curve_t *E12, theta_structure_t *A)
+{
+    fp2_t xx, yy;
+
+    // This should be true from our computations in splitting_compute
+    // but still check this for sanity
+    if (!is_product_theta_point(&A->null_point))
+        return 0;
+
+    ec_curve_init(&(E12->E1));
+    ec_curve_init(&(E12->E2));
+
+    // A valid elliptic theta null point has no zero coordinate
+    if (fp2_is_zero(&A->null_point.x) | fp2_is_zero(&A->null_point.y) | fp2_is_zero(&A->null_point.z))
+        return 0;
+
+    // xx = x², yy = y²
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.y);
+    // xx = x^4, yy = y^4
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A2 = -2(x^4+y^4)/(x^4-y^4)
+    fp2_add(&E12->E2.A, &xx, &yy);
+    fp2_sub(&E12->E2.C, &xx, &yy);
+    fp2_add(&E12->E2.A, &E12->E2.A, &E12->E2.A);
+    fp2_neg(&E12->E2.A, &E12->E2.A);
+
+    // same with x,z
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.z);
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A1 = -2(x^4+z^4)/(x^4-z^4)
+    fp2_add(&E12->E1.A, &xx, &yy);
+    fp2_sub(&E12->E1.C, &xx, &yy);
+    fp2_add(&E12->E1.A, &E12->E1.A, &E12->E1.A);
+    fp2_neg(&E12->E1.A, &E12->E1.A);
+
+    if (fp2_is_zero(&E12->E1.C) | fp2_is_zero(&E12->E2.C))
+        return 0;
+
+    return 1;
+}
+
+static int
+theta_point_to_montgomery_point(theta_couple_point_t *P12, const theta_point_t *P, const theta_structure_t *A)
+{
+    fp2_t temp;
+    const fp2_t *x, *z;
+
+    if (!is_product_theta_point(P))
+        return 0;
+
+    x = &P->x;
+    z = &P->y;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->z;
+        z = &P->t;
+    }
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        return 0; // at this point P=(0:0:0:0) so is invalid
+    }
+    // P2.X = A.null_point.y * P.x + A.null_point.x * P.y
+    // P2.Z = - A.null_point.y * P.x + A.null_point.x * P.y
+    fp2_mul(&P12->P2.x, &A->null_point.y, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P2.z, &temp, &P12->P2.x);
+    fp2_add(&P12->P2.x, &P12->P2.x, &temp);
+
+    x = &P->x;
+    z = &P->z;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->y;
+        z = &P->t;
+    }
+    // P1.X = A.null_point.z * P.x + A.null_point.x * P.z
+    // P1.Z = -A.null_point.z * P.x + A.null_point.x * P.z
+    fp2_mul(&P12->P1.x, &A->null_point.z, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P1.z, &temp, &P12->P1.x);
+    fp2_add(&P12->P1.x, &P12->P1.x, &temp);
+    return 1;
+}
+
+static int
+_theta_chain_compute_impl(unsigned n,
+                          theta_couple_curve_t *E12,
+                          const theta_kernel_couple_points_t *ker,
+                          bool extra_torsion,
+                          theta_couple_curve_t *E34,
+                          theta_couple_point_t *P12,
+                          size_t numP,
+                          bool verify,
+                          bool randomize)
+{
+    theta_structure_t theta;
+
+    // lift the basis
+    theta_couple_jac_point_t xyT1, xyT2;
+
+    ec_basis_t bas1 = { .P = ker->T1.P1, .Q = ker->T2.P1, .PmQ = ker->T1m2.P1 };
+    ec_basis_t bas2 = { .P = ker->T1.P2, .Q = ker->T2.P2, .PmQ = ker->T1m2.P2 };
+    if (!lift_basis(&xyT1.P1, &xyT2.P1, &bas1, &E12->E1))
+        return 0;
+    if (!lift_basis(&xyT1.P2, &xyT2.P2, &bas2, &E12->E2))
+        return 0;
+
+    const unsigned extra = HD_extra_torsion * extra_torsion;
+
+#ifndef NDEBUG
+    assert(extra == 0 || extra == 2); // only cases implemented
+    if (!test_point_order_twof(&bas2.P, &E12->E2, n + extra))
+        debug_print("bas2.P does not have correct order");
+
+    if (!test_jac_order_twof(&xyT2.P2, &E12->E2, n + extra))
+        debug_print("xyT2.P2 does not have correct order");
+#endif
+
+    theta_point_t pts[numP ? numP : 1];
+
+    int space = 1;
+    for (unsigned i = 1; i < n; i *= 2)
+        ++space;
+
+    uint16_t todo[space];
+    todo[0] = n - 2 + extra;
+
+    int current = 0;
+
+    // kernel points for the gluing isogeny
+    theta_couple_jac_point_t jacQ1[space], jacQ2[space];
+    jacQ1[0] = xyT1;
+    jacQ2[0] = xyT2;
+    while (todo[current] != 1) {
+        assert(todo[current] >= 2);
+        ++current;
+        assert(current < space);
+        // the gluing isogeny is quite a bit more expensive than the others,
+        // so we adjust the usual splitting rule here a little bit: towards
+        // the end of the doubling chain it will be cheaper to recompute the
+        // doublings after evaluation than to push the intermediate points.
+        const unsigned num_dbls = todo[current - 1] >= 16 ? todo[current - 1] / 2 : todo[current - 1] - 1;
+        assert(num_dbls && num_dbls < todo[current - 1]);
+        double_couple_jac_point_iter(&jacQ1[current], num_dbls, &jacQ1[current - 1], E12);
+        double_couple_jac_point_iter(&jacQ2[current], num_dbls, &jacQ2[current - 1], E12);
+        todo[current] = todo[current - 1] - num_dbls;
+    }
+
+    // kernel points for the remaining isogeny steps
+    theta_point_t thetaQ1[space], thetaQ2[space];
+
+    // the gluing step
+    theta_gluing_t first_step;
+    {
+        assert(todo[current] == 1);
+
+        // compute the gluing isogeny
+        if (!gluing_compute(&first_step, E12, &jacQ1[current], &jacQ2[current], verify))
+            return 0;
+
+        // evaluate
+        for (unsigned j = 0; j < numP; ++j) {
+            assert(ec_is_zero(&P12[j].P1) || ec_is_zero(&P12[j].P2));
+            if (!gluing_eval_point_special_case(&pts[j], &P12[j], &first_step))
+                return 0;
+        }
+
+        // push kernel points through gluing isogeny
+        for (int j = 0; j < current; ++j) {
+            gluing_eval_basis(&thetaQ1[j], &thetaQ2[j], &jacQ1[j], &jacQ2[j], &first_step);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    // set-up the theta_structure for the first codomain
+    theta.null_point = first_step.codomain;
+    theta.precomputation = 0;
+    theta_precomputation(&theta);
+
+    theta_isogeny_t step;
+
+    // and now we do the remaining steps
+    for (unsigned i = 1; current >= 0 && todo[current]; ++i) {
+        assert(current < space);
+        while (todo[current] != 1) {
+            assert(todo[current] >= 2);
+            ++current;
+            assert(current < space);
+            const unsigned num_dbls = todo[current - 1] / 2;
+            assert(num_dbls && num_dbls < todo[current - 1]);
+            double_iter(&thetaQ1[current], &theta, &thetaQ1[current - 1], num_dbls);
+            double_iter(&thetaQ2[current], &theta, &thetaQ2[current - 1], num_dbls);
+            todo[current] = todo[current - 1] - num_dbls;
+        }
+
+        // computing the next step
+        int ret;
+        if (i == n - 2) // penultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 0, verify);
+        else if (i == n - 1) // ultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 1, 0, false);
+        else
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 1, verify);
+        if (!ret)
+            return 0;
+
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+
+        // updating the codomain
+        theta = step.codomain;
+
+        // pushing the kernel
+        assert(todo[current] == 1);
+        for (int j = 0; j < current; ++j) {
+            theta_isogeny_eval(&thetaQ1[j], &step, &thetaQ1[j]);
+            theta_isogeny_eval(&thetaQ2[j], &step, &thetaQ2[j]);
+            assert(todo[j]);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    assert(current == -1);
+
+    if (!extra_torsion) {
+        if (n >= 3) {
+            // in the last step we've skipped pushing the kernel since current was == 0, let's do it now
+            theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+            theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+        }
+
+        // penultimate step
+        theta_isogeny_compute_4(&step, &theta, &thetaQ1[0], &thetaQ2[0], 0, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+        theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+        theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+
+        // ultimate step
+        theta_isogeny_compute_2(&step, &theta, &thetaQ1[0], &thetaQ2[0], 1, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+    }
+
+    // final splitting step
+    theta_splitting_t last_step;
+
+    bool is_split = splitting_compute(&last_step, &theta, extra_torsion ? 8 : -1, randomize);
+
+    if (!is_split) {
+        debug_print("kernel did not generate an isogeny between elliptic products");
+        return 0;
+    }
+
+    if (!theta_product_structure_to_elliptic_product(E34, &last_step.B))
+        return 0;
+
+    // evaluate
+    for (size_t j = 0; j < numP; ++j) {
+        apply_isomorphism(&pts[j], &last_step.M, &pts[j]);
+        if (!theta_point_to_montgomery_point(&P12[j], &pts[j], &last_step.B))
+            return 0;
+    }
+
+    return 1;
+}
+
+int
+theta_chain_compute_and_eval(unsigned n,
+                             /*const*/ theta_couple_curve_t *E12,
+                             const theta_kernel_couple_points_t *ker,
+                             bool extra_torsion,
+                             theta_couple_curve_t *E34,
+                             theta_couple_point_t *P12,
+                             size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, false);
+}
+
+// Like theta_chain_compute_and_eval, adding extra verification checks;
+// used in the signature verification
+int
+theta_chain_compute_and_eval_verify(unsigned n,
+                                    /*const*/ theta_couple_curve_t *E12,
+                                    const theta_kernel_couple_points_t *ker,
+                                    bool extra_torsion,
+                                    theta_couple_curve_t *E34,
+                                    theta_couple_point_t *P12,
+                                    size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, true, false);
+}
+
+int
+theta_chain_compute_and_eval_randomized(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, true);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/theta_isogenies.h b/src/pqm4/sqisign_lvl1/ref/theta_isogenies.h
new file mode 100644
index 0000000..d151811
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/theta_isogenies.h
@@ -0,0 +1,18 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta isogeny header
+ */
+
+#ifndef THETA_ISOGENY_H
+#define THETA_ISOGENY_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <fp2.h>
+#include "theta_structure.h"
+#include <hd.h>
+#include <hd_splitting_transforms.h>
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/theta_structure.c b/src/pqm4/sqisign_lvl1/ref/theta_structure.c
new file mode 100644
index 0000000..ce97ac6
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/theta_structure.c
@@ -0,0 +1,78 @@
+#include "theta_structure.h"
+#include <assert.h>
+
+void
+theta_precomputation(theta_structure_t *A)
+{
+
+    if (A->precomputation) {
+        return;
+    }
+
+    theta_point_t A_dual;
+    to_squared_theta(&A_dual, &A->null_point);
+
+    fp2_t t1, t2;
+    fp2_mul(&t1, &A_dual.x, &A_dual.y);
+    fp2_mul(&t2, &A_dual.z, &A_dual.t);
+    fp2_mul(&A->XYZ0, &t1, &A_dual.z);
+    fp2_mul(&A->XYT0, &t1, &A_dual.t);
+    fp2_mul(&A->YZT0, &t2, &A_dual.y);
+    fp2_mul(&A->XZT0, &t2, &A_dual.x);
+
+    fp2_mul(&t1, &A->null_point.x, &A->null_point.y);
+    fp2_mul(&t2, &A->null_point.z, &A->null_point.t);
+    fp2_mul(&A->xyz0, &t1, &A->null_point.z);
+    fp2_mul(&A->xyt0, &t1, &A->null_point.t);
+    fp2_mul(&A->yzt0, &t2, &A->null_point.y);
+    fp2_mul(&A->xzt0, &t2, &A->null_point.x);
+
+    A->precomputation = true;
+}
+
+void
+double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in)
+{
+    to_squared_theta(out, in);
+    fp2_sqr(&out->x, &out->x);
+    fp2_sqr(&out->y, &out->y);
+    fp2_sqr(&out->z, &out->z);
+    fp2_sqr(&out->t, &out->t);
+
+    if (!A->precomputation) {
+        theta_precomputation(A);
+    }
+    fp2_mul(&out->x, &out->x, &A->YZT0);
+    fp2_mul(&out->y, &out->y, &A->XZT0);
+    fp2_mul(&out->z, &out->z, &A->XYT0);
+    fp2_mul(&out->t, &out->t, &A->XYZ0);
+
+    hadamard(out, out);
+
+    fp2_mul(&out->x, &out->x, &A->yzt0);
+    fp2_mul(&out->y, &out->y, &A->xzt0);
+    fp2_mul(&out->z, &out->z, &A->xyt0);
+    fp2_mul(&out->t, &out->t, &A->xyz0);
+}
+
+void
+double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp)
+{
+    if (exp == 0) {
+        *out = *in;
+    } else {
+        double_point(out, A, in);
+        for (int i = 1; i < exp; i++) {
+            double_point(out, A, out);
+        }
+    }
+}
+
+uint32_t
+is_product_theta_point(const theta_point_t *P)
+{
+    fp2_t t1, t2;
+    fp2_mul(&t1, &P->x, &P->t);
+    fp2_mul(&t2, &P->y, &P->z);
+    return fp2_is_equal(&t1, &t2);
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/theta_structure.h b/src/pqm4/sqisign_lvl1/ref/theta_structure.h
new file mode 100644
index 0000000..fc630b7
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/theta_structure.h
@@ -0,0 +1,135 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta structure header
+ */
+
+#ifndef THETA_STRUCTURE_H
+#define THETA_STRUCTURE_H
+
+#include <ec.h>
+#include <fp2.h>
+#include <hd.h>
+
+/** @internal
+ * @ingroup hd_module
+ * @defgroup hd_theta Functions for theta structures
+ * @{
+ */
+
+/**
+ * @brief Perform the hadamard transform on a theta point
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x+y+z+t, x-y+z-t, x+y-z-t, x-y-z+t)
+ *
+ */
+static inline void
+hadamard(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_t t1, t2, t3, t4;
+
+    // t1 = x + y
+    fp2_add(&t1, &in->x, &in->y);
+    // t2 = x - y
+    fp2_sub(&t2, &in->x, &in->y);
+    // t3 = z + t
+    fp2_add(&t3, &in->z, &in->t);
+    // t4 = z - t
+    fp2_sub(&t4, &in->z, &in->t);
+
+    fp2_add(&out->x, &t1, &t3);
+    fp2_add(&out->y, &t2, &t4);
+    fp2_sub(&out->z, &t1, &t3);
+    fp2_sub(&out->t, &t2, &t4);
+}
+
+/**
+ * @brief Square the coordinates of a theta point
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2, y^2, z^2, t^2)
+ *
+ */
+static inline void
+pointwise_square(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_sqr(&out->x, &in->x);
+    fp2_sqr(&out->y, &in->y);
+    fp2_sqr(&out->z, &in->z);
+    fp2_sqr(&out->t, &in->t);
+}
+
+/**
+ * @brief Square the coordinates and then perform the hadamard transform
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2+y^2+z^2+t^2, x^2-y^2+z^2-t^2, x^2+y^2-z^2-t^2, x^2-y^2-z^2+t^2)
+ *
+ */
+static inline void
+to_squared_theta(theta_point_t *out, const theta_point_t *in)
+{
+    pointwise_square(out, in);
+    hadamard(out, out);
+}
+
+/**
+ * @brief Perform the theta structure precomputation
+ *
+ * @param A Output: the theta_structure
+ *
+ * if A.null_point = (x,y,z,t)
+ * if (xx,yy,zz,tt) = to_squared_theta(A.null_point)
+ * Computes y0,z0,t0,Y0,Z0,T0 = x/y,x/z,x/t,XX/YY,XX/ZZ,XX/TT
+ *
+ */
+void theta_precomputation(theta_structure_t *A);
+
+/**
+ * @brief Compute the double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * in = (x,y,z,t)
+ * out = [2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in);
+
+/**
+ * @brief Compute the iterated double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * @param exp the exponent
+ * in = (x,y,z,t)
+ * out = [2^2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp);
+
+/*
+ * @brief Check if a theta point is a product theta point
+ *
+ * @param P a theta point
+ * @return 0xFFFFFFFF if true, zero otherwise
+ */
+uint32_t is_product_theta_point(const theta_point_t *P);
+
+// end hd_theta
+/**
+ * @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/tools.h b/src/pqm4/sqisign_lvl1/ref/tools.h
new file mode 100644
index 0000000..5a6a505
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/tools.h
@@ -0,0 +1,49 @@
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include <time.h>
+
+// Debug printing:
+// https://stackoverflow.com/questions/1644868/define-macro-for-debug-printing-in-c
+#ifndef NDEBUG
+#define DEBUG_PRINT 1
+#else
+#define DEBUG_PRINT 0
+#endif
+
+#ifndef __FILE_NAME__
+#define __FILE_NAME__ "NA"
+#endif
+
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+
+#ifndef __func__
+#define __func__ "NA"
+#endif
+
+#define debug_print(fmt)                                                                           \
+    do {                                                                                           \
+        if (DEBUG_PRINT)                                                                           \
+            printf("warning: %s, file %s, line %d, function %s().\n",                              \
+                   fmt,                                                                            \
+                   __FILE_NAME__,                                                                  \
+                   __LINE__,                                                                       \
+                   __func__);                                                                      \
+    } while (0)
+
+
+clock_t tic(void);
+float tac(void);                             /* time in ms since last tic */
+float TAC(const char *str);                  /* same, but prints it with label 'str' */
+float toc(const clock_t t);                  /* time in ms since t */
+float TOC(const clock_t t, const char *str); /* same, but prints it with label 'str' */
+float TOC_clock(const clock_t t, const char *str);
+
+clock_t dclock(const clock_t t); // return the clock cycle diff between now and t
+float clock_to_time(const clock_t t,
+                    const char *str); // convert the number of clock cycles t to time
+float clock_print(const clock_t t, const char *str);
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/tutil.h b/src/pqm4/sqisign_lvl1/ref/tutil.h
new file mode 100644
index 0000000..59f1620
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/tutil.h
@@ -0,0 +1,36 @@
+#ifndef TUTIL_H
+#define TUTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP16(i) __builtin_bswap16((i))
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#define UNUSED __attribute__((unused))
+#else
+#define BSWAP16(i) ((((i) >> 8) & 0xff) | (((i) & 0xff00) << 8))
+#define BSWAP32(i)                                                                                 \
+    ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
+#define UNUSED
+#endif
+
+#if defined(RADIX_64)
+#define digit_t uint64_t
+#define sdigit_t int64_t
+#define RADIX 64
+#define LOG2RADIX 6
+#define BSWAP_DIGIT(i) BSWAP64(i)
+#elif defined(RADIX_32)
+#define digit_t uint32_t
+#define sdigit_t int32_t
+#define RADIX 32
+#define LOG2RADIX 5
+#define BSWAP_DIGIT(i) BSWAP32(i)
+#else
+#error "Radix must be 32bit or 64 bit"
+#endif
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/verification.h b/src/pqm4/sqisign_lvl1/ref/verification.h
new file mode 100644
index 0000000..af67469
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/verification.h
@@ -0,0 +1,123 @@
+/** @file
+ *
+ * @brief The verification protocol
+ */
+
+#ifndef VERIFICATION_H
+#define VERIFICATION_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/** @defgroup verification SQIsignHD verification protocol
+ * @{
+ */
+
+/** @defgroup verification_t Types for SQIsignHD verification protocol
+ * @{
+ */
+
+typedef digit_t scalar_t[NWORDS_ORDER];
+typedef scalar_t scalar_mtx_2x2_t[2][2];
+
+/** @brief Type for the signature
+ *
+ * @typedef signature_t
+ *
+ * @struct signature
+ *
+ */
+typedef struct signature
+{
+    fp2_t E_aux_A; // the Montgomery A-coefficient for the auxiliary curve
+    uint8_t backtracking;
+    uint8_t two_resp_length;
+    scalar_mtx_2x2_t mat_Bchall_can_to_B_chall; // the matrix of the desired basis
+    scalar_t chall_coeff;
+    uint8_t hint_aux;
+    uint8_t hint_chall;
+} signature_t;
+
+/** @brief Type for the public keys
+ *
+ * @typedef public_key_t
+ *
+ * @struct public_key
+ *
+ */
+typedef struct public_key
+{
+    ec_curve_t curve; // the normalized A-coefficient of the Montgomery curve
+    uint8_t hint_pk;
+} public_key_t;
+
+/** @}
+ */
+
+/*************************** Functions *****************************/
+
+void public_key_init(public_key_t *pk);
+void public_key_finalize(public_key_t *pk);
+
+void hash_to_challenge(scalar_t *scalar,
+                       const public_key_t *pk,
+                       const ec_curve_t *com_curve,
+                       const unsigned char *message,
+                       size_t length);
+
+/**
+ * @brief Verification
+ *
+ * @param sig signature
+ * @param pk public key
+ * @param m message
+ * @param l size
+ * @returns 1 if the signature verifies, 0 otherwise
+ */
+int protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l);
+
+/*************************** Encoding *****************************/
+
+/** @defgroup encoding Encoding and decoding functions
+ * @{
+ */
+
+/**
+ * @brief Encodes a signature as a byte array
+ *
+ * @param enc : Byte array to encode the signature in
+ * @param sig : Signature to encode
+ */
+void signature_to_bytes(unsigned char *enc, const signature_t *sig);
+
+/**
+ * @brief Decodes a signature from a byte array
+ *
+ * @param sig : Structure to decode the signature in
+ * @param enc : Byte array to decode
+ */
+void signature_from_bytes(signature_t *sig, const unsigned char *enc);
+
+/**
+ * @brief Encodes a public key as a byte array
+ *
+ * @param enc : Byte array to encode the public key in
+ * @param pk : Public key to encode
+ */
+unsigned char *public_key_to_bytes(unsigned char *enc, const public_key_t *pk);
+
+/**
+ * @brief Decodes a public key from a byte array
+ *
+ * @param pk : Structure to decode the public key in
+ * @param enc : Byte array to decode
+ */
+const unsigned char *public_key_from_bytes(public_key_t *pk, const unsigned char *enc);
+
+/** @}
+ */
+
+/** @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl1/ref/verify.c b/src/pqm4/sqisign_lvl1/ref/verify.c
new file mode 100644
index 0000000..b5f78ad
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/verify.c
@@ -0,0 +1,309 @@
+#include <verification.h>
+#include <mp.h>
+#include <hd.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+// Check that the basis change matrix elements are canonical
+// representatives modulo 2^(SQIsign_response_length + 2).
+static int
+check_canonical_basis_change_matrix(const signature_t *sig)
+{
+    // This works as long as all values in sig->mat_Bchall_can_to_B_chall are
+    // positive integers.
+    int ret = 1;
+    scalar_t aux;
+
+    memset(aux, 0, NWORDS_ORDER * sizeof(digit_t));
+    aux[0] = 0x1;
+    multiple_mp_shiftl(aux, SQIsign_response_length + HD_extra_torsion - (int)sig->backtracking, NWORDS_ORDER);
+
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 2; j++) {
+            if (mp_compare(aux, sig->mat_Bchall_can_to_B_chall[i][j], NWORDS_ORDER) <= 0) {
+                ret = 0;
+            }
+        }
+    }
+
+    return ret;
+}
+
+// Compute the 2^n isogeny from the signature with kernel
+// P + [chall_coeff]Q and store the codomain in E_chall
+static int
+compute_challenge_verify(ec_curve_t *E_chall, const signature_t *sig, const ec_curve_t *Epk, const uint8_t hint_pk)
+{
+    ec_basis_t bas_EA;
+    ec_isog_even_t phi_chall;
+
+    // Set domain and length of 2^n isogeny
+    copy_curve(&phi_chall.curve, Epk);
+    phi_chall.length = TORSION_EVEN_POWER - sig->backtracking;
+
+    // Compute the basis from the supplied hint
+    if (!ec_curve_to_basis_2f_from_hint(&bas_EA, &phi_chall.curve, TORSION_EVEN_POWER, hint_pk)) // canonical
+        return 0;
+
+    // recovering the exact challenge
+    {
+        if (!ec_ladder3pt(&phi_chall.kernel, sig->chall_coeff, &bas_EA.P, &bas_EA.Q, &bas_EA.PmQ, &phi_chall.curve)) {
+            return 0;
+        };
+    }
+
+    // Double the kernel until is has the correct order
+    ec_dbl_iter(&phi_chall.kernel, sig->backtracking, &phi_chall.kernel, &phi_chall.curve);
+
+    // Compute the codomain
+    copy_curve(E_chall, &phi_chall.curve);
+    if (ec_eval_even(E_chall, &phi_chall, NULL, 0))
+        return 0;
+    return 1;
+}
+
+// same as matrix_application_even_basis() in id2iso.c, with some modifications:
+// - this version works with a matrix of scalars (not ibz_t).
+// - reduction modulo 2^f of matrix elements is removed here, because it is
+//   assumed that the elements are already cannonical representatives modulo
+//   2^f; this is ensured by calling check_canonical_basis_change_matrix() at
+//   the beginning of protocols_verify().
+static int
+matrix_scalar_application_even_basis(ec_basis_t *bas, const ec_curve_t *E, scalar_mtx_2x2_t *mat, int f)
+{
+    scalar_t scalar0, scalar1;
+    memset(scalar0, 0, NWORDS_ORDER * sizeof(digit_t));
+    memset(scalar1, 0, NWORDS_ORDER * sizeof(digit_t));
+
+    ec_basis_t tmp_bas;
+    copy_basis(&tmp_bas, bas);
+
+    // For a matrix [[a, c], [b, d]] we compute:
+    //
+    // first basis element R = [a]P + [b]Q
+    if (!ec_biscalar_mul(&bas->P, (*mat)[0][0], (*mat)[1][0], f, &tmp_bas, E))
+        return 0;
+    // second basis element S = [c]P + [d]Q
+    if (!ec_biscalar_mul(&bas->Q, (*mat)[0][1], (*mat)[1][1], f, &tmp_bas, E))
+        return 0;
+    // Their difference R - S = [a - c]P + [b - d]Q
+    mp_sub(scalar0, (*mat)[0][0], (*mat)[0][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar0, f, NWORDS_ORDER);
+    mp_sub(scalar1, (*mat)[1][0], (*mat)[1][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar1, f, NWORDS_ORDER);
+    return ec_biscalar_mul(&bas->PmQ, scalar0, scalar1, f, &tmp_bas, E);
+}
+
+// Compute the bases for the challenge and auxillary curve from
+// the canonical bases. Challenge basis is reconstructed from the
+// compressed scalars within the challenge.
+static int
+challenge_and_aux_basis_verify(ec_basis_t *B_chall_can,
+                               ec_basis_t *B_aux_can,
+                               ec_curve_t *E_chall,
+                               ec_curve_t *E_aux,
+                               signature_t *sig,
+                               const int pow_dim2_deg_resp)
+{
+
+    // recovering the canonical basis as TORSION_EVEN_POWER for consistency with signing
+    if (!ec_curve_to_basis_2f_from_hint(B_chall_can, E_chall, TORSION_EVEN_POWER, sig->hint_chall))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_chall_can,
+                      TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion - sig->two_resp_length,
+                      B_chall_can,
+                      E_chall);
+
+    if (!ec_curve_to_basis_2f_from_hint(B_aux_can, E_aux, TORSION_EVEN_POWER, sig->hint_aux))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_aux_can, TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion, B_aux_can, E_aux);
+
+#ifndef NDEBUG
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp + sig->two_resp_length))
+        debug_print("canonical basis has wrong order, expect something to fail");
+#endif
+
+    // applying the change matrix on the basis of E_chall
+    return matrix_scalar_application_even_basis(B_chall_can,
+                                                E_chall,
+                                                &sig->mat_Bchall_can_to_B_chall,
+                                                pow_dim2_deg_resp + HD_extra_torsion + sig->two_resp_length);
+}
+
+// When two_resp_length is non-zero, we must compute a small 2^n-isogeny
+// updating E_chall as the codomain as well as push the basis on E_chall
+// through this isogeny
+static int
+two_response_isogeny_verify(ec_curve_t *E_chall, ec_basis_t *B_chall_can, const signature_t *sig, int pow_dim2_deg_resp)
+{
+    ec_point_t ker, points[3];
+
+    // choosing the right point for the small two_isogenies
+    if (mp_is_even(sig->mat_Bchall_can_to_B_chall[0][0], NWORDS_ORDER) &&
+        mp_is_even(sig->mat_Bchall_can_to_B_chall[1][0], NWORDS_ORDER)) {
+        copy_point(&ker, &B_chall_can->Q);
+    } else {
+        copy_point(&ker, &B_chall_can->P);
+    }
+
+    copy_point(&points[0], &B_chall_can->P);
+    copy_point(&points[1], &B_chall_can->Q);
+    copy_point(&points[2], &B_chall_can->PmQ);
+
+    ec_dbl_iter(&ker, pow_dim2_deg_resp + HD_extra_torsion, &ker, E_chall);
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&ker, E_chall, sig->two_resp_length))
+        debug_print("kernel does not have order 2^(two_resp_length");
+#endif
+
+    if (ec_eval_small_chain(E_chall, &ker, sig->two_resp_length, points, 3, false)) {
+        return 0;
+    }
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&points[0], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[0] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[1], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[1] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[2], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[2] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    copy_point(&B_chall_can->P, &points[0]);
+    copy_point(&B_chall_can->Q, &points[1]);
+    copy_point(&B_chall_can->PmQ, &points[2]);
+    return 1;
+}
+
+// The commitment curve can be recovered from the codomain of the 2D
+// isogeny built from the bases computed during verification.
+static int
+compute_commitment_curve_verify(ec_curve_t *E_com,
+                                const ec_basis_t *B_chall_can,
+                                const ec_basis_t *B_aux_can,
+                                const ec_curve_t *E_chall,
+                                const ec_curve_t *E_aux,
+                                int pow_dim2_deg_resp)
+
+{
+#ifndef NDEBUG
+    // Check all the points are the correct order
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_chall_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+
+    if (!test_basis_order_twof(B_aux_can, E_aux, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_aux_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    // now compute the dim2 isogeny from Echall x E_aux -> E_com x E_aux'
+    // of kernel B_chall_can x B_aux_can
+
+    // first we set-up the kernel
+    theta_couple_curve_t EchallxEaux;
+    copy_curve(&EchallxEaux.E1, E_chall);
+    copy_curve(&EchallxEaux.E2, E_aux);
+
+    theta_kernel_couple_points_t dim_two_ker;
+    copy_bases_to_kernel(&dim_two_ker, B_chall_can, B_aux_can);
+
+    // computing the isogeny
+    theta_couple_curve_t codomain;
+    int codomain_splits;
+    ec_curve_init(&codomain.E1);
+    ec_curve_init(&codomain.E2);
+    // handling the special case where we don't need to perform any dim2 computation
+    if (pow_dim2_deg_resp == 0) {
+        codomain_splits = 1;
+        copy_curve(&codomain.E1, &EchallxEaux.E1);
+        copy_curve(&codomain.E2, &EchallxEaux.E2);
+        // We still need to check that E_chall is supersingular
+        // This assumes that HD_extra_torsion == 2
+        if (!ec_is_basis_four_torsion(B_chall_can, E_chall)) {
+            return 0;
+        }
+    } else {
+        codomain_splits = theta_chain_compute_and_eval_verify(
+            pow_dim2_deg_resp, &EchallxEaux, &dim_two_ker, true, &codomain, NULL, 0);
+    }
+
+    // computing the commitment curve
+    // its always the first one because of our (2^n,2^n)-isogeny formulae
+    copy_curve(E_com, &codomain.E1);
+
+    return codomain_splits;
+}
+
+// SQIsign verification
+int
+protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l)
+{
+    int verify;
+
+    if (!check_canonical_basis_change_matrix(sig))
+        return 0;
+
+    // Computation of the length of the dim 2 2^n isogeny
+    int pow_dim2_deg_resp = SQIsign_response_length - (int)sig->two_resp_length - (int)sig->backtracking;
+
+    // basic sanity test: checking that the response is not too long
+    if (pow_dim2_deg_resp < 0)
+        return 0;
+    // The dim 2 isogeny embeds a dim 1 isogeny of odd degree, so it can
+    // never be of length 2.
+    if (pow_dim2_deg_resp == 1)
+        return 0;
+
+    // check the public curve is valid
+    if (!ec_curve_verify_A(&(pk->curve).A))
+        return 0;
+
+    // Set auxiliary curve from the A-coefficient within the signature
+    ec_curve_t E_aux;
+    if (!ec_curve_init_from_A(&E_aux, &sig->E_aux_A))
+        return 0; // invalid curve
+
+    // checking that we are given A-coefficients and no precomputation
+    assert(fp2_is_one(&pk->curve.C) == 0xFFFFFFFF && !pk->curve.is_A24_computed_and_normalized);
+
+    // computation of the challenge
+    ec_curve_t E_chall;
+    if (!compute_challenge_verify(&E_chall, sig, &pk->curve, pk->hint_pk)) {
+        return 0;
+    }
+
+    // Computation of the canonical bases for the challenge and aux curve
+    ec_basis_t B_chall_can, B_aux_can;
+
+    if (!challenge_and_aux_basis_verify(&B_chall_can, &B_aux_can, &E_chall, &E_aux, sig, pow_dim2_deg_resp)) {
+        return 0;
+    }
+
+    // When two_resp_length != 0 we need to compute a second, short 2^r-isogeny
+    if (sig->two_resp_length > 0) {
+        if (!two_response_isogeny_verify(&E_chall, &B_chall_can, sig, pow_dim2_deg_resp)) {
+            return 0;
+        }
+    }
+
+    // We can recover the commitment curve with a 2D isogeny
+    // The supplied signature did not compute an isogeny between eliptic products
+    // and so definitely is an invalid signature.
+    ec_curve_t E_com;
+    if (!compute_commitment_curve_verify(&E_com, &B_chall_can, &B_aux_can, &E_chall, &E_aux, pow_dim2_deg_resp))
+        return 0;
+
+    scalar_t chk_chall;
+
+    // recomputing the challenge vector
+    hash_to_challenge(&chk_chall, pk, &E_com, m, l);
+
+    // performing the final check
+    verify = mp_compare(sig->chall_coeff, chk_chall, NWORDS_ORDER) == 0;
+
+    return verify;
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/xeval.c b/src/pqm4/sqisign_lvl1/ref/xeval.c
new file mode 100644
index 0000000..7fc7170
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/xeval.c
@@ -0,0 +1,64 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
+void
+xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1, t2;
+    for (int j = 0; j < lenQ; j++) {
+        fp2_add(&t0, &Q[j].x, &Q[j].z);
+        fp2_sub(&t1, &Q[j].x, &Q[j].z);
+        fp2_mul(&t2, &kps->K.x, &t1);
+        fp2_mul(&t1, &kps->K.z, &t0);
+        fp2_add(&t0, &t2, &t1);
+        fp2_sub(&t1, &t2, &t1);
+        fp2_mul(&R[j].x, &Q[j].x, &t0);
+        fp2_mul(&R[j].z, &Q[j].z, &t1);
+    }
+}
+
+void
+xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1;
+    for (int i = 0; i < lenQ; i++) {
+        fp2_mul(&t0, &Q[i].x, &Q[i].z);
+        fp2_mul(&t1, &kps->K.x, &Q[i].z);
+        fp2_add(&t1, &t1, &Q[i].x);
+        fp2_mul(&t1, &t1, &Q[i].x);
+        fp2_sqr(&R[i].x, &Q[i].z);
+        fp2_add(&R[i].x, &R[i].x, &t1);
+        fp2_mul(&R[i].z, &t0, &kps->K.z);
+    }
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
+void
+xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps)
+{
+    const ec_point_t *K = kps->K;
+
+    fp2_t t0, t1;
+
+    for (int i = 0; i < lenQ; i++) {
+        fp2_add(&t0, &Q[i].x, &Q[i].z);
+        fp2_sub(&t1, &Q[i].x, &Q[i].z);
+        fp2_mul(&(R[i].x), &t0, &K[1].x);
+        fp2_mul(&(R[i].z), &t1, &K[2].x);
+        fp2_mul(&t0, &t0, &t1);
+        fp2_mul(&t0, &t0, &K[0].x);
+        fp2_add(&t1, &(R[i].x), &(R[i].z));
+        fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
+        fp2_sqr(&t1, &t1);
+        fp2_sqr(&(R[i].z), &(R[i].z));
+        fp2_add(&(R[i].x), &t0, &t1);
+        fp2_sub(&t0, &t0, &(R[i].z));
+        fp2_mul(&(R[i].x), &(R[i].x), &t1);
+        fp2_mul(&(R[i].z), &(R[i].z), &t0);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl1/ref/xisog.c b/src/pqm4/sqisign_lvl1/ref/xisog.c
new file mode 100644
index 0000000..7242d29
--- /dev/null
+++ b/src/pqm4/sqisign_lvl1/ref/xisog.c
@@ -0,0 +1,61 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+// Degree-2 isogeny with kernel generated by P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    fp2_sqr(&B->x, &P.x);
+    fp2_sqr(&B->z, &P.z);
+    fp2_sub(&B->x, &B->z, &B->x);
+    fp2_add(&kps->K.x, &P.x, &P.z);
+    fp2_sub(&kps->K.z, &P.x, &P.z);
+}
+
+void
+xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24)
+{
+    // No need to check the square root, only used for signing.
+    fp2_t t0, four;
+    fp2_set_small(&four, 4);
+    fp2_add(&t0, &A24.x, &A24.x);
+    fp2_sub(&t0, &t0, &A24.z);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(&A24.z);
+    fp2_mul(&t0, &t0, &A24.z);
+    fp2_copy(&kps->K.x, &t0);
+    fp2_add(&B24->x, &t0, &t0);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t0, &t0, &four);
+    fp2_sqrt(&t0);
+    fp2_neg(&kps->K.z, &t0);
+    fp2_add(&B24->z, &t0, &t0);
+    fp2_add(&B24->x, &B24->x, &B24->z);
+    fp2_add(&B24->z, &B24->z, &B24->z);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    ec_point_t *K = kps->K;
+
+    fp2_sqr(&K[0].x, &P.x);
+    fp2_sqr(&K[0].z, &P.z);
+    fp2_add(&K[1].x, &K[0].z, &K[0].x);
+    fp2_sub(&K[1].z, &K[0].z, &K[0].x);
+    fp2_mul(&B->x, &K[1].x, &K[1].z);
+    fp2_sqr(&B->z, &K[0].z);
+
+    // Constants for xeval_4
+    fp2_add(&K[2].x, &P.x, &P.z);
+    fp2_sub(&K[1].x, &P.x, &P.z);
+    fp2_add(&K[0].x, &K[0].z, &K[0].z);
+    fp2_add(&K[0].x, &K[0].x, &K[0].x);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/api.h b/src/pqm4/sqisign_lvl3/ref/api.h
new file mode 100644
index 0000000..1670ea6
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/api.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <stddef.h>
+#include <sqisign_namespace.h>
+
+#define CRYPTO_SECRETKEYBYTES 529
+#define CRYPTO_PUBLICKEYBYTES 97
+#define CRYPTO_BYTES 224
+
+#define CRYPTO_ALGNAME "SQIsign_lvl3"
+
+SQISIGN_API
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#endif /* api_h */
diff --git a/src/pqm4/sqisign_lvl3/ref/basis.c b/src/pqm4/sqisign_lvl3/ref/basis.c
new file mode 100644
index 0000000..94cb7fc
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/basis.c
@@ -0,0 +1,416 @@
+#include "ec.h"
+#include "fp2.h"
+#include "e0_basis.h"
+#include <assert.h>
+
+uint32_t
+ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve)
+{ // Recover y-coordinate of a point on the Montgomery curve y^2 = x^3 + Ax^2 + x
+    fp2_t t0;
+
+    fp2_sqr(&t0, Px);
+    fp2_mul(y, &t0, &curve->A); // Ax^2
+    fp2_add(y, y, Px);          // Ax^2 + x
+    fp2_mul(&t0, &t0, Px);
+    fp2_add(y, y, &t0); // x^3 + Ax^2 + x
+    // This is required, because we do not yet know that our curves are
+    // supersingular so our points live on the twist with B = 1.
+    return fp2_sqrt_verify(y);
+}
+
+static void
+difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve)
+{
+    // Given P,Q in projective x-only, computes a deterministic choice for (P-Q)
+    // Based on Proposition 3 of https://eprint.iacr.org/2017/518.pdf
+
+    fp2_t Bxx, Bxz, Bzz, t0, t1;
+
+    fp2_mul(&t0, &P->x, &Q->x);
+    fp2_mul(&t1, &P->z, &Q->z);
+    fp2_sub(&Bxx, &t0, &t1);
+    fp2_sqr(&Bxx, &Bxx);
+    fp2_mul(&Bxx, &Bxx, &curve->C); // C*(P.x*Q.x-P.z*Q.z)^2
+    fp2_add(&Bxz, &t0, &t1);
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    fp2_add(&Bzz, &t0, &t1);
+    fp2_mul(&Bxz, &Bxz, &Bzz); // (P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_sub(&Bzz, &t0, &t1);
+    fp2_sqr(&Bzz, &Bzz);
+    fp2_mul(&Bzz, &Bzz, &curve->C); // C*(P.x*Q.z-P.z*Q.x)^2
+    fp2_mul(&Bxz, &Bxz, &curve->C); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&t0, &t0, &curve->A);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&Bxz, &Bxz, &t0); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x) + 2*A*P.x*Q.z*P.z*Q.x
+
+    // To ensure that the denominator is a fourth power in Fp, we normalize by
+    // C*C_bar^2*(P.z)_bar^2*(Q.z)_bar^2
+    fp_copy(&t0.re, &curve->C.re);
+    fp_neg(&t0.im, &curve->C.im);
+    fp2_sqr(&t0, &t0);
+    fp2_mul(&t0, &t0, &curve->C);
+    fp_copy(&t1.re, &P->z.re);
+    fp_neg(&t1.im, &P->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp_copy(&t1.re, &Q->z.re);
+    fp_neg(&t1.im, &Q->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&Bxx, &Bxx, &t0);
+    fp2_mul(&Bxz, &Bxz, &t0);
+    fp2_mul(&Bzz, &Bzz, &t0);
+
+    // Solving quadratic equation
+    fp2_sqr(&t0, &Bxz);
+    fp2_mul(&t1, &Bxx, &Bzz);
+    fp2_sub(&t0, &t0, &t1);
+    // No need to check if t0 is square, as per the entangled basis algorithm.
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &Bxz, &t0);
+    fp2_copy(&PQ->z, &Bzz);
+}
+
+// Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and the point
+// P = (X/Z : 1). For generic implementation see lift_basis()
+uint32_t
+lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    assert(fp2_is_one(&B->P.z));
+    assert(fp2_is_one(&E->C));
+
+    fp2_copy(&P->x, &B->P.x);
+    fp2_copy(&Q->x, &B->Q.x);
+    fp2_copy(&Q->z, &B->Q.z);
+    fp2_set_one(&P->z);
+    uint32_t ret = ec_recover_y(&P->y, &P->x, E);
+
+    // Algorithm of Okeya-Sakurai to recover y.Q in the montgomery model
+    fp2_t v1, v2, v3, v4;
+    fp2_mul(&v1, &P->x, &Q->z);
+    fp2_add(&v2, &Q->x, &v1);
+    fp2_sub(&v3, &Q->x, &v1);
+    fp2_sqr(&v3, &v3);
+    fp2_mul(&v3, &v3, &B->PmQ.x);
+    fp2_add(&v1, &E->A, &E->A);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_add(&v2, &v2, &v1);
+    fp2_mul(&v4, &P->x, &Q->x);
+    fp2_add(&v4, &v4, &Q->z);
+    fp2_mul(&v2, &v2, &v4);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_sub(&v2, &v2, &v1);
+    fp2_mul(&v2, &v2, &B->PmQ.z);
+    fp2_sub(&Q->y, &v3, &v2);
+    fp2_add(&v1, &P->y, &P->y);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_mul(&v1, &v1, &B->PmQ.z);
+    fp2_mul(&Q->x, &Q->x, &v1);
+    fp2_mul(&Q->z, &Q->z, &v1);
+
+    // Transforming to a jacobian coordinate
+    fp2_sqr(&v1, &Q->z);
+    fp2_mul(&Q->y, &Q->y, &v1);
+    fp2_mul(&Q->x, &Q->x, &Q->z);
+    return ret;
+}
+
+uint32_t
+lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    // Normalise the curve E such that (A : C) is (A/C : 1)
+    // and the point x(P) = (X/Z : 1).
+    fp2_t inverses[2];
+    fp2_copy(&inverses[0], &B->P.z);
+    fp2_copy(&inverses[1], &E->C);
+
+    fp2_batched_inv(inverses, 2);
+    fp2_set_one(&B->P.z);
+    fp2_set_one(&E->C);
+
+    fp2_mul(&B->P.x, &B->P.x, &inverses[0]);
+    fp2_mul(&E->A, &E->A, &inverses[1]);
+
+    // Lift the basis to Jacobian points P, Q
+    return lift_basis_normalized(P, Q, B, E);
+}
+
+// Given an x-coordinate, determines if this is a valid
+// point on the curve. Assumes C=1.
+static uint32_t
+is_on_curve(const fp2_t *x, const ec_curve_t *curve)
+{
+    assert(fp2_is_one(&curve->C));
+    fp2_t t0;
+
+    fp2_add(&t0, x, &curve->A); // x + (A/C)
+    fp2_mul(&t0, &t0, x);       // x^2 + (A/C)*x
+    fp2_add_one(&t0, &t0);      // x^2 + (A/C)*x + 1
+    fp2_mul(&t0, &t0, x);       // x^3 + (A/C)*x^2 + x
+
+    return fp2_is_square(&t0);
+}
+
+// Helper function which given a point of order k*2^n with n maximal
+// and k odd, computes a point of order 2^f
+static inline void
+clear_cofactor_for_maximal_even_order(ec_point_t *P, ec_curve_t *curve, int f)
+{
+    // clear out the odd cofactor to get a point of order 2^n
+    ec_mul(P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, P, curve);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_A24(P, P, &curve->A24, curve->is_A24_computed_and_normalized);
+    }
+}
+
+// Helper function which finds an NQR -1 / (1 + i*b) for entangled basis generation
+static uint8_t
+find_nqr_factor(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    // factor = -1/(1 + i*b) for b in Fp will be NQR whenever 1 + b^2 is NQR
+    // in Fp, so we find one of these and then invert (1 + i*b). We store b
+    // as a u8 hint to save time in verification.
+
+    // We return the hint as a u8, but use (uint16_t)n to give 2^16 - 1
+    // to make failure cryptographically negligible, with a fallback when
+    // n > 128 is required.
+    uint8_t hint;
+    uint32_t found = 0;
+    uint16_t n = start;
+
+    bool qr_b = 1;
+    fp_t b, tmp;
+    fp2_t z, t0, t1;
+
+    do {
+        while (qr_b) {
+            // find b with 1 + b^2 a non-quadratic residue
+            fp_set_small(&tmp, (uint32_t)n * n + 1);
+            qr_b = fp_is_square(&tmp);
+            n++; // keeps track of b = n - 1
+        }
+
+        // for Px := -A/(1 + i*b) to be on the curve
+        // is equivalent to A^2*(z-1) - z^2 NQR for z = 1 + i*b
+        // thus prevents unnecessary inversion pre-check
+
+        // t0 = z - 1 = i*b
+        // t1 = z = 1 + i*b
+        fp_set_small(&b, (uint32_t)n - 1);
+        fp2_set_zero(&t0);
+        fp2_set_one(&z);
+        fp_copy(&z.im, &b);
+        fp_copy(&t0.im, &b);
+
+        // A^2*(z-1) - z^2
+        fp2_sqr(&t1, &curve->A);
+        fp2_mul(&t0, &t0, &t1); // A^2 * (z - 1)
+        fp2_sqr(&t1, &z);
+        fp2_sub(&t0, &t0, &t1); // A^2 * (z - 1) - z^2
+        found = !fp2_is_square(&t0);
+
+        qr_b = 1;
+    } while (!found);
+
+    // set Px to -A/(1 + i*b)
+    fp2_copy(x, &z);
+    fp2_inv(x);
+    fp2_mul(x, x, &curve->A);
+    fp2_neg(x, x);
+
+    /*
+     * With very low probability n will not fit in 7 bits.
+     * We set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    hint = n <= 128 ? n - 1 : 0;
+
+    return hint;
+}
+
+// Helper function which finds a point x(P) = n * A
+static uint8_t
+find_nA_x_coord(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    assert(!fp2_is_square(&curve->A)); // Only to be called when A is a NQR
+
+    // when A is NQR we allow x(P) to be a multiple n*A of A
+    uint8_t n = start;
+    if (n == 1) {
+        fp2_copy(x, &curve->A);
+    } else {
+        fp2_mul_small(x, &curve->A, n);
+    }
+
+    while (!is_on_curve(x, curve)) {
+        fp2_add(x, x, &curve->A);
+        n++;
+    }
+
+    /*
+     * With very low probability (1/2^128), n will not fit in 7 bits.
+     * In this case, we set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    uint8_t hint = n < 128 ? n : 0;
+    return hint;
+}
+
+// The entangled basis generation does not allow A = 0
+// so we simply return the one we have already precomputed
+static void
+ec_basis_E0_2f(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    assert(fp2_is_zero(&curve->A));
+    ec_point_t P, Q;
+
+    // Set P, Q to precomputed (X : 1) values
+    fp2_copy(&P.x, &BASIS_E0_PX);
+    fp2_copy(&Q.x, &BASIS_E0_QX);
+    fp2_set_one(&P.z);
+    fp2_set_one(&Q.z);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_E0(&P, &P);
+        xDBL_E0(&Q, &Q);
+    }
+
+    // Set P, Q in the basis and compute x(P - Q)
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->Q, &Q);
+    difference_point(&PQ2->PmQ, &P, &Q, curve);
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// and stores hints as an array for faster recomputation at a later point
+uint8_t
+ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 0;
+    }
+
+    uint8_t hint;
+    bool hint_A = fp2_is_square(&curve->A);
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_A) {
+        // when A is NQR we allow x(P) to be a multiple n*A of A
+        hint = find_nA_x_coord(&P.x, curve, 1);
+    } else {
+        // when A is QR we instead have to find (1 + b^2) a NQR
+        // such that x(P) = -A / (1 + i*b)
+        hint = find_nqr_factor(&P.x, curve, 1);
+    }
+
+    fp2_set_one(&P.z);
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+    // Finally, we compress hint_A and hint into a single bytes.
+    // We choose to set the LSB of hint to hint_A
+    assert(hint < 128); // We expect hint to be 7-bits in size
+    return (hint << 1) | hint_A;
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// given the hints as an array for faster basis computation
+int
+ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 1;
+    }
+
+    // The LSB of hint encodes whether A is a QR
+    // The remaining 7-bits are used to find a valid x(P)
+    bool hint_A = hint & 1;
+    uint8_t hint_P = hint >> 1;
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_P) {
+        // When hint_P = 0 it means we did not find a point in 128 attempts
+        // this is very rare and we almost never expect to need this fallback
+        // In either case, we can start with b = 128 to skip testing the known
+        // values which will not work
+        if (!hint_A) {
+            find_nA_x_coord(&P.x, curve, 128);
+        } else {
+            find_nqr_factor(&P.x, curve, 128);
+        }
+    } else {
+        // Otherwise we use the hint to directly find x(P) based on hint_A
+        if (!hint_A) {
+            // when A is NQR, we have found n such that x(P) = n*A
+            fp2_mul_small(&P.x, &curve->A, hint_P);
+        } else {
+            // when A is QR we have found b such that (1 + b^2) is a NQR in
+            // Fp, so we must compute x(P) = -A / (1 + i*b)
+            fp_set_one(&P.x.re);
+            fp_set_small(&P.x.im, hint_P);
+            fp2_inv(&P.x);
+            fp2_mul(&P.x, &P.x, &curve->A);
+            fp2_neg(&P.x, &P.x);
+        }
+    }
+    fp2_set_one(&P.z);
+
+#ifndef NDEBUG
+    int passed = 1;
+    passed = is_on_curve(&P.x, curve);
+    passed &= !fp2_is_square(&P.x);
+
+    if (!passed)
+        return 0;
+#endif
+
+    // set xQ to -xP - A
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+#ifndef NDEBUG
+    passed &= test_basis_order_twof(PQ2, curve, f);
+
+    if (!passed)
+        return 0;
+#endif
+
+    return 1;
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/common.c b/src/pqm4/sqisign_lvl3/ref/common.c
new file mode 100644
index 0000000..d393e9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/common.c
@@ -0,0 +1,88 @@
+#include <fips202.h>
+#include <tutil.h>
+#include <mp.h>
+#include <encoded_sizes.h>
+#include <ec_params.h>
+#include <verification.h>
+
+void
+public_key_init(public_key_t *pk)
+{
+    ec_curve_init(&pk->curve);
+}
+
+void
+public_key_finalize(public_key_t *pk)
+{
+}
+
+// compute the challenge as the hash of the message and the commitment curve and public key
+void
+hash_to_challenge(scalar_t *scalar,
+                  const public_key_t *pk,
+                  const ec_curve_t *com_curve,
+                  const unsigned char *message,
+                  size_t length)
+{
+    unsigned char buf[2 * FP2_ENCODED_BYTES];
+    {
+        fp2_t j1, j2;
+        ec_j_inv(&j1, &pk->curve);
+        ec_j_inv(&j2, com_curve);
+        fp2_encode(buf, &j1);
+        fp2_encode(buf + FP2_ENCODED_BYTES, &j2);
+    }
+
+    {
+        // The type scalar_t represents an element of GF(p), which is about
+        // 2*lambda bits, where lambda = 128, 192 or 256, according to the
+        // security level. Thus, the variable scalar should have enough memory
+        // for the values produced by SHAKE256 in the intermediate iterations.
+
+        shake256incctx ctx;
+
+        size_t hash_bytes = ((2 * SECURITY_BITS) + 7) / 8;
+        size_t limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        size_t bits = (2 * SECURITY_BITS) % RADIX;
+        digit_t mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, buf, 2 * FP2_ENCODED_BYTES);
+        shake256_inc_absorb(&ctx, message, length);
+        shake256_inc_finalize(&ctx);
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+        for (int i = 2; i < HASH_ITERATIONS; i++) {
+            shake256_inc_init(&ctx);
+            shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+            shake256_inc_finalize(&ctx);
+            shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+            (*scalar)[limbs - 1] &= mask;
+        }
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+        shake256_inc_finalize(&ctx);
+
+        hash_bytes = ((TORSION_EVEN_POWER - SQIsign_response_length) + 7) / 8;
+        limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        bits = (TORSION_EVEN_POWER - SQIsign_response_length) % RADIX;
+        mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        memset(*scalar, 0, NWORDS_ORDER * sizeof(digit_t));
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+
+#ifdef TARGET_BIG_ENDIAN
+        for (int i = 0; i < NWORDS_ORDER; i++)
+            (*scalar)[i] = BSWAP_DIGIT((*scalar)[i]);
+#endif
+
+        mp_mod_2exp(*scalar, SECURITY_BITS, NWORDS_ORDER);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/config.mk b/src/pqm4/sqisign_lvl3/ref/config.mk
new file mode 100644
index 0000000..cd822b5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/config.mk
@@ -0,0 +1,2 @@
+elf/crypto_sign_sqisign_lvl3_ref_%.elf: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl3 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
+obj/libcrypto_sign_sqisign_lvl3_ref.a: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl3 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
diff --git a/src/pqm4/sqisign_lvl3/ref/e0_basis.c b/src/pqm4/sqisign_lvl3/ref/e0_basis.c
new file mode 100644
index 0000000..1b12a83
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/e0_basis.c
@@ -0,0 +1,55 @@
+#include <e0_basis.h>
+const fp2_t BASIS_E0_PX = {
+#if 0
+#elif RADIX == 16
+{0x1196, 0x134b, 0xdbd, 0x118d, 0x712, 0x1646, 0x5d7, 0x8eb, 0x431, 0xf5b, 0x161e, 0x13b6, 0x1c07, 0x42, 0x8ba, 0xeec, 0x1a43, 0x545, 0x1cdb, 0x1659, 0x1614, 0xde, 0x72d, 0x1b80, 0x1706, 0x15a3, 0x894, 0xd4a, 0x1b2f, 0x12}
+#elif RADIX == 32
+{0x9a5c65a, 0xa31adbd, 0x7b231c4, 0xc51d65d, 0x1e7ad90, 0x1e76d6, 0x8ba0217, 0xe90ddd8, 0x3cdb2a2, 0xf5852cb, 0x72d06, 0xd1dc1b7, 0xa94894a, 0x14cbd}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x31c4a31adbd9a5c6, 0xe7ad90c51d65d7b2, 0x88ba021701e76d61, 0x2cb3cdb2a2e90ddd, 0xdc1b70072d06f585, 0x16eecbda94894ad1}
+#else
+{0x94635b7b34b8c, 0x431475975ec8c7, 0x380f3b6b0f3d6c, 0x2e90ddd88ba021, 0x5eb0a59679b654, 0x347706dc01cb41, 0xb7765ed4a44a5}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0xa85, 0x10cc, 0x1ef, 0xb0b, 0x1082, 0x5be, 0xd14, 0x1100, 0x1a33, 0x174b, 0x181c, 0x83e, 0x1034, 0x18ba, 0x205, 0x1f39, 0x1e9, 0x1998, 0x130e, 0x801, 0xfeb, 0x698, 0xdf9, 0x6a5, 0x5b6, 0x2c8, 0x1283, 0xad9, 0x960, 0x1e}
+#elif RADIX == 32
+{0x8662a17, 0x96161ef, 0x42df420, 0xce200d1, 0x1cba5e8, 0xd107d8, 0x205c5d4, 0x7a7e72, 0x330eccc, 0xc3fad00, 0x4adf934, 0x6416d8d, 0x5b32831, 0x2f581}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xf42096161ef8662a, 0xcba5e8ce200d142d, 0x2205c5d40d107d81, 0xd00330eccc07a7e7, 0x16d8d4adf934c3fa, 0x6065815b3283164}
+#else
+{0x412c2c3df0cc54, 0x2338803450b7d0, 0x206883ec0e5d2f, 0x407a7e72205c5d, 0x187f5a00661d99, 0x5905b6352b7e4d, 0x3032c0ad99418}
+#endif
+#endif
+};
+const fp2_t BASIS_E0_QX = {
+#if 0
+#elif RADIX == 16
+{0x16ed, 0x818, 0x127a, 0xcfb, 0x1be6, 0x1b40, 0x1bf1, 0xe75, 0x129c, 0x151, 0x425, 0x142e, 0x1edb, 0x254, 0x5cc, 0x1a5b, 0x1e1d, 0x1e27, 0x1a12, 0x8a8, 0x59e, 0x933, 0x1647, 0x686, 0x19e, 0x1e51, 0x151f, 0x1b6e, 0x1efe, 0xd}
+#elif RADIX == 32
+{0x40c5bb5, 0x99f727a, 0x1da06f9, 0x71cebbf, 0x250a8ca, 0xb6e85c4, 0x5cc12a7, 0xf8774b6, 0x1a12f13, 0x9967915, 0xd64749, 0x288678d, 0x6dd51ff, 0x2ebfb}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x6f999f727a40c5b, 0x50a8ca71cebbf1da, 0x65cc12a7b6e85c42, 0x9151a12f13f8774b, 0x8678d0d647499967, 0x2e23bfb6dd51ff28}
+#else
+{0x7333ee4f4818b7, 0x29c73aefc7681b, 0x3db742e2128546, 0x3f8774b65cc12a, 0x332cf22a3425e2, 0x4a219e343591d2, 0x6d1dfdb6ea8ff}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x18a9, 0x1838, 0x1588, 0x1720, 0xf3f, 0x1fcd, 0x44d, 0x1e6b, 0x681, 0x1249, 0x1f8a, 0x5af, 0x1f58, 0x1c12, 0xf21, 0x1887, 0x278, 0x156a, 0xbfe, 0x765, 0x12f7, 0x4da, 0x16ce, 0x7c1, 0x1c04, 0x1773, 0x853, 0xab7, 0xe1d, 0x1a}
+#elif RADIX == 32
+{0xc1c62a7, 0xee41588, 0xdfe6bcf, 0x7cd644, 0x8a9249a, 0xd60b5ff, 0xf21e097, 0x9e310e, 0xabfeab5, 0xd4bdcec, 0x836ce26, 0xb9f010f, 0x56e853b, 0x10875}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x6bcfee41588c1c62, 0xa9249a07cd644dfe, 0xef21e097d60b5ff8, 0xcecabfeab509e310, 0xf010f836ce26d4bd, 0x2a7787556e853bb9}
+#else
+{0x1fdc82b11838c5, 0x681f359137f9af, 0x3eb05affc54924, 0x509e310ef21e09, 0x5a97b9d957fd56, 0x6e7c043e0db389, 0x4fbc3aab7429d}
+#endif
+#endif
+};
diff --git a/src/pqm4/sqisign_lvl3/ref/e0_basis.h b/src/pqm4/sqisign_lvl3/ref/e0_basis.h
new file mode 100644
index 0000000..05cafb8
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/e0_basis.h
@@ -0,0 +1,3 @@
+#include <fp2.h>
+extern const fp2_t BASIS_E0_PX;
+extern const fp2_t BASIS_E0_QX;
diff --git a/src/pqm4/sqisign_lvl3/ref/ec.c b/src/pqm4/sqisign_lvl3/ref/ec.c
new file mode 100644
index 0000000..be4e4e5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/ec.c
@@ -0,0 +1,665 @@
+#include <assert.h>
+#include <stdio.h>
+#include <mp.h>
+#include <ec.h>
+
+void
+ec_point_init(ec_point_t *P)
+{ // Initialize point as identity element (1:0)
+    fp2_set_one(&(P->x));
+    fp2_set_zero(&(P->z));
+}
+
+void
+ec_curve_init(ec_curve_t *E)
+{ // Initialize the curve struct
+    // Initialize the constants
+    fp2_set_zero(&(E->A));
+    fp2_set_one(&(E->C));
+
+    // Initialize the point (A+2 : 4C)
+    ec_point_init(&(E->A24));
+
+    // Set the bool to be false by default
+    E->is_A24_computed_and_normalized = false;
+}
+
+void
+select_point(ec_point_t *Q, const ec_point_t *P1, const ec_point_t *P2, const digit_t option)
+{ // Select points in constant time
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+cswap_points(ec_point_t *P, ec_point_t *Q, const digit_t option)
+{ // Swap points in constant time
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
+    fp2_cswap(&(P->x), &(Q->x), option);
+    fp2_cswap(&(P->z), &(Q->z), option);
+}
+
+void
+ec_normalize_point(ec_point_t *P)
+{
+    fp2_inv(&P->z);
+    fp2_mul(&P->x, &P->x, &P->z);
+    fp2_set_one(&(P->z));
+}
+
+void
+ec_normalize_curve(ec_curve_t *E)
+{
+    fp2_inv(&E->C);
+    fp2_mul(&E->A, &E->A, &E->C);
+    fp2_set_one(&E->C);
+}
+
+void
+ec_curve_normalize_A24(ec_curve_t *E)
+{
+    if (!E->is_A24_computed_and_normalized) {
+        AC_to_A24(&E->A24, E);
+        ec_normalize_point(&E->A24);
+        E->is_A24_computed_and_normalized = true;
+    }
+    assert(fp2_is_one(&E->A24.z));
+}
+
+void
+ec_normalize_curve_and_A24(ec_curve_t *E)
+{ // Neither the curve or A24 are guaranteed to be normalized.
+  // First we normalize (A/C : 1) and conditionally compute
+    if (!fp2_is_one(&E->C)) {
+        ec_normalize_curve(E);
+    }
+
+    if (!E->is_A24_computed_and_normalized) {
+        // Now compute A24 = ((A + 2) / 4 : 1)
+        fp2_add_one(&E->A24.x, &E->A);     // re(A24.x) = re(A) + 1
+        fp2_add_one(&E->A24.x, &E->A24.x); // re(A24.x) = re(A) + 2
+        fp_copy(&E->A24.x.im, &E->A.im);   // im(A24.x) = im(A)
+
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 2
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 4
+        fp2_set_one(&E->A24.z);
+
+        E->is_A24_computed_and_normalized = true;
+    }
+}
+
+uint32_t
+ec_is_zero(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_has_zero_coordinate(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->x) | fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_is_equal(const ec_point_t *P, const ec_point_t *Q)
+{ // Evaluate if two points in Montgomery coordinates (X:Z) are equal
+  // Returns 0xFFFFFFFF (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1;
+
+    // Check if P, Q are the points at infinity
+    uint32_t l_zero = ec_is_zero(P);
+    uint32_t r_zero = ec_is_zero(Q);
+
+    // Check if PX * QZ = QX * PZ
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    uint32_t lr_equal = fp2_is_equal(&t0, &t1);
+
+    // Points are equal if
+    // - Both are zero, or
+    // - neither are zero AND PX * QZ = QX * PZ
+    return (l_zero & r_zero) | (~l_zero & ~r_zero * lr_equal);
+}
+
+uint32_t
+ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    if (ec_is_zero(P))
+        return 0;
+
+    uint32_t x_is_zero, tmp_is_zero;
+    fp2_t t0, t1, t2;
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t0, &t1);
+    fp2_mul(&t2, &t2, &E->A);
+    fp2_mul(&t1, &t1, &E->C);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t1, &t2); // 4 (CX^2+CZ^2+AXZ)
+
+    x_is_zero = fp2_is_zero(&P->x);
+    tmp_is_zero = fp2_is_zero(&t0);
+
+    // two torsion if x or x^2 + Ax + 1 is zero
+    return x_is_zero | tmp_is_zero;
+}
+
+uint32_t
+ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    ec_point_t test;
+    xDBL_A24(&test, P, &E->A24, E->is_A24_computed_and_normalized);
+    return ec_is_two_torsion(&test, E);
+}
+
+uint32_t
+ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E)
+{ // Check if basis points (P, Q) form a full 2^t-basis
+    ec_point_t P2, Q2;
+    xDBL_A24(&P2, &B->P, &E->A24, E->is_A24_computed_and_normalized);
+    xDBL_A24(&Q2, &B->Q, &E->A24, E->is_A24_computed_and_normalized);
+    return (ec_is_two_torsion(&P2, E) & ec_is_two_torsion(&Q2, E) & ~ec_is_equal(&P2, &Q2));
+}
+
+int
+ec_curve_verify_A(const fp2_t *A)
+{ // Verify the Montgomery coefficient A is valid (A^2-4 \ne 0)
+  // Return 1 if curve is valid, 0 otherwise
+    fp2_t t;
+    fp2_set_one(&t);
+    fp_add(&t.re, &t.re, &t.re); // t=2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    fp_neg(&t.re, &t.re); // t=-2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    return 1;
+}
+
+int
+ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A)
+{ // Initialize the curve from the A coefficient and check it is valid
+  // Return 1 if curve is valid, 0 otherwise
+    ec_curve_init(E);
+    fp2_copy(&E->A, A); // Set A
+    return ec_curve_verify_A(A);
+}
+
+void
+ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve)
+{ // j-invariant computation for Montgommery coefficient A2=(A+2C:4C)
+    fp2_t t0, t1;
+
+    fp2_sqr(&t1, &curve->C);
+    fp2_sqr(j_inv, &curve->A);
+    fp2_add(&t0, &t1, &t1);
+    fp2_sub(&t0, j_inv, &t0);
+    fp2_sub(&t0, &t0, &t1);
+    fp2_sub(j_inv, &t0, &t1);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(j_inv, j_inv, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_sqr(&t1, &t0);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(j_inv);
+    fp2_mul(j_inv, &t0, j_inv);
+}
+
+void
+xDBL_E0(ec_point_t *Q, const ec_point_t *P)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z) on the curve E0 with (A:C) = (0:1).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C) = (0:1). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&Q->z, &t1, &t2);
+    fp2_mul(&Q->z, &Q->z, &t2);
+}
+
+void
+xDBL(ec_point_t *Q, const ec_point_t *P, const ec_point_t *AC)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z). Computation of coefficient values A+2C and 4C
+  // on-the-fly. 
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t3, &AC->z, &AC->z);
+    fp2_mul(&t1, &t1, &t3);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&t0, &t3, &AC->x);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and
+  //        the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    if (!A24_normalized)
+        fp2_mul(&t1, &t1, &A24->z);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_mul(&t0, &t2, &A24->x);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ)
+{ // Differential addition of Montgomery points in projective coordinates (X:Z).
+  // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, and difference
+  //        PQ=P-Q=(XPQ:ZPQ).
+  // Output: projective Montgomery point R <- P+Q = (XR:ZR) such that x(P+Q)=XR/ZR.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&t2, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &t2);
+    fp2_add(&t2, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&t2, &t2);
+    fp2_sqr(&t3, &t3);
+    fp2_mul(&t2, &PQ->z, &t2);
+    fp2_mul(&R->z, &PQ->x, &t3);
+    fp2_copy(&R->x, &t2);
+}
+
+void
+xDBLADD(ec_point_t *R,
+        ec_point_t *S,
+        const ec_point_t *P,
+        const ec_point_t *Q,
+        const ec_point_t *PQ,
+        const ec_point_t *A24,
+        const bool A24_normalized)
+{ // Simultaneous doubling and differential addition.
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, the difference
+  //         PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points R <- 2*P = (XR:ZR) such that x(2P)=XR/ZR, and S <- P+Q = (XS:ZS) such that =
+  //         x(Q+P)=XS/ZS.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&R->x, &t0);
+    fp2_sub(&t2, &Q->x, &Q->z);
+    fp2_add(&S->x, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_sqr(&R->z, &t1);
+    fp2_mul(&t1, &t1, &S->x);
+    fp2_sub(&t2, &R->x, &R->z);
+    if (!A24_normalized)
+        fp2_mul(&R->z, &R->z, &A24->z);
+    fp2_mul(&R->x, &R->x, &R->z);
+    fp2_mul(&S->x, &A24->x, &t2);
+    fp2_sub(&S->z, &t0, &t1);
+    fp2_add(&R->z, &R->z, &S->x);
+    fp2_add(&S->x, &t0, &t1);
+    fp2_mul(&R->z, &R->z, &t2);
+    fp2_sqr(&S->z, &S->z);
+    fp2_sqr(&S->x, &S->x);
+    fp2_mul(&S->z, &S->z, &PQ->x);
+    fp2_mul(&S->x, &S->x, &PQ->z);
+}
+
+void
+xMUL(ec_point_t *Q, const ec_point_t *P, const digit_t *k, const int kbits, const ec_curve_t *curve)
+{ // The Montgomery ladder
+  // Input: projective Montgomery point P=(XP:ZP) such that xP=XP/ZP, a scalar k of bitlength kbits, and
+  //        the Montgomery curve constants (A:C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points Q <- k*P = (XQ:ZQ) such that x(k*P)=XQ/ZQ.
+    ec_point_t R0, R1, A24;
+    digit_t mask;
+    unsigned int bit, prevbit = 0, swap;
+
+    if (!curve->is_A24_computed_and_normalized) {
+        // Computation of A24=(A+2C:4C)
+        fp2_add(&A24.x, &curve->C, &curve->C);
+        fp2_add(&A24.z, &A24.x, &A24.x);
+        fp2_add(&A24.x, &A24.x, &curve->A);
+    } else {
+        fp2_copy(&A24.x, &curve->A24.x);
+        fp2_copy(&A24.z, &curve->A24.z);
+        // Assert A24 has been normalised
+        assert(fp2_is_one(&A24.z));
+    }
+
+    // R0 <- (1:0), R1 <- P
+    ec_point_init(&R0);
+    fp2_copy(&R1.x, &P->x);
+    fp2_copy(&R1.z, &P->z);
+
+    // Main loop
+    for (int i = kbits - 1; i >= 0; i--) {
+        bit = (k[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
+        swap = bit ^ prevbit;
+        prevbit = bit;
+        mask = 0 - (digit_t)swap;
+
+        cswap_points(&R0, &R1, mask);
+        xDBLADD(&R0, &R1, &R0, &R1, P, &A24, true);
+    }
+    swap = 0 ^ prevbit;
+    mask = 0 - (digit_t)swap;
+    cswap_points(&R0, &R1, mask);
+
+    fp2_copy(&Q->x, &R0.x);
+    fp2_copy(&Q->z, &R0.z);
+}
+
+int
+xDBLMUL(ec_point_t *S,
+        const ec_point_t *P,
+        const digit_t *k,
+        const ec_point_t *Q,
+        const digit_t *l,
+        const ec_point_t *PQ,
+        const int kbits,
+        const ec_curve_t *curve)
+{ // The Montgomery biladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, scalars k and l of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants (A:C).
+  // Output: projective Montgomery point S <- k*P + l*Q = (XS:ZS) such that x(k*P + l*Q)=XS/ZS.
+
+    int i, A_is_zero;
+    digit_t evens, mevens, bitk0, bitl0, maskk, maskl, temp, bs1_ip1, bs2_ip1, bs1_i, bs2_i, h;
+    digit_t sigma[2] = { 0 }, pre_sigma = 0;
+    digit_t k_t[NWORDS_ORDER], l_t[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }, r[2 * BITS] = { 0 };
+    ec_point_t DIFF1a, DIFF1b, DIFF2a, DIFF2b, R[3] = { 0 }, T[3];
+
+    // differential additions formulas are invalid in this case
+    if (ec_has_zero_coordinate(P) | ec_has_zero_coordinate(Q) | ec_has_zero_coordinate(PQ))
+        return 0;
+
+    // Derive sigma according to parity
+    bitk0 = (k[0] & 1);
+    bitl0 = (l[0] & 1);
+    maskk = 0 - bitk0; // Parity masks: 0 if even, otherwise 1...1
+    maskl = 0 - bitl0;
+    sigma[0] = (bitk0 ^ 1);
+    sigma[1] = (bitl0 ^ 1);
+    evens = sigma[0] + sigma[1]; // Count number of even scalars
+    mevens = 0 - (evens & 1);    // Mask mevens <- 0 if # even of scalars = 0 or 2, otherwise mevens = 1...1
+
+    // If k and l are both even or both odd, pick sigma = (0,1)
+    sigma[0] = (sigma[0] & mevens);
+    sigma[1] = (sigma[1] & mevens) | (1 & ~mevens);
+
+    // Convert even scalars to odd
+    one[0] = 1;
+    mp_sub(k_t, k, one, NWORDS_ORDER);
+    mp_sub(l_t, l, one, NWORDS_ORDER);
+    select_ct(k_t, k_t, k, maskk, NWORDS_ORDER);
+    select_ct(l_t, l_t, l, maskl, NWORDS_ORDER);
+
+    // Scalar recoding
+    for (i = 0; i < kbits; i++) {
+        // If sigma[0] = 1 swap k_t and l_t
+        maskk = 0 - (sigma[0] ^ pre_sigma);
+        swap_ct(k_t, l_t, maskk, NWORDS_ORDER);
+
+        if (i == kbits - 1) {
+            bs1_ip1 = 0;
+            bs2_ip1 = 0;
+        } else {
+            bs1_ip1 = mp_shiftr(k_t, 1, NWORDS_ORDER);
+            bs2_ip1 = mp_shiftr(l_t, 1, NWORDS_ORDER);
+        }
+        bs1_i = k_t[0] & 1;
+        bs2_i = l_t[0] & 1;
+
+        r[2 * i] = bs1_i ^ bs1_ip1;
+        r[2 * i + 1] = bs2_i ^ bs2_ip1;
+
+        // Revert sigma if second bit, r_(2i+1), is 1
+        pre_sigma = sigma[0];
+        maskk = 0 - r[2 * i + 1];
+        select_ct(&temp, &sigma[0], &sigma[1], maskk, 1);
+        select_ct(&sigma[1], &sigma[1], &sigma[0], maskk, 1);
+        sigma[0] = temp;
+    }
+
+    // Point initialization
+    ec_point_init(&R[0]);
+    maskk = 0 - sigma[0];
+    select_point(&R[1], P, Q, maskk);
+    select_point(&R[2], Q, P, maskk);
+
+    fp2_copy(&DIFF1a.x, &R[1].x);
+    fp2_copy(&DIFF1a.z, &R[1].z);
+    fp2_copy(&DIFF1b.x, &R[2].x);
+    fp2_copy(&DIFF1b.z, &R[2].z);
+
+    // Initialize DIFF2a <- P+Q, DIFF2b <- P-Q
+    xADD(&R[2], &R[1], &R[2], PQ);
+    if (ec_has_zero_coordinate(&R[2]))
+        return 0; // non valid formulas
+
+    fp2_copy(&DIFF2a.x, &R[2].x);
+    fp2_copy(&DIFF2a.z, &R[2].z);
+    fp2_copy(&DIFF2b.x, &PQ->x);
+    fp2_copy(&DIFF2b.z, &PQ->z);
+
+    A_is_zero = fp2_is_zero(&curve->A);
+
+    // Main loop
+    for (i = kbits - 1; i >= 0; i--) {
+        h = r[2 * i] + r[2 * i + 1]; // in {0, 1, 2}
+        maskk = 0 - (h & 1);
+        select_point(&T[0], &R[0], &R[1], maskk);
+        maskk = 0 - (h >> 1);
+        select_point(&T[0], &T[0], &R[2], maskk);
+        if (A_is_zero) {
+            xDBL_E0(&T[0], &T[0]);
+        } else {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(&T[0], &T[0], &curve->A24, true);
+        }
+
+        maskk = 0 - r[2 * i + 1]; // in {0, 1}
+        select_point(&T[1], &R[0], &R[1], maskk);
+        select_point(&T[2], &R[1], &R[2], maskk);
+
+        cswap_points(&DIFF1a, &DIFF1b, maskk);
+        xADD(&T[1], &T[1], &T[2], &DIFF1a);
+        xADD(&T[2], &R[0], &R[2], &DIFF2a);
+
+        // If hw (mod 2) = 1 then swap DIFF2a and DIFF2b
+        maskk = 0 - (h & 1);
+        cswap_points(&DIFF2a, &DIFF2b, maskk);
+
+        // R <- T
+        copy_point(&R[0], &T[0]);
+        copy_point(&R[1], &T[1]);
+        copy_point(&R[2], &T[2]);
+    }
+
+    // Output R[evens]
+    select_point(S, &R[0], &R[1], mevens);
+
+    maskk = 0 - (bitk0 & bitl0);
+    select_point(S, S, &R[2], maskk);
+    return 1;
+}
+
+int
+ec_ladder3pt(ec_point_t *R,
+             const digit_t *m,
+             const ec_point_t *P,
+             const ec_point_t *Q,
+             const ec_point_t *PQ,
+             const ec_curve_t *E)
+{ // The 3-point Montgomery ladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, a scalar k of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C/4C:1).
+  // Output: projective Montgomery point R <- P + m*Q = (XR:ZR) such that x(P + m*Q)=XR/ZR.
+    assert(E->is_A24_computed_and_normalized);
+    if (!fp2_is_one(&E->A24.z)) {
+        return 0;
+    }
+    // Formulas are not valid in that case
+    if (ec_has_zero_coordinate(PQ)) {
+        return 0;
+    }
+
+    ec_point_t X0, X1, X2;
+    copy_point(&X0, Q);
+    copy_point(&X1, P);
+    copy_point(&X2, PQ);
+
+    int i, j;
+    digit_t t;
+    for (i = 0; i < NWORDS_ORDER; i++) {
+        t = 1;
+        for (j = 0; j < RADIX; j++) {
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            xDBLADD(&X0, &X1, &X0, &X1, &X2, &E->A24, true);
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            t <<= 1;
+        };
+    };
+    copy_point(R, &X1);
+    return 1;
+}
+
+// WRAPPERS to export
+
+void
+ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve)
+{
+    // If A24 = ((A+2)/4 : 1) we save multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+    } else {
+        // Otherwise we compute A24 on the fly for doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+    }
+}
+
+void
+ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve)
+{
+    if (n == 0) {
+        copy_point(res, P);
+        return;
+    }
+
+    // When the chain is long enough, we should normalise A24
+    if (n > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is normalized we can save some multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+        for (int i = 0; i < n - 1; i++) {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(res, res, &curve->A24, true);
+        }
+    } else {
+        // Otherwise we do normal doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+        for (int i = 0; i < n - 1; i++) {
+            xDBL(res, res, (const ec_point_t *)curve);
+        }
+    }
+}
+
+void
+ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve)
+{
+    ec_dbl_iter(&res->P, n, &B->P, curve);
+    ec_dbl_iter(&res->Q, n, &B->Q, curve);
+    ec_dbl_iter(&res->PmQ, n, &B->PmQ, curve);
+}
+
+void
+ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve)
+{
+    // For large scalars it's worth normalising anyway
+    if (kbits > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is computed and normalized we save some Fp2 multiplications
+    xMUL(res, P, scalar, kbits, curve);
+}
+
+int
+ec_biscalar_mul(ec_point_t *res,
+                const digit_t *scalarP,
+                const digit_t *scalarQ,
+                const int kbits,
+                const ec_basis_t *PQ,
+                const ec_curve_t *curve)
+{
+    if (fp2_is_zero(&PQ->PmQ.z))
+        return 0;
+
+    /* Differential additions behave badly when PmQ = (0:1), so we need to
+     * treat this case specifically. Since we assume P, Q are a basis, this
+     * can happen only if kbits==1 */
+    if (kbits == 1) {
+        // Sanity check: our basis should be given by 2-torsion points
+        if (!ec_is_two_torsion(&PQ->P, curve) || !ec_is_two_torsion(&PQ->Q, curve) ||
+            !ec_is_two_torsion(&PQ->PmQ, curve))
+            return 0;
+        digit_t bP, bQ;
+        bP = (scalarP[0] & 1);
+        bQ = (scalarQ[0] & 1);
+        if (bP == 0 && bQ == 0)
+            ec_point_init(res); //(1: 0)
+        else if (bP == 1 && bQ == 0)
+            copy_point(res, &PQ->P);
+        else if (bP == 0 && bQ == 1)
+            copy_point(res, &PQ->Q);
+        else if (bP == 1 && bQ == 1)
+            copy_point(res, &PQ->PmQ);
+        else // should never happen
+            assert(0);
+        return 1;
+    } else {
+        ec_curve_t E;
+        copy_curve(&E, curve);
+
+        if (!fp2_is_zero(&curve->A)) { // If A is not zero normalize
+            ec_curve_normalize_A24(&E);
+        }
+        return xDBLMUL(res, &PQ->P, scalarP, &PQ->Q, scalarQ, &PQ->PmQ, kbits, (const ec_curve_t *)&E);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/ec.h b/src/pqm4/sqisign_lvl3/ref/ec.h
new file mode 100644
index 0000000..ee2be38
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/ec.h
@@ -0,0 +1,668 @@
+﻿/** @file
+ *
+ * @authors Luca De Feo, Francisco RH
+ *
+ * @brief Elliptic curve stuff
+ */
+
+#ifndef EC_H
+#define EC_H
+#include <sqisign_namespace.h>
+#include <ec_params.h>
+#include <fp2.h>
+#include <tools.h>
+#include <stdio.h>
+
+/** @defgroup ec Elliptic curves
+ * @{
+ */
+
+/** @defgroup ec_t Data structures
+ * @{
+ */
+
+/** @brief Projective point on the Kummer line E/pm 1 in Montgomery coordinates
+ *
+ * @typedef ec_point_t
+ *
+ * @struct ec_point_t
+ *
+ * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
+ */
+typedef struct ec_point_t
+{
+    fp2_t x;
+    fp2_t z;
+} ec_point_t;
+
+/** @brief Projective point in Montgomery coordinates
+ *
+ * @typedef jac_point_t
+ *
+ * @struct jac_point_t
+ *
+ * A projective point in (X:Y:Z) coordinates
+ */
+typedef struct jac_point_t
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+} jac_point_t;
+
+/** @brief Addition components
+ *
+ * @typedef add_components_t
+ *
+ * @struct add_components_t
+ *
+ * 3 components u,v,w that define the (X:Z) coordinates of both
+ * addition and substraction of two distinct points with
+ * P+Q =(u-v:w) and P-Q = (u+v=w)
+ */
+typedef struct add_components_t
+{
+    fp2_t u;
+    fp2_t v;
+    fp2_t w;
+} add_components_t;
+
+/** @brief A basis of a torsion subgroup
+ *
+ * @typedef ec_basis_t
+ *
+ * @struct ec_basis_t
+ *
+ * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
+ */
+typedef struct ec_basis_t
+{
+    ec_point_t P;
+    ec_point_t Q;
+    ec_point_t PmQ;
+} ec_basis_t;
+
+/** @brief An elliptic curve
+ *
+ * @typedef ec_curve_t
+ *
+ * @struct ec_curve_t
+ *
+ * An elliptic curve in projective Montgomery form
+ */
+typedef struct ec_curve_t
+{
+    fp2_t A;
+    fp2_t C;                             ///< cannot be 0
+    ec_point_t A24;                      // the point (A+2 : 4C)
+    bool is_A24_computed_and_normalized; // says if A24 has been computed and normalized
+} ec_curve_t;
+
+/** @brief An isogeny of degree a power of 2
+ *
+ * @typedef ec_isog_even_t
+ *
+ * @struct ec_isog_even_t
+ */
+typedef struct ec_isog_even_t
+{
+    ec_curve_t curve;  ///< The domain curve
+    ec_point_t kernel; ///< A kernel generator
+    unsigned length;   ///< The length as a 2-isogeny walk
+} ec_isog_even_t;
+
+/** @brief Isomorphism of Montgomery curves
+ *
+ * @typedef ec_isom_t
+ *
+ * @struct ec_isom_t
+ *
+ * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X + Nz Z) : (D Z) )
+ */
+typedef struct ec_isom_t
+{
+    fp2_t Nx;
+    fp2_t Nz;
+    fp2_t D;
+} ec_isom_t;
+
+// end ec_t
+/** @}
+ */
+
+/** @defgroup ec_curve_t Curves and isomorphisms
+ * @{
+ */
+
+// Initalisation for curves and points
+void ec_curve_init(ec_curve_t *E);
+void ec_point_init(ec_point_t *P);
+
+/**
+ * @brief Verify that a Montgomery coefficient is valid
+ *
+ * @param A an fp2_t
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_verify_A(const fp2_t *A);
+
+/**
+ * @brief Initialize an elliptic curve from a coefficient
+ *
+ * @param A an fp2_t
+ * @param E the elliptic curve to initialize
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A);
+
+// Copying points, bases and curves
+static inline void
+copy_point(ec_point_t *P, const ec_point_t *Q)
+{
+    fp2_copy(&P->x, &Q->x);
+    fp2_copy(&P->z, &Q->z);
+}
+
+static inline void
+copy_basis(ec_basis_t *B1, const ec_basis_t *B0)
+{
+    copy_point(&B1->P, &B0->P);
+    copy_point(&B1->Q, &B0->Q);
+    copy_point(&B1->PmQ, &B0->PmQ);
+}
+
+static inline void
+copy_curve(ec_curve_t *E1, const ec_curve_t *E2)
+{
+    fp2_copy(&(E1->A), &(E2->A));
+    fp2_copy(&(E1->C), &(E2->C));
+    E1->is_A24_computed_and_normalized = E2->is_A24_computed_and_normalized;
+    copy_point(&E1->A24, &E2->A24);
+}
+
+// Functions for working with the A24 point and normalisation
+
+/**
+ * @brief Reduce (A : C) to (A/C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve(ec_curve_t *E);
+
+/**
+ * @brief Reduce (A + 2 : 4C) to ((A+2)/4C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_curve_normalize_A24(ec_curve_t *E);
+
+/**
+ * @brief Normalise both (A : C) and (A + 2 : 4C) as above, in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve_and_A24(ec_curve_t *E);
+
+/**
+ * @brief Given a curve E, compute (A+2 : 4C)
+ *
+ * @param A24 the value (A+2 : 4C) to return into
+ * @param E a curve
+ */
+static inline void
+AC_to_A24(ec_point_t *A24, const ec_curve_t *E)
+{
+    // Maybe we already have this computed
+    if (E->is_A24_computed_and_normalized) {
+        copy_point(A24, &E->A24);
+        return;
+    }
+
+    // A24 = (A+2C : 4C)
+    fp2_add(&A24->z, &E->C, &E->C);
+    fp2_add(&A24->x, &E->A, &A24->z);
+    fp2_add(&A24->z, &A24->z, &A24->z);
+}
+
+/**
+ * @brief Given a curve the point (A+2 : 4C) compute the curve coefficients (A : C)
+ *
+ * @param E a curve to compute
+ * @param A24 the value (A+2 : 4C)
+ */
+static inline void
+A24_to_AC(ec_curve_t *E, const ec_point_t *A24)
+{
+    // (A:C) = ((A+2C)*2-4C : 4C)
+    fp2_add(&E->A, &A24->x, &A24->x);
+    fp2_sub(&E->A, &E->A, &A24->z);
+    fp2_add(&E->A, &E->A, &E->A);
+    fp2_copy(&E->C, &A24->z);
+}
+
+/**
+ * @brief j-invariant.
+ *
+ * @param j_inv computed j_invariant
+ * @param curve input curve
+ */
+void ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve);
+
+/**
+ * @brief Isomorphism of elliptic curve
+ * Takes as input two isomorphic Kummer lines in Montgomery form, and output an isomorphism between
+ * them
+ *
+ * @param isom computed isomorphism
+ * @param from domain curve
+ * @param to image curve
+ * @return 0xFFFFFFFF if there was an error during the computation, zero otherwise
+ */
+uint32_t ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to);
+
+/**
+ * @brief In-place evaluation of an isomorphism
+ *
+ * @param P a point
+ * @param isom an isomorphism
+ */
+void ec_iso_eval(ec_point_t *P, ec_isom_t *isom);
+
+/** @}
+ */
+/** @defgroup ec_point_t Point operations
+ * @{
+ */
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @param Q a point
+ * @return 0xFFFFFFFF if equal, zero otherwise
+ */
+uint32_t ec_is_equal(const ec_point_t *P, const ec_point_t *Q);
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @return 0xFFFFFFFF if point at infinity, zero otherwise
+ */
+uint32_t ec_is_zero(const ec_point_t *P);
+
+/**
+ * @brief Two torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Four torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Reduce Z-coordinate of point in place
+ *
+ * @param P a point
+ */
+void ec_normalize_point(ec_point_t *P);
+
+void xDBL_E0(ec_point_t *Q, const ec_point_t *P);
+void xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ);
+void xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized);
+
+/**
+ * @brief Point doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param curve an elliptic curve
+ */
+void ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve);
+
+/**
+ * @brief Point iterated doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param n the number of double
+ * @param curve the curve on which P lays
+ */
+void ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Iterated doubling for a basis P, Q, PmQ
+ *
+ * @param res the computed iterated double of basis B
+ * @param n the number of doubles
+ * @param B the basis to double
+ * @param curve the parent curve of the basis
+ */
+void ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve);
+
+/**
+ * @brief Point multiplication
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param P a point
+ * @param kbits numer of bits of the scalar
+ */
+void ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Combination P+m*Q
+ *
+ * @param R computed P + m * Q
+ * @param curve the curve
+ * @param m an unsigned multi-precision integer
+ * @param P a point
+ * @param Q a point
+ * @param PQ the difference P-Q
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_ladder3pt(ec_point_t *R,
+                 const digit_t *m,
+                 const ec_point_t *P,
+                 const ec_point_t *Q,
+                 const ec_point_t *PQ,
+                 const ec_curve_t *curve);
+
+/**
+ * @brief Linear combination of points of a basis
+ *
+ * @param res computed scalarP * P + scalarQ * Q
+ * @param scalarP an unsigned multi-precision integer
+ * @param scalarQ an unsigned multi-precision integer
+ * @param kbits number of bits of the scalars, or n for points of order 2^n
+ * @param PQ a torsion basis consisting of points P and Q
+ * @param curve the curve
+ *
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_biscalar_mul(ec_point_t *res,
+                    const digit_t *scalarP,
+                    const digit_t *scalarQ,
+                    const int kbits,
+                    const ec_basis_t *PQ,
+                    const ec_curve_t *curve);
+
+// end point computations
+/**
+ * @}
+ */
+
+/** @defgroup ec_dlog_t Torsion basis computations
+ * @{
+ */
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve along with a hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ *
+ * @return A hint
+ *
+ * The algorithm is deterministc
+ */
+uint8_t ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f);
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve and a given hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ * @param hint the hint
+ *
+ * @return 1 is the basis is valid, 0 otherwise
+ *
+ * The algorithm is deterministc
+ */
+int ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint);
+/** // end basis computations
+ * @}
+ */
+
+/** @defgroup ec_isog_t Isogenies
+ * @{
+ */
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points.
+ * Returns 0 if successful and -1 if kernel has the wrong order or includes (0:1).
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points);
+
+/**
+ * @brief Multiplicative strategy for a short isogeny chain. Returns 1 if successfull and -1
+ * if kernel has the wrong order or includes (0:1) when special=false.
+ *
+ * @param curve domain curve, to be overwritten by the codomain curve.
+ * @param kernel a kernel generator of order 2^len
+ * @param len the length of t he 2-isogeny chain
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ * @param special if true, allow isogenies with (0:1) in the kernel
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_small_chain(ec_curve_t *curve,
+                             const ec_point_t *kernel,
+                             int len,
+                             ec_point_t *points,
+                             unsigned len_points,
+                             bool special);
+
+/**
+ * @brief Recover Y-coordinate from X-coordinate and curve coefficients.
+ *
+ * @param y: a y-coordinate
+ * @param Px: a x-coordinate
+ * @param curve: the elliptic curve
+ *
+ * @return 0xFFFFFFFF if the point was on the curve, 0 otherwise
+ */
+uint32_t ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve);
+
+// Jacobian point init and copying
+void jac_init(jac_point_t *P);
+void copy_jac_point(jac_point_t *P, const jac_point_t *Q);
+
+/**
+ * @brief Test if two Jacobian points are equal
+ *
+ * @param P: a point
+ * @param Q: a point
+ *
+ * @return 0xFFFFFFFF if they are equal, 0 otherwise
+ */
+uint32_t jac_is_equal(const jac_point_t *P, const jac_point_t *Q);
+
+// Convert from Jacobian to x-only (just drop the Y-coordinate)
+void jac_to_xz(ec_point_t *P, const jac_point_t *xyP);
+// Convert from Jacobian coordinates in Montgomery model to Weierstrass
+void jac_to_ws(jac_point_t *P, fp2_t *t, fp2_t *ao3, const jac_point_t *Q, const ec_curve_t *curve);
+void jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve);
+
+// Jacobian arithmetic
+void jac_neg(jac_point_t *Q, const jac_point_t *P);
+void ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+void DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC);
+void DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t);
+void jac_to_xz_add_components(add_components_t *uvw, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ *
+ *
+ * Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and
+ * the point P = (X/Z : 1). For generic implementation see lift_basis()
+ */
+uint32_t lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ */
+uint32_t lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Check if basis points (P, Q) form a full 4-basis
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if they form a basis, 0 otherwise
+ */
+uint32_t ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E);
+
+/*
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Test functions for printing and order checking, only used in debug mode
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+
+/**
+ * @brief Check if a point (X : Z) has order exactly 2^t
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_point_order_twof(const ec_point_t *P, const ec_curve_t *E, int t)
+{
+    ec_point_t test;
+    ec_curve_t curve;
+    test = *P;
+    copy_curve(&curve, E);
+
+    if (ec_is_zero(&test))
+        return 0;
+    // Scale point by 2^(t-1)
+    ec_dbl_iter(&test, t - 1, &test, &curve);
+    // If it's zero now, it doesnt have order 2^t
+    if (ec_is_zero(&test))
+        return 0;
+    // Ensure [2^t] P = 0
+    ec_dbl(&test, &test, &curve);
+    return ec_is_zero(&test);
+}
+
+/**
+ * @brief Check if basis points (P, Q, PmQ) all have order exactly 2^t
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_basis_order_twof(const ec_basis_t *B, const ec_curve_t *E, int t)
+{
+    int check_P = test_point_order_twof(&B->P, E, t);
+    int check_Q = test_point_order_twof(&B->Q, E, t);
+    int check_PmQ = test_point_order_twof(&B->PmQ, E, t);
+
+    return check_P & check_Q & check_PmQ;
+}
+
+/**
+ * @brief Check if a Jacobian point (X : Y : Z) has order exactly 2^f
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_jac_order_twof(const jac_point_t *P, const ec_curve_t *E, int t)
+{
+    jac_point_t test;
+    test = *P;
+    if (fp2_is_zero(&test.z))
+        return 0;
+    for (int i = 0; i < t - 1; i++) {
+        DBL(&test, &test, E);
+    }
+    if (fp2_is_zero(&test.z))
+        return 0;
+    DBL(&test, &test, E);
+    return (fp2_is_zero(&test.z));
+}
+
+// Prints the x-coordinate of the point (X : 1)
+static void
+ec_point_print(const char *name, ec_point_t P)
+{
+    fp2_t a;
+    if (fp2_is_zero(&P.z)) {
+        printf("%s = INF\n", name);
+    } else {
+        fp2_copy(&a, &P.z);
+        fp2_inv(&a);
+        fp2_mul(&a, &a, &P.x);
+        fp2_print(name, &a);
+    }
+}
+
+// Prints the Montgomery coefficient A
+static void
+ec_curve_print(const char *name, ec_curve_t E)
+{
+    fp2_t a;
+    fp2_copy(&a, &E.C);
+    fp2_inv(&a);
+    fp2_mul(&a, &a, &E.A);
+    fp2_print(name, &a);
+}
+
+#endif
+// end isogeny computations
+/**
+ * @}
+ */
+
+// end ec
+/**
+ * @}
+ */
diff --git a/src/pqm4/sqisign_lvl3/ref/ec_jac.c b/src/pqm4/sqisign_lvl3/ref/ec_jac.c
new file mode 100644
index 0000000..20ca68c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/ec_jac.c
@@ -0,0 +1,335 @@
+#include <assert.h>
+#include <ec.h>
+
+void
+jac_init(jac_point_t *P)
+{ // Initialize Montgomery in Jacobian coordinates as identity element (0:1:0)
+    fp2_set_zero(&P->x);
+    fp2_set_one(&P->y);
+    fp2_set_zero(&P->z);
+}
+
+uint32_t
+jac_is_equal(const jac_point_t *P, const jac_point_t *Q)
+{ // Evaluate if two points in Jacobian coordinates (X:Y:Z) are equal
+  // Returns 1 (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1, t2, t3;
+
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t2, &P->x, &t0); // x1*z2^2
+    fp2_sqr(&t1, &P->z);
+    fp2_mul(&t3, &Q->x, &t1); // x2*z1^2
+    fp2_sub(&t2, &t2, &t3);
+
+    fp2_mul(&t0, &t0, &Q->z);
+    fp2_mul(&t0, &P->y, &t0); // y1*z2^3
+    fp2_mul(&t1, &t1, &P->z);
+    fp2_mul(&t1, &Q->y, &t1); // y2*z1^3
+    fp2_sub(&t0, &t0, &t1);
+
+    return fp2_is_zero(&t0) & fp2_is_zero(&t2);
+}
+
+void
+jac_to_xz(ec_point_t *P, const jac_point_t *xyP)
+{
+    fp2_copy(&P->x, &xyP->x);
+    fp2_copy(&P->z, &xyP->z);
+    fp2_sqr(&P->z, &P->z);
+
+    // If xyP = (0:1:0), we currently have P=(0 : 0) but we want to set P=(1:0)
+    uint32_t c1, c2;
+    fp2_t one;
+    fp2_set_one(&one);
+
+    c1 = fp2_is_zero(&P->x);
+    c2 = fp2_is_zero(&P->z);
+    fp2_select(&P->x, &P->x, &one, c1 & c2);
+}
+
+void
+jac_to_ws(jac_point_t *Q, fp2_t *t, fp2_t *ao3, const jac_point_t *P, const ec_curve_t *curve)
+{
+    // Cost of 3M + 2S when A != 0.
+    fp_t one;
+    fp2_t a;
+    /* a = 1 - A^2/3, U = X + (A*Z^2)/3, V = Y, W = Z, T = a*Z^4*/
+    fp_set_one(&one);
+    if (!fp2_is_zero(&(curve->A))) {
+        fp_div3(&(ao3->re), &(curve->A.re));
+        fp_div3(&(ao3->im), &(curve->A.im));
+        fp2_sqr(t, &P->z);
+        fp2_mul(&Q->x, ao3, t);
+        fp2_add(&Q->x, &Q->x, &P->x);
+        fp2_sqr(t, t);
+        fp2_mul(&a, ao3, &(curve->A));
+        fp_sub(&(a.re), &one, &(a.re));
+        fp_neg(&(a.im), &(a.im));
+        fp2_mul(t, t, &a);
+    } else {
+        fp2_copy(&Q->x, &P->x);
+        fp2_sqr(t, &P->z);
+        fp2_sqr(t, t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve)
+{
+    // Cost of 1M + 1S when A != 0.
+    fp2_t t;
+    /* X = U - (A*W^2)/3, Y = V, Z = W. */
+    if (!fp2_is_zero(&(curve->A))) {
+        fp2_sqr(&t, &P->z);
+        fp2_mul(&t, &t, ao3);
+        fp2_sub(&Q->x, &P->x, &t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+copy_jac_point(jac_point_t *P, const jac_point_t *Q)
+{
+    fp2_copy(&(P->x), &(Q->x));
+    fp2_copy(&(P->y), &(Q->y));
+    fp2_copy(&(P->z), &(Q->z));
+}
+
+void
+jac_neg(jac_point_t *Q, const jac_point_t *P)
+{
+    fp2_copy(&Q->x, &P->x);
+    fp2_neg(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC)
+{ // Cost of 6M + 6S.
+  // Doubling on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding to
+  // (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    fp2_t t0, t1, t2, t3;
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_sqr(&t0, &P->x); // t0 = x1^2
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1); // t0 = 3x1^2
+    fp2_sqr(&t1, &P->z);    // t1 = z1^2
+    fp2_mul(&t2, &P->x, &AC->A);
+    fp2_add(&t2, &t2, &t2); // t2 = 2Ax1
+    fp2_add(&t2, &t1, &t2); // t2 = 2Ax1+z1^2
+    fp2_mul(&t2, &t1, &t2); // t2 = z1^2(2Ax1+z1^2)
+    fp2_add(&t2, &t0, &t2); // t2 = alpha = 3x1^2 + z1^2(2Ax1+z1^2)
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z); // z2 = 2y1z1
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t0, &t0, &AC->A); // t0 = 4Ay1^2z1^2
+    fp2_sqr(&t1, &P->y);
+    fp2_add(&t1, &t1, &t1);     // t1 = 2y1^2
+    fp2_add(&t3, &P->x, &P->x); // t3 = 2x1
+    fp2_mul(&t3, &t1, &t3);     // t3 = 4x1y1^2
+    fp2_sqr(&Q->x, &t2);        // x2 = alpha^2
+    fp2_sub(&Q->x, &Q->x, &t0); // x2 = alpha^2 - 4Ay1^2z1^2
+    fp2_sub(&Q->x, &Q->x, &t3);
+    fp2_sub(&Q->x, &Q->x, &t3); // x2 = alpha^2 - 4Ay1^2z1^2 - 8x1y1^2
+    fp2_sub(&Q->y, &t3, &Q->x); // y2 = 4x1y1^2 - x2
+    fp2_mul(&Q->y, &Q->y, &t2); // y2 = alpha(4x1y1^2 - x2)
+    fp2_sqr(&t1, &t1);          // t1 = 4y1^4
+    fp2_sub(&Q->y, &Q->y, &t1);
+    fp2_sub(&Q->y, &Q->y, &t1); // y2 = alpha(4x1y1^2 - x2) - 8y1^4
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t)
+{ // Cost of 3M + 5S.
+  // Doubling on a Weierstrass curve, representation in modified Jacobian coordinates
+  // (X:Y:Z:T=a*Z^4) corresponding to (X/Z^2,Y/Z^3), where a is the curve coefficient.
+  // Formula from https://hyperelliptic.org/EFD/g1p/auto-shortw-modified.html
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_t xx, c, cc, r, s, m;
+    // XX = X^2
+    fp2_sqr(&xx, &P->x);
+    // A = 2*Y^2
+    fp2_sqr(&c, &P->y);
+    fp2_add(&c, &c, &c);
+    // AA = A^2
+    fp2_sqr(&cc, &c);
+    // R = 2*AA
+    fp2_add(&r, &cc, &cc);
+    // S = (X+A)^2-XX-AA
+    fp2_add(&s, &P->x, &c);
+    fp2_sqr(&s, &s);
+    fp2_sub(&s, &s, &xx);
+    fp2_sub(&s, &s, &cc);
+    // M = 3*XX+T1
+    fp2_add(&m, &xx, &xx);
+    fp2_add(&m, &m, &xx);
+    fp2_add(&m, &m, t);
+    // X3 = M^2-2*S
+    fp2_sqr(&Q->x, &m);
+    fp2_sub(&Q->x, &Q->x, &s);
+    fp2_sub(&Q->x, &Q->x, &s);
+    // Z3 = 2*Y*Z
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z);
+    // Y3 = M*(S-X3)-R
+    fp2_sub(&Q->y, &s, &Q->x);
+    fp2_mul(&Q->y, &Q->y, &m);
+    fp2_sub(&Q->y, &Q->y, &r);
+    // T3 = 2*R*T1
+    fp2_mul(u, t, &r);
+    fp2_add(u, u, u);
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+select_jac_point(jac_point_t *Q, const jac_point_t *P1, const jac_point_t *P2, const digit_t option)
+{ // Select points
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->y), &(P1->y), &(P2->y), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Addition on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding
+    // to (x,y) = (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    //
+    // Complete routine, to handle all edge cases:
+    //   if ZP == 0:            # P == inf
+    //       return Q
+    //   if ZQ == 0:            # Q == inf
+    //       return P
+    //   dy <- YQ*ZP**3 - YP*ZQ**3
+    //   dx <- XQ*ZP**2 - XP*ZQ**2
+    //   if dx == 0:             # x1 == x2
+    //       if dy == 0:         # ... and y1 == y2: doubling case
+    //           dy <- ZP*ZQ * (3*XP^2 + ZP^2 * (2*A*XP + ZP^2))
+    //           dx <- 2*YP*ZP
+    //       else:              # ... but y1 != y2, thus P = -Q
+    //           return inf
+    //   XR <- dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2)
+    //   YR <- dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3
+    //   ZR <- dx * ZP * ZQ
+
+    // Constant time processing:
+    // - The case for P == 0 or Q == 0 is handled at the end with conditional select
+    // - dy and dx are computed for both the normal and doubling cases, we switch when
+    //   dx == dy == 0 for the normal case.
+    // - If we have that P = -Q then dx = 0 and so ZR will be zero, giving us the point
+    //   at infinity for "free".
+    //
+    // These current formula are expensive and I'm probably missing some tricks...
+    // Thought I'd get the ball rolling.
+    // Cost 17M + 6S + 13a
+    fp2_t t0, t1, t2, t3, u1, u2, v1, dx, dy;
+
+    /* If P is zero or Q is zero we will conditionally swap before returning. */
+    uint32_t ctl1 = fp2_is_zero(&P->z);
+    uint32_t ctl2 = fp2_is_zero(&Q->z);
+
+    /* Precompute some values */
+    fp2_sqr(&t0, &P->z); // t0 = z1^2
+    fp2_sqr(&t1, &Q->z); // t1 = z2^2
+
+    /* Compute dy and dx for ordinary case */
+    fp2_mul(&v1, &t1, &Q->z); // v1 = z2^3
+    fp2_mul(&t2, &t0, &P->z); // t2 = z1^3
+    fp2_mul(&v1, &v1, &P->y); // v1 = y1z2^3
+    fp2_mul(&t2, &t2, &Q->y); // t2 = y2z1^3
+    fp2_sub(&dy, &t2, &v1);   // dy = y2z1^3 - y1z2^3
+    fp2_mul(&u2, &t0, &Q->x); // u2 = x2z1^2
+    fp2_mul(&u1, &t1, &P->x); // u1 = x1z2^2
+    fp2_sub(&dx, &u2, &u1);   // dx = x2z1^2 - x1z2^2
+
+    /* Compute dy and dx for doubling case */
+    fp2_add(&t1, &P->y, &P->y);   // dx_dbl = t1 = 2y1
+    fp2_add(&t2, &AC->A, &AC->A); // t2 = 2A
+    fp2_mul(&t2, &t2, &P->x);     // t2 = 2Ax1
+    fp2_add(&t2, &t2, &t0);       // t2 = 2Ax1 + z1^2
+    fp2_mul(&t2, &t2, &t0);       // t2 = z1^2 * (2Ax1 + z1^2)
+    fp2_sqr(&t0, &P->x);          // t0 = x1^2
+    fp2_add(&t2, &t2, &t0);       // t2 = x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 2*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 3*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_mul(&t2, &t2, &Q->z);     // dy_dbl = t2 = z2 * (3*x1^2 + z1^2 * (2Ax1 + z1^2))
+
+    /* If dx is zero and dy is zero swap with double variables */
+    uint32_t ctl = fp2_is_zero(&dx) & fp2_is_zero(&dy);
+    fp2_select(&dx, &dx, &t1, ctl);
+    fp2_select(&dy, &dy, &t2, ctl);
+
+    /* Some more precomputations */
+    fp2_mul(&t0, &P->z, &Q->z); // t0 = z1z2
+    fp2_sqr(&t1, &t0);          // t1 = z1z2^2
+    fp2_sqr(&t2, &dx);          // t2 = dx^2
+    fp2_sqr(&t3, &dy);          // t3 = dy^2
+
+    /* Compute x3 = dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2) */
+    fp2_mul(&R->x, &AC->A, &t1); // x3 = A*(z1z2)^2
+    fp2_add(&R->x, &R->x, &u1);  // x3 = A*(z1z2)^2 + u1
+    fp2_add(&R->x, &R->x, &u2);  // x3 = A*(z1z2)^2 + u1 + u2
+    fp2_mul(&R->x, &R->x, &t2);  // x3 = dx^2 * (A*(z1z2)^2 + u1 + u2)
+    fp2_sub(&R->x, &t3, &R->x);  // x3 = dy^2 - dx^2 * (A*(z1z2)^2 + u1 + u2)
+
+    /* Compute y3 = dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3*/
+    fp2_mul(&R->y, &u1, &t2);     // y3 = u1 * dx^2
+    fp2_sub(&R->y, &R->y, &R->x); // y3 = u1 * dx^2 - x3
+    fp2_mul(&R->y, &R->y, &dy);   // y3 = dy * (u1 * dx^2 - x3)
+    fp2_mul(&t3, &t2, &dx);       // t3 = dx^3
+    fp2_mul(&t3, &t3, &v1);       // t3 = v1 * dx^3
+    fp2_sub(&R->y, &R->y, &t3);   // y3 = dy * (u1 * dx^2 - x3) - v1 * dx^3
+
+    /* Compute z3 = dx * z1 * z2 */
+    fp2_mul(&R->z, &dx, &t0);
+
+    /* Finally, we need to set R = P is Q.Z = 0 and R = Q if P.Z = 0 */
+    select_jac_point(R, R, Q, ctl1);
+    select_jac_point(R, R, P, ctl2);
+}
+
+void
+jac_to_xz_add_components(add_components_t *add_comp, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Take P and Q in E distinct, two jac_point_t, return three components u,v and w in Fp2 such
+    // that the xz coordinates of P+Q are (u-v:w) and of P-Q are (u+v:w)
+
+    fp2_t t0, t1, t2, t3, t4, t5, t6;
+
+    fp2_sqr(&t0, &P->z);             // t0 = z1^2
+    fp2_sqr(&t1, &Q->z);             // t1 = z2^2
+    fp2_mul(&t2, &P->x, &t1);        // t2 = x1z2^2
+    fp2_mul(&t3, &t0, &Q->x);        // t3 = z1^2x2
+    fp2_mul(&t4, &P->y, &Q->z);      // t4 = y1z2
+    fp2_mul(&t4, &t4, &t1);          // t4 = y1z2^3
+    fp2_mul(&t5, &P->z, &Q->y);      // t5 = z1y2
+    fp2_mul(&t5, &t5, &t0);          // t5 = z1^3y2
+    fp2_mul(&t0, &t0, &t1);          // t0 = (z1z2)^2
+    fp2_mul(&t6, &t4, &t5);          // t6 = (z1z_2)^3y1y2
+    fp2_add(&add_comp->v, &t6, &t6); // v  = 2(z1z_2)^3y1y2
+    fp2_sqr(&t4, &t4);               // t4 = y1^2z2^6
+    fp2_sqr(&t5, &t5);               // t5 = z1^6y_2^2
+    fp2_add(&t4, &t4, &t5);          // t4 = z1^6y_2^2 + y1^2z2^6
+    fp2_add(&t5, &t2, &t3);          // t5 = x1z2^2 +z_1^2x2
+    fp2_add(&t6, &t3, &t3);          // t6 = 2z_1^2x2
+    fp2_sub(&t6, &t5, &t6);          // t6 = lambda = x1z2^2 - z_1^2x2
+    fp2_sqr(&t6, &t6);               // t6 = lambda^2 = (x1z2^2 - z_1^2x2)^2
+    fp2_mul(&t1, &AC->A, &t0);       // t1 = A*(z1z2)^2
+    fp2_add(&t1, &t5, &t1);          // t1 = gamma =A*(z1z2)^2 + x1z2^2 +z_1^2x2
+    fp2_mul(&t1, &t1, &t6);          // t1 = gamma*lambda^2
+    fp2_sub(&add_comp->u, &t4, &t1); // u  = z1^6y_2^2 + y1^2z2^6 - gamma*lambda^2
+    fp2_mul(&add_comp->w, &t6, &t0); // w  = (z1z2)^2(lambda)^2
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/ec_params.c b/src/pqm4/sqisign_lvl3/ref/ec_params.c
new file mode 100644
index 0000000..ae214aa
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/ec_params.c
@@ -0,0 +1,4 @@
+#include <ec_params.h>
+// p+1 divided by the power of 2
+const digit_t p_cofactor_for_2f[1] = {65};
+
diff --git a/src/pqm4/sqisign_lvl3/ref/ec_params.h b/src/pqm4/sqisign_lvl3/ref/ec_params.h
new file mode 100644
index 0000000..941abd5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/ec_params.h
@@ -0,0 +1,12 @@
+#ifndef EC_PARAMS_H
+#define EC_PARAMS_H
+
+#include <fp.h>
+
+#define TORSION_EVEN_POWER 376
+
+// p+1 divided by the power of 2
+extern const digit_t p_cofactor_for_2f[1];
+#define P_COFACTOR_FOR_2F_BITLENGTH 7
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/encode_verification.c b/src/pqm4/sqisign_lvl3/ref/encode_verification.c
new file mode 100644
index 0000000..fecdb9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/encode_verification.c
@@ -0,0 +1,220 @@
+#include <verification.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp2.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+typedef unsigned char byte_t;
+
+// digits
+
+static void
+encode_digits(byte_t *enc, const digit_t *x, size_t nbytes)
+{
+#ifdef TARGET_BIG_ENDIAN
+    const size_t ndigits = nbytes / sizeof(digit_t);
+    const size_t rem = nbytes % sizeof(digit_t);
+
+    for (size_t i = 0; i < ndigits; i++)
+        ((digit_t *)enc)[i] = BSWAP_DIGIT(x[i]);
+    if (rem) {
+        digit_t ld = BSWAP_DIGIT(x[ndigits]);
+        memcpy(enc + ndigits * sizeof(digit_t), (byte_t *)&ld, rem);
+    }
+#else
+    memcpy(enc, (const byte_t *)x, nbytes);
+#endif
+}
+
+static void
+decode_digits(digit_t *x, const byte_t *enc, size_t nbytes, size_t ndigits)
+{
+    assert(nbytes <= ndigits * sizeof(digit_t));
+    memcpy((byte_t *)x, enc, nbytes);
+    memset((byte_t *)x + nbytes, 0, ndigits * sizeof(digit_t) - nbytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (size_t i = 0; i < ndigits; i++)
+        x[i] = BSWAP_DIGIT(x[i]);
+#endif
+}
+
+// fp2_t
+
+static byte_t *
+fp2_to_bytes(byte_t *enc, const fp2_t *x)
+{
+    fp2_encode(enc, x);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+static const byte_t *
+fp2_from_bytes(fp2_t *x, const byte_t *enc)
+{
+    fp2_decode(x, enc);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+// curves and points
+
+static byte_t *
+proj_to_bytes(byte_t *enc, const fp2_t *x, const fp2_t *z)
+{
+    assert(!fp2_is_zero(z));
+    fp2_t tmp = *z;
+    fp2_inv(&tmp);
+#ifndef NDEBUG
+    {
+        fp2_t chk;
+        fp2_mul(&chk, z, &tmp);
+        fp2_t one;
+        fp2_set_one(&one);
+        assert(fp2_is_equal(&chk, &one));
+    }
+#endif
+    fp2_mul(&tmp, x, &tmp);
+    enc = fp2_to_bytes(enc, &tmp);
+    return enc;
+}
+
+static const byte_t *
+proj_from_bytes(fp2_t *x, fp2_t *z, const byte_t *enc)
+{
+    enc = fp2_from_bytes(x, enc);
+    fp2_set_one(z);
+    return enc;
+}
+
+static byte_t *
+ec_curve_to_bytes(byte_t *enc, const ec_curve_t *curve)
+{
+    return proj_to_bytes(enc, &curve->A, &curve->C);
+}
+
+static const byte_t *
+ec_curve_from_bytes(ec_curve_t *curve, const byte_t *enc)
+{
+    memset(curve, 0, sizeof(*curve));
+    return proj_from_bytes(&curve->A, &curve->C, enc);
+}
+
+static byte_t *
+ec_point_to_bytes(byte_t *enc, const ec_point_t *point)
+{
+    return proj_to_bytes(enc, &point->x, &point->z);
+}
+
+static const byte_t *
+ec_point_from_bytes(ec_point_t *point, const byte_t *enc)
+{
+    return proj_from_bytes(&point->x, &point->z, enc);
+}
+
+static byte_t *
+ec_basis_to_bytes(byte_t *enc, const ec_basis_t *basis)
+{
+    enc = ec_point_to_bytes(enc, &basis->P);
+    enc = ec_point_to_bytes(enc, &basis->Q);
+    enc = ec_point_to_bytes(enc, &basis->PmQ);
+    return enc;
+}
+
+static const byte_t *
+ec_basis_from_bytes(ec_basis_t *basis, const byte_t *enc)
+{
+    enc = ec_point_from_bytes(&basis->P, enc);
+    enc = ec_point_from_bytes(&basis->Q, enc);
+    enc = ec_point_from_bytes(&basis->PmQ, enc);
+    return enc;
+}
+
+// public API
+
+byte_t *
+public_key_to_bytes(byte_t *enc, const public_key_t *pk)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_to_bytes(enc, &pk->curve);
+    *enc++ = pk->hint_pk;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+const byte_t *
+public_key_from_bytes(public_key_t *pk, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_from_bytes(&pk->curve, enc);
+    pk->hint_pk = *enc++;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+void
+signature_to_bytes(byte_t *enc, const signature_t *sig)
+{
+#ifndef NDEBUG
+    byte_t *const start = enc;
+#endif
+
+    enc = fp2_to_bytes(enc, &sig->E_aux_A);
+
+    *enc++ = sig->backtracking;
+    *enc++ = sig->two_resp_length;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][1], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][1], nbytes);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    encode_digits(enc, sig->chall_coeff, nbytes);
+    enc += nbytes;
+
+    *enc++ = sig->hint_aux;
+    *enc++ = sig->hint_chall;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
+
+void
+signature_from_bytes(signature_t *sig, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+
+    enc = fp2_from_bytes(&sig->E_aux_A, enc);
+
+    sig->backtracking = *enc++;
+    sig->two_resp_length = *enc++;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    decode_digits(sig->chall_coeff, enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    sig->hint_aux = *enc++;
+    sig->hint_chall = *enc++;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/encoded_sizes.h b/src/pqm4/sqisign_lvl3/ref/encoded_sizes.h
new file mode 100644
index 0000000..50a8781
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/encoded_sizes.h
@@ -0,0 +1,11 @@
+#define SECURITY_BITS 192
+#define SQIsign_response_length 192
+#define HASH_ITERATIONS 256
+#define FP_ENCODED_BYTES 48
+#define FP2_ENCODED_BYTES 96
+#define EC_CURVE_ENCODED_BYTES 96
+#define EC_POINT_ENCODED_BYTES 96
+#define EC_BASIS_ENCODED_BYTES 288
+#define PUBLICKEY_BYTES 97
+#define SECRETKEY_BYTES 529
+#define SIGNATURE_BYTES 224
diff --git a/src/pqm4/sqisign_lvl3/ref/fp.c b/src/pqm4/sqisign_lvl3/ref/fp.c
new file mode 100644
index 0000000..48e2937
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp.c
@@ -0,0 +1,15 @@
+#include <fp.h>
+
+/*
+ * If ctl == 0x00000000, then *d is set to a0
+ * If ctl == 0xFFFFFFFF, then *d is set to a1
+ * ctl MUST be either 0x00000000 or 0xFFFFFFFF.
+ */
+void
+fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl)
+{
+    digit_t cw = (int32_t)ctl;
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++) {
+        (*d)[i] = (*a0)[i] ^ (cw & ((*a0)[i] ^ (*a1)[i]));
+    }
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/fp.h b/src/pqm4/sqisign_lvl3/ref/fp.h
new file mode 100644
index 0000000..1241d58
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp.h
@@ -0,0 +1,48 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <sqisign_namespace.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD]; // Datatype for representing field elements
+
+extern const digit_t ONE[NWORDS_FIELD];
+extern const digit_t ZERO[NWORDS_FIELD];
+// extern const digit_t PM1O3[NWORDS_FIELD];
+
+void fp_set_small(fp_t *x, const digit_t val);
+void fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val);
+void fp_set_zero(fp_t *x);
+void fp_set_one(fp_t *x);
+uint32_t fp_is_equal(const fp_t *a, const fp_t *b);
+uint32_t fp_is_zero(const fp_t *a);
+void fp_copy(fp_t *out, const fp_t *a);
+
+void fp_encode(void *dst, const fp_t *a);
+void fp_decode_reduce(fp_t *d, const void *src, size_t len);
+uint32_t fp_decode(fp_t *d, const void *src);
+
+void fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl);
+void fp_cswap(fp_t *a, fp_t *b, uint32_t ctl);
+
+void fp_add(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_sub(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_neg(fp_t *out, const fp_t *a);
+void fp_sqr(fp_t *out, const fp_t *a);
+void fp_mul(fp_t *out, const fp_t *a, const fp_t *b);
+
+void fp_inv(fp_t *x);
+uint32_t fp_is_square(const fp_t *a);
+void fp_sqrt(fp_t *a);
+void fp_half(fp_t *out, const fp_t *a);
+void fp_exp3div4(fp_t *out, const fp_t *a);
+void fp_div3(fp_t *out, const fp_t *a);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/fp2.c b/src/pqm4/sqisign_lvl3/ref/fp2.c
new file mode 100644
index 0000000..a258952
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp2.c
@@ -0,0 +1,328 @@
+#include <inttypes.h>
+#include <encoded_sizes.h>
+#include <fp2.h>
+
+/* Arithmetic modulo X^2 + 1 */
+
+void
+fp2_set_small(fp2_t *x, const digit_t val)
+{
+    fp_set_small(&(x->re), val);
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n)
+{
+    fp_mul_small(&x->re, &y->re, n);
+    fp_mul_small(&x->im, &y->im, n);
+}
+
+void
+fp2_set_one(fp2_t *x)
+{
+    fp_set_one(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_set_zero(fp2_t *x)
+{
+    fp_set_zero(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+// Is a GF(p^2) element zero?
+// Returns 0xFF...FF (true) if a=0, 0 (false) otherwise
+uint32_t
+fp2_is_zero(const fp2_t *a)
+{
+    return fp_is_zero(&(a->re)) & fp_is_zero(&(a->im));
+}
+
+// Compare two GF(p^2) elements in constant time
+// Returns 0xFF...FF (true) if a=b, 0 (false) otherwise
+uint32_t
+fp2_is_equal(const fp2_t *a, const fp2_t *b)
+{
+    return fp_is_equal(&(a->re), &(b->re)) & fp_is_equal(&(a->im), &(b->im));
+}
+
+// Is a GF(p^2) element one?
+// Returns 0xFF...FF (true) if a=1, 0 (false) otherwise
+uint32_t
+fp2_is_one(const fp2_t *a)
+{
+    return fp_is_equal(&(a->re), &ONE) & fp_is_zero(&(a->im));
+}
+
+void
+fp2_copy(fp2_t *x, const fp2_t *y)
+{
+    fp_copy(&(x->re), &(y->re));
+    fp_copy(&(x->im), &(y->im));
+}
+
+void
+fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_add(&(x->re), &(y->re), &(z->re));
+    fp_add(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_add_one(fp2_t *x, const fp2_t *y)
+{
+    fp_add(&x->re, &y->re, &ONE);
+    fp_copy(&x->im, &y->im);
+}
+
+void
+fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_sub(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_neg(fp2_t *x, const fp2_t *y)
+{
+    fp_neg(&(x->re), &(y->re));
+    fp_neg(&(x->im), &(y->im));
+}
+
+void
+fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_t t0, t1;
+
+    fp_add(&t0, &(y->re), &(y->im));
+    fp_add(&t1, &(z->re), &(z->im));
+    fp_mul(&t0, &t0, &t1);
+    fp_mul(&t1, &(y->im), &(z->im));
+    fp_mul(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &t0, &t1);
+    fp_sub(&(x->im), &(x->im), &(x->re));
+    fp_sub(&(x->re), &(x->re), &t1);
+}
+
+void
+fp2_sqr(fp2_t *x, const fp2_t *y)
+{
+    fp_t sum, diff;
+
+    fp_add(&sum, &(y->re), &(y->im));
+    fp_sub(&diff, &(y->re), &(y->im));
+    fp_mul(&(x->im), &(y->re), &(y->im));
+    fp_add(&(x->im), &(x->im), &(x->im));
+    fp_mul(&(x->re), &sum, &diff);
+}
+
+void
+fp2_inv(fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+    fp_inv(&t0);
+    fp_mul(&(x->re), &(x->re), &t0);
+    fp_mul(&(x->im), &(x->im), &t0);
+    fp_neg(&(x->im), &(x->im));
+}
+
+uint32_t
+fp2_is_square(const fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+
+    return fp_is_square(&t0);
+}
+
+void
+fp2_sqrt(fp2_t *a)
+{
+    fp_t x0, x1, t0, t1;
+
+    /* From "Optimized One-Dimensional SQIsign Verification on Intel and
+     * Cortex-M4" by Aardal et al: https://eprint.iacr.org/2024/1563 */
+
+    // x0 = \delta = sqrt(a0^2 + a1^2).
+    fp_sqr(&x0, &(a->re));
+    fp_sqr(&x1, &(a->im));
+    fp_add(&x0, &x0, &x1);
+    fp_sqrt(&x0);
+    // If a1 = 0, there is a risk of \delta = -a0, which makes x0 = 0 below.
+    // In that case, we restore the value \delta = a0.
+    fp_select(&x0, &x0, &(a->re), fp_is_zero(&(a->im)));
+    // x0 = \delta + a0, t0 = 2 * x0.
+    fp_add(&x0, &x0, &(a->re));
+    fp_add(&t0, &x0, &x0);
+
+    // x1 = t0^(p-3)/4
+    fp_exp3div4(&x1, &t0);
+
+    // x0 = x0 * x1, x1 = x1 * a1, t1 = (2x0)^2.
+    fp_mul(&x0, &x0, &x1);
+    fp_mul(&x1, &x1, &(a->im));
+    fp_add(&t1, &x0, &x0);
+    fp_sqr(&t1, &t1);
+    // If t1 = t0, return x0 + x1*i, otherwise x1 - x0*i.
+    fp_sub(&t0, &t0, &t1);
+    uint32_t f = fp_is_zero(&t0);
+    fp_neg(&t1, &x0);
+    fp_copy(&t0, &x1);
+    fp_select(&t0, &t0, &x0, f);
+    fp_select(&t1, &t1, &x1, f);
+
+    // Check if t0 is zero
+    uint32_t t0_is_zero = fp_is_zero(&t0);
+
+    // Check whether t0, t1 are odd
+    // Note: we encode to ensure canonical representation
+    uint8_t tmp_bytes[FP_ENCODED_BYTES];
+    fp_encode(tmp_bytes, &t0);
+    uint32_t t0_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+    fp_encode(tmp_bytes, &t1);
+    uint32_t t1_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+
+    // We negate the output if:
+    // t0 is odd, or
+    // t0 is zero and t1 is odd
+    uint32_t negate_output = t0_is_odd | (t0_is_zero & t1_is_odd);
+    fp_neg(&x0, &t0);
+    fp_select(&(a->re), &t0, &x0, negate_output);
+    fp_neg(&x0, &t1);
+    fp_select(&(a->im), &t1, &x0, negate_output);
+}
+
+uint32_t
+fp2_sqrt_verify(fp2_t *a)
+{
+    fp2_t t0, t1;
+
+    fp2_copy(&t0, a);
+    fp2_sqrt(a);
+    fp2_sqr(&t1, a);
+
+    return (fp2_is_equal(&t0, &t1));
+}
+
+void
+fp2_half(fp2_t *x, const fp2_t *y)
+{
+    fp_half(&(x->re), &(y->re));
+    fp_half(&(x->im), &(y->im));
+}
+
+void
+fp2_batched_inv(fp2_t *x, int len)
+{
+    fp2_t t1[len], t2[len];
+    fp2_t inverse;
+
+    // x = x0,...,xn
+    // t1 = x0, x0*x1, ... ,x0 * x1 * ... * xn
+    fp2_copy(&t1[0], &x[0]);
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t1[i], &t1[i - 1], &x[i]);
+    }
+
+    // inverse = 1/ (x0 * x1 * ... * xn)
+    fp2_copy(&inverse, &t1[len - 1]);
+    fp2_inv(&inverse);
+
+    fp2_copy(&t2[0], &inverse);
+    // t2 = 1/ (x0 * x1 * ... * xn), 1/ (x0 * x1 * ... * x(n-1)) , ... , 1/xO
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t2[i], &t2[i - 1], &x[len - i]);
+    }
+
+    fp2_copy(&x[0], &t2[len - 1]);
+
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&x[i], &t1[i - 1], &t2[len - i - 1]);
+    }
+}
+
+// exponentiation using square and multiply
+// Warning!! Not constant time!
+void
+fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size)
+{
+    fp2_t acc;
+    digit_t bit;
+
+    fp2_copy(&acc, x);
+    fp2_set_one(out);
+
+    // Iterate over each word of exp
+    for (int j = 0; j < size; j++) {
+        // Iterate over each bit of the word
+        for (int i = 0; i < RADIX; i++) {
+            bit = (exp[j] >> i) & 1;
+            if (bit == 1) {
+                fp2_mul(out, out, &acc);
+            }
+            fp2_sqr(&acc, &acc);
+        }
+    }
+}
+
+void
+fp2_print(const char *name, const fp2_t *a)
+{
+    printf("%s0x", name);
+
+    uint8_t buf[FP_ENCODED_BYTES];
+    fp_encode(&buf, &a->re); // Encoding ensures canonical rep
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+
+    printf(" + i*0x");
+
+    fp_encode(&buf, &a->im);
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+    printf("\n");
+}
+
+void
+fp2_encode(void *dst, const fp2_t *a)
+{
+    uint8_t *buf = dst;
+    fp_encode(buf, &(a->re));
+    fp_encode(buf + FP_ENCODED_BYTES, &(a->im));
+}
+
+uint32_t
+fp2_decode(fp2_t *d, const void *src)
+{
+    const uint8_t *buf = src;
+    uint32_t re, im;
+
+    re = fp_decode(&(d->re), buf);
+    im = fp_decode(&(d->im), buf + FP_ENCODED_BYTES);
+    return re & im;
+}
+
+void
+fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl)
+{
+    fp_select(&(d->re), &(a0->re), &(a1->re), ctl);
+    fp_select(&(d->im), &(a0->im), &(a1->im), ctl);
+}
+
+void
+fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl)
+{
+    fp_cswap(&(a->re), &(b->re), ctl);
+    fp_cswap(&(a->im), &(b->im), ctl);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/fp2.h b/src/pqm4/sqisign_lvl3/ref/fp2.h
new file mode 100644
index 0000000..00e673b
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp2.h
@@ -0,0 +1,41 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include <sqisign_namespace.h>
+#include "fp.h"
+#include <stdio.h>
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t
+{
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set_small(fp2_t *x, const digit_t val);
+void fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n);
+void fp2_set_one(fp2_t *x);
+void fp2_set_zero(fp2_t *x);
+uint32_t fp2_is_zero(const fp2_t *a);
+uint32_t fp2_is_equal(const fp2_t *a, const fp2_t *b);
+uint32_t fp2_is_one(const fp2_t *a);
+void fp2_copy(fp2_t *x, const fp2_t *y);
+void fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_add_one(fp2_t *x, const fp2_t *y);
+void fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_neg(fp2_t *x, const fp2_t *y);
+void fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_sqr(fp2_t *x, const fp2_t *y);
+void fp2_inv(fp2_t *x);
+uint32_t fp2_is_square(const fp2_t *x);
+void fp2_sqrt(fp2_t *x);
+uint32_t fp2_sqrt_verify(fp2_t *a);
+void fp2_half(fp2_t *x, const fp2_t *y);
+void fp2_batched_inv(fp2_t *x, int len);
+void fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size);
+void fp2_print(const char *name, const fp2_t *a);
+void fp2_encode(void *dst, const fp2_t *a);
+uint32_t fp2_decode(fp2_t *d, const void *src);
+void fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl);
+void fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/fp_constants.h b/src/pqm4/sqisign_lvl3/ref/fp_constants.h
new file mode 100644
index 0000000..063579a
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp_constants.h
@@ -0,0 +1,17 @@
+#if RADIX == 32
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 12
+#else
+#define NWORDS_FIELD 14
+#endif
+#define NWORDS_ORDER 12
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 6
+#else
+#define NWORDS_FIELD 7
+#endif
+#define NWORDS_ORDER 6
+#endif
+#define BITS 384
+#define LOG2P 9
diff --git a/src/pqm4/sqisign_lvl3/ref/fp_p65376_32.c b/src/pqm4/sqisign_lvl3/ref/fp_p65376_32.c
new file mode 100644
index 0000000..1483461
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/fp_p65376_32.c
@@ -0,0 +1,1231 @@
+// clang-format off
+// Command line : python monty.py 32
+// 0x40ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define sspint int32_t
+#define spint uint32_t
+#define udpint uint64_t
+#define dpint uint64_t
+
+#define Wordlength 32
+#define Nlimbs 14
+#define Radix 28
+#define Nbits 383
+#define Nbytes 48
+
+#define MONTGOMERY
+// propagate carries
+inline static spint prop(spint *n) {
+  int i;
+  spint mask = ((spint)1 << 28u) - (spint)1;
+  sspint carry = (sspint)n[0];
+  carry >>= 28u;
+  n[0] &= mask;
+  for (i = 1; i < 13; i++) {
+    carry += (sspint)n[i];
+    n[i] = (spint)carry & mask;
+    carry >>= 28u;
+  }
+  n[13] += (spint)carry;
+  return -((n[13] >> 1) >> 30u);
+}
+
+// propagate carries and add p if negative, propagate carries again
+inline static int flatten(spint *n) {
+  spint carry = prop(n);
+  n[0] -= (spint)1u & carry;
+  n[13] += ((spint)0x41000u) & carry;
+  (void)prop(n);
+  return (int)(carry & 1);
+}
+
+// Montgomery final subtract
+static int modfsb(spint *n) {
+  n[0] += (spint)1u;
+  n[13] -= (spint)0x41000u;
+  return flatten(n);
+}
+
+// Modular addition - reduce less than 2p
+static void modadd(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] + b[0];
+  n[1] = a[1] + b[1];
+  n[2] = a[2] + b[2];
+  n[3] = a[3] + b[3];
+  n[4] = a[4] + b[4];
+  n[5] = a[5] + b[5];
+  n[6] = a[6] + b[6];
+  n[7] = a[7] + b[7];
+  n[8] = a[8] + b[8];
+  n[9] = a[9] + b[9];
+  n[10] = a[10] + b[10];
+  n[11] = a[11] + b[11];
+  n[12] = a[12] + b[12];
+  n[13] = a[13] + b[13];
+  n[0] += (spint)2u;
+  n[13] -= (spint)0x82000u;
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[13] += ((spint)0x82000u) & carry;
+  (void)prop(n);
+}
+
+// Modular subtraction - reduce less than 2p
+static void modsub(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] - b[0];
+  n[1] = a[1] - b[1];
+  n[2] = a[2] - b[2];
+  n[3] = a[3] - b[3];
+  n[4] = a[4] - b[4];
+  n[5] = a[5] - b[5];
+  n[6] = a[6] - b[6];
+  n[7] = a[7] - b[7];
+  n[8] = a[8] - b[8];
+  n[9] = a[9] - b[9];
+  n[10] = a[10] - b[10];
+  n[11] = a[11] - b[11];
+  n[12] = a[12] - b[12];
+  n[13] = a[13] - b[13];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[13] += ((spint)0x82000u) & carry;
+  (void)prop(n);
+}
+
+// Modular negation
+static void modneg(const spint *b, spint *n) {
+  spint carry;
+  n[0] = (spint)0 - b[0];
+  n[1] = (spint)0 - b[1];
+  n[2] = (spint)0 - b[2];
+  n[3] = (spint)0 - b[3];
+  n[4] = (spint)0 - b[4];
+  n[5] = (spint)0 - b[5];
+  n[6] = (spint)0 - b[6];
+  n[7] = (spint)0 - b[7];
+  n[8] = (spint)0 - b[8];
+  n[9] = (spint)0 - b[9];
+  n[10] = (spint)0 - b[10];
+  n[11] = (spint)0 - b[11];
+  n[12] = (spint)0 - b[12];
+  n[13] = (spint)0 - b[13];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[13] += ((spint)0x82000u) & carry;
+  (void)prop(n);
+}
+
+// Overflow limit   = 18446744073709551616
+// maximum possible = 1008877845989814286
+// Modular multiplication, c=a*b mod 2p
+static void modmul(const spint *a, const spint *b, spint *c) {
+  dpint t = 0;
+  spint p13 = 0x41000u;
+  spint q = ((spint)1 << 28u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  t += (dpint)a[0] * b[0];
+  spint v0 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[1];
+  t += (dpint)a[1] * b[0];
+  spint v1 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[2];
+  t += (dpint)a[1] * b[1];
+  t += (dpint)a[2] * b[0];
+  spint v2 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[3];
+  t += (dpint)a[1] * b[2];
+  t += (dpint)a[2] * b[1];
+  t += (dpint)a[3] * b[0];
+  spint v3 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[4];
+  t += (dpint)a[1] * b[3];
+  t += (dpint)a[2] * b[2];
+  t += (dpint)a[3] * b[1];
+  t += (dpint)a[4] * b[0];
+  spint v4 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[5];
+  t += (dpint)a[1] * b[4];
+  t += (dpint)a[2] * b[3];
+  t += (dpint)a[3] * b[2];
+  t += (dpint)a[4] * b[1];
+  t += (dpint)a[5] * b[0];
+  spint v5 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[6];
+  t += (dpint)a[1] * b[5];
+  t += (dpint)a[2] * b[4];
+  t += (dpint)a[3] * b[3];
+  t += (dpint)a[4] * b[2];
+  t += (dpint)a[5] * b[1];
+  t += (dpint)a[6] * b[0];
+  spint v6 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[7];
+  t += (dpint)a[1] * b[6];
+  t += (dpint)a[2] * b[5];
+  t += (dpint)a[3] * b[4];
+  t += (dpint)a[4] * b[3];
+  t += (dpint)a[5] * b[2];
+  t += (dpint)a[6] * b[1];
+  t += (dpint)a[7] * b[0];
+  spint v7 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[8];
+  t += (dpint)a[1] * b[7];
+  t += (dpint)a[2] * b[6];
+  t += (dpint)a[3] * b[5];
+  t += (dpint)a[4] * b[4];
+  t += (dpint)a[5] * b[3];
+  t += (dpint)a[6] * b[2];
+  t += (dpint)a[7] * b[1];
+  t += (dpint)a[8] * b[0];
+  spint v8 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[9];
+  t += (dpint)a[1] * b[8];
+  t += (dpint)a[2] * b[7];
+  t += (dpint)a[3] * b[6];
+  t += (dpint)a[4] * b[5];
+  t += (dpint)a[5] * b[4];
+  t += (dpint)a[6] * b[3];
+  t += (dpint)a[7] * b[2];
+  t += (dpint)a[8] * b[1];
+  t += (dpint)a[9] * b[0];
+  spint v9 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[10];
+  t += (dpint)a[1] * b[9];
+  t += (dpint)a[2] * b[8];
+  t += (dpint)a[3] * b[7];
+  t += (dpint)a[4] * b[6];
+  t += (dpint)a[5] * b[5];
+  t += (dpint)a[6] * b[4];
+  t += (dpint)a[7] * b[3];
+  t += (dpint)a[8] * b[2];
+  t += (dpint)a[9] * b[1];
+  t += (dpint)a[10] * b[0];
+  spint v10 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[11];
+  t += (dpint)a[1] * b[10];
+  t += (dpint)a[2] * b[9];
+  t += (dpint)a[3] * b[8];
+  t += (dpint)a[4] * b[7];
+  t += (dpint)a[5] * b[6];
+  t += (dpint)a[6] * b[5];
+  t += (dpint)a[7] * b[4];
+  t += (dpint)a[8] * b[3];
+  t += (dpint)a[9] * b[2];
+  t += (dpint)a[10] * b[1];
+  t += (dpint)a[11] * b[0];
+  spint v11 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[12];
+  t += (dpint)a[1] * b[11];
+  t += (dpint)a[2] * b[10];
+  t += (dpint)a[3] * b[9];
+  t += (dpint)a[4] * b[8];
+  t += (dpint)a[5] * b[7];
+  t += (dpint)a[6] * b[6];
+  t += (dpint)a[7] * b[5];
+  t += (dpint)a[8] * b[4];
+  t += (dpint)a[9] * b[3];
+  t += (dpint)a[10] * b[2];
+  t += (dpint)a[11] * b[1];
+  t += (dpint)a[12] * b[0];
+  spint v12 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[0] * b[13];
+  t += (dpint)a[1] * b[12];
+  t += (dpint)a[2] * b[11];
+  t += (dpint)a[3] * b[10];
+  t += (dpint)a[4] * b[9];
+  t += (dpint)a[5] * b[8];
+  t += (dpint)a[6] * b[7];
+  t += (dpint)a[7] * b[6];
+  t += (dpint)a[8] * b[5];
+  t += (dpint)a[9] * b[4];
+  t += (dpint)a[10] * b[3];
+  t += (dpint)a[11] * b[2];
+  t += (dpint)a[12] * b[1];
+  t += (dpint)a[13] * b[0];
+  t += (dpint)v0 * (dpint)p13;
+  spint v13 = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[1] * b[13];
+  t += (dpint)a[2] * b[12];
+  t += (dpint)a[3] * b[11];
+  t += (dpint)a[4] * b[10];
+  t += (dpint)a[5] * b[9];
+  t += (dpint)a[6] * b[8];
+  t += (dpint)a[7] * b[7];
+  t += (dpint)a[8] * b[6];
+  t += (dpint)a[9] * b[5];
+  t += (dpint)a[10] * b[4];
+  t += (dpint)a[11] * b[3];
+  t += (dpint)a[12] * b[2];
+  t += (dpint)a[13] * b[1];
+  t += (dpint)v1 * (dpint)p13;
+  c[0] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[2] * b[13];
+  t += (dpint)a[3] * b[12];
+  t += (dpint)a[4] * b[11];
+  t += (dpint)a[5] * b[10];
+  t += (dpint)a[6] * b[9];
+  t += (dpint)a[7] * b[8];
+  t += (dpint)a[8] * b[7];
+  t += (dpint)a[9] * b[6];
+  t += (dpint)a[10] * b[5];
+  t += (dpint)a[11] * b[4];
+  t += (dpint)a[12] * b[3];
+  t += (dpint)a[13] * b[2];
+  t += (dpint)v2 * (dpint)p13;
+  c[1] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[3] * b[13];
+  t += (dpint)a[4] * b[12];
+  t += (dpint)a[5] * b[11];
+  t += (dpint)a[6] * b[10];
+  t += (dpint)a[7] * b[9];
+  t += (dpint)a[8] * b[8];
+  t += (dpint)a[9] * b[7];
+  t += (dpint)a[10] * b[6];
+  t += (dpint)a[11] * b[5];
+  t += (dpint)a[12] * b[4];
+  t += (dpint)a[13] * b[3];
+  t += (dpint)v3 * (dpint)p13;
+  c[2] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[4] * b[13];
+  t += (dpint)a[5] * b[12];
+  t += (dpint)a[6] * b[11];
+  t += (dpint)a[7] * b[10];
+  t += (dpint)a[8] * b[9];
+  t += (dpint)a[9] * b[8];
+  t += (dpint)a[10] * b[7];
+  t += (dpint)a[11] * b[6];
+  t += (dpint)a[12] * b[5];
+  t += (dpint)a[13] * b[4];
+  t += (dpint)v4 * (dpint)p13;
+  c[3] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[5] * b[13];
+  t += (dpint)a[6] * b[12];
+  t += (dpint)a[7] * b[11];
+  t += (dpint)a[8] * b[10];
+  t += (dpint)a[9] * b[9];
+  t += (dpint)a[10] * b[8];
+  t += (dpint)a[11] * b[7];
+  t += (dpint)a[12] * b[6];
+  t += (dpint)a[13] * b[5];
+  t += (dpint)v5 * (dpint)p13;
+  c[4] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[6] * b[13];
+  t += (dpint)a[7] * b[12];
+  t += (dpint)a[8] * b[11];
+  t += (dpint)a[9] * b[10];
+  t += (dpint)a[10] * b[9];
+  t += (dpint)a[11] * b[8];
+  t += (dpint)a[12] * b[7];
+  t += (dpint)a[13] * b[6];
+  t += (dpint)v6 * (dpint)p13;
+  c[5] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[7] * b[13];
+  t += (dpint)a[8] * b[12];
+  t += (dpint)a[9] * b[11];
+  t += (dpint)a[10] * b[10];
+  t += (dpint)a[11] * b[9];
+  t += (dpint)a[12] * b[8];
+  t += (dpint)a[13] * b[7];
+  t += (dpint)v7 * (dpint)p13;
+  c[6] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[8] * b[13];
+  t += (dpint)a[9] * b[12];
+  t += (dpint)a[10] * b[11];
+  t += (dpint)a[11] * b[10];
+  t += (dpint)a[12] * b[9];
+  t += (dpint)a[13] * b[8];
+  t += (dpint)v8 * (dpint)p13;
+  c[7] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[9] * b[13];
+  t += (dpint)a[10] * b[12];
+  t += (dpint)a[11] * b[11];
+  t += (dpint)a[12] * b[10];
+  t += (dpint)a[13] * b[9];
+  t += (dpint)v9 * (dpint)p13;
+  c[8] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[10] * b[13];
+  t += (dpint)a[11] * b[12];
+  t += (dpint)a[12] * b[11];
+  t += (dpint)a[13] * b[10];
+  t += (dpint)v10 * (dpint)p13;
+  c[9] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[11] * b[13];
+  t += (dpint)a[12] * b[12];
+  t += (dpint)a[13] * b[11];
+  t += (dpint)v11 * (dpint)p13;
+  c[10] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[12] * b[13];
+  t += (dpint)a[13] * b[12];
+  t += (dpint)v12 * (dpint)p13;
+  c[11] = ((spint)t & mask);
+  t >>= 28;
+  t += (dpint)a[13] * b[13];
+  t += (dpint)v13 * (dpint)p13;
+  c[12] = ((spint)t & mask);
+  t >>= 28;
+  c[13] = (spint)t;
+}
+
+// Modular squaring, c=a*a  mod 2p
+static void modsqr(const spint *a, spint *c) {
+  udpint tot;
+  udpint t = 0;
+  spint p13 = 0x41000u;
+  spint q = ((spint)1 << 28u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  tot = (udpint)a[0] * a[0];
+  t = tot;
+  spint v0 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[1];
+  tot *= 2;
+  t += tot;
+  spint v1 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[2];
+  tot *= 2;
+  tot += (udpint)a[1] * a[1];
+  t += tot;
+  spint v2 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[3];
+  tot += (udpint)a[1] * a[2];
+  tot *= 2;
+  t += tot;
+  spint v3 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[4];
+  tot += (udpint)a[1] * a[3];
+  tot *= 2;
+  tot += (udpint)a[2] * a[2];
+  t += tot;
+  spint v4 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[5];
+  tot += (udpint)a[1] * a[4];
+  tot += (udpint)a[2] * a[3];
+  tot *= 2;
+  t += tot;
+  spint v5 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[6];
+  tot += (udpint)a[1] * a[5];
+  tot += (udpint)a[2] * a[4];
+  tot *= 2;
+  tot += (udpint)a[3] * a[3];
+  t += tot;
+  spint v6 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[7];
+  tot += (udpint)a[1] * a[6];
+  tot += (udpint)a[2] * a[5];
+  tot += (udpint)a[3] * a[4];
+  tot *= 2;
+  t += tot;
+  spint v7 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[8];
+  tot += (udpint)a[1] * a[7];
+  tot += (udpint)a[2] * a[6];
+  tot += (udpint)a[3] * a[5];
+  tot *= 2;
+  tot += (udpint)a[4] * a[4];
+  t += tot;
+  spint v8 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[9];
+  tot += (udpint)a[1] * a[8];
+  tot += (udpint)a[2] * a[7];
+  tot += (udpint)a[3] * a[6];
+  tot += (udpint)a[4] * a[5];
+  tot *= 2;
+  t += tot;
+  spint v9 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[10];
+  tot += (udpint)a[1] * a[9];
+  tot += (udpint)a[2] * a[8];
+  tot += (udpint)a[3] * a[7];
+  tot += (udpint)a[4] * a[6];
+  tot *= 2;
+  tot += (udpint)a[5] * a[5];
+  t += tot;
+  spint v10 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[11];
+  tot += (udpint)a[1] * a[10];
+  tot += (udpint)a[2] * a[9];
+  tot += (udpint)a[3] * a[8];
+  tot += (udpint)a[4] * a[7];
+  tot += (udpint)a[5] * a[6];
+  tot *= 2;
+  t += tot;
+  spint v11 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[12];
+  tot += (udpint)a[1] * a[11];
+  tot += (udpint)a[2] * a[10];
+  tot += (udpint)a[3] * a[9];
+  tot += (udpint)a[4] * a[8];
+  tot += (udpint)a[5] * a[7];
+  tot *= 2;
+  tot += (udpint)a[6] * a[6];
+  t += tot;
+  spint v12 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[0] * a[13];
+  tot += (udpint)a[1] * a[12];
+  tot += (udpint)a[2] * a[11];
+  tot += (udpint)a[3] * a[10];
+  tot += (udpint)a[4] * a[9];
+  tot += (udpint)a[5] * a[8];
+  tot += (udpint)a[6] * a[7];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v0 * p13;
+  spint v13 = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[1] * a[13];
+  tot += (udpint)a[2] * a[12];
+  tot += (udpint)a[3] * a[11];
+  tot += (udpint)a[4] * a[10];
+  tot += (udpint)a[5] * a[9];
+  tot += (udpint)a[6] * a[8];
+  tot *= 2;
+  tot += (udpint)a[7] * a[7];
+  t += tot;
+  t += (udpint)v1 * p13;
+  c[0] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[2] * a[13];
+  tot += (udpint)a[3] * a[12];
+  tot += (udpint)a[4] * a[11];
+  tot += (udpint)a[5] * a[10];
+  tot += (udpint)a[6] * a[9];
+  tot += (udpint)a[7] * a[8];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v2 * p13;
+  c[1] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[3] * a[13];
+  tot += (udpint)a[4] * a[12];
+  tot += (udpint)a[5] * a[11];
+  tot += (udpint)a[6] * a[10];
+  tot += (udpint)a[7] * a[9];
+  tot *= 2;
+  tot += (udpint)a[8] * a[8];
+  t += tot;
+  t += (udpint)v3 * p13;
+  c[2] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[4] * a[13];
+  tot += (udpint)a[5] * a[12];
+  tot += (udpint)a[6] * a[11];
+  tot += (udpint)a[7] * a[10];
+  tot += (udpint)a[8] * a[9];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v4 * p13;
+  c[3] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[5] * a[13];
+  tot += (udpint)a[6] * a[12];
+  tot += (udpint)a[7] * a[11];
+  tot += (udpint)a[8] * a[10];
+  tot *= 2;
+  tot += (udpint)a[9] * a[9];
+  t += tot;
+  t += (udpint)v5 * p13;
+  c[4] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[6] * a[13];
+  tot += (udpint)a[7] * a[12];
+  tot += (udpint)a[8] * a[11];
+  tot += (udpint)a[9] * a[10];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v6 * p13;
+  c[5] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[7] * a[13];
+  tot += (udpint)a[8] * a[12];
+  tot += (udpint)a[9] * a[11];
+  tot *= 2;
+  tot += (udpint)a[10] * a[10];
+  t += tot;
+  t += (udpint)v7 * p13;
+  c[6] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[8] * a[13];
+  tot += (udpint)a[9] * a[12];
+  tot += (udpint)a[10] * a[11];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v8 * p13;
+  c[7] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[9] * a[13];
+  tot += (udpint)a[10] * a[12];
+  tot *= 2;
+  tot += (udpint)a[11] * a[11];
+  t += tot;
+  t += (udpint)v9 * p13;
+  c[8] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[10] * a[13];
+  tot += (udpint)a[11] * a[12];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v10 * p13;
+  c[9] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[11] * a[13];
+  tot *= 2;
+  tot += (udpint)a[12] * a[12];
+  t += tot;
+  t += (udpint)v11 * p13;
+  c[10] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[12] * a[13];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v12 * p13;
+  c[11] = ((spint)t & mask);
+  t >>= 28;
+  tot = (udpint)a[13] * a[13];
+  t += tot;
+  t += (udpint)v13 * p13;
+  c[12] = ((spint)t & mask);
+  t >>= 28;
+  c[13] = (spint)t;
+}
+
+// copy
+static void modcpy(const spint *a, spint *c) {
+  int i;
+  for (i = 0; i < 14; i++) {
+    c[i] = a[i];
+  }
+}
+
+// square n times
+static void modnsqr(spint *a, int n) {
+  int i;
+  for (i = 0; i < n; i++) {
+    modsqr(a, a);
+  }
+}
+
+// Calculate progenitor
+static void modpro(const spint *w, spint *z) {
+  spint x[14];
+  spint t0[14];
+  spint t1[14];
+  spint t2[14];
+  spint t3[14];
+  spint t4[14];
+  spint t5[14];
+  modcpy(w, x);
+  modsqr(x, z);
+  modsqr(z, t0);
+  modmul(x, t0, t1);
+  modmul(z, t1, z);
+  modsqr(z, t0);
+  modsqr(t0, t3);
+  modsqr(t3, t4);
+  modsqr(t4, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 3);
+  modmul(t2, t5, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 6);
+  modmul(t2, t5, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 2);
+  modmul(t4, t5, t5);
+  modnsqr(t5, 13);
+  modmul(t2, t5, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 2);
+  modmul(t4, t5, t4);
+  modnsqr(t4, 28);
+  modmul(t2, t4, t2);
+  modsqr(t2, t4);
+  modmul(t3, t4, t3);
+  modnsqr(t3, 59);
+  modmul(t2, t3, t2);
+  modmul(t1, t2, t1);
+  modmul(z, t1, z);
+  modmul(t0, z, t0);
+  modmul(t1, t0, t1);
+  modsqr(t1, t2);
+  modmul(t1, t2, t2);
+  modsqr(t2, t2);
+  modmul(t1, t2, t2);
+  modmul(t0, t2, t0);
+  modmul(z, t0, z);
+  modsqr(z, t2);
+  modmul(z, t2, t2);
+  modmul(t0, t2, t0);
+  modmul(t1, t0, t1);
+  modcpy(t1, t2);
+  modnsqr(t2, 128);
+  modmul(t1, t2, t1);
+  modmul(t0, t1, t0);
+  modnsqr(t0, 125);
+  modmul(z, t0, z);
+}
+
+// calculate inverse, provide progenitor h if available
+static void modinv(const spint *x, const spint *h, spint *z) {
+  spint s[14];
+  spint t[14];
+  if (h == NULL) {
+    modpro(x, t);
+  } else {
+    modcpy(h, t);
+  }
+  modcpy(x, s);
+  modnsqr(t, 2);
+  modmul(s, t, z);
+}
+
+// Convert m to n-residue form, n=nres(m)
+static void nres(const spint *m, spint *n) {
+  const spint c[14] = {0xf13732fu, 0x3f03f03u, 0x3f03f0u,  0xf03f03fu,
+                       0x3f03f03u, 0x3f03f0u,  0xf03f03fu, 0x3f03f03u,
+                       0x3f03f0u,  0xf03f03fu, 0x3f03f03u, 0x3f03f0u,
+                       0xf03f03fu, 0x14f03u};
+  modmul(m, c, n);
+}
+
+// Convert n back to normal form, m=redc(n)
+static void redc(const spint *n, spint *m) {
+  int i;
+  spint c[14];
+  c[0] = 1;
+  for (i = 1; i < 14; i++) {
+    c[i] = 0;
+  }
+  modmul(n, c, m);
+  (void)modfsb(m);
+}
+
+// is unity?
+static int modis1(const spint *a) {
+  int i;
+  spint c[14];
+  spint c0;
+  spint d = 0;
+  redc(a, c);
+  for (i = 1; i < 14; i++) {
+    d |= c[i];
+  }
+  c0 = (spint)c[0];
+  return ((spint)1 & ((d - (spint)1) >> 28u) &
+          (((c0 ^ (spint)1) - (spint)1) >> 28u));
+}
+
+// is zero?
+static int modis0(const spint *a) {
+  int i;
+  spint c[14];
+  spint d = 0;
+  redc(a, c);
+  for (i = 0; i < 14; i++) {
+    d |= c[i];
+  }
+  return ((spint)1 & ((d - (spint)1) >> 28u));
+}
+
+// set to zero
+static void modzer(spint *a) {
+  int i;
+  for (i = 0; i < 14; i++) {
+    a[i] = 0;
+  }
+}
+
+// set to one
+static void modone(spint *a) {
+  int i;
+  a[0] = 1;
+  for (i = 1; i < 14; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// set to integer
+static void modint(int x, spint *a) {
+  int i;
+  a[0] = (spint)x;
+  for (i = 1; i < 14; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// Modular multiplication by an integer, c=a*b mod 2p
+static void modmli(const spint *a, int b, spint *c) {
+  spint t[14];
+  modint(b, t);
+  modmul(a, t, c);
+}
+
+// Test for quadratic residue
+static int modqr(const spint *h, const spint *x) {
+  spint r[14];
+  if (h == NULL) {
+    modpro(x, r);
+    modsqr(r, r);
+  } else {
+    modsqr(h, r);
+  }
+  modmul(r, x, r);
+  return modis1(r) | modis0(x);
+}
+
+// conditional move g to f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcmv(int b, const spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 14; i++) {
+    s = g[i];
+    t = f[i];
+    f[i] = c0 * t + c1 * s;
+    f[i] -= r * (t + s);
+  }
+}
+
+// conditional swap g and f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcsw(int b, volatile spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t, w;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 14; i++) {
+    s = g[i];
+    t = f[i];
+    w = r * (t + s);
+    f[i] = c0 * t + c1 * s;
+    f[i] -= w;
+    g[i] = c0 * s + c1 * t;
+    g[i] -= w;
+  }
+}
+
+// Modular square root, provide progenitor h if available, NULL if not
+static void modsqrt(const spint *x, const spint *h, spint *r) {
+  spint s[14];
+  spint y[14];
+  if (h == NULL) {
+    modpro(x, y);
+  } else {
+    modcpy(h, y);
+  }
+  modmul(y, x, s);
+  modcpy(s, r);
+}
+
+// shift left by less than a word
+static void modshl(unsigned int n, spint *a) {
+  int i;
+  a[13] = ((a[13] << n)) | (a[12] >> (28u - n));
+  for (i = 12; i > 0; i--) {
+    a[i] = ((a[i] << n) & (spint)0xfffffff) | (a[i - 1] >> (28u - n));
+  }
+  a[0] = (a[0] << n) & (spint)0xfffffff;
+}
+
+// shift right by less than a word. Return shifted out part
+static int modshr(unsigned int n, spint *a) {
+  int i;
+  spint r = a[0] & (((spint)1 << n) - (spint)1);
+  for (i = 0; i < 13; i++) {
+    a[i] = (a[i] >> n) | ((a[i + 1] << (28u - n)) & (spint)0xfffffff);
+  }
+  a[13] = a[13] >> n;
+  return r;
+}
+
+// set a= 2^r
+static void mod2r(unsigned int r, spint *a) {
+  unsigned int n = r / 28u;
+  unsigned int m = r % 28u;
+  modzer(a);
+  if (r >= 48 * 8)
+    return;
+  a[n] = 1;
+  a[n] <<= m;
+  nres(a, a);
+}
+
+// export to byte array
+static void modexp(const spint *a, char *b) {
+  int i;
+  spint c[14];
+  redc(a, c);
+  for (i = 47; i >= 0; i--) {
+    b[i] = c[0] & (spint)0xff;
+    (void)modshr(8, c);
+  }
+}
+
+// import from byte array
+// returns 1 if in range, else 0
+static int modimp(const char *b, spint *a) {
+  int i, res;
+  for (i = 0; i < 14; i++) {
+    a[i] = 0;
+  }
+  for (i = 0; i < 48; i++) {
+    modshl(8, a);
+    a[0] += (spint)(unsigned char)b[i];
+  }
+  res = modfsb(a);
+  nres(a, a);
+  return res;
+}
+
+// determine sign
+static int modsign(const spint *a) {
+  spint c[14];
+  redc(a, c);
+  return c[0] % 2;
+}
+
+// return true if equal
+static int modcmp(const spint *a, const spint *b) {
+  spint c[14], d[14];
+  int i, eq = 1;
+  redc(a, c);
+  redc(b, d);
+  for (i = 0; i < 14; i++) {
+    eq &= (((c[i] ^ d[i]) - 1) >> 28) & 1;
+  }
+  return eq;
+}
+
+// clang-format on
+/******************************************************************************
+ API functions calling generated code above
+ ******************************************************************************/
+
+#include <fp.h>
+
+const digit_t ZERO[NWORDS_FIELD] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
+const digit_t ONE[NWORDS_FIELD] = {
+    0x000003f0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00010000
+};
+// Montgomery representation of 2^-1
+static const digit_t TWO_INV[NWORDS_FIELD] = { 0x000001f8, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                               0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                               0x00000000, 0x00000000, 0x00000000, 0x00008000 };
+// Montgomery representation of 3^-1
+static const digit_t THREE_INV[NWORDS_FIELD] = { 0x0aaaabfa, 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa,
+                                                 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa,
+                                                 0x0aaaaaaa, 0x0aaaaaaa, 0x0aaaaaaa, 0x00030aaa };
+// Montgomery representation of 2^384
+static const digit_t R2[NWORDS_FIELD] = { 0x003f1373, 0x0f03f03f, 0x03f03f03, 0x003f03f0, 0x0f03f03f,
+                                          0x03f03f03, 0x003f03f0, 0x0f03f03f, 0x03f03f03, 0x003f03f0,
+                                          0x0f03f03f, 0x03f03f03, 0x003f03f0, 0x0000c03f };
+
+void
+fp_set_small(fp_t *x, const digit_t val)
+{
+    modint((int)val, *x);
+}
+
+void
+fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val)
+{
+    modmli(*a, (int)val, *x);
+}
+
+void
+fp_set_zero(fp_t *x)
+{
+    modzer(*x);
+}
+
+void
+fp_set_one(fp_t *x)
+{
+    modone(*x);
+}
+
+uint32_t
+fp_is_equal(const fp_t *a, const fp_t *b)
+{
+    return -(uint32_t)modcmp(*a, *b);
+}
+
+uint32_t
+fp_is_zero(const fp_t *a)
+{
+    return -(uint32_t)modis0(*a);
+}
+
+void
+fp_copy(fp_t *out, const fp_t *a)
+{
+    modcpy(*a, *out);
+}
+
+void
+fp_cswap(fp_t *a, fp_t *b, uint32_t ctl)
+{
+    modcsw((int)(ctl & 0x1), *a, *b);
+}
+
+void
+fp_add(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modadd(*a, *b, *out);
+}
+
+void
+fp_sub(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modsub(*a, *b, *out);
+}
+
+void
+fp_neg(fp_t *out, const fp_t *a)
+{
+    modneg(*a, *out);
+}
+
+void
+fp_sqr(fp_t *out, const fp_t *a)
+{
+    modsqr(*a, *out);
+}
+
+void
+fp_mul(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modmul(*a, *b, *out);
+}
+
+void
+fp_inv(fp_t *x)
+{
+    modinv(*x, NULL, *x);
+}
+
+uint32_t
+fp_is_square(const fp_t *a)
+{
+    return -(uint32_t)modqr(NULL, *a);
+}
+
+void
+fp_sqrt(fp_t *a)
+{
+    modsqrt(*a, NULL, *a);
+}
+
+void
+fp_half(fp_t *out, const fp_t *a)
+{
+    modmul(TWO_INV, *a, *out);
+}
+
+void
+fp_exp3div4(fp_t *out, const fp_t *a)
+{
+    modpro(*a, *out);
+}
+
+void
+fp_div3(fp_t *out, const fp_t *a)
+{
+    modmul(THREE_INV, *a, *out);
+}
+
+void
+fp_encode(void *dst, const fp_t *a)
+{
+    // Modified version of modexp()
+    int i;
+    spint c[14];
+    redc(*a, c);
+    for (i = 0; i < 48; i++) {
+        ((char *)dst)[i] = c[0] & (spint)0xff;
+        (void)modshr(8, c);
+    }
+}
+
+uint32_t
+fp_decode(fp_t *d, const void *src)
+{
+    // Modified version of modimp()
+    int i;
+    spint res;
+    const unsigned char *b = src;
+    for (i = 0; i < 14; i++) {
+        (*d)[i] = 0;
+    }
+    for (i = 47; i >= 0; i--) {
+        modshl(8, *d);
+        (*d)[0] += (spint)b[i];
+    }
+    res = (spint)-modfsb(*d);
+    nres(*d, *d);
+    // If the value was canonical then res = -1; otherwise, res = 0
+    for (i = 0; i < 14; i++) {
+        (*d)[i] &= res;
+    }
+    return (uint32_t)res;
+}
+
+static inline unsigned char
+add_carry(unsigned char cc, spint a, spint b, spint *d)
+{
+    udpint t = (udpint)a + (udpint)b + cc;
+    *d = (spint)t;
+    return (unsigned char)(t >> Wordlength);
+}
+
+static void
+partial_reduce(spint *out, const spint *src)
+{
+    spint h, l, quo, rem;
+    unsigned char cc;
+
+    // Split value in high (8 bits) and low (376 bits) parts.
+    h = src[11] >> 24;
+    l = src[11] & 0x00FFFFFF;
+
+    // 65*2^376 = 1 mod q; hence, we add floor(h/65) + (h mod 65)*2^376
+    // to the low part.
+    quo = (h * 0xFC1) >> 18;
+    rem = h - (65 * quo);
+    cc = add_carry(0, src[0], quo, &out[0]);
+    cc = add_carry(cc, src[1], 0, &out[1]);
+    cc = add_carry(cc, src[2], 0, &out[2]);
+    cc = add_carry(cc, src[3], 0, &out[3]);
+    cc = add_carry(cc, src[4], 0, &out[4]);
+    cc = add_carry(cc, src[5], 0, &out[5]);
+    cc = add_carry(cc, src[6], 0, &out[6]);
+    cc = add_carry(cc, src[7], 0, &out[7]);
+    cc = add_carry(cc, src[8], 0, &out[8]);
+    cc = add_carry(cc, src[9], 0, &out[9]);
+    cc = add_carry(cc, src[10], 0, &out[10]);
+    (void)add_carry(cc, l, rem << 24, &out[11]);
+}
+
+// Little-endian encoding of a 32-bit integer.
+static inline void
+enc32le(void *dst, uint32_t x)
+{
+    uint8_t *buf = dst;
+    buf[0] = (uint8_t)x;
+    buf[1] = (uint8_t)(x >> 8);
+    buf[2] = (uint8_t)(x >> 16);
+    buf[3] = (uint8_t)(x >> 24);
+}
+
+// Little-endian decoding of a 32-bit integer.
+static inline uint32_t
+dec32le(const void *src)
+{
+    const uint8_t *buf = src;
+    return (spint)buf[0] | ((spint)buf[1] << 8) | ((spint)buf[2] << 16) | ((spint)buf[3] << 24);
+}
+
+void
+fp_decode_reduce(fp_t *d, const void *src, size_t len)
+{
+    uint32_t t[12];  // Stores Nbytes * 8 bits
+    uint8_t tmp[48]; // Nbytes
+    const uint8_t *b = src;
+
+    fp_set_zero(d);
+    if (len == 0) {
+        return;
+    }
+
+    size_t rem = len % 48;
+    if (rem != 0) {
+        // Input size is not a multiple of 48, we decode a partial
+        // block, which is already less than 2^376.
+        size_t k = len - rem;
+        memcpy(tmp, b + k, len - k);
+        memset(tmp + len - k, 0, (sizeof tmp) - (len - k));
+        fp_decode(d, tmp);
+        len = k;
+    }
+    // Process all remaining blocks, in descending address order.
+    while (len > 0) {
+        fp_mul(d, d, &R2);
+        len -= 48;
+        t[0] = dec32le(b + len);
+        t[1] = dec32le(b + len + 4);
+        t[2] = dec32le(b + len + 8);
+        t[3] = dec32le(b + len + 12);
+        t[4] = dec32le(b + len + 16);
+        t[5] = dec32le(b + len + 20);
+        t[6] = dec32le(b + len + 24);
+        t[7] = dec32le(b + len + 28);
+        t[8] = dec32le(b + len + 32);
+        t[9] = dec32le(b + len + 36);
+        t[10] = dec32le(b + len + 40);
+        t[11] = dec32le(b + len + 44);
+        partial_reduce(t, t);
+        enc32le(tmp, t[0]);
+        enc32le(tmp + 4, t[1]);
+        enc32le(tmp + 8, t[2]);
+        enc32le(tmp + 12, t[3]);
+        enc32le(tmp + 16, t[4]);
+        enc32le(tmp + 20, t[5]);
+        enc32le(tmp + 24, t[6]);
+        enc32le(tmp + 28, t[7]);
+        enc32le(tmp + 32, t[8]);
+        enc32le(tmp + 36, t[9]);
+        enc32le(tmp + 40, t[10]);
+        enc32le(tmp + 44, t[11]);
+        fp_t a;
+        fp_decode(&a, tmp);
+        fp_add(d, d, &a);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/hd.c b/src/pqm4/sqisign_lvl3/ref/hd.c
new file mode 100644
index 0000000..0424108
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/hd.c
@@ -0,0 +1,93 @@
+#include <hd.h>
+#include <assert.h>
+
+void
+double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2)
+{
+    ec_dbl(&out->P1, &in->P1, &E1E2->E1);
+    ec_dbl(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_point_iter(theta_couple_point_t *out,
+                         unsigned n,
+                         const theta_couple_point_t *in,
+                         const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        memmove(out, in, sizeof(theta_couple_point_t));
+    } else {
+        double_couple_point(out, in, E1E2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            double_couple_point(out, out, E1E2);
+        }
+    }
+}
+
+void
+add_couple_jac_points(theta_couple_jac_point_t *out,
+                      const theta_couple_jac_point_t *T1,
+                      const theta_couple_jac_point_t *T2,
+                      const theta_couple_curve_t *E1E2)
+{
+    ADD(&out->P1, &T1->P1, &T2->P1, &E1E2->E1);
+    ADD(&out->P2, &T1->P2, &T2->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point(theta_couple_jac_point_t *out,
+                        const theta_couple_jac_point_t *in,
+                        const theta_couple_curve_t *E1E2)
+{
+    DBL(&out->P1, &in->P1, &E1E2->E1);
+    DBL(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                             unsigned n,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        *out = *in;
+    } else if (n == 1) {
+        double_couple_jac_point(out, in, E1E2);
+    } else {
+        fp2_t a1, a2, t1, t2;
+
+        jac_to_ws(&out->P1, &t1, &a1, &in->P1, &E1E2->E1);
+        jac_to_ws(&out->P2, &t2, &a2, &in->P2, &E1E2->E2);
+
+        DBLW(&out->P1, &t1, &out->P1, &t1);
+        DBLW(&out->P2, &t2, &out->P2, &t2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            DBLW(&out->P1, &t1, &out->P1, &t1);
+            DBLW(&out->P2, &t2, &out->P2, &t2);
+        }
+
+        jac_from_ws(&out->P1, &out->P1, &a1, &E1E2->E1);
+        jac_from_ws(&out->P2, &out->P2, &a2, &E1E2->E2);
+    }
+}
+
+void
+couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP)
+{
+    jac_to_xz(&P->P1, &xyP->P1);
+    jac_to_xz(&P->P2, &xyP->P2);
+}
+
+void
+copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2)
+{
+    // Copy the basis on E1 to (P, _) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P1, &B1->P);
+    copy_point(&ker->T2.P1, &B1->Q);
+    copy_point(&ker->T1m2.P1, &B1->PmQ);
+
+    // Copy the basis on E2 to (_, P) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P2, &B2->P);
+    copy_point(&ker->T2.P2, &B2->Q);
+    copy_point(&ker->T1m2.P2, &B2->PmQ);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/hd.h b/src/pqm4/sqisign_lvl3/ref/hd.h
new file mode 100644
index 0000000..2b16e23
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/hd.h
@@ -0,0 +1,435 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief The HD-isogenies algorithm required by the signature
+ *
+ */
+
+#ifndef HD_H
+#define HD_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <stdio.h>
+
+/** @defgroup hd_module Abelian surfaces and their isogenies
+ * @{
+ */
+
+#define HD_extra_torsion 2
+
+/** @defgroup hd_struct Data structures for dimension 2
+ * @{
+ */
+
+/** @brief Type for couple point with XZ coordinates
+ * @typedef theta_couple_point_t
+ *
+ * @struct theta_couple_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XZ coordinates
+ */
+typedef struct theta_couple_point
+{
+    ec_point_t P1;
+    ec_point_t P2;
+} theta_couple_point_t;
+
+/** @brief Type for three couple points T1, T2, T1-T2 with XZ coordinates
+ * @typedef theta_kernel_couple_points_t
+ *
+ * @struct theta_kernel_couple_points
+ *
+ * Structure for a triple of theta couple points T1, T2 and T1 - T2
+ */
+typedef struct theta_kernel_couple_points
+{
+    theta_couple_point_t T1;
+    theta_couple_point_t T2;
+    theta_couple_point_t T1m2;
+} theta_kernel_couple_points_t;
+
+/** @brief Type for couple point with XYZ coordinates
+ * @typedef theta_couple_jac_point_t
+ *
+ * @struct theta_couple_jac_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XYZ coordinates
+ */
+typedef struct theta_couple_jac_point
+{
+    jac_point_t P1;
+    jac_point_t P2;
+} theta_couple_jac_point_t;
+
+/** @brief Type for couple curve *
+ * @typedef theta_couple_curve_t
+ *
+ * @struct theta_couple_curve
+ *
+ * the  theta_couple_curve structure
+ */
+typedef struct theta_couple_curve
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+} theta_couple_curve_t;
+
+/** @brief Type for a product E1 x E2 with corresponding bases
+ * @typedef theta_couple_curve_with_basis_t
+ *
+ * @struct theta_couple_curve_with_basis
+ *
+ * tType for a product E1 x E2 with corresponding bases Ei[2^n]
+ */
+typedef struct theta_couple_curve_with_basis
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+    ec_basis_t B1;
+    ec_basis_t B2;
+} theta_couple_curve_with_basis_t;
+
+/** @brief Type for theta point *
+ * @typedef theta_point_t
+ *
+ * @struct theta_point
+ *
+ * the  theta_point structure used
+ */
+typedef struct theta_point
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+    fp2_t t;
+} theta_point_t;
+
+/** @brief Type for theta point with repeating components
+ * @typedef theta_point_compact_t
+ *
+ * @struct theta_point_compact
+ *
+ * the  theta_point structure used for points with repeated components
+ */
+typedef struct theta_point_compact
+{
+    fp2_t x;
+    fp2_t y;
+} theta_point_compact_t;
+
+/** @brief Type for theta structure *
+ * @typedef theta_structure_t
+ *
+ * @struct theta_structure
+ *
+ * the  theta_structure structure used
+ */
+typedef struct theta_structure
+{
+    theta_point_t null_point;
+    bool precomputation;
+
+    // Eight precomputed values used for doubling and
+    // (2,2)-isogenies.
+    fp2_t XYZ0;
+    fp2_t YZT0;
+    fp2_t XZT0;
+    fp2_t XYT0;
+
+    fp2_t xyz0;
+    fp2_t yzt0;
+    fp2_t xzt0;
+    fp2_t xyt0;
+} theta_structure_t;
+
+/** @brief A 2x2 matrix used for action by translation
+ * @typedef translation_matrix_t
+ *
+ * @struct translation_matrix
+ *
+ * Structure to hold 4 fp2_t elements representing a 2x2 matrix used when computing
+ * a compatible theta structure during gluing.
+ */
+typedef struct translation_matrix
+{
+    fp2_t g00;
+    fp2_t g01;
+    fp2_t g10;
+    fp2_t g11;
+} translation_matrix_t;
+
+/** @brief A 4x4 matrix used for basis changes
+ * @typedef basis_change_matrix_t
+ *
+ * @struct basis_change_matrix
+ *
+ * Structure to hold 16 elements representing a 4x4 matrix used for changing
+ * the basis of a theta point.
+ */
+typedef struct basis_change_matrix
+{
+    fp2_t m[4][4];
+} basis_change_matrix_t;
+
+/** @brief Type for gluing (2,2) theta isogeny *
+ * @typedef theta_gluing_t
+ *
+ * @struct theta_gluing
+ *
+ * the  theta_gluing structure
+ */
+typedef struct theta_gluing
+{
+
+    theta_couple_curve_t domain;
+    theta_couple_jac_point_t xyK1_8;
+    theta_point_compact_t imageK1_8;
+    basis_change_matrix_t M;
+    theta_point_t precomputation;
+    theta_point_t codomain;
+
+} theta_gluing_t;
+
+/** @brief Type for standard (2,2) theta isogeny *
+ * @typedef theta_isogeny_t
+ *
+ * @struct theta_isogeny
+ *
+ * the  theta_isogeny structure
+ */
+typedef struct theta_isogeny
+{
+    theta_point_t T1_8;
+    theta_point_t T2_8;
+    bool hadamard_bool_1;
+    bool hadamard_bool_2;
+    theta_structure_t domain;
+    theta_point_t precomputation;
+    theta_structure_t codomain;
+} theta_isogeny_t;
+
+/** @brief Type for splitting isomorphism *
+ * @typedef theta_splitting_t
+ *
+ * @struct theta_splitting
+ *
+ * the theta_splitting structure
+ */
+typedef struct theta_splitting
+{
+    basis_change_matrix_t M;
+    theta_structure_t B;
+
+} theta_splitting_t;
+
+// end of hd_struct
+/**
+ * @}
+ */
+
+/** @defgroup hd_functions Functions for dimension 2
+ * @{
+ */
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param n : the number of iteration
+ * @param E1E2 an elliptic product
+ * @param in the theta couple point in the elliptic product
+ * in = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_point_iter(theta_couple_point_t *out,
+                              unsigned n,
+                              const theta_couple_point_t *in,
+                              const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the addition of two points in (X : Y : Z) coordinates on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param T1 the theta couple jac point in the elliptic product
+ * @param T2 the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1, P2), (Q1, Q2)
+ * out = (P1 + Q1, P2 + Q2)
+ *
+ **/
+void add_couple_jac_points(theta_couple_jac_point_t *out,
+                           const theta_couple_jac_point_t *T1,
+                           const theta_couple_jac_point_t *T2,
+                           const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_jac_point(theta_couple_jac_point_t *out,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple jac point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param n : the number of iteration
+ * @param in the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                                  unsigned n,
+                                  const theta_couple_jac_point_t *in,
+                                  const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief A forgetful function which returns (X : Z) points given a pair of (X : Y : Z) points
+ *
+ * @param P Output: the theta_couple_point
+ * @param xyP : the theta_couple_jac_point
+ **/
+void couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval(unsigned n,
+                                 /*const*/ theta_couple_curve_t *E12,
+                                 const theta_kernel_couple_points_t *ker,
+                                 bool extra_torsion,
+                                 theta_couple_curve_t *E34,
+                                 theta_couple_point_t *P12,
+                                 size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it does extra isotropy
+ * checks on the kernel.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_verify(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it selects a random Montgomery
+ * model of the codomain.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success, 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_randomized(unsigned n,
+                                            /*const*/ theta_couple_curve_t *E12,
+                                            const theta_kernel_couple_points_t *ker,
+                                            bool extra_torsion,
+                                            theta_couple_curve_t *E34,
+                                            theta_couple_point_t *P12,
+                                            size_t numP);
+
+/**
+ * @brief Given a bases B1 on E1 and B2 on E2 copies this to create a kernel
+ *         on E1 x E2 as couple points T1, T2 and T1 - T2
+ *
+ * @param ker Output: a kernel for dim_two_isogenies (T1, T2, T1-T2)
+ * @param B1 Input basis on E1
+ * @param B2 Input basis on E2
+ **/
+void copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2);
+
+/**
+ * @brief Given a couple of points (P1, P2) on a couple of curves (E1, E2)
+ * this function tests if both points are of order exactly 2^t
+ *
+ * @param T: couple point (P1, P2)
+ * @param E: a couple of curves (E1, E2)
+ * @param t: an integer
+ * @returns 0xFFFFFFFF on success, 0 on failure
+ */
+static int
+test_couple_point_order_twof(const theta_couple_point_t *T, const theta_couple_curve_t *E, int t)
+{
+    int check_P1 = test_point_order_twof(&T->P1, &E->E1, t);
+    int check_P2 = test_point_order_twof(&T->P2, &E->E2, t);
+
+    return check_P1 & check_P2;
+}
+
+// end of hd_functions
+/**
+ * @}
+ */
+// end of hd_module
+/**
+ * @}
+ */
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.c b/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.c
new file mode 100644
index 0000000..d980d12
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.c
@@ -0,0 +1,143 @@
+#include <hd_splitting_transforms.h>
+
+#define FP2_ZERO 0
+#define FP2_ONE 1
+#define FP2_I 2
+#define FP2_MINUS_ONE 3
+#define FP2_MINUS_I 4
+
+const int EVEN_INDEX[10][2] = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 0}, {1, 2}, {2, 0}, {2, 1}, {3, 0}, {3, 3}};
+const int CHI_EVAL[4][4] = {{1, 1, 1, 1}, {1, -1, 1, -1}, {1, 1, -1, -1}, {1, -1, -1, 1}};
+const fp2_t FP2_CONSTANTS[5] = {{
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0xfc, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}
+#elif RADIX == 32
+{0x3f0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10000}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x3, 0x0, 0x0, 0x0, 0x0, 0x3d00000000000000}
+#else
+{0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0xe400000000000}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0xfc, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}
+#elif RADIX == 32
+{0x3f0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10000}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x3, 0x0, 0x0, 0x0, 0x0, 0x3d00000000000000}
+#else
+{0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0xe400000000000}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x1f03, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0xfff, 0x1e}
+#elif RADIX == 32
+{0xffffc0f, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0x30fff}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xfffffffffffffffc, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3ffffffffffffff}
+#else
+{0x7ffffffffffff8, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x1ffffffffffff}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x1f03, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0xfff, 0x1e}
+#elif RADIX == 32
+{0xffffc0f, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff, 0x30fff}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xfffffffffffffffc, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x3ffffffffffffff}
+#else
+{0x7ffffffffffff8, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x7fffffffffffff, 0x1ffffffffffff}
+#endif
+#endif
+}};
+const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10] = {{{{FP2_ONE, FP2_I, FP2_ONE, FP2_I}, {FP2_ONE, FP2_MINUS_I, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_MINUS_ONE, FP2_MINUS_I}, {FP2_MINUS_ONE, FP2_I, FP2_MINUS_ONE, FP2_I}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}};
+const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6] = {{{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}, {{{FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}}}};
diff --git a/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.h b/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.h
new file mode 100644
index 0000000..b3147a4
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/hd_splitting_transforms.h
@@ -0,0 +1,18 @@
+#ifndef HD_SPLITTING_H
+#define HD_SPLITTING_H
+
+#include <hd.h>
+#include <stdint.h>
+
+typedef struct precomp_basis_change_matrix {
+    uint8_t m[4][4];
+} precomp_basis_change_matrix_t;
+
+extern const int EVEN_INDEX[10][2];
+extern const int CHI_EVAL[4][4];
+extern const fp2_t FP2_CONSTANTS[5];
+extern const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10];
+extern const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6];
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl3/ref/isog.h b/src/pqm4/sqisign_lvl3/ref/isog.h
new file mode 100644
index 0000000..b251ca3
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/isog.h
@@ -0,0 +1,28 @@
+#ifndef _ISOG_H_
+#define _ISOG_H_
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/* KPS structure for isogenies of degree 2 or 4 */
+typedef struct
+{
+    ec_point_t K;
+} ec_kps2_t;
+typedef struct
+{
+    ec_point_t K[3];
+} ec_kps4_t;
+
+void xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P); // degree-2 isogeny construction
+void xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24);
+
+void xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P); // degree-4 isogeny construction
+void xisog_4_singular(ec_kps4_t *kps, ec_point_t *B24, const ec_point_t P, ec_point_t A24);
+
+void xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps);
+void xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps);
+
+void xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps);
+void xeval_4_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_point_t P, const ec_kps4_t *kps);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/isog_chains.c b/src/pqm4/sqisign_lvl3/ref/isog_chains.c
new file mode 100644
index 0000000..abc9808
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/isog_chains.c
@@ -0,0 +1,241 @@
+#include "isog.h"
+#include <assert.h>
+
+// since we use degree 4 isogeny steps, we need to handle the odd case with care
+static uint32_t
+ec_eval_even_strategy(ec_curve_t *curve,
+                      ec_point_t *points,
+                      unsigned len_points,
+                      const ec_point_t *kernel,
+                      const int isog_len)
+{
+    ec_curve_normalize_A24(curve);
+    ec_point_t A24;
+    copy_point(&A24, &curve->A24);
+
+    int space = 1;
+    for (int i = 1; i < isog_len; i *= 2)
+        ++space;
+
+    // Stack of remaining kernel points and their associated orders
+    ec_point_t splits[space];
+    uint16_t todo[space];
+    splits[0] = *kernel;
+    todo[0] = isog_len;
+
+    int current = 0; // Pointer to current top of stack
+
+    // Chain of 4-isogenies
+    for (int j = 0; j < isog_len / 2; ++j) {
+        assert(current >= 0);
+        assert(todo[current] >= 1);
+        // Get the next point of order 4
+        while (todo[current] != 2) {
+            assert(todo[current] >= 3);
+            // A new split will be added
+            ++current;
+            assert(current < space);
+            // We set the seed of the new split to be computed and saved
+            copy_point(&splits[current], &splits[current - 1]);
+            // if we copied from the very first element, then we perform one additional doubling
+            unsigned num_dbls = todo[current - 1] / 4 * 2 + todo[current - 1] % 2;
+            todo[current] = todo[current - 1] - num_dbls;
+            while (num_dbls--)
+                xDBL_A24(&splits[current], &splits[current], &A24, false);
+        }
+
+        if (j == 0) {
+            assert(fp2_is_one(&A24.z));
+            if (!ec_is_four_torsion(&splits[current], curve))
+                return -1;
+
+            ec_point_t T;
+            xDBL_A24(&T, &splits[current], &A24, false);
+            if (fp2_is_zero(&T.x))
+                return -1; // special isogenies not allowed
+        } else {
+            assert(todo[current] == 2);
+#ifndef NDEBUG
+            if (fp2_is_zero(&splits[current].z))
+                debug_print("splitting point z coordinate is unexpectedly zero");
+
+            ec_point_t test;
+            xDBL_A24(&test, &splits[current], &A24, false);
+            if (fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly zero before doubling");
+            xDBL_A24(&test, &test, &A24, false);
+            if (!fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+        }
+
+        // Evaluate 4-isogeny
+        ec_kps4_t kps4;
+        xisog_4(&kps4, &A24, splits[current]);
+        xeval_4(splits, splits, current, &kps4);
+        for (int i = 0; i < current; ++i)
+            todo[i] -= 2;
+        xeval_4(points, points, len_points, &kps4);
+
+        --current;
+    }
+    assert(isog_len % 2 ? !current : current == -1);
+
+    // Final 2-isogeny
+    if (isog_len % 2) {
+#ifndef NDEBUG
+        if (fp2_is_zero(&splits[0].z))
+            debug_print("splitting point z coordinate is unexpectedly zero");
+        ec_point_t test;
+        copy_point(&test, &splits[0]);
+        xDBL_A24(&test, &test, &A24, false);
+        if (!fp2_is_zero(&test.z))
+            debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+
+        // We need to check the order of this point in case there were no 4-isogenies
+        if (isog_len == 1 && !ec_is_two_torsion(&splits[0], curve))
+            return -1;
+        if (fp2_is_zero(&splits[0].x)) {
+            // special isogenies not allowed
+            // this case can only happen if isog_len == 1; otherwise the
+            // previous 4-isogenies we computed ensure that $T=(0:1)$ is put
+            // as the kernel of the dual isogeny
+            return -1;
+        }
+
+        ec_kps2_t kps2;
+        xisog_2(&kps2, &A24, splits[0]);
+        xeval_2(points, points, len_points, &kps2);
+    }
+
+    // Output curve in the form (A:C)
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+
+    return 0;
+}
+
+uint32_t
+ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points)
+{
+    copy_curve(image, &phi->curve);
+    return ec_eval_even_strategy(image, points, len_points, &phi->kernel, phi->length);
+}
+
+// naive implementation
+uint32_t
+ec_eval_small_chain(ec_curve_t *curve,
+                    const ec_point_t *kernel,
+                    int len,
+                    ec_point_t *points,
+                    unsigned len_points,
+                    bool special) // do we allow special isogenies?
+{
+
+    ec_point_t A24;
+    AC_to_A24(&A24, curve);
+
+    ec_kps2_t kps;
+    ec_point_t small_K, big_K;
+    copy_point(&big_K, kernel);
+
+    for (int i = 0; i < len; i++) {
+        copy_point(&small_K, &big_K);
+        // small_K = big_K;
+        for (int j = 0; j < len - i - 1; j++) {
+            xDBL_A24(&small_K, &small_K, &A24, false);
+        }
+        // Check the order of the point before the first isogeny step
+        if (i == 0 && !ec_is_two_torsion(&small_K, curve))
+            return (uint32_t)-1;
+        // Perform isogeny step
+        if (fp2_is_zero(&small_K.x)) {
+            if (special) {
+                ec_point_t B24;
+                xisog_2_singular(&kps, &B24, A24);
+                xeval_2_singular(&big_K, &big_K, 1, &kps);
+                xeval_2_singular(points, points, len_points, &kps);
+                copy_point(&A24, &B24);
+            } else {
+                return (uint32_t)-1;
+            }
+        } else {
+            xisog_2(&kps, &A24, small_K);
+            xeval_2(&big_K, &big_K, 1, &kps);
+            xeval_2(points, points, len_points, &kps);
+        }
+    }
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+    return 0;
+}
+
+uint32_t
+ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to)
+{
+    fp2_t t0, t1, t2, t3, t4;
+
+    fp2_mul(&t0, &from->A, &from->C);
+    fp2_mul(&t1, &to->A, &to->C);
+
+    fp2_mul(&t2, &t1, &to->C); // toA*toC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*toA*toC^2
+    fp2_sqr(&t3, &to->A);
+    fp2_mul(&t3, &t3, &to->A); // toA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->Nx, &t3, &t2); // 2*toA^3-9*toA*toC^2
+    fp2_mul(&t2, &t0, &from->A);  // fromA^2*fromC
+    fp2_sqr(&t3, &from->C);
+    fp2_mul(&t3, &t3, &from->C); // fromC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);             // 3*fromC^3
+    fp2_sub(&t3, &t3, &t2);             // 3*fromC^3-fromA^2*fromC
+    fp2_mul(&isom->Nx, &isom->Nx, &t3); // lambda_x = (2*toA^3-9*toA*toC^2)*(3*fromC^3-fromA^2*fromC)
+
+    fp2_mul(&t2, &t0, &from->C); // fromA*fromC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*fromA*fromC^2
+    fp2_sqr(&t3, &from->A);
+    fp2_mul(&t3, &t3, &from->A); // fromA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->D, &t3, &t2); // 2*fromA^3-9*fromA*fromC^2
+    fp2_mul(&t2, &t1, &to->A);   // toA^2*toC
+    fp2_sqr(&t3, &to->C);
+    fp2_mul(&t3, &t3, &to->C); // toC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);           // 3*toC^3
+    fp2_sub(&t3, &t3, &t2);           // 3*toC^3-toA^2*toC
+    fp2_mul(&isom->D, &isom->D, &t3); // lambda_z = (2*fromA^3-9*fromA*fromC^2)*(3*toC^3-toA^2*toC)
+
+    // Mont -> SW -> SW -> Mont
+    fp2_mul(&t0, &to->C, &from->A);
+    fp2_mul(&t0, &t0, &isom->Nx); // lambda_x*toC*fromA
+    fp2_mul(&t1, &from->C, &to->A);
+    fp2_mul(&t1, &t1, &isom->D);  // lambda_z*fromC*toA
+    fp2_sub(&isom->Nz, &t0, &t1); // lambda_x*toC*fromA - lambda_z*fromC*toA
+    fp2_mul(&t0, &from->C, &to->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1);             // 3*fromC*toC
+    fp2_mul(&isom->D, &isom->D, &t0);   // 3*lambda_z*fromC*toC
+    fp2_mul(&isom->Nx, &isom->Nx, &t0); // 3*lambda_x*fromC*toC
+
+    return (fp2_is_zero(&isom->Nx) | fp2_is_zero(&isom->D));
+}
+
+void
+ec_iso_eval(ec_point_t *P, ec_isom_t *isom)
+{
+    fp2_t tmp;
+    fp2_mul(&P->x, &P->x, &isom->Nx);
+    fp2_mul(&tmp, &P->z, &isom->Nz);
+    fp2_add(&P->x, &P->x, &tmp);
+    fp2_mul(&P->z, &P->z, &isom->D);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/mp.c b/src/pqm4/sqisign_lvl3/ref/mp.c
new file mode 100644
index 0000000..27f4a96
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/mp.c
@@ -0,0 +1,357 @@
+#include <mp.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+// double-wide multiplication
+void
+MUL(digit_t *out, const digit_t a, const digit_t b)
+{
+#ifdef RADIX_32
+    uint64_t r = (uint64_t)a * b;
+    out[0] = r & 0xFFFFFFFFUL;
+    out[1] = r >> 32;
+
+#elif defined(RADIX_64) && defined(_MSC_VER)
+    uint64_t umul_hi;
+    out[0] = _umul128(a, b, &umul_hi);
+    out[1] = umul_hi;
+
+#elif defined(RADIX_64) && defined(HAVE_UINT128)
+    unsigned __int128 umul_tmp;
+    umul_tmp = (unsigned __int128)(a) * (unsigned __int128)(b);
+    out[0] = (uint64_t)umul_tmp;
+    out[1] = (uint64_t)(umul_tmp >> 64);
+
+#else
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4);
+    al = a & mask_low;               // Low part
+    ah = a >> (sizeof(digit_t) * 4); // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t) * 4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low; // out00
+
+    res1 = albl >> (sizeof(digit_t) * 4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t) * 4);
+    out[0] ^= temp << (sizeof(digit_t) * 4); // out01
+
+    res1 = ahbl >> (sizeof(digit_t) * 4);
+    res2 = albh >> (sizeof(digit_t) * 4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low; // out10
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry; // out11
+
+#endif
+}
+
+void
+mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision addition
+    unsigned int i, carry = 0;
+
+    for (i = 0; i < nwords; i++) {
+        ADDC(c[i], carry, a[i], b[i], carry);
+    }
+}
+
+digit_t
+mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift by 1...RADIX-1
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords - 1; i++) {
+        SHIFTR(x[i + 1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords - 1] >>= shift;
+    return bit_out;
+}
+
+void
+mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift by 1...RADIX-1
+
+    for (int i = nwords - 1; i > 0; i--) {
+        SHIFTL(x[i], x[i - 1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+void
+multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{
+    int t = shift;
+    while (t > RADIX - 1) {
+        mp_shiftl(x, RADIX - 1, nwords);
+        t = t - (RADIX - 1);
+    }
+    mp_shiftl(x, t, nwords);
+}
+
+// The below functions were taken from the EC module
+
+void
+mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision subtraction, assuming a > b
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < nwords; i++) {
+        SUBC(c[i], borrow, a[i], b[i], borrow);
+    }
+}
+
+void
+select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords)
+{ // Select c <- a if mask = 0, select c <- b if mask = 1...1
+
+    for (int i = 0; i < nwords; i++) {
+        c[i] = ((a[i] ^ b[i]) & mask) ^ a[i];
+    }
+}
+
+void
+swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords)
+{ // Swap entries
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then a <- b and b <- a
+    digit_t temp;
+
+    for (int i = 0; i < nwords; i++) {
+        temp = option & (a[i] ^ b[i]);
+        a[i] = temp ^ a[i];
+        b[i] = temp ^ b[i];
+    }
+}
+
+int
+mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords)
+{ // Multiprecision comparison, a=b? : (1) a>b, (0) a=b, (-1) a<b
+
+    for (int i = nwords - 1; i >= 0; i--) {
+        if (a[i] > b[i])
+            return 1;
+        else if (a[i] < b[i])
+            return -1;
+    }
+    return 0;
+}
+
+bool
+mp_is_zero(const digit_t *a, unsigned int nwords)
+{ // Is a multiprecision element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < nwords; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void
+mp_mul2(digit_t *c, const digit_t *a, const digit_t *b)
+{ // Multiprecision multiplication fixed to two-digit operands
+    unsigned int carry = 0;
+    digit_t t0[2], t1[2], t2[2];
+
+    MUL(t0, a[0], b[0]);
+    MUL(t1, a[0], b[1]);
+    ADDC(t0[1], carry, t0[1], t1[0], carry);
+    ADDC(t1[1], carry, 0, t1[1], carry);
+    MUL(t2, a[1], b[1]);
+    ADDC(t2[0], carry, t2[0], t1[1], carry);
+    ADDC(t2[1], carry, 0, t2[1], carry);
+    c[0] = t0[0];
+    c[1] = t0[1];
+    c[2] = t2[0];
+    c[3] = t2[1];
+}
+
+void
+mp_print(const digit_t *a, size_t nwords)
+{
+    printf("0x");
+    for (size_t i = 0; i < nwords; i++) {
+#ifdef RADIX_32
+        printf("%08" PRIx32, a[nwords - i - 1]); // Print each word with 8 hex digits
+#elif defined(RADIX_64)
+        printf("%016" PRIx64, a[nwords - i - 1]); // Print each word with 16 hex digits
+#endif
+    }
+}
+
+void
+mp_copy(digit_t *b, const digit_t *a, size_t nwords)
+{
+    for (size_t i = 0; i < nwords; i++) {
+        b[i] = a[i];
+    }
+}
+
+void
+mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords)
+{
+    // Multiprecision multiplication, c = a*b, for nwords-digit inputs, with nwords-digit output
+    // explicitly does not use the higher half of c, as we do not need in our applications
+    digit_t carry, UV[2], t[nwords], cc[nwords];
+
+    for (size_t i = 0; i < nwords; i++) {
+        cc[i] = 0;
+    }
+
+    for (size_t i = 0; i < nwords; i++) {
+
+        MUL(t, a[i], b[0]);
+
+        for (size_t j = 1; j < nwords - 1; j++) {
+            MUL(UV, a[i], b[j]);
+            ADDC(t[j], carry, t[j], UV[0], 0);
+            t[j + 1] = UV[1] + carry;
+        }
+
+        int j = nwords - 1;
+        MUL(UV, a[i], b[j]);
+        ADDC(t[j], carry, t[j], UV[0], 0);
+
+        mp_add(&cc[i], &cc[i], t, nwords - i);
+    }
+
+    mp_copy(c, cc, nwords);
+}
+
+void
+mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords)
+{ // Multiprecision modulo 2^e, with 0 <= a < 2^(e)
+    unsigned int i, q = e >> LOG2RADIX, r = e & (RADIX - 1);
+
+    if (q < nwords) {
+        a[q] &= ((digit_t)1 << r) - 1;
+
+        for (i = q + 1; i < nwords; i++) {
+            a[i] = 0;
+        }
+    }
+}
+
+void
+mp_neg(digit_t *a, unsigned int nwords)
+{ // negates a
+    for (size_t i = 0; i < nwords; i++) {
+        a[i] ^= -1;
+    }
+
+    a[0] += 1;
+}
+
+bool
+mp_is_one(const digit_t *x, unsigned int nwords)
+{ // returns true if x represents 1, and false otherwise
+    if (x[0] != 1) {
+        return false;
+    }
+
+    for (size_t i = 1; i < nwords; i++) {
+        if (x[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void
+mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords)
+{ // Inversion modulo 2^e, using Newton's method and Hensel lifting
+    // we take the first power of 2 larger than e to use
+    // requires a to be odd, of course
+    // returns b such that a*b = 1 mod 2^e
+    assert((a[0] & 1) == 1);
+
+    digit_t x[nwords], y[nwords], aa[nwords], mp_one[nwords], tmp[nwords];
+    mp_copy(aa, a, nwords);
+
+    mp_one[0] = 1;
+    for (unsigned int i = 1; i < nwords; i++) {
+        mp_one[i] = 0;
+    }
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    p -= 2; // using k = 4 for initial inverse
+    int w = (1 << (p + 2));
+
+    mp_mod_2exp(aa, w, nwords);
+    mp_add(x, aa, aa, nwords);
+    mp_add(x, x, aa, nwords);  // should be 3a
+    x[0] ^= (1 << 1);          // so that x equals (3a)^2 xor 2
+    mp_mod_2exp(x, w, nwords); // now x*a = 1 mod 2^4, which we lift
+
+    mp_mul(tmp, aa, x, nwords);
+    mp_neg(tmp, nwords);
+    mp_add(y, mp_one, tmp, nwords);
+
+    // Hensel lifting for p rounds
+    for (int i = 0; i < p; i++) {
+        mp_add(tmp, mp_one, y, nwords);
+        mp_mul(x, x, tmp, nwords);
+        mp_mul(y, y, y, nwords);
+    }
+
+    mp_mod_2exp(x, w, nwords);
+    mp_copy(b, x, nwords);
+
+    //  verify results
+    mp_mul(x, x, aa, nwords);
+    mp_mod_2exp(x, w, nwords);
+    assert(mp_is_one(x, nwords));
+}
+
+void
+mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords)
+{
+    // given a matrix ( ( a, b ), (c,  d) ) of values mod 2^e
+    // returns the inverse matrix gamma ( (d, -b), (-c, a) )
+    // where gamma is the inverse of the determinant a*d - b*c
+    // assumes the matrix is invertible, otherwises, inversion of determinant fails
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    int w = (1 << (p));
+
+    digit_t det[nwords], tmp[nwords], resa[nwords], resb[nwords], resc[nwords], resd[nwords];
+    mp_mul(tmp, r1, s2, nwords);
+    mp_mul(det, r2, s1, nwords);
+    mp_sub(det, tmp, det, nwords);
+    mp_inv_2e(det, det, e, nwords);
+
+    mp_mul(resa, det, s2, nwords);
+    mp_mul(resb, det, r2, nwords);
+    mp_mul(resc, det, s1, nwords);
+    mp_mul(resd, det, r1, nwords);
+
+    mp_neg(resb, nwords);
+    mp_neg(resc, nwords);
+
+    mp_mod_2exp(resa, w, nwords);
+    mp_mod_2exp(resb, w, nwords);
+    mp_mod_2exp(resc, w, nwords);
+    mp_mod_2exp(resd, w, nwords);
+
+    mp_copy(r1, resa, nwords);
+    mp_copy(r2, resb, nwords);
+    mp_copy(s1, resc, nwords);
+    mp_copy(s2, resd, nwords);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/mp.h b/src/pqm4/sqisign_lvl3/ref/mp.h
new file mode 100644
index 0000000..b3733b5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/mp.h
@@ -0,0 +1,88 @@
+#ifndef MP_H
+#define MP_H
+
+#include <sqisign_namespace.h>
+#include <stdbool.h>
+#include <tutil.h>
+
+// Functions taken from the GF module
+
+void mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+digit_t mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void MUL(digit_t *out, const digit_t a, const digit_t b);
+
+// Functions taken from the EC module
+
+void mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+void select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords);
+void swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords);
+int mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords);
+bool mp_is_zero(const digit_t *a, unsigned int nwords);
+void mp_mul2(digit_t *c, const digit_t *a, const digit_t *b);
+
+// Further functions for multiprecision arithmetic
+void mp_print(const digit_t *a, size_t nwords);
+void mp_copy(digit_t *b, const digit_t *a, size_t nwords);
+void mp_neg(digit_t *a, unsigned int nwords);
+bool mp_is_one(const digit_t *x, unsigned int nwords);
+void mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords);
+void mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords);
+void mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords);
+void mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords);
+
+#define mp_is_odd(x, nwords) (((nwords) != 0) & (int)(x)[0])
+#define mp_is_even(x, nwords) (!mp_is_odd(x, nwords))
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+static inline unsigned int
+is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int
+is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int
+is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations
+ * **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                                              \
+    {                                                                                                                  \
+        digit_t tempReg = (addend1) + (digit_t)(carryIn);                                                              \
+        (sumOut) = (addend2) + tempReg;                                                                                \
+        (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg));    \
+    }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                                                  \
+    {                                                                                                                  \
+        digit_t tempReg = (minuend) - (subtrahend);                                                                    \
+        unsigned int borrowReg =                                                                                       \
+            (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));                \
+        (differenceOut) = tempReg - (digit_t)(borrowIn);                                                               \
+        (borrowOut) = borrowReg;                                                                                       \
+    }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/pqm4_api.c b/src/pqm4/sqisign_lvl3/ref/pqm4_api.c
new file mode 100644
index 0000000..5f1b121
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/pqm4_api.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <sig.h>
+#include <string.h>
+
+typedef struct {
+  size_t mlen;
+  char msg[59];
+  size_t smlen;
+  char sm[59 + CRYPTO_BYTES];
+} SQISign_KAT_t;
+
+const char kat_lvl3_pk[CRYPTO_PUBLICKEYBYTES] = {
+  0xBE, 0xAA, 0x01, 0xC6, 0x82, 0x45, 0xC6, 0x4B, 0x6C, 0x96, 0xED, 0xE0, 0x90, 0x89, 0xD0, 0x89, 0xBA, 0x1C, 0x42, 0x0B, 0xD3, 0xA9, 0xA0, 0xDC, 0x96, 0xBA, 0x81, 0x3F, 0x62, 0x7B, 0xA9, 0xAF, 0xB5, 0xE4, 0xEC, 0x6B, 0x70, 0x25, 0x27, 0x99, 0x2D, 0x3D, 0xFE, 0x07, 0x85, 0x0F, 0x20, 0x30, 0xFB, 0xC3, 0x17, 0x73, 0x53, 0x94, 0xBA, 0xFB, 0xA5, 0x44, 0xFB, 0x30, 0x4F, 0x29, 0xC0, 0x4D, 0x6F, 0x97, 0x85, 0x2D, 0x4D, 0x14, 0x13, 0x4E, 0x4F, 0x7C, 0x03, 0x43, 0x84, 0x48, 0x23, 0x01, 0x1B, 0x77, 0x2C, 0xD9, 0xD3, 0x09, 0x00, 0x66, 0x1D, 0x26, 0xB8, 0x54, 0x44, 0xF2, 0x31, 0x2B, 0x02, 
+};
+
+const SQISign_KAT_t kat_lvl3[2] = {
+  {
+    .mlen = 32,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 32 + CRYPTO_BYTES,
+    .sm = { 0x24, 0x9A, 0xA4, 0x39, 0x1C, 0x30, 0x7F, 0xA3, 0xE4, 0x0A, 0x69, 0xAE, 0x23, 0xF2, 0x53, 0x15, 0x56, 0x4A, 0x53, 0xC4, 0x3F, 0x2E, 0xF8, 0x0B, 0x10, 0xE2, 0x26, 0x5E, 0x65, 0xA8, 0x1C, 0xDC, 0x67, 0xB6, 0x86, 0x11, 0x7C, 0x56, 0xC0, 0x63, 0x47, 0x20, 0x84, 0x67, 0x60, 0x8D, 0xCC, 0x02, 0x99, 0x66, 0xD0, 0x68, 0x64, 0x8F, 0xFE, 0xC5, 0x6A, 0xA8, 0x53, 0x71, 0x52, 0x57, 0x82, 0xF0, 0x98, 0x6A, 0xA0, 0xAB, 0x29, 0x53, 0x7A, 0x0B, 0xE8, 0x73, 0xE4, 0x1A, 0x99, 0x44, 0x1A, 0x5C, 0x73, 0x0C, 0x1E, 0x03, 0xEB, 0x28, 0x82, 0x93, 0x42, 0xAE, 0x14, 0xB0, 0x8D, 0x26, 0xDB, 0x03, 0x00, 0x02, 0x51, 0x9B, 0x4F, 0xF9, 0x41, 0xE2, 0xDF, 0x67, 0x34, 0xDF, 0x76, 0x3D, 0x2D, 0xD9, 0xA9, 0x48, 0xE5, 0x74, 0x08, 0x8E, 0xB7, 0x88, 0xD0, 0x5B, 0x03, 0x3E, 0x2E, 0x75, 0x78, 0x60, 0xD7, 0x88, 0x15, 0x7E, 0x6A, 0x68, 0x4F, 0x47, 0x25, 0x68, 0x80, 0x33, 0x16, 0x7C, 0x90, 0xDC, 0x91, 0x58, 0xC9, 0x00, 0x7A, 0xA2, 0x5F, 0x15, 0x91, 0xA8, 0x9C, 0x0A, 0xEA, 0x02, 0x6F, 0x23, 0xCE, 0xDE, 0xB6, 0xF9, 0x35, 0x00, 0x37, 0x2A, 0x84, 0x07, 0x34, 0xA8, 0x03, 0x48, 0x0C, 0xDC, 0x60, 0x19, 0x14, 0xE5, 0x32, 0x67, 0x28, 0xA4, 0x6F, 0x8F, 0xB8, 0x21, 0x64, 0x96, 0x11, 0x9D, 0xDE, 0xF8, 0x59, 0x59, 0x4E, 0x02, 0x5E, 0x06, 0xD7, 0x9C, 0x30, 0x6D, 0x92, 0x10, 0x85, 0x19, 0x9A, 0x90, 0xFD, 0x1B, 0xDE, 0xF9, 0x49, 0xA6, 0x72, 0xA0, 0x17, 0x56, 0x5E, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+  {
+    .mlen = 59,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 59 + CRYPTO_BYTES,
+    .sm = { 0xFB, 0x52, 0xE6, 0x3A, 0x1B, 0xD2, 0x16, 0x0F, 0xFC, 0x99, 0x99, 0x7D, 0xF4, 0x83, 0xEE, 0x99, 0xC0, 0xE6, 0x92, 0x5C, 0x16, 0xBB, 0x3E, 0x6F, 0xDB, 0x86, 0x2C, 0x3A, 0xAC, 0x86, 0x37, 0x46, 0x1C, 0x5B, 0x68, 0xB4, 0xE0, 0xD1, 0x6C, 0x0B, 0x55, 0x6B, 0x8E, 0xD4, 0x57, 0xB0, 0xD0, 0x33, 0x74, 0xA4, 0xD8, 0x55, 0x82, 0x0F, 0xED, 0xE2, 0x5C, 0xF6, 0x09, 0x20, 0x53, 0x88, 0xF2, 0xB2, 0xBA, 0xEB, 0x9A, 0x8F, 0x9C, 0x6C, 0x37, 0x69, 0x52, 0xF1, 0x0E, 0xAB, 0xA7, 0x10, 0xBC, 0x26, 0x1B, 0x72, 0x78, 0xC4, 0xB9, 0x51, 0x9E, 0x0F, 0xB0, 0x1C, 0x53, 0x34, 0x4D, 0x4B, 0x2A, 0x16, 0x01, 0x01, 0x8F, 0xB7, 0x44, 0x4B, 0x16, 0x8A, 0xCC, 0x0E, 0xDE, 0x32, 0x0B, 0x0F, 0x83, 0x89, 0x01, 0xBA, 0xC6, 0x46, 0x7E, 0x70, 0x81, 0xC0, 0xC1, 0x58, 0x01, 0xBF, 0x55, 0x63, 0xA8, 0x3D, 0xFB, 0x89, 0x43, 0x83, 0x30, 0xE8, 0x14, 0x1C, 0xAB, 0xB4, 0x05, 0x80, 0x88, 0x60, 0x36, 0xFA, 0xA2, 0x3E, 0x74, 0x00, 0xEC, 0xE1, 0xEE, 0x7B, 0x38, 0x9C, 0x58, 0xE9, 0xD0, 0xA2, 0x4C, 0x04, 0xB6, 0xD2, 0x36, 0x1D, 0x20, 0xE8, 0x8E, 0x3D, 0xB8, 0x79, 0xEC, 0xF9, 0x00, 0xE2, 0x6E, 0x5D, 0xFE, 0xB0, 0x75, 0x22, 0x43, 0x37, 0xC7, 0x5C, 0x00, 0x4B, 0xE5, 0xE5, 0xDC, 0x54, 0xDA, 0xAD, 0x54, 0x7F, 0xEC, 0xB8, 0xCA, 0x00, 0xE3, 0x1D, 0x49, 0xA2, 0x67, 0xA7, 0x4B, 0x3F, 0xFC, 0x1E, 0xB3, 0x40, 0x08, 0xEA, 0x48, 0xBD, 0x81, 0x0E, 0xAE, 0xE1, 0x54, 0x68, 0xB2, 0x00, 0x02, 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+};
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+  memcpy(pk, kat_lvl3_pk, CRYPTO_PUBLICKEYBYTES);
+  // We don't need the secret key
+  memset(sk, 0, CRYPTO_SECRETKEYBYTES);
+}
+
+int crypto_sign(unsigned char *sm, size_t *smlen, const unsigned char *m,
+                size_t mlen, const unsigned char *sk) {
+  for (size_t i = 0; i < sizeof(kat_lvl3) / sizeof(kat_lvl3[0]); i++) {
+    if (mlen == kat_lvl3[i].mlen) {
+      memcpy(sm, kat_lvl3[i].sm, kat_lvl3[i].smlen);
+      *smlen = kat_lvl3[i].smlen;
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int crypto_sign_open(unsigned char *m, size_t *mlen, const unsigned char *sm,
+                     size_t smlen, const unsigned char *pk) {
+  unsigned long long mlen_ull = *mlen;
+  int ret = sqisign_open(m, &mlen_ull, sm, smlen, pk);
+  if (mlen) {
+    *mlen = mlen_ull;
+  }
+  return ret;
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/rng.h b/src/pqm4/sqisign_lvl3/ref/rng.h
new file mode 100644
index 0000000..3c24d07
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/rng.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef rng_h
+#define rng_h
+
+#include "randombytes.h"
+
+#endif /* rng_h */
diff --git a/src/pqm4/sqisign_lvl3/ref/sig.h b/src/pqm4/sqisign_lvl3/ref/sig.h
new file mode 100644
index 0000000..4c33510
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/sig.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SQISIGN_H
+#define SQISIGN_H
+
+#include <stdint.h>
+#include <sqisign_namespace.h>
+
+#if defined(ENABLE_SIGN)
+/**
+ * SQIsign keypair generation.
+ *
+ * The implementation corresponds to SQIsign.CompactKeyGen() in the SQIsign spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[out] pk SQIsign public key
+ * @param[out] sk SQIsign secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_keypair(unsigned char *pk, unsigned char *sk);
+
+/**
+ * SQIsign signature generation.
+ *
+ * The implementation performs SQIsign.expandSK() + SQIsign.sign() in the SQIsign spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_sign(unsigned char *sm,
+                 unsigned long long *smlen,
+                 const unsigned char *m,
+                 unsigned long long mlen,
+                 const unsigned char *sk);
+#endif
+
+/**
+ * SQIsign open signature.
+ *
+ * The implementation performs SQIsign.verify(). If the signature verification succeeded, the
+ * original message is stored in m. Keys provided is a compact public key. The caller is responsible
+ * to allocate sufficient memory to hold m.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+SQISIGN_API
+int sqisign_open(unsigned char *m,
+                 unsigned long long *mlen,
+                 const unsigned char *sm,
+                 unsigned long long smlen,
+                 const unsigned char *pk);
+
+/**
+ * SQIsign verify signature.
+ *
+ * If the signature verification succeeded, returns 0, otherwise 1.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] siglen Length of sig
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+SQISIGN_API 
+int sqisign_verify(const unsigned char *m,
+                   unsigned long long mlen,
+                   const unsigned char *sig,
+                   unsigned long long siglen,
+                   const unsigned char *pk);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/sqisign.c b/src/pqm4/sqisign_lvl3/ref/sqisign.c
new file mode 100644
index 0000000..57fd75d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/sqisign.c
@@ -0,0 +1,106 @@
+#include <sig.h>
+#include <string.h>
+#include <encoded_sizes.h>
+#include <verification.h>
+#if defined(ENABLE_SIGN)
+#include <signature.h>
+#endif
+
+#if defined(ENABLE_SIGN)
+SQISIGN_API
+int
+sqisign_keypair(unsigned char *pk, unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    secret_key_init(&skt);
+
+    ret = !protocols_keygen(&pkt, &skt);
+
+    secret_key_to_bytes(sk, &skt, &pkt);
+    public_key_to_bytes(pk, &pkt);
+    secret_key_finalize(&skt);
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_sign(unsigned char *sm,
+             unsigned long long *smlen,
+             const unsigned char *m,
+             unsigned long long mlen,
+             const unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+    secret_key_init(&skt);
+    secret_key_from_bytes(&skt, &pkt, sk);
+
+    memmove(sm + SIGNATURE_BYTES, m, mlen);
+
+    ret = !protocols_sign(&sigt, &pkt, &skt, sm + SIGNATURE_BYTES, mlen);
+    if (ret != 0) {
+        *smlen = 0;
+        goto err;
+    }
+
+    signature_to_bytes(sm, &sigt);
+    *smlen = SIGNATURE_BYTES + mlen;
+
+err:
+    secret_key_finalize(&skt);
+    return ret;
+}
+#endif
+
+SQISIGN_API
+int
+sqisign_open(unsigned char *m,
+             unsigned long long *mlen,
+             const unsigned char *sm,
+             unsigned long long smlen,
+             const unsigned char *pk)
+{
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sm);
+
+    ret = !protocols_verify(&sigt, &pkt, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES);
+
+    if (!ret) {
+        *mlen = smlen - SIGNATURE_BYTES;
+        memmove(m, sm + SIGNATURE_BYTES, *mlen);
+    } else {
+        *mlen = 0;
+        memset(m, 0, smlen - SIGNATURE_BYTES);
+    }
+
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_verify(const unsigned char *m,
+               unsigned long long mlen,
+               const unsigned char *sig,
+               unsigned long long siglen,
+               const unsigned char *pk)
+{
+
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sig);
+
+    ret = !protocols_verify(&sigt, &pkt, m, mlen);
+
+    return ret;
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/sqisign_namespace.h b/src/pqm4/sqisign_lvl3/ref/sqisign_namespace.h
new file mode 100644
index 0000000..14fd51d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/sqisign_namespace.h
@@ -0,0 +1,1022 @@
+
+#ifndef SQISIGN_NAMESPACE_H
+#define SQISIGN_NAMESPACE_H
+
+//#define DISABLE_NAMESPACING
+
+#if defined(_WIN32)
+#define SQISIGN_API __declspec(dllexport)
+#else
+#define SQISIGN_API __attribute__((visibility("default")))
+#endif
+
+#define PARAM_JOIN3_(a, b, c) sqisign_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(SQISIGN_VARIANT, end, s)
+
+#define PARAM_JOIN2_(a, b) sqisign_##a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME2(end, s) PARAM_JOIN2(end, s)
+
+#ifndef DISABLE_NAMESPACING
+#define SQISIGN_NAMESPACE_GENERIC(s) PARAM_NAME2(gen, s)
+#else
+#define SQISIGN_NAMESPACE_GENERIC(s) s
+#endif
+
+#if defined(SQISIGN_VARIANT) && !defined(DISABLE_NAMESPACING)
+#if defined(SQISIGN_BUILD_TYPE_REF)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(SQISIGN_BUILD_TYPE_OPT)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(SQISIGN_BUILD_TYPE_BROADWELL)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(broadwell, s)
+#elif defined(SQISIGN_BUILD_TYPE_ARM64CRYPTO)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(arm64crypto, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define SQISIGN_NAMESPACE(s) s
+#endif
+
+// Namespacing symbols exported from algebra.c:
+#undef quat_alg_add
+#undef quat_alg_conj
+#undef quat_alg_coord_mul
+#undef quat_alg_elem_copy
+#undef quat_alg_elem_copy_ibz
+#undef quat_alg_elem_equal
+#undef quat_alg_elem_is_zero
+#undef quat_alg_elem_mul_by_scalar
+#undef quat_alg_elem_set
+#undef quat_alg_equal_denom
+#undef quat_alg_init_set_ui
+#undef quat_alg_make_primitive
+#undef quat_alg_mul
+#undef quat_alg_norm
+#undef quat_alg_normalize
+#undef quat_alg_scalar
+#undef quat_alg_sub
+
+#define quat_alg_add                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_add)
+#define quat_alg_conj                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_conj)
+#define quat_alg_coord_mul                              SQISIGN_NAMESPACE_GENERIC(quat_alg_coord_mul)
+#define quat_alg_elem_copy                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy)
+#define quat_alg_elem_copy_ibz                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy_ibz)
+#define quat_alg_elem_equal                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_equal)
+#define quat_alg_elem_is_zero                           SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_is_zero)
+#define quat_alg_elem_mul_by_scalar                     SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_mul_by_scalar)
+#define quat_alg_elem_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_set)
+#define quat_alg_equal_denom                            SQISIGN_NAMESPACE_GENERIC(quat_alg_equal_denom)
+#define quat_alg_init_set_ui                            SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set_ui)
+#define quat_alg_make_primitive                         SQISIGN_NAMESPACE_GENERIC(quat_alg_make_primitive)
+#define quat_alg_mul                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_mul)
+#define quat_alg_norm                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_norm)
+#define quat_alg_normalize                              SQISIGN_NAMESPACE_GENERIC(quat_alg_normalize)
+#define quat_alg_scalar                                 SQISIGN_NAMESPACE_GENERIC(quat_alg_scalar)
+#define quat_alg_sub                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_sub)
+
+// Namespacing symbols exported from api.c:
+#undef crypto_sign
+#undef crypto_sign_keypair
+#undef crypto_sign_open
+
+#define crypto_sign                                     SQISIGN_NAMESPACE(crypto_sign)
+#define crypto_sign_keypair                             SQISIGN_NAMESPACE(crypto_sign_keypair)
+#define crypto_sign_open                                SQISIGN_NAMESPACE(crypto_sign_open)
+
+// Namespacing symbols exported from basis.c:
+#undef ec_curve_to_basis_2f_from_hint
+#undef ec_curve_to_basis_2f_to_hint
+#undef ec_recover_y
+#undef lift_basis
+#undef lift_basis_normalized
+
+#define ec_curve_to_basis_2f_from_hint                  SQISIGN_NAMESPACE(ec_curve_to_basis_2f_from_hint)
+#define ec_curve_to_basis_2f_to_hint                    SQISIGN_NAMESPACE(ec_curve_to_basis_2f_to_hint)
+#define ec_recover_y                                    SQISIGN_NAMESPACE(ec_recover_y)
+#define lift_basis                                      SQISIGN_NAMESPACE(lift_basis)
+#define lift_basis_normalized                           SQISIGN_NAMESPACE(lift_basis_normalized)
+
+// Namespacing symbols exported from biextension.c:
+#undef clear_cofac
+#undef ec_dlog_2_tate
+#undef ec_dlog_2_weil
+#undef fp2_frob
+#undef reduced_tate
+#undef weil
+
+#define clear_cofac                                     SQISIGN_NAMESPACE(clear_cofac)
+#define ec_dlog_2_tate                                  SQISIGN_NAMESPACE(ec_dlog_2_tate)
+#define ec_dlog_2_weil                                  SQISIGN_NAMESPACE(ec_dlog_2_weil)
+#define fp2_frob                                        SQISIGN_NAMESPACE(fp2_frob)
+#define reduced_tate                                    SQISIGN_NAMESPACE(reduced_tate)
+#define weil                                            SQISIGN_NAMESPACE(weil)
+
+// Namespacing symbols exported from common.c:
+#undef hash_to_challenge
+#undef public_key_finalize
+#undef public_key_init
+
+#define hash_to_challenge                               SQISIGN_NAMESPACE(hash_to_challenge)
+#define public_key_finalize                             SQISIGN_NAMESPACE(public_key_finalize)
+#define public_key_init                                 SQISIGN_NAMESPACE(public_key_init)
+
+// Namespacing symbols exported from dim2.c:
+#undef ibz_2x2_mul_mod
+#undef ibz_mat_2x2_add
+#undef ibz_mat_2x2_copy
+#undef ibz_mat_2x2_det_from_ibz
+#undef ibz_mat_2x2_eval
+#undef ibz_mat_2x2_inv_mod
+#undef ibz_mat_2x2_set
+#undef ibz_vec_2_set
+
+#define ibz_2x2_mul_mod                                 SQISIGN_NAMESPACE_GENERIC(ibz_2x2_mul_mod)
+#define ibz_mat_2x2_add                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_add)
+#define ibz_mat_2x2_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_copy)
+#define ibz_mat_2x2_det_from_ibz                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_det_from_ibz)
+#define ibz_mat_2x2_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_eval)
+#define ibz_mat_2x2_inv_mod                             SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_inv_mod)
+#define ibz_mat_2x2_set                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_set)
+#define ibz_vec_2_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_set)
+
+// Namespacing symbols exported from dim2id2iso.c:
+#undef dim2id2iso_arbitrary_isogeny_evaluation
+#undef dim2id2iso_ideal_to_isogeny_clapotis
+#undef find_uv
+#undef fixed_degree_isogeny_and_eval
+
+#define dim2id2iso_arbitrary_isogeny_evaluation         SQISIGN_NAMESPACE(dim2id2iso_arbitrary_isogeny_evaluation)
+#define dim2id2iso_ideal_to_isogeny_clapotis            SQISIGN_NAMESPACE(dim2id2iso_ideal_to_isogeny_clapotis)
+#define find_uv                                         SQISIGN_NAMESPACE(find_uv)
+#define fixed_degree_isogeny_and_eval                   SQISIGN_NAMESPACE(fixed_degree_isogeny_and_eval)
+
+// Namespacing symbols exported from dim4.c:
+#undef ibz_inv_dim4_make_coeff_mpm
+#undef ibz_inv_dim4_make_coeff_pmp
+#undef ibz_mat_4x4_copy
+#undef ibz_mat_4x4_equal
+#undef ibz_mat_4x4_eval
+#undef ibz_mat_4x4_eval_t
+#undef ibz_mat_4x4_gcd
+#undef ibz_mat_4x4_identity
+#undef ibz_mat_4x4_inv_with_det_as_denom
+#undef ibz_mat_4x4_is_identity
+#undef ibz_mat_4x4_mul
+#undef ibz_mat_4x4_negate
+#undef ibz_mat_4x4_scalar_div
+#undef ibz_mat_4x4_scalar_mul
+#undef ibz_mat_4x4_transpose
+#undef ibz_mat_4x4_zero
+#undef ibz_vec_4_add
+#undef ibz_vec_4_content
+#undef ibz_vec_4_copy
+#undef ibz_vec_4_copy_ibz
+#undef ibz_vec_4_is_zero
+#undef ibz_vec_4_linear_combination
+#undef ibz_vec_4_negate
+#undef ibz_vec_4_scalar_div
+#undef ibz_vec_4_scalar_mul
+#undef ibz_vec_4_set
+#undef ibz_vec_4_sub
+#undef quat_qf_eval
+
+#define ibz_inv_dim4_make_coeff_mpm                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_mpm)
+#define ibz_inv_dim4_make_coeff_pmp                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_pmp)
+#define ibz_mat_4x4_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_copy)
+#define ibz_mat_4x4_equal                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_equal)
+#define ibz_mat_4x4_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval)
+#define ibz_mat_4x4_eval_t                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval_t)
+#define ibz_mat_4x4_gcd                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_gcd)
+#define ibz_mat_4x4_identity                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_identity)
+#define ibz_mat_4x4_inv_with_det_as_denom               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_inv_with_det_as_denom)
+#define ibz_mat_4x4_is_identity                         SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_identity)
+#define ibz_mat_4x4_mul                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_mul)
+#define ibz_mat_4x4_negate                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_negate)
+#define ibz_mat_4x4_scalar_div                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_div)
+#define ibz_mat_4x4_scalar_mul                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_mul)
+#define ibz_mat_4x4_transpose                           SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_transpose)
+#define ibz_mat_4x4_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_zero)
+#define ibz_vec_4_add                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_add)
+#define ibz_vec_4_content                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_content)
+#define ibz_vec_4_copy                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy)
+#define ibz_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_ibz)
+#define ibz_vec_4_is_zero                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_is_zero)
+#define ibz_vec_4_linear_combination                    SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination)
+#define ibz_vec_4_negate                                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_negate)
+#define ibz_vec_4_scalar_div                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_div)
+#define ibz_vec_4_scalar_mul                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul)
+#define ibz_vec_4_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_set)
+#define ibz_vec_4_sub                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_sub)
+#define quat_qf_eval                                    SQISIGN_NAMESPACE_GENERIC(quat_qf_eval)
+
+// Namespacing symbols exported from ec.c:
+#undef cswap_points
+#undef ec_biscalar_mul
+#undef ec_curve_init
+#undef ec_curve_init_from_A
+#undef ec_curve_normalize_A24
+#undef ec_curve_verify_A
+#undef ec_dbl
+#undef ec_dbl_iter
+#undef ec_dbl_iter_basis
+#undef ec_has_zero_coordinate
+#undef ec_is_basis_four_torsion
+#undef ec_is_equal
+#undef ec_is_four_torsion
+#undef ec_is_two_torsion
+#undef ec_is_zero
+#undef ec_j_inv
+#undef ec_ladder3pt
+#undef ec_mul
+#undef ec_normalize_curve
+#undef ec_normalize_curve_and_A24
+#undef ec_normalize_point
+#undef ec_point_init
+#undef select_point
+#undef xADD
+#undef xDBL
+#undef xDBLADD
+#undef xDBLMUL
+#undef xDBL_A24
+#undef xDBL_E0
+#undef xMUL
+
+#define cswap_points                                    SQISIGN_NAMESPACE(cswap_points)
+#define ec_biscalar_mul                                 SQISIGN_NAMESPACE(ec_biscalar_mul)
+#define ec_curve_init                                   SQISIGN_NAMESPACE(ec_curve_init)
+#define ec_curve_init_from_A                            SQISIGN_NAMESPACE(ec_curve_init_from_A)
+#define ec_curve_normalize_A24                          SQISIGN_NAMESPACE(ec_curve_normalize_A24)
+#define ec_curve_verify_A                               SQISIGN_NAMESPACE(ec_curve_verify_A)
+#define ec_dbl                                          SQISIGN_NAMESPACE(ec_dbl)
+#define ec_dbl_iter                                     SQISIGN_NAMESPACE(ec_dbl_iter)
+#define ec_dbl_iter_basis                               SQISIGN_NAMESPACE(ec_dbl_iter_basis)
+#define ec_has_zero_coordinate                          SQISIGN_NAMESPACE(ec_has_zero_coordinate)
+#define ec_is_basis_four_torsion                        SQISIGN_NAMESPACE(ec_is_basis_four_torsion)
+#define ec_is_equal                                     SQISIGN_NAMESPACE(ec_is_equal)
+#define ec_is_four_torsion                              SQISIGN_NAMESPACE(ec_is_four_torsion)
+#define ec_is_two_torsion                               SQISIGN_NAMESPACE(ec_is_two_torsion)
+#define ec_is_zero                                      SQISIGN_NAMESPACE(ec_is_zero)
+#define ec_j_inv                                        SQISIGN_NAMESPACE(ec_j_inv)
+#define ec_ladder3pt                                    SQISIGN_NAMESPACE(ec_ladder3pt)
+#define ec_mul                                          SQISIGN_NAMESPACE(ec_mul)
+#define ec_normalize_curve                              SQISIGN_NAMESPACE(ec_normalize_curve)
+#define ec_normalize_curve_and_A24                      SQISIGN_NAMESPACE(ec_normalize_curve_and_A24)
+#define ec_normalize_point                              SQISIGN_NAMESPACE(ec_normalize_point)
+#define ec_point_init                                   SQISIGN_NAMESPACE(ec_point_init)
+#define select_point                                    SQISIGN_NAMESPACE(select_point)
+#define xADD                                            SQISIGN_NAMESPACE(xADD)
+#define xDBL                                            SQISIGN_NAMESPACE(xDBL)
+#define xDBLADD                                         SQISIGN_NAMESPACE(xDBLADD)
+#define xDBLMUL                                         SQISIGN_NAMESPACE(xDBLMUL)
+#define xDBL_A24                                        SQISIGN_NAMESPACE(xDBL_A24)
+#define xDBL_E0                                         SQISIGN_NAMESPACE(xDBL_E0)
+#define xMUL                                            SQISIGN_NAMESPACE(xMUL)
+
+// Namespacing symbols exported from ec_jac.c:
+#undef ADD
+#undef DBL
+#undef DBLW
+#undef copy_jac_point
+#undef jac_from_ws
+#undef jac_init
+#undef jac_is_equal
+#undef jac_neg
+#undef jac_to_ws
+#undef jac_to_xz
+#undef jac_to_xz_add_components
+#undef select_jac_point
+
+#define ADD                                             SQISIGN_NAMESPACE(ADD)
+#define DBL                                             SQISIGN_NAMESPACE(DBL)
+#define DBLW                                            SQISIGN_NAMESPACE(DBLW)
+#define copy_jac_point                                  SQISIGN_NAMESPACE(copy_jac_point)
+#define jac_from_ws                                     SQISIGN_NAMESPACE(jac_from_ws)
+#define jac_init                                        SQISIGN_NAMESPACE(jac_init)
+#define jac_is_equal                                    SQISIGN_NAMESPACE(jac_is_equal)
+#define jac_neg                                         SQISIGN_NAMESPACE(jac_neg)
+#define jac_to_ws                                       SQISIGN_NAMESPACE(jac_to_ws)
+#define jac_to_xz                                       SQISIGN_NAMESPACE(jac_to_xz)
+#define jac_to_xz_add_components                        SQISIGN_NAMESPACE(jac_to_xz_add_components)
+#define select_jac_point                                SQISIGN_NAMESPACE(select_jac_point)
+
+// Namespacing symbols exported from encode_signature.c:
+#undef secret_key_from_bytes
+#undef secret_key_to_bytes
+
+#define secret_key_from_bytes                           SQISIGN_NAMESPACE(secret_key_from_bytes)
+#define secret_key_to_bytes                             SQISIGN_NAMESPACE(secret_key_to_bytes)
+
+// Namespacing symbols exported from encode_verification.c:
+#undef public_key_from_bytes
+#undef public_key_to_bytes
+#undef signature_from_bytes
+#undef signature_to_bytes
+
+#define public_key_from_bytes                           SQISIGN_NAMESPACE(public_key_from_bytes)
+#define public_key_to_bytes                             SQISIGN_NAMESPACE(public_key_to_bytes)
+#define signature_from_bytes                            SQISIGN_NAMESPACE(signature_from_bytes)
+#define signature_to_bytes                              SQISIGN_NAMESPACE(signature_to_bytes)
+
+// Namespacing symbols exported from finit.c:
+#undef ibz_mat_2x2_finalize
+#undef ibz_mat_2x2_init
+#undef ibz_mat_4x4_finalize
+#undef ibz_mat_4x4_init
+#undef ibz_vec_2_finalize
+#undef ibz_vec_2_init
+#undef ibz_vec_4_finalize
+#undef ibz_vec_4_init
+#undef quat_alg_elem_finalize
+#undef quat_alg_elem_init
+#undef quat_alg_finalize
+#undef quat_alg_init_set
+#undef quat_lattice_finalize
+#undef quat_lattice_init
+#undef quat_left_ideal_finalize
+#undef quat_left_ideal_init
+
+#define ibz_mat_2x2_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_finalize)
+#define ibz_mat_2x2_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_init)
+#define ibz_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_finalize)
+#define ibz_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_init)
+#define ibz_vec_2_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_finalize)
+#define ibz_vec_2_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_init)
+#define ibz_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_finalize)
+#define ibz_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_init)
+#define quat_alg_elem_finalize                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_finalize)
+#define quat_alg_elem_init                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_init)
+#define quat_alg_finalize                               SQISIGN_NAMESPACE_GENERIC(quat_alg_finalize)
+#define quat_alg_init_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set)
+#define quat_lattice_finalize                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_finalize)
+#define quat_lattice_init                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_init)
+#define quat_left_ideal_finalize                        SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_finalize)
+#define quat_left_ideal_init                            SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_init)
+
+// Namespacing symbols exported from fp.c:
+#undef fp_select
+
+#define fp_select                                       SQISIGN_NAMESPACE(fp_select)
+
+// Namespacing symbols exported from fp.c, fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_exp3div4
+#undef fp_inv
+#undef fp_is_square
+#undef fp_sqrt
+
+#define fp_exp3div4                                     SQISIGN_NAMESPACE(fp_exp3div4)
+#define fp_inv                                          SQISIGN_NAMESPACE(fp_inv)
+#define fp_is_square                                    SQISIGN_NAMESPACE(fp_is_square)
+#define fp_sqrt                                         SQISIGN_NAMESPACE(fp_sqrt)
+
+// Namespacing symbols exported from fp2.c:
+#undef fp2_add
+#undef fp2_add_one
+#undef fp2_batched_inv
+#undef fp2_copy
+#undef fp2_cswap
+#undef fp2_decode
+#undef fp2_encode
+#undef fp2_half
+#undef fp2_inv
+#undef fp2_is_equal
+#undef fp2_is_one
+#undef fp2_is_square
+#undef fp2_is_zero
+#undef fp2_mul
+#undef fp2_mul_small
+#undef fp2_neg
+#undef fp2_pow_vartime
+#undef fp2_print
+#undef fp2_select
+#undef fp2_set_one
+#undef fp2_set_small
+#undef fp2_set_zero
+#undef fp2_sqr
+#undef fp2_sqrt
+#undef fp2_sqrt_verify
+#undef fp2_sub
+
+#define fp2_add                                         SQISIGN_NAMESPACE(fp2_add)
+#define fp2_add_one                                     SQISIGN_NAMESPACE(fp2_add_one)
+#define fp2_batched_inv                                 SQISIGN_NAMESPACE(fp2_batched_inv)
+#define fp2_copy                                        SQISIGN_NAMESPACE(fp2_copy)
+#define fp2_cswap                                       SQISIGN_NAMESPACE(fp2_cswap)
+#define fp2_decode                                      SQISIGN_NAMESPACE(fp2_decode)
+#define fp2_encode                                      SQISIGN_NAMESPACE(fp2_encode)
+#define fp2_half                                        SQISIGN_NAMESPACE(fp2_half)
+#define fp2_inv                                         SQISIGN_NAMESPACE(fp2_inv)
+#define fp2_is_equal                                    SQISIGN_NAMESPACE(fp2_is_equal)
+#define fp2_is_one                                      SQISIGN_NAMESPACE(fp2_is_one)
+#define fp2_is_square                                   SQISIGN_NAMESPACE(fp2_is_square)
+#define fp2_is_zero                                     SQISIGN_NAMESPACE(fp2_is_zero)
+#define fp2_mul                                         SQISIGN_NAMESPACE(fp2_mul)
+#define fp2_mul_small                                   SQISIGN_NAMESPACE(fp2_mul_small)
+#define fp2_neg                                         SQISIGN_NAMESPACE(fp2_neg)
+#define fp2_pow_vartime                                 SQISIGN_NAMESPACE(fp2_pow_vartime)
+#define fp2_print                                       SQISIGN_NAMESPACE(fp2_print)
+#define fp2_select                                      SQISIGN_NAMESPACE(fp2_select)
+#define fp2_set_one                                     SQISIGN_NAMESPACE(fp2_set_one)
+#define fp2_set_small                                   SQISIGN_NAMESPACE(fp2_set_small)
+#define fp2_set_zero                                    SQISIGN_NAMESPACE(fp2_set_zero)
+#define fp2_sqr                                         SQISIGN_NAMESPACE(fp2_sqr)
+#define fp2_sqrt                                        SQISIGN_NAMESPACE(fp2_sqrt)
+#define fp2_sqrt_verify                                 SQISIGN_NAMESPACE(fp2_sqrt_verify)
+#define fp2_sub                                         SQISIGN_NAMESPACE(fp2_sub)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_copy
+#undef fp_cswap
+#undef fp_decode
+#undef fp_decode_reduce
+#undef fp_div3
+#undef fp_encode
+#undef fp_half
+#undef fp_is_equal
+#undef fp_is_zero
+#undef fp_mul_small
+#undef fp_neg
+#undef fp_set_one
+#undef fp_set_small
+#undef fp_set_zero
+
+#define fp_copy                                         SQISIGN_NAMESPACE(fp_copy)
+#define fp_cswap                                        SQISIGN_NAMESPACE(fp_cswap)
+#define fp_decode                                       SQISIGN_NAMESPACE(fp_decode)
+#define fp_decode_reduce                                SQISIGN_NAMESPACE(fp_decode_reduce)
+#define fp_div3                                         SQISIGN_NAMESPACE(fp_div3)
+#define fp_encode                                       SQISIGN_NAMESPACE(fp_encode)
+#define fp_half                                         SQISIGN_NAMESPACE(fp_half)
+#define fp_is_equal                                     SQISIGN_NAMESPACE(fp_is_equal)
+#define fp_is_zero                                      SQISIGN_NAMESPACE(fp_is_zero)
+#define fp_mul_small                                    SQISIGN_NAMESPACE(fp_mul_small)
+#define fp_neg                                          SQISIGN_NAMESPACE(fp_neg)
+#define fp_set_one                                      SQISIGN_NAMESPACE(fp_set_one)
+#define fp_set_small                                    SQISIGN_NAMESPACE(fp_set_small)
+#define fp_set_zero                                     SQISIGN_NAMESPACE(fp_set_zero)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c, gf27500.c, gf5248.c, gf65376.c:
+#undef fp_add
+#undef fp_mul
+#undef fp_sqr
+#undef fp_sub
+
+#define fp_add                                          SQISIGN_NAMESPACE(fp_add)
+#define fp_mul                                          SQISIGN_NAMESPACE(fp_mul)
+#define fp_sqr                                          SQISIGN_NAMESPACE(fp_sqr)
+#define fp_sub                                          SQISIGN_NAMESPACE(fp_sub)
+
+// Namespacing symbols exported from gf27500.c:
+#undef gf27500_decode
+#undef gf27500_decode_reduce
+#undef gf27500_div
+#undef gf27500_div3
+#undef gf27500_encode
+#undef gf27500_invert
+#undef gf27500_legendre
+#undef gf27500_sqrt
+
+#define gf27500_decode                                  SQISIGN_NAMESPACE(gf27500_decode)
+#define gf27500_decode_reduce                           SQISIGN_NAMESPACE(gf27500_decode_reduce)
+#define gf27500_div                                     SQISIGN_NAMESPACE(gf27500_div)
+#define gf27500_div3                                    SQISIGN_NAMESPACE(gf27500_div3)
+#define gf27500_encode                                  SQISIGN_NAMESPACE(gf27500_encode)
+#define gf27500_invert                                  SQISIGN_NAMESPACE(gf27500_invert)
+#define gf27500_legendre                                SQISIGN_NAMESPACE(gf27500_legendre)
+#define gf27500_sqrt                                    SQISIGN_NAMESPACE(gf27500_sqrt)
+
+// Namespacing symbols exported from gf27500.c, gf5248.c, gf65376.c:
+#undef fp2_mul_c0
+#undef fp2_mul_c1
+#undef fp2_sq_c0
+#undef fp2_sq_c1
+
+#define fp2_mul_c0                                      SQISIGN_NAMESPACE(fp2_mul_c0)
+#define fp2_mul_c1                                      SQISIGN_NAMESPACE(fp2_mul_c1)
+#define fp2_sq_c0                                       SQISIGN_NAMESPACE(fp2_sq_c0)
+#define fp2_sq_c1                                       SQISIGN_NAMESPACE(fp2_sq_c1)
+
+// Namespacing symbols exported from gf5248.c:
+#undef gf5248_decode
+#undef gf5248_decode_reduce
+#undef gf5248_div
+#undef gf5248_div3
+#undef gf5248_encode
+#undef gf5248_invert
+#undef gf5248_legendre
+#undef gf5248_sqrt
+
+#define gf5248_decode                                   SQISIGN_NAMESPACE(gf5248_decode)
+#define gf5248_decode_reduce                            SQISIGN_NAMESPACE(gf5248_decode_reduce)
+#define gf5248_div                                      SQISIGN_NAMESPACE(gf5248_div)
+#define gf5248_div3                                     SQISIGN_NAMESPACE(gf5248_div3)
+#define gf5248_encode                                   SQISIGN_NAMESPACE(gf5248_encode)
+#define gf5248_invert                                   SQISIGN_NAMESPACE(gf5248_invert)
+#define gf5248_legendre                                 SQISIGN_NAMESPACE(gf5248_legendre)
+#define gf5248_sqrt                                     SQISIGN_NAMESPACE(gf5248_sqrt)
+
+// Namespacing symbols exported from gf65376.c:
+#undef gf65376_decode
+#undef gf65376_decode_reduce
+#undef gf65376_div
+#undef gf65376_div3
+#undef gf65376_encode
+#undef gf65376_invert
+#undef gf65376_legendre
+#undef gf65376_sqrt
+
+#define gf65376_decode                                  SQISIGN_NAMESPACE(gf65376_decode)
+#define gf65376_decode_reduce                           SQISIGN_NAMESPACE(gf65376_decode_reduce)
+#define gf65376_div                                     SQISIGN_NAMESPACE(gf65376_div)
+#define gf65376_div3                                    SQISIGN_NAMESPACE(gf65376_div3)
+#define gf65376_encode                                  SQISIGN_NAMESPACE(gf65376_encode)
+#define gf65376_invert                                  SQISIGN_NAMESPACE(gf65376_invert)
+#define gf65376_legendre                                SQISIGN_NAMESPACE(gf65376_legendre)
+#define gf65376_sqrt                                    SQISIGN_NAMESPACE(gf65376_sqrt)
+
+// Namespacing symbols exported from hd.c:
+#undef add_couple_jac_points
+#undef copy_bases_to_kernel
+#undef couple_jac_to_xz
+#undef double_couple_jac_point
+#undef double_couple_jac_point_iter
+#undef double_couple_point
+#undef double_couple_point_iter
+
+#define add_couple_jac_points                           SQISIGN_NAMESPACE(add_couple_jac_points)
+#define copy_bases_to_kernel                            SQISIGN_NAMESPACE(copy_bases_to_kernel)
+#define couple_jac_to_xz                                SQISIGN_NAMESPACE(couple_jac_to_xz)
+#define double_couple_jac_point                         SQISIGN_NAMESPACE(double_couple_jac_point)
+#define double_couple_jac_point_iter                    SQISIGN_NAMESPACE(double_couple_jac_point_iter)
+#define double_couple_point                             SQISIGN_NAMESPACE(double_couple_point)
+#define double_couple_point_iter                        SQISIGN_NAMESPACE(double_couple_point_iter)
+
+// Namespacing symbols exported from hnf.c:
+#undef ibz_mat_4x4_is_hnf
+#undef ibz_mat_4xn_hnf_mod_core
+#undef ibz_vec_4_copy_mod
+#undef ibz_vec_4_linear_combination_mod
+#undef ibz_vec_4_scalar_mul_mod
+
+#define ibz_mat_4x4_is_hnf                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_hnf)
+#define ibz_mat_4xn_hnf_mod_core                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_4xn_hnf_mod_core)
+#define ibz_vec_4_copy_mod                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_mod)
+#define ibz_vec_4_linear_combination_mod                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination_mod)
+#define ibz_vec_4_scalar_mul_mod                        SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul_mod)
+
+// Namespacing symbols exported from hnf_internal.c:
+#undef ibz_centered_mod
+#undef ibz_conditional_assign
+#undef ibz_mod_not_zero
+#undef ibz_xgcd_with_u_not_0
+
+#define ibz_centered_mod                                SQISIGN_NAMESPACE_GENERIC(ibz_centered_mod)
+#define ibz_conditional_assign                          SQISIGN_NAMESPACE_GENERIC(ibz_conditional_assign)
+#define ibz_mod_not_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mod_not_zero)
+#define ibz_xgcd_with_u_not_0                           SQISIGN_NAMESPACE_GENERIC(ibz_xgcd_with_u_not_0)
+
+// Namespacing symbols exported from ibz_division.c:
+#undef ibz_xgcd
+
+#define ibz_xgcd                                        SQISIGN_NAMESPACE_GENERIC(ibz_xgcd)
+
+// Namespacing symbols exported from id2iso.c:
+#undef change_of_basis_matrix_tate
+#undef change_of_basis_matrix_tate_invert
+#undef ec_biscalar_mul_ibz_vec
+#undef endomorphism_application_even_basis
+#undef id2iso_ideal_to_kernel_dlogs_even
+#undef id2iso_kernel_dlogs_to_ideal_even
+#undef matrix_application_even_basis
+
+#define change_of_basis_matrix_tate                     SQISIGN_NAMESPACE(change_of_basis_matrix_tate)
+#define change_of_basis_matrix_tate_invert              SQISIGN_NAMESPACE(change_of_basis_matrix_tate_invert)
+#define ec_biscalar_mul_ibz_vec                         SQISIGN_NAMESPACE(ec_biscalar_mul_ibz_vec)
+#define endomorphism_application_even_basis             SQISIGN_NAMESPACE(endomorphism_application_even_basis)
+#define id2iso_ideal_to_kernel_dlogs_even               SQISIGN_NAMESPACE(id2iso_ideal_to_kernel_dlogs_even)
+#define id2iso_kernel_dlogs_to_ideal_even               SQISIGN_NAMESPACE(id2iso_kernel_dlogs_to_ideal_even)
+#define matrix_application_even_basis                   SQISIGN_NAMESPACE(matrix_application_even_basis)
+
+// Namespacing symbols exported from ideal.c:
+#undef quat_lideal_add
+#undef quat_lideal_class_gram
+#undef quat_lideal_conjugate_without_hnf
+#undef quat_lideal_copy
+#undef quat_lideal_create
+#undef quat_lideal_create_principal
+#undef quat_lideal_equals
+#undef quat_lideal_generator
+#undef quat_lideal_inter
+#undef quat_lideal_inverse_lattice_without_hnf
+#undef quat_lideal_mul
+#undef quat_lideal_norm
+#undef quat_lideal_right_order
+#undef quat_lideal_right_transporter
+#undef quat_order_discriminant
+#undef quat_order_is_maximal
+
+#define quat_lideal_add                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_add)
+#define quat_lideal_class_gram                          SQISIGN_NAMESPACE_GENERIC(quat_lideal_class_gram)
+#define quat_lideal_conjugate_without_hnf               SQISIGN_NAMESPACE_GENERIC(quat_lideal_conjugate_without_hnf)
+#define quat_lideal_copy                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_copy)
+#define quat_lideal_create                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_create)
+#define quat_lideal_create_principal                    SQISIGN_NAMESPACE_GENERIC(quat_lideal_create_principal)
+#define quat_lideal_equals                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_equals)
+#define quat_lideal_generator                           SQISIGN_NAMESPACE_GENERIC(quat_lideal_generator)
+#define quat_lideal_inter                               SQISIGN_NAMESPACE_GENERIC(quat_lideal_inter)
+#define quat_lideal_inverse_lattice_without_hnf         SQISIGN_NAMESPACE_GENERIC(quat_lideal_inverse_lattice_without_hnf)
+#define quat_lideal_mul                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_mul)
+#define quat_lideal_norm                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_norm)
+#define quat_lideal_right_order                         SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_order)
+#define quat_lideal_right_transporter                   SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_transporter)
+#define quat_order_discriminant                         SQISIGN_NAMESPACE_GENERIC(quat_order_discriminant)
+#define quat_order_is_maximal                           SQISIGN_NAMESPACE_GENERIC(quat_order_is_maximal)
+
+// Namespacing symbols exported from intbig.c:
+#undef ibz_abs
+#undef ibz_add
+#undef ibz_bitsize
+#undef ibz_cmp
+#undef ibz_cmp_int32
+#undef ibz_convert_to_str
+#undef ibz_copy
+#undef ibz_copy_digits
+#undef ibz_div
+#undef ibz_div_2exp
+#undef ibz_div_floor
+#undef ibz_divides
+#undef ibz_finalize
+#undef ibz_gcd
+#undef ibz_get
+#undef ibz_init
+#undef ibz_invmod
+#undef ibz_is_even
+#undef ibz_is_odd
+#undef ibz_is_one
+#undef ibz_is_zero
+#undef ibz_legendre
+#undef ibz_mod
+#undef ibz_mod_ui
+#undef ibz_mul
+#undef ibz_neg
+#undef ibz_pow
+#undef ibz_pow_mod
+#undef ibz_print
+#undef ibz_probab_prime
+#undef ibz_rand_interval
+#undef ibz_rand_interval_bits
+#undef ibz_rand_interval_i
+#undef ibz_rand_interval_minm_m
+#undef ibz_set
+#undef ibz_set_from_str
+#undef ibz_size_in_base
+#undef ibz_sqrt
+#undef ibz_sqrt_floor
+#undef ibz_sqrt_mod_p
+#undef ibz_sub
+#undef ibz_swap
+#undef ibz_to_digits
+#undef ibz_two_adic
+
+#define ibz_abs                                         SQISIGN_NAMESPACE_GENERIC(ibz_abs)
+#define ibz_add                                         SQISIGN_NAMESPACE_GENERIC(ibz_add)
+#define ibz_bitsize                                     SQISIGN_NAMESPACE_GENERIC(ibz_bitsize)
+#define ibz_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibz_cmp)
+#define ibz_cmp_int32                                   SQISIGN_NAMESPACE_GENERIC(ibz_cmp_int32)
+#define ibz_convert_to_str                              SQISIGN_NAMESPACE_GENERIC(ibz_convert_to_str)
+#define ibz_copy                                        SQISIGN_NAMESPACE_GENERIC(ibz_copy)
+#define ibz_copy_digits                                 SQISIGN_NAMESPACE_GENERIC(ibz_copy_digits)
+#define ibz_div                                         SQISIGN_NAMESPACE_GENERIC(ibz_div)
+#define ibz_div_2exp                                    SQISIGN_NAMESPACE_GENERIC(ibz_div_2exp)
+#define ibz_div_floor                                   SQISIGN_NAMESPACE_GENERIC(ibz_div_floor)
+#define ibz_divides                                     SQISIGN_NAMESPACE_GENERIC(ibz_divides)
+#define ibz_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibz_finalize)
+#define ibz_gcd                                         SQISIGN_NAMESPACE_GENERIC(ibz_gcd)
+#define ibz_get                                         SQISIGN_NAMESPACE_GENERIC(ibz_get)
+#define ibz_init                                        SQISIGN_NAMESPACE_GENERIC(ibz_init)
+#define ibz_invmod                                      SQISIGN_NAMESPACE_GENERIC(ibz_invmod)
+#define ibz_is_even                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_even)
+#define ibz_is_odd                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_odd)
+#define ibz_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_one)
+#define ibz_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_zero)
+#define ibz_legendre                                    SQISIGN_NAMESPACE_GENERIC(ibz_legendre)
+#define ibz_mod                                         SQISIGN_NAMESPACE_GENERIC(ibz_mod)
+#define ibz_mod_ui                                      SQISIGN_NAMESPACE_GENERIC(ibz_mod_ui)
+#define ibz_mul                                         SQISIGN_NAMESPACE_GENERIC(ibz_mul)
+#define ibz_neg                                         SQISIGN_NAMESPACE_GENERIC(ibz_neg)
+#define ibz_pow                                         SQISIGN_NAMESPACE_GENERIC(ibz_pow)
+#define ibz_pow_mod                                     SQISIGN_NAMESPACE_GENERIC(ibz_pow_mod)
+#define ibz_print                                       SQISIGN_NAMESPACE_GENERIC(ibz_print)
+#define ibz_probab_prime                                SQISIGN_NAMESPACE_GENERIC(ibz_probab_prime)
+#define ibz_rand_interval                               SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval)
+#define ibz_rand_interval_bits                          SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_bits)
+#define ibz_rand_interval_i                             SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_i)
+#define ibz_rand_interval_minm_m                        SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_minm_m)
+#define ibz_set                                         SQISIGN_NAMESPACE_GENERIC(ibz_set)
+#define ibz_set_from_str                                SQISIGN_NAMESPACE_GENERIC(ibz_set_from_str)
+#define ibz_size_in_base                                SQISIGN_NAMESPACE_GENERIC(ibz_size_in_base)
+#define ibz_sqrt                                        SQISIGN_NAMESPACE_GENERIC(ibz_sqrt)
+#define ibz_sqrt_floor                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_floor)
+#define ibz_sqrt_mod_p                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_mod_p)
+#define ibz_sub                                         SQISIGN_NAMESPACE_GENERIC(ibz_sub)
+#define ibz_swap                                        SQISIGN_NAMESPACE_GENERIC(ibz_swap)
+#define ibz_to_digits                                   SQISIGN_NAMESPACE_GENERIC(ibz_to_digits)
+#define ibz_two_adic                                    SQISIGN_NAMESPACE_GENERIC(ibz_two_adic)
+
+// Namespacing symbols exported from integers.c:
+#undef ibz_cornacchia_prime
+#undef ibz_generate_random_prime
+
+#define ibz_cornacchia_prime                            SQISIGN_NAMESPACE_GENERIC(ibz_cornacchia_prime)
+#define ibz_generate_random_prime                       SQISIGN_NAMESPACE_GENERIC(ibz_generate_random_prime)
+
+// Namespacing symbols exported from isog_chains.c:
+#undef ec_eval_even
+#undef ec_eval_small_chain
+#undef ec_iso_eval
+#undef ec_isomorphism
+
+#define ec_eval_even                                    SQISIGN_NAMESPACE(ec_eval_even)
+#define ec_eval_small_chain                             SQISIGN_NAMESPACE(ec_eval_small_chain)
+#define ec_iso_eval                                     SQISIGN_NAMESPACE(ec_iso_eval)
+#define ec_isomorphism                                  SQISIGN_NAMESPACE(ec_isomorphism)
+
+// Namespacing symbols exported from keygen.c:
+#undef protocols_keygen
+#undef secret_key_finalize
+#undef secret_key_init
+
+#define protocols_keygen                                SQISIGN_NAMESPACE(protocols_keygen)
+#define secret_key_finalize                             SQISIGN_NAMESPACE(secret_key_finalize)
+#define secret_key_init                                 SQISIGN_NAMESPACE(secret_key_init)
+
+// Namespacing symbols exported from l2.c:
+#undef quat_lattice_lll
+#undef quat_lll_core
+
+#define quat_lattice_lll                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_lll)
+#define quat_lll_core                                   SQISIGN_NAMESPACE_GENERIC(quat_lll_core)
+
+// Namespacing symbols exported from lat_ball.c:
+#undef quat_lattice_bound_parallelogram
+#undef quat_lattice_sample_from_ball
+
+#define quat_lattice_bound_parallelogram                SQISIGN_NAMESPACE_GENERIC(quat_lattice_bound_parallelogram)
+#define quat_lattice_sample_from_ball                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_sample_from_ball)
+
+// Namespacing symbols exported from lattice.c:
+#undef quat_lattice_add
+#undef quat_lattice_alg_elem_mul
+#undef quat_lattice_conjugate_without_hnf
+#undef quat_lattice_contains
+#undef quat_lattice_dual_without_hnf
+#undef quat_lattice_equal
+#undef quat_lattice_gram
+#undef quat_lattice_hnf
+#undef quat_lattice_inclusion
+#undef quat_lattice_index
+#undef quat_lattice_intersect
+#undef quat_lattice_mat_alg_coord_mul_without_hnf
+#undef quat_lattice_mul
+#undef quat_lattice_reduce_denom
+
+#define quat_lattice_add                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_add)
+#define quat_lattice_alg_elem_mul                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_alg_elem_mul)
+#define quat_lattice_conjugate_without_hnf              SQISIGN_NAMESPACE_GENERIC(quat_lattice_conjugate_without_hnf)
+#define quat_lattice_contains                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_contains)
+#define quat_lattice_dual_without_hnf                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_dual_without_hnf)
+#define quat_lattice_equal                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_equal)
+#define quat_lattice_gram                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_gram)
+#define quat_lattice_hnf                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_hnf)
+#define quat_lattice_inclusion                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_inclusion)
+#define quat_lattice_index                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_index)
+#define quat_lattice_intersect                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_intersect)
+#define quat_lattice_mat_alg_coord_mul_without_hnf      SQISIGN_NAMESPACE_GENERIC(quat_lattice_mat_alg_coord_mul_without_hnf)
+#define quat_lattice_mul                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_mul)
+#define quat_lattice_reduce_denom                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_reduce_denom)
+
+// Namespacing symbols exported from lll_applications.c:
+#undef quat_lideal_lideal_mul_reduced
+#undef quat_lideal_prime_norm_reduced_equivalent
+#undef quat_lideal_reduce_basis
+
+#define quat_lideal_lideal_mul_reduced                  SQISIGN_NAMESPACE_GENERIC(quat_lideal_lideal_mul_reduced)
+#define quat_lideal_prime_norm_reduced_equivalent       SQISIGN_NAMESPACE_GENERIC(quat_lideal_prime_norm_reduced_equivalent)
+#define quat_lideal_reduce_basis                        SQISIGN_NAMESPACE_GENERIC(quat_lideal_reduce_basis)
+
+// Namespacing symbols exported from lll_verification.c:
+#undef ibq_vec_4_copy_ibz
+#undef quat_lll_bilinear
+#undef quat_lll_gram_schmidt_transposed_with_ibq
+#undef quat_lll_set_ibq_parameters
+#undef quat_lll_verify
+
+#define ibq_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_copy_ibz)
+#define quat_lll_bilinear                               SQISIGN_NAMESPACE_GENERIC(quat_lll_bilinear)
+#define quat_lll_gram_schmidt_transposed_with_ibq       SQISIGN_NAMESPACE_GENERIC(quat_lll_gram_schmidt_transposed_with_ibq)
+#define quat_lll_set_ibq_parameters                     SQISIGN_NAMESPACE_GENERIC(quat_lll_set_ibq_parameters)
+#define quat_lll_verify                                 SQISIGN_NAMESPACE_GENERIC(quat_lll_verify)
+
+// Namespacing symbols exported from mem.c:
+#undef sqisign_secure_clear
+#undef sqisign_secure_free
+
+#define sqisign_secure_clear                            SQISIGN_NAMESPACE_GENERIC(sqisign_secure_clear)
+#define sqisign_secure_free                             SQISIGN_NAMESPACE_GENERIC(sqisign_secure_free)
+
+// Namespacing symbols exported from mp.c:
+#undef MUL
+#undef mp_add
+#undef mp_compare
+#undef mp_copy
+#undef mp_inv_2e
+#undef mp_invert_matrix
+#undef mp_is_one
+#undef mp_is_zero
+#undef mp_mod_2exp
+#undef mp_mul
+#undef mp_mul2
+#undef mp_neg
+#undef mp_print
+#undef mp_shiftl
+#undef mp_shiftr
+#undef mp_sub
+#undef multiple_mp_shiftl
+#undef select_ct
+#undef swap_ct
+
+#define MUL                                             SQISIGN_NAMESPACE_GENERIC(MUL)
+#define mp_add                                          SQISIGN_NAMESPACE_GENERIC(mp_add)
+#define mp_compare                                      SQISIGN_NAMESPACE_GENERIC(mp_compare)
+#define mp_copy                                         SQISIGN_NAMESPACE_GENERIC(mp_copy)
+#define mp_inv_2e                                       SQISIGN_NAMESPACE_GENERIC(mp_inv_2e)
+#define mp_invert_matrix                                SQISIGN_NAMESPACE_GENERIC(mp_invert_matrix)
+#define mp_is_one                                       SQISIGN_NAMESPACE_GENERIC(mp_is_one)
+#define mp_is_zero                                      SQISIGN_NAMESPACE_GENERIC(mp_is_zero)
+#define mp_mod_2exp                                     SQISIGN_NAMESPACE_GENERIC(mp_mod_2exp)
+#define mp_mul                                          SQISIGN_NAMESPACE_GENERIC(mp_mul)
+#define mp_mul2                                         SQISIGN_NAMESPACE_GENERIC(mp_mul2)
+#define mp_neg                                          SQISIGN_NAMESPACE_GENERIC(mp_neg)
+#define mp_print                                        SQISIGN_NAMESPACE_GENERIC(mp_print)
+#define mp_shiftl                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftl)
+#define mp_shiftr                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftr)
+#define mp_sub                                          SQISIGN_NAMESPACE_GENERIC(mp_sub)
+#define multiple_mp_shiftl                              SQISIGN_NAMESPACE_GENERIC(multiple_mp_shiftl)
+#define select_ct                                       SQISIGN_NAMESPACE_GENERIC(select_ct)
+#define swap_ct                                         SQISIGN_NAMESPACE_GENERIC(swap_ct)
+
+// Namespacing symbols exported from normeq.c:
+#undef quat_change_to_O0_basis
+#undef quat_lattice_O0_set
+#undef quat_lattice_O0_set_extremal
+#undef quat_order_elem_create
+#undef quat_represent_integer
+#undef quat_sampling_random_ideal_O0_given_norm
+
+#define quat_change_to_O0_basis                         SQISIGN_NAMESPACE_GENERIC(quat_change_to_O0_basis)
+#define quat_lattice_O0_set                             SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set)
+#define quat_lattice_O0_set_extremal                    SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set_extremal)
+#define quat_order_elem_create                          SQISIGN_NAMESPACE_GENERIC(quat_order_elem_create)
+#define quat_represent_integer                          SQISIGN_NAMESPACE_GENERIC(quat_represent_integer)
+#define quat_sampling_random_ideal_O0_given_norm        SQISIGN_NAMESPACE_GENERIC(quat_sampling_random_ideal_O0_given_norm)
+
+// Namespacing symbols exported from printer.c:
+#undef ibz_mat_2x2_print
+#undef ibz_mat_4x4_print
+#undef ibz_vec_2_print
+#undef ibz_vec_4_print
+#undef quat_alg_elem_print
+#undef quat_alg_print
+#undef quat_lattice_print
+#undef quat_left_ideal_print
+
+#define ibz_mat_2x2_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_print)
+#define ibz_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_print)
+#define ibz_vec_2_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_print)
+#define ibz_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_print)
+#define quat_alg_elem_print                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_print)
+#define quat_alg_print                                  SQISIGN_NAMESPACE_GENERIC(quat_alg_print)
+#define quat_lattice_print                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_print)
+#define quat_left_ideal_print                           SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_print)
+
+// Namespacing symbols exported from random_input_generation.c:
+#undef quat_test_input_random_ideal_generation
+#undef quat_test_input_random_ideal_lattice_generation
+#undef quat_test_input_random_lattice_generation
+
+#define quat_test_input_random_ideal_generation         SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_generation)
+#define quat_test_input_random_ideal_lattice_generation SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_lattice_generation)
+#define quat_test_input_random_lattice_generation       SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_lattice_generation)
+
+// Namespacing symbols exported from rationals.c:
+#undef ibq_abs
+#undef ibq_add
+#undef ibq_cmp
+#undef ibq_copy
+#undef ibq_finalize
+#undef ibq_init
+#undef ibq_inv
+#undef ibq_is_ibz
+#undef ibq_is_one
+#undef ibq_is_zero
+#undef ibq_mat_4x4_finalize
+#undef ibq_mat_4x4_init
+#undef ibq_mat_4x4_print
+#undef ibq_mul
+#undef ibq_neg
+#undef ibq_reduce
+#undef ibq_set
+#undef ibq_sub
+#undef ibq_to_ibz
+#undef ibq_vec_4_finalize
+#undef ibq_vec_4_init
+#undef ibq_vec_4_print
+
+#define ibq_abs                                         SQISIGN_NAMESPACE_GENERIC(ibq_abs)
+#define ibq_add                                         SQISIGN_NAMESPACE_GENERIC(ibq_add)
+#define ibq_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibq_cmp)
+#define ibq_copy                                        SQISIGN_NAMESPACE_GENERIC(ibq_copy)
+#define ibq_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibq_finalize)
+#define ibq_init                                        SQISIGN_NAMESPACE_GENERIC(ibq_init)
+#define ibq_inv                                         SQISIGN_NAMESPACE_GENERIC(ibq_inv)
+#define ibq_is_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_ibz)
+#define ibq_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_one)
+#define ibq_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibq_is_zero)
+#define ibq_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_finalize)
+#define ibq_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_init)
+#define ibq_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_print)
+#define ibq_mul                                         SQISIGN_NAMESPACE_GENERIC(ibq_mul)
+#define ibq_neg                                         SQISIGN_NAMESPACE_GENERIC(ibq_neg)
+#define ibq_reduce                                      SQISIGN_NAMESPACE_GENERIC(ibq_reduce)
+#define ibq_set                                         SQISIGN_NAMESPACE_GENERIC(ibq_set)
+#define ibq_sub                                         SQISIGN_NAMESPACE_GENERIC(ibq_sub)
+#define ibq_to_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_to_ibz)
+#define ibq_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_finalize)
+#define ibq_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_init)
+#define ibq_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_print)
+
+// Namespacing symbols exported from sign.c:
+#undef protocols_sign
+
+#define protocols_sign                                  SQISIGN_NAMESPACE(protocols_sign)
+
+// Namespacing symbols exported from sqisign.c:
+#undef sqisign_keypair
+#undef sqisign_open
+#undef sqisign_sign
+#undef sqisign_verify
+
+#define sqisign_keypair                                 SQISIGN_NAMESPACE(sqisign_keypair)
+#define sqisign_open                                    SQISIGN_NAMESPACE(sqisign_open)
+#define sqisign_sign                                    SQISIGN_NAMESPACE(sqisign_sign)
+#define sqisign_verify                                  SQISIGN_NAMESPACE(sqisign_verify)
+
+// Namespacing symbols exported from theta_isogenies.c:
+#undef theta_chain_compute_and_eval
+#undef theta_chain_compute_and_eval_randomized
+#undef theta_chain_compute_and_eval_verify
+
+#define theta_chain_compute_and_eval                    SQISIGN_NAMESPACE(theta_chain_compute_and_eval)
+#define theta_chain_compute_and_eval_randomized         SQISIGN_NAMESPACE(theta_chain_compute_and_eval_randomized)
+#define theta_chain_compute_and_eval_verify             SQISIGN_NAMESPACE(theta_chain_compute_and_eval_verify)
+
+// Namespacing symbols exported from theta_structure.c:
+#undef double_iter
+#undef double_point
+#undef is_product_theta_point
+#undef theta_precomputation
+
+#define double_iter                                     SQISIGN_NAMESPACE(double_iter)
+#define double_point                                    SQISIGN_NAMESPACE(double_point)
+#define is_product_theta_point                          SQISIGN_NAMESPACE(is_product_theta_point)
+#define theta_precomputation                            SQISIGN_NAMESPACE(theta_precomputation)
+
+// Namespacing symbols exported from verify.c:
+#undef protocols_verify
+
+#define protocols_verify                                SQISIGN_NAMESPACE(protocols_verify)
+
+// Namespacing symbols exported from xeval.c:
+#undef xeval_2
+#undef xeval_2_singular
+#undef xeval_4
+
+#define xeval_2                                         SQISIGN_NAMESPACE(xeval_2)
+#define xeval_2_singular                                SQISIGN_NAMESPACE(xeval_2_singular)
+#define xeval_4                                         SQISIGN_NAMESPACE(xeval_4)
+
+// Namespacing symbols exported from xisog.c:
+#undef xisog_2
+#undef xisog_2_singular
+#undef xisog_4
+
+#define xisog_2                                         SQISIGN_NAMESPACE(xisog_2)
+#define xisog_2_singular                                SQISIGN_NAMESPACE(xisog_2_singular)
+#define xisog_4                                         SQISIGN_NAMESPACE(xisog_4)
+
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl3/ref/theta_isogenies.c b/src/pqm4/sqisign_lvl3/ref/theta_isogenies.c
new file mode 100644
index 0000000..478a9ab
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/theta_isogenies.c
@@ -0,0 +1,1283 @@
+#include "theta_isogenies.h"
+#include <stdio.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <tools.h>
+#include <rng.h>
+
+// Select a base change matrix in constant time, with M1 a regular
+// base change matrix and M2 a precomputed base change matrix
+// If option = 0 then M <- M1, else if option = 0xFF...FF then M <- M2
+static inline void
+select_base_change_matrix(basis_change_matrix_t *M,
+                          const basis_change_matrix_t *M1,
+                          const precomp_basis_change_matrix_t *M2,
+                          const uint32_t option)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            fp2_select(&M->m[i][j], &M1->m[i][j], &FP2_CONSTANTS[M2->m[i][j]], option);
+}
+
+// Set a regular base change matrix from a precomputed one
+static inline void
+set_base_change_matrix_from_precomp(basis_change_matrix_t *res, const precomp_basis_change_matrix_t *M)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            res->m[i][j] = FP2_CONSTANTS[M->m[i][j]];
+}
+
+static inline void
+choose_index_theta_point(fp2_t *res, int ind, const theta_point_t *T)
+{
+    const fp2_t *src = NULL;
+    switch (ind % 4) {
+        case 0:
+            src = &T->x;
+            break;
+        case 1:
+            src = &T->y;
+            break;
+        case 2:
+            src = &T->z;
+            break;
+        case 3:
+            src = &T->t;
+            break;
+        default:
+            assert(0);
+    }
+    fp2_copy(res, src);
+}
+
+// same as apply_isomorphism method but more efficient when the t component of P is zero.
+static void
+apply_isomorphism_general(theta_point_t *res,
+                          const basis_change_matrix_t *M,
+                          const theta_point_t *P,
+                          const bool Pt_not_zero)
+{
+    fp2_t x1;
+    theta_point_t temp;
+
+    fp2_mul(&temp.x, &P->x, &M->m[0][0]);
+    fp2_mul(&x1, &P->y, &M->m[0][1]);
+    fp2_add(&temp.x, &temp.x, &x1);
+    fp2_mul(&x1, &P->z, &M->m[0][2]);
+    fp2_add(&temp.x, &temp.x, &x1);
+
+    fp2_mul(&temp.y, &P->x, &M->m[1][0]);
+    fp2_mul(&x1, &P->y, &M->m[1][1]);
+    fp2_add(&temp.y, &temp.y, &x1);
+    fp2_mul(&x1, &P->z, &M->m[1][2]);
+    fp2_add(&temp.y, &temp.y, &x1);
+
+    fp2_mul(&temp.z, &P->x, &M->m[2][0]);
+    fp2_mul(&x1, &P->y, &M->m[2][1]);
+    fp2_add(&temp.z, &temp.z, &x1);
+    fp2_mul(&x1, &P->z, &M->m[2][2]);
+    fp2_add(&temp.z, &temp.z, &x1);
+
+    fp2_mul(&temp.t, &P->x, &M->m[3][0]);
+    fp2_mul(&x1, &P->y, &M->m[3][1]);
+    fp2_add(&temp.t, &temp.t, &x1);
+    fp2_mul(&x1, &P->z, &M->m[3][2]);
+    fp2_add(&temp.t, &temp.t, &x1);
+
+    if (Pt_not_zero) {
+        fp2_mul(&x1, &P->t, &M->m[0][3]);
+        fp2_add(&temp.x, &temp.x, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[1][3]);
+        fp2_add(&temp.y, &temp.y, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[2][3]);
+        fp2_add(&temp.z, &temp.z, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[3][3]);
+        fp2_add(&temp.t, &temp.t, &x1);
+    }
+
+    fp2_copy(&res->x, &temp.x);
+    fp2_copy(&res->y, &temp.y);
+    fp2_copy(&res->z, &temp.z);
+    fp2_copy(&res->t, &temp.t);
+}
+
+static void
+apply_isomorphism(theta_point_t *res, const basis_change_matrix_t *M, const theta_point_t *P)
+{
+    apply_isomorphism_general(res, M, P, true);
+}
+
+// set res = M1 * M2 with matrix multiplication
+static void
+base_change_matrix_multiplication(basis_change_matrix_t *res,
+                                  const basis_change_matrix_t *M1,
+                                  const basis_change_matrix_t *M2)
+{
+    basis_change_matrix_t tmp;
+    fp2_t sum, m_ik, m_kj;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            fp2_set_zero(&sum);
+            for (int k = 0; k < 4; k++) {
+                m_ik = M1->m[i][k];
+                m_kj = M2->m[k][j];
+                fp2_mul(&m_ik, &m_ik, &m_kj);
+                fp2_add(&sum, &sum, &m_ik);
+            }
+            tmp.m[i][j] = sum;
+        }
+    }
+    *res = tmp;
+}
+
+// compute the theta_point corresponding to the couple of point T on an elliptic product
+static void
+base_change(theta_point_t *out, const theta_gluing_t *phi, const theta_couple_point_t *T)
+{
+    theta_point_t null_point;
+
+    // null_point = (a : b : c : d)
+    // a = P1.x P2.x, b = P1.x P2.z, c = P1.z P2.x, d = P1.z P2.z
+    fp2_mul(&null_point.x, &T->P1.x, &T->P2.x);
+    fp2_mul(&null_point.y, &T->P1.x, &T->P2.z);
+    fp2_mul(&null_point.z, &T->P2.x, &T->P1.z);
+    fp2_mul(&null_point.t, &T->P1.z, &T->P2.z);
+
+    // Apply the basis change
+    apply_isomorphism(out, &phi->M, &null_point);
+}
+
+static void
+action_by_translation_z_and_det(fp2_t *z_inv, fp2_t *det_inv, const ec_point_t *P4, const ec_point_t *P2)
+{
+    // Store the Z-coordinate to invert
+    fp2_copy(z_inv, &P4->z);
+
+    // Then collect detij = xij wij - uij zij
+    fp2_t tmp;
+    fp2_mul(det_inv, &P4->x, &P2->z);
+    fp2_mul(&tmp, &P4->z, &P2->x);
+    fp2_sub(det_inv, det_inv, &tmp);
+}
+
+static void
+action_by_translation_compute_matrix(translation_matrix_t *G,
+                                     const ec_point_t *P4,
+                                     const ec_point_t *P2,
+                                     const fp2_t *z_inv,
+                                     const fp2_t *det_inv)
+{
+    fp2_t tmp;
+
+    // Gi.g10 = uij xij /detij - xij/zij
+    fp2_mul(&tmp, &P4->x, z_inv);
+    fp2_mul(&G->g10, &P4->x, &P2->x);
+    fp2_mul(&G->g10, &G->g10, det_inv);
+    fp2_sub(&G->g10, &G->g10, &tmp);
+
+    // Gi.g11 = uij zij * detij
+    fp2_mul(&G->g11, &P2->x, det_inv);
+    fp2_mul(&G->g11, &G->g11, &P4->z);
+
+    // Gi.g00 = -Gi.g11
+    fp2_neg(&G->g00, &G->g11);
+
+    // Gi.g01 = - wij zij detij
+    fp2_mul(&G->g01, &P2->z, det_inv);
+    fp2_mul(&G->g01, &G->g01, &P4->z);
+    fp2_neg(&G->g01, &G->g01);
+}
+
+// Returns 1 if the basis is as expected and 0 otherwise
+// We only expect this to fail for malformed signatures, so
+// do not require this to run in constant time.
+static int
+verify_two_torsion(const theta_couple_point_t *K1_2, const theta_couple_point_t *K2_2, const theta_couple_curve_t *E12)
+{
+    // First check if any point in K1_2 or K2_2 is zero, if they are then the points did not have
+    // order 8 when we started gluing
+    if (ec_is_zero(&K1_2->P1) | ec_is_zero(&K1_2->P2) | ec_is_zero(&K2_2->P1) | ec_is_zero(&K2_2->P2)) {
+        return 0;
+    }
+
+    // Now ensure that P1, Q1 and P2, Q2 are independent. For points of order two this means
+    // that they're not the same
+    if (ec_is_equal(&K1_2->P1, &K2_2->P1) | ec_is_equal(&K1_2->P2, &K2_2->P2)) {
+        return 0;
+    }
+
+    // Finally, double points to ensure all points have order exactly 0
+    theta_couple_point_t O1, O2;
+    double_couple_point(&O1, K1_2, E12);
+    double_couple_point(&O2, K2_2, E12);
+    // If this check fails then the points had order 2*f for some f, and the kernel is malformed.
+    if (!(ec_is_zero(&O1.P1) & ec_is_zero(&O1.P2) & ec_is_zero(&O2.P1) & ec_is_zero(&O2.P2))) {
+        return 0;
+    }
+
+    return 1;
+}
+
+// Computes the action by translation for four points
+// (P1, P2) and (Q1, Q2) on E1 x E2 simultaneously to
+// save on inversions.
+// Returns 0 if any of Pi or Qi does not have order 2
+// and 1 otherwise
+static int
+action_by_translation(translation_matrix_t *Gi,
+                      const theta_couple_point_t *K1_4,
+                      const theta_couple_point_t *K2_4,
+                      const theta_couple_curve_t *E12)
+{
+    // Compute points of order 2 from Ki_4
+    theta_couple_point_t K1_2, K2_2;
+    double_couple_point(&K1_2, K1_4, E12);
+    double_couple_point(&K2_2, K2_4, E12);
+
+    if (!verify_two_torsion(&K1_2, &K2_2, E12)) {
+        return 0;
+    }
+
+    // We need to invert four Z coordinates and
+    // four determinants which we do with batched
+    // inversion
+    fp2_t inverses[8];
+    action_by_translation_z_and_det(&inverses[0], &inverses[4], &K1_4->P1, &K1_2.P1);
+    action_by_translation_z_and_det(&inverses[1], &inverses[5], &K1_4->P2, &K1_2.P2);
+    action_by_translation_z_and_det(&inverses[2], &inverses[6], &K2_4->P1, &K2_2.P1);
+    action_by_translation_z_and_det(&inverses[3], &inverses[7], &K2_4->P2, &K2_2.P2);
+
+    fp2_batched_inv(inverses, 8);
+    if (fp2_is_zero(&inverses[0]))
+        return 0; // something was wrong with our input (which somehow was not caught by
+                  // verify_two_torsion)
+
+    action_by_translation_compute_matrix(&Gi[0], &K1_4->P1, &K1_2.P1, &inverses[0], &inverses[4]);
+    action_by_translation_compute_matrix(&Gi[1], &K1_4->P2, &K1_2.P2, &inverses[1], &inverses[5]);
+    action_by_translation_compute_matrix(&Gi[2], &K2_4->P1, &K2_2.P1, &inverses[2], &inverses[6]);
+    action_by_translation_compute_matrix(&Gi[3], &K2_4->P2, &K2_2.P2, &inverses[3], &inverses[7]);
+
+    return 1;
+}
+
+// Given the appropriate four torsion, computes the
+// change of basis to compute the correct theta null
+// point.
+// Returns 0 if the order of K1_4 or K2_4 is not 4
+static int
+gluing_change_of_basis(basis_change_matrix_t *M,
+                       const theta_couple_point_t *K1_4,
+                       const theta_couple_point_t *K2_4,
+                       const theta_couple_curve_t *E12)
+{
+    // Compute the four 2x2 matrices for the action by translation
+    // on the four points:
+    translation_matrix_t Gi[4];
+    if (!action_by_translation(Gi, K1_4, K2_4, E12))
+        return 0;
+
+    // Computation of the 4x4 matrix from Mij
+    // t001, t101 (resp t002, t102) first column of M11 * M21 (resp M12 * M22)
+    fp2_t t001, t101, t002, t102, tmp;
+
+    fp2_mul(&t001, &Gi[0].g00, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g01, &Gi[2].g10);
+    fp2_add(&t001, &t001, &tmp);
+
+    fp2_mul(&t101, &Gi[0].g10, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g11, &Gi[2].g10);
+    fp2_add(&t101, &t101, &tmp);
+
+    fp2_mul(&t002, &Gi[1].g00, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g01, &Gi[3].g10);
+    fp2_add(&t002, &t002, &tmp);
+
+    fp2_mul(&t102, &Gi[1].g10, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g11, &Gi[3].g10);
+    fp2_add(&t102, &t102, &tmp);
+
+    // trace for the first row
+    fp2_set_one(&M->m[0][0]);
+    fp2_mul(&tmp, &t001, &t002);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+
+    fp2_mul(&M->m[0][1], &t001, &t102);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+
+    fp2_mul(&M->m[0][2], &t101, &t002);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+
+    fp2_mul(&M->m[0][3], &t101, &t102);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+
+    // Compute the action of (0,out.K2_4.P2) for the second row
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][1]);
+    fp2_mul(&M->m[1][0], &Gi[3].g00, &M->m[0][0]);
+    fp2_add(&M->m[1][0], &M->m[1][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][1]);
+    fp2_mul(&M->m[1][1], &Gi[3].g10, &M->m[0][0]);
+    fp2_add(&M->m[1][1], &M->m[1][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][3]);
+    fp2_mul(&M->m[1][2], &Gi[3].g00, &M->m[0][2]);
+    fp2_add(&M->m[1][2], &M->m[1][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][3]);
+    fp2_mul(&M->m[1][3], &Gi[3].g10, &M->m[0][2]);
+    fp2_add(&M->m[1][3], &M->m[1][3], &tmp);
+
+    // compute the action of (K1_4.P1,0) for the third row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][2]);
+    fp2_mul(&M->m[2][0], &Gi[0].g00, &M->m[0][0]);
+    fp2_add(&M->m[2][0], &M->m[2][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][3]);
+    fp2_mul(&M->m[2][1], &Gi[0].g00, &M->m[0][1]);
+    fp2_add(&M->m[2][1], &M->m[2][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][2]);
+    fp2_mul(&M->m[2][2], &Gi[0].g10, &M->m[0][0]);
+    fp2_add(&M->m[2][2], &M->m[2][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][3]);
+    fp2_mul(&M->m[2][3], &Gi[0].g10, &M->m[0][1]);
+    fp2_add(&M->m[2][3], &M->m[2][3], &tmp);
+
+    // compute the action of (K1_4.P1,K2_4.P2) for the final row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][2]);
+    fp2_mul(&M->m[3][0], &Gi[0].g00, &M->m[1][0]);
+    fp2_add(&M->m[3][0], &M->m[3][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][3]);
+    fp2_mul(&M->m[3][1], &Gi[0].g00, &M->m[1][1]);
+    fp2_add(&M->m[3][1], &M->m[3][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][2]);
+    fp2_mul(&M->m[3][2], &Gi[0].g10, &M->m[1][0]);
+    fp2_add(&M->m[3][2], &M->m[3][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][3]);
+    fp2_mul(&M->m[3][3], &Gi[0].g10, &M->m[1][1]);
+    fp2_add(&M->m[3][3], &M->m[3][3], &tmp);
+
+    return 1;
+}
+
+/**
+ * @brief Compute the gluing isogeny from an elliptic product
+ *
+ * @param out Output: the theta_gluing
+ * @param K1_8 a couple point
+ * @param E12 an elliptic curve product
+ * @param K2_8 a point in E2[8]
+ *
+ * out : E1xE2 -> A of kernel [4](K1_8,K2_8)
+ * if the kernel supplied has the incorrect order, or gluing seems malformed,
+ * returns 0, otherwise returns 1.
+ */
+static int
+gluing_compute(theta_gluing_t *out,
+               const theta_couple_curve_t *E12,
+               const theta_couple_jac_point_t *xyK1_8,
+               const theta_couple_jac_point_t *xyK2_8,
+               bool verify)
+{
+    // Ensure that we have been given the eight torsion
+#ifndef NDEBUG
+    {
+        int check = test_jac_order_twof(&xyK1_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK1_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK1_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P2 does not have order 8");
+    }
+#endif
+
+    out->xyK1_8 = *xyK1_8;
+    out->domain = *E12;
+
+    // Given points in E[8] x E[8] we need the four torsion below
+    theta_couple_jac_point_t xyK1_4, xyK2_4;
+
+    double_couple_jac_point(&xyK1_4, xyK1_8, E12);
+    double_couple_jac_point(&xyK2_4, xyK2_8, E12);
+
+    // Convert from (X:Y:Z) coordinates to (X:Z)
+    theta_couple_point_t K1_8, K2_8;
+    theta_couple_point_t K1_4, K2_4;
+
+    couple_jac_to_xz(&K1_8, xyK1_8);
+    couple_jac_to_xz(&K2_8, xyK2_8);
+    couple_jac_to_xz(&K1_4, &xyK1_4);
+    couple_jac_to_xz(&K2_4, &xyK2_4);
+
+    // Set the basis change matrix, if we have not been given a valid K[8] for this computation
+    // gluing_change_of_basis will detect this and return 0
+    if (!gluing_change_of_basis(&out->M, &K1_4, &K2_4, E12)) {
+        debug_print("gluing failed as kernel does not have correct order");
+        return 0;
+    }
+
+    // apply the base change to the kernel
+    theta_point_t TT1, TT2;
+
+    base_change(&TT1, out, &K1_8);
+    base_change(&TT2, out, &K2_8);
+
+    // compute the codomain
+    to_squared_theta(&TT1, &TT1);
+    to_squared_theta(&TT2, &TT2);
+
+    // If the kernel is well formed then TT1.t and TT2.t are zero
+    // if they are not, we exit early as the signature we are validating
+    // is probably malformed
+    if (!(fp2_is_zero(&TT1.t) & fp2_is_zero(&TT2.t))) {
+        debug_print("gluing failed TT1.t or TT2.t is not zero");
+        return 0;
+    }
+    // Test our projective factors are non zero
+    if (fp2_is_zero(&TT1.x) | fp2_is_zero(&TT2.x) | fp2_is_zero(&TT1.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT1.z))
+        return 0; // invalid input
+
+    // Projective factor: Ax
+    fp2_mul(&out->codomain.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.y, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.z, &TT1.x, &TT2.z);
+    fp2_set_zero(&out->codomain.t);
+    // Projective factor: ABCxz
+    fp2_mul(&out->precomputation.x, &TT1.y, &TT2.z);
+    fp2_copy(&out->precomputation.y, &out->codomain.z);
+    fp2_copy(&out->precomputation.z, &out->codomain.y);
+    fp2_set_zero(&out->precomputation.t);
+
+    // Compute the two components of phi(K1_8) = (x:x:y:y).
+    fp2_mul(&out->imageK1_8.x, &TT1.x, &out->precomputation.x);
+    fp2_mul(&out->imageK1_8.y, &TT1.z, &out->precomputation.z);
+
+    // If K1_8 and K2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points [2]K1_8 and [2]K2_8 are isotropic.
+    if (verify) {
+        fp2_t t1, t2;
+        fp2_mul(&t1, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&out->imageK1_8.x, &t1))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t2, &t1))
+            return 0;
+    }
+
+    // compute the final codomain
+    hadamard(&out->codomain, &out->codomain);
+    return 1;
+}
+
+// sub routine of the gluing eval
+static void
+gluing_eval_point(theta_point_t *image, const theta_couple_jac_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T1, T2;
+    add_components_t add_comp1, add_comp2;
+
+    // Compute the cross addition components of P1+Q1 and P2+Q2
+    jac_to_xz_add_components(&add_comp1, &P->P1, &phi->xyK1_8.P1, &phi->domain.E1);
+    jac_to_xz_add_components(&add_comp2, &P->P2, &phi->xyK1_8.P2, &phi->domain.E2);
+
+    // Compute T1 and T2 derived from the cross addition components.
+    fp2_mul(&T1.x, &add_comp1.u, &add_comp2.u); // T1x = u1u2
+    fp2_mul(&T2.t, &add_comp1.v, &add_comp2.v); // T2t = v1v2
+    fp2_add(&T1.x, &T1.x, &T2.t);               // T1x = u1u2 + v1v2
+    fp2_mul(&T1.y, &add_comp1.u, &add_comp2.w); // T1y = u1w2
+    fp2_mul(&T1.z, &add_comp1.w, &add_comp2.u); // T1z = w1u2
+    fp2_mul(&T1.t, &add_comp1.w, &add_comp2.w); // T1t = w1w2
+    fp2_add(&T2.x, &add_comp1.u, &add_comp1.v); // T2x = (u1+v1)
+    fp2_add(&T2.y, &add_comp2.u, &add_comp2.v); // T2y = (u2+v2)
+    fp2_mul(&T2.x, &T2.x, &T2.y);               // T2x = (u1+v1)(u2+v2)
+    fp2_sub(&T2.x, &T2.x, &T1.x);               // T1x = v1u2 + u1v2
+    fp2_mul(&T2.y, &add_comp1.v, &add_comp2.w); // T2y = v1w2
+    fp2_mul(&T2.z, &add_comp1.w, &add_comp2.v); // T2z = w1v2
+    fp2_set_zero(&T2.t);                        // T2t = 0
+
+    // Apply the basis change and compute their respective square
+    // theta(P+Q) = M.T1 - M.T2 and theta(P-Q) = M.T1 + M.T2
+    apply_isomorphism_general(&T1, &phi->M, &T1, true);
+    apply_isomorphism_general(&T2, &phi->M, &T2, false);
+    pointwise_square(&T1, &T1);
+    pointwise_square(&T2, &T2);
+
+    // the difference between the two is therefore theta(P+Q)theta(P-Q)
+    // whose hadamard transform is then the product of the dual
+    // theta_points of phi(P) and phi(Q).
+    fp2_sub(&T1.x, &T1.x, &T2.x);
+    fp2_sub(&T1.y, &T1.y, &T2.y);
+    fp2_sub(&T1.z, &T1.z, &T2.z);
+    fp2_sub(&T1.t, &T1.t, &T2.t);
+    hadamard(&T1, &T1);
+
+    // Compute (x, y, z, t)
+    // As imageK1_8 = (x:x:y:y), its inverse is (y:y:x:x).
+    fp2_mul(&image->x, &T1.x, &phi->imageK1_8.y);
+    fp2_mul(&image->y, &T1.y, &phi->imageK1_8.y);
+    fp2_mul(&image->z, &T1.z, &phi->imageK1_8.x);
+    fp2_mul(&image->t, &T1.t, &phi->imageK1_8.x);
+
+    hadamard(image, image);
+}
+
+// Same as gluing_eval_point but in the very special case where we already know that the point will
+// have a zero coordinate at the place where the zero coordinate of the dual_theta_nullpoint would
+// have made the computation difficult
+static int
+gluing_eval_point_special_case(theta_point_t *image, const theta_couple_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T;
+
+    // Apply the basis change
+    base_change(&T, phi, P);
+
+    // Apply the to_squared_theta transform
+    to_squared_theta(&T, &T);
+
+    // This coordinate should always be 0 in a gluing because D=0.
+    // If this is not the case, something went very wrong, so reject
+    if (!fp2_is_zero(&T.t))
+        return 0;
+
+    // Compute (x, y, z, t)
+    fp2_mul(&image->x, &T.x, &phi->precomputation.x);
+    fp2_mul(&image->y, &T.y, &phi->precomputation.y);
+    fp2_mul(&image->z, &T.z, &phi->precomputation.z);
+    fp2_set_zero(&image->t);
+
+    hadamard(image, image);
+    return 1;
+}
+
+/**
+ * @brief Evaluate a gluing isogeny from an elliptic product on a basis
+ *
+ * @param image1 Output: the theta_point of the image of the first couple of points
+ * @param image2 Output : the theta point of the image of the second couple of points
+ * @param xyT1: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param xyT2: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param phi : a gluing isogeny E1 x E2 -> A
+ *
+ **/
+static void
+gluing_eval_basis(theta_point_t *image1,
+                  theta_point_t *image2,
+                  const theta_couple_jac_point_t *xyT1,
+                  const theta_couple_jac_point_t *xyT2,
+                  const theta_gluing_t *phi)
+{
+    gluing_eval_point(image1, xyT1, phi);
+    gluing_eval_point(image2, xyT2, phi);
+}
+
+/**
+ * @brief Compute a (2,2) isogeny in dimension 2 in the theta_model
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_8 a point in A[8]
+ * @param T2_8 a point in A[8]
+ * @param hadamard_bool_1 a boolean used for the last two steps of the chain
+ * @param hadamard_bool_2 a boolean used for the last two steps of the chain
+ *
+ * out : A -> B of kernel [4](T1_8,T2_8)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ * verify: add extra sanity check to ensure our 8-torsion points are coherent with the isogeny
+ *
+ */
+static int
+theta_isogeny_compute(theta_isogeny_t *out,
+                      const theta_structure_t *A,
+                      const theta_point_t *T1_8,
+                      const theta_point_t *T2_8,
+                      bool hadamard_bool_1,
+                      bool hadamard_bool_2,
+                      bool verify)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_8;
+    out->T2_8 = *T2_8;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_8);
+        to_squared_theta(&TT1, &TT1);
+        hadamard(&TT2, T2_8);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_8);
+        to_squared_theta(&TT2, T2_8);
+    }
+
+    fp2_t t1, t2;
+
+    // Test that our projective factor ABCDxzw is non zero, where
+    // TT1=(Ax, Bx, Cy, Dy), TT2=(Az, Bw, Cz, Dw)
+    // But ABCDxzw=0 can only happen if we had an unexpected splitting in
+    // the isogeny chain.
+    // In either case reject
+    // (this is not strictly necessary, we could just return (0:0:0:0))
+    if (fp2_is_zero(&TT2.x) | fp2_is_zero(&TT2.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT2.t) | fp2_is_zero(&TT1.x) |
+        fp2_is_zero(&TT1.y))
+        return 0;
+
+    fp2_mul(&t1, &TT1.x, &TT2.y);
+    fp2_mul(&t2, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.null_point.x, &TT2.x, &t1);
+    fp2_mul(&out->codomain.null_point.y, &TT2.y, &t2);
+    fp2_mul(&out->codomain.null_point.z, &TT2.z, &t1);
+    fp2_mul(&out->codomain.null_point.t, &TT2.t, &t2);
+    fp2_t t3;
+    fp2_mul(&t3, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.x, &t3, &TT1.y);
+    fp2_mul(&out->precomputation.y, &t3, &TT1.x);
+    fp2_copy(&out->precomputation.z, &out->codomain.null_point.t);
+    fp2_copy(&out->precomputation.t, &out->codomain.null_point.z);
+
+    // If T1_8 and T2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points 2T1_8 and 2T2_8 are isotropic.
+    if (verify) {
+        fp2_mul(&t1, &TT1.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT1.z, &out->precomputation.z);
+        fp2_mul(&t2, &TT1.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.y, &out->precomputation.y);
+        fp2_mul(&t2, &TT2.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+    }
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+    return 1;
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the 4 torsion above the kernel is known and not the 8
+ * torsion
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_4 a point in A[4]
+ * @param T2_4 a point in A[4]
+ * @param hadamard_bool_1 a boolean
+ * @param hadamard_bool_2 a boolean
+ *
+ * out : A -> B of kernel [2](T1_4,T2_4)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_4(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_4,
+                        const theta_point_t *T2_4,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_4;
+    out->T2_8 = *T2_4;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+    // we will compute:
+    // TT1 = (xAB, _ , xCD, _)
+    // TT2 = (AA,BB,CC,DD)
+
+    // fp2_t xA_inv,zA_inv,tB_inv;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_4);
+        to_squared_theta(&TT1, &TT1);
+
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_4);
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    fp2_t sqaabb, sqaacc;
+    fp2_mul(&sqaabb, &TT2.x, &TT2.y);
+    fp2_mul(&sqaacc, &TT2.x, &TT2.z);
+    // No need to check the square roots, only used for signing.
+    // sqaabb = sqrt(AA*BB)
+    fp2_sqrt(&sqaabb);
+    // sqaacc = sqrt(AA*CC)
+    fp2_sqrt(&sqaacc);
+
+    // we compute out->codomain.null_point = (xAB * sqaacc * AA, xAB *sqaabb *sqaacc, xCD*sqaabb *
+    // AA) out->precomputation = (xAB * BB * CC *DD , sqaabb * CC * DD * xAB , sqaacc * BB* DD * xAB
+    // , xCD * sqaabb *sqaacc * BB)
+
+    fp2_mul(&out->codomain.null_point.y, &sqaabb, &sqaacc);
+    fp2_mul(&out->precomputation.t, &out->codomain.null_point.y, &TT1.z);
+    fp2_mul(&out->codomain.null_point.y, &out->codomain.null_point.y,
+            &TT1.x); // done for out->codomain.null_point.y
+
+    fp2_mul(&out->codomain.null_point.t, &TT1.z, &sqaabb);
+    fp2_mul(&out->codomain.null_point.t, &out->codomain.null_point.t,
+            &TT2.x); // done for out->codomain.null_point.t
+
+    fp2_mul(&out->codomain.null_point.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.z, &out->codomain.null_point.x,
+            &TT2.z); // done for out->codomain.null_point.z
+    fp2_mul(&out->codomain.null_point.x, &out->codomain.null_point.x,
+            &sqaacc); // done for out->codomain.null_point.x
+
+    fp2_mul(&out->precomputation.x, &TT1.x, &TT2.t);
+    fp2_mul(&out->precomputation.z, &out->precomputation.x, &TT2.y);
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.z);
+    fp2_mul(&out->precomputation.y, &out->precomputation.x, &sqaabb); // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y);  // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &sqaacc); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y);  // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the kernel is known and not the 8 or 4 torsion above
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_2 a point in A[2]
+ * @param T2_2 a point in A[2]
+ * @param hadamard_bool_1 a boolean
+ * @param boo2 a boolean
+ *
+ * out : A -> B of kernel (T1_2,T2_2)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_2(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_2,
+                        const theta_point_t *T2_2,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_2;
+    out->T2_8 = *T2_2;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT2;
+    // we will compute:
+    // TT2 = (AA,BB,CC,DD)
+
+    if (hadamard_bool_1) {
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    // we compute out->codomain.null_point = (AA,sqaabb, sqaacc, sqaadd)
+    // out->precomputation = (  BB * CC *DD , sqaabb * CC * DD , sqaacc * BB* DD , sqaadd * BB * CC)
+    fp2_copy(&out->codomain.null_point.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.y, &TT2.x, &TT2.y);
+    fp2_mul(&out->codomain.null_point.z, &TT2.x, &TT2.z);
+    fp2_mul(&out->codomain.null_point.t, &TT2.x, &TT2.t);
+    // No need to check the square roots, only used for signing.
+    fp2_sqrt(&out->codomain.null_point.y);
+    fp2_sqrt(&out->codomain.null_point.z);
+    fp2_sqrt(&out->codomain.null_point.t);
+
+    fp2_mul(&out->precomputation.x, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.y,
+            &out->precomputation.x,
+            &out->codomain.null_point.y);                            // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y); // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &TT2.t, &out->codomain.null_point.z);
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &TT2.y); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &TT2.z, &out->codomain.null_point.t);
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y); // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+static void
+theta_isogeny_eval(theta_point_t *out, const theta_isogeny_t *phi, const theta_point_t *P)
+{
+    if (phi->hadamard_bool_1) {
+        hadamard(out, P);
+        to_squared_theta(out, out);
+    } else {
+        to_squared_theta(out, P);
+    }
+    fp2_mul(&out->x, &out->x, &phi->precomputation.x);
+    fp2_mul(&out->y, &out->y, &phi->precomputation.y);
+    fp2_mul(&out->z, &out->z, &phi->precomputation.z);
+    fp2_mul(&out->t, &out->t, &phi->precomputation.t);
+
+    if (phi->hadamard_bool_2) {
+        hadamard(out, out);
+    }
+}
+
+#if defined(ENABLE_SIGN)
+// Sample a random secret index in [0, 5] to select one of the 6 normalisation
+// matrices for the normalisation of the output of the (2,2)-chain during
+// splitting
+static unsigned char
+sample_random_index(void)
+{
+    // To avoid bias in reduction we should only consider integers smaller
+    // than 2^32 which are a multiple of 6, so we only reduce bytes with a
+    // value in [0, 4294967292-1].
+    // We have 4294967292/2^32 = ~99.9999999% chance that the first try is "good".
+    unsigned char seed_arr[4];
+    uint32_t seed;
+
+    do {
+        randombytes(seed_arr, 4);
+        seed = (seed_arr[0] | (seed_arr[1] << 8) | (seed_arr[2] << 16) | (seed_arr[3] << 24));
+    } while (seed >= 4294967292U);
+
+    uint32_t secret_index = seed - (((uint64_t)seed * 2863311531U) >> 34) * 6;
+    assert(secret_index == seed % 6); // ensure the constant time trick above works
+    return (unsigned char)secret_index;
+}
+#endif
+
+static bool
+splitting_compute(theta_splitting_t *out, const theta_structure_t *A, int zero_index, bool randomize)
+
+{
+    // init
+    uint32_t ctl;
+    uint32_t count = 0;
+    fp2_t U_cst, t1, t2;
+
+    memset(&out->M, 0, sizeof(basis_change_matrix_t));
+
+    // enumerate through all indices
+    for (int i = 0; i < 10; i++) {
+        fp2_set_zero(&U_cst);
+        for (int t = 0; t < 4; t++) {
+            // Iterate through the null point
+            choose_index_theta_point(&t2, t, &A->null_point);
+            choose_index_theta_point(&t1, t ^ EVEN_INDEX[i][1], &A->null_point);
+
+            // Compute t1 * t2
+            fp2_mul(&t1, &t1, &t2);
+            // If CHI_EVAL(i,t) is +1 we want ctl to be 0 and
+            // If CHI_EVAL(i,t) is -1 we want ctl to be 0xFF..FF
+            ctl = (uint32_t)(CHI_EVAL[EVEN_INDEX[i][0]][t] >> 1);
+            assert(ctl == 0 || ctl == 0xffffffff);
+
+            fp2_neg(&t2, &t1);
+            fp2_select(&t1, &t1, &t2, ctl);
+
+            // Then we compute U_cst ± (t1 * t2)
+            fp2_add(&U_cst, &U_cst, &t1);
+        }
+
+        // If U_cst is 0 then update the splitting matrix
+        ctl = fp2_is_zero(&U_cst);
+        count -= ctl;
+        select_base_change_matrix(&out->M, &out->M, &SPLITTING_TRANSFORMS[i], ctl);
+        if (zero_index != -1 && i == zero_index &&
+            !ctl) { // extra checks if we know exactly where the 0 index should be
+            return 0;
+        }
+    }
+
+#if defined(ENABLE_SIGN)
+    // Pick a random normalization matrix
+    if (randomize) {
+        unsigned char secret_index = sample_random_index();
+        basis_change_matrix_t Mrandom;
+
+        set_base_change_matrix_from_precomp(&Mrandom, &NORMALIZATION_TRANSFORMS[0]);
+
+        // Use a constant time selection to pick the index we want
+        for (unsigned char i = 1; i < 6; i++) {
+            // When i == secret_index, mask == 0 and 0xFF..FF otherwise
+            int32_t mask = i - secret_index;
+            mask = (mask | -mask) >> 31;
+            select_base_change_matrix(&Mrandom, &Mrandom, &NORMALIZATION_TRANSFORMS[i], ~mask);
+        }
+        base_change_matrix_multiplication(&out->M, &Mrandom, &out->M);
+    }
+#else
+    assert(!randomize);
+#endif
+
+    // apply the isomorphism to ensure the null point is compatible with splitting
+    apply_isomorphism(&out->B.null_point, &out->M, &A->null_point);
+
+    // splitting was successful only if exactly one zero was identified
+    return count == 1;
+}
+
+static int
+theta_product_structure_to_elliptic_product(theta_couple_curve_t *E12, theta_structure_t *A)
+{
+    fp2_t xx, yy;
+
+    // This should be true from our computations in splitting_compute
+    // but still check this for sanity
+    if (!is_product_theta_point(&A->null_point))
+        return 0;
+
+    ec_curve_init(&(E12->E1));
+    ec_curve_init(&(E12->E2));
+
+    // A valid elliptic theta null point has no zero coordinate
+    if (fp2_is_zero(&A->null_point.x) | fp2_is_zero(&A->null_point.y) | fp2_is_zero(&A->null_point.z))
+        return 0;
+
+    // xx = x², yy = y²
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.y);
+    // xx = x^4, yy = y^4
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A2 = -2(x^4+y^4)/(x^4-y^4)
+    fp2_add(&E12->E2.A, &xx, &yy);
+    fp2_sub(&E12->E2.C, &xx, &yy);
+    fp2_add(&E12->E2.A, &E12->E2.A, &E12->E2.A);
+    fp2_neg(&E12->E2.A, &E12->E2.A);
+
+    // same with x,z
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.z);
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A1 = -2(x^4+z^4)/(x^4-z^4)
+    fp2_add(&E12->E1.A, &xx, &yy);
+    fp2_sub(&E12->E1.C, &xx, &yy);
+    fp2_add(&E12->E1.A, &E12->E1.A, &E12->E1.A);
+    fp2_neg(&E12->E1.A, &E12->E1.A);
+
+    if (fp2_is_zero(&E12->E1.C) | fp2_is_zero(&E12->E2.C))
+        return 0;
+
+    return 1;
+}
+
+static int
+theta_point_to_montgomery_point(theta_couple_point_t *P12, const theta_point_t *P, const theta_structure_t *A)
+{
+    fp2_t temp;
+    const fp2_t *x, *z;
+
+    if (!is_product_theta_point(P))
+        return 0;
+
+    x = &P->x;
+    z = &P->y;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->z;
+        z = &P->t;
+    }
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        return 0; // at this point P=(0:0:0:0) so is invalid
+    }
+    // P2.X = A.null_point.y * P.x + A.null_point.x * P.y
+    // P2.Z = - A.null_point.y * P.x + A.null_point.x * P.y
+    fp2_mul(&P12->P2.x, &A->null_point.y, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P2.z, &temp, &P12->P2.x);
+    fp2_add(&P12->P2.x, &P12->P2.x, &temp);
+
+    x = &P->x;
+    z = &P->z;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->y;
+        z = &P->t;
+    }
+    // P1.X = A.null_point.z * P.x + A.null_point.x * P.z
+    // P1.Z = -A.null_point.z * P.x + A.null_point.x * P.z
+    fp2_mul(&P12->P1.x, &A->null_point.z, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P1.z, &temp, &P12->P1.x);
+    fp2_add(&P12->P1.x, &P12->P1.x, &temp);
+    return 1;
+}
+
+static int
+_theta_chain_compute_impl(unsigned n,
+                          theta_couple_curve_t *E12,
+                          const theta_kernel_couple_points_t *ker,
+                          bool extra_torsion,
+                          theta_couple_curve_t *E34,
+                          theta_couple_point_t *P12,
+                          size_t numP,
+                          bool verify,
+                          bool randomize)
+{
+    theta_structure_t theta;
+
+    // lift the basis
+    theta_couple_jac_point_t xyT1, xyT2;
+
+    ec_basis_t bas1 = { .P = ker->T1.P1, .Q = ker->T2.P1, .PmQ = ker->T1m2.P1 };
+    ec_basis_t bas2 = { .P = ker->T1.P2, .Q = ker->T2.P2, .PmQ = ker->T1m2.P2 };
+    if (!lift_basis(&xyT1.P1, &xyT2.P1, &bas1, &E12->E1))
+        return 0;
+    if (!lift_basis(&xyT1.P2, &xyT2.P2, &bas2, &E12->E2))
+        return 0;
+
+    const unsigned extra = HD_extra_torsion * extra_torsion;
+
+#ifndef NDEBUG
+    assert(extra == 0 || extra == 2); // only cases implemented
+    if (!test_point_order_twof(&bas2.P, &E12->E2, n + extra))
+        debug_print("bas2.P does not have correct order");
+
+    if (!test_jac_order_twof(&xyT2.P2, &E12->E2, n + extra))
+        debug_print("xyT2.P2 does not have correct order");
+#endif
+
+    theta_point_t pts[numP ? numP : 1];
+
+    int space = 1;
+    for (unsigned i = 1; i < n; i *= 2)
+        ++space;
+
+    uint16_t todo[space];
+    todo[0] = n - 2 + extra;
+
+    int current = 0;
+
+    // kernel points for the gluing isogeny
+    theta_couple_jac_point_t jacQ1[space], jacQ2[space];
+    jacQ1[0] = xyT1;
+    jacQ2[0] = xyT2;
+    while (todo[current] != 1) {
+        assert(todo[current] >= 2);
+        ++current;
+        assert(current < space);
+        // the gluing isogeny is quite a bit more expensive than the others,
+        // so we adjust the usual splitting rule here a little bit: towards
+        // the end of the doubling chain it will be cheaper to recompute the
+        // doublings after evaluation than to push the intermediate points.
+        const unsigned num_dbls = todo[current - 1] >= 16 ? todo[current - 1] / 2 : todo[current - 1] - 1;
+        assert(num_dbls && num_dbls < todo[current - 1]);
+        double_couple_jac_point_iter(&jacQ1[current], num_dbls, &jacQ1[current - 1], E12);
+        double_couple_jac_point_iter(&jacQ2[current], num_dbls, &jacQ2[current - 1], E12);
+        todo[current] = todo[current - 1] - num_dbls;
+    }
+
+    // kernel points for the remaining isogeny steps
+    theta_point_t thetaQ1[space], thetaQ2[space];
+
+    // the gluing step
+    theta_gluing_t first_step;
+    {
+        assert(todo[current] == 1);
+
+        // compute the gluing isogeny
+        if (!gluing_compute(&first_step, E12, &jacQ1[current], &jacQ2[current], verify))
+            return 0;
+
+        // evaluate
+        for (unsigned j = 0; j < numP; ++j) {
+            assert(ec_is_zero(&P12[j].P1) || ec_is_zero(&P12[j].P2));
+            if (!gluing_eval_point_special_case(&pts[j], &P12[j], &first_step))
+                return 0;
+        }
+
+        // push kernel points through gluing isogeny
+        for (int j = 0; j < current; ++j) {
+            gluing_eval_basis(&thetaQ1[j], &thetaQ2[j], &jacQ1[j], &jacQ2[j], &first_step);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    // set-up the theta_structure for the first codomain
+    theta.null_point = first_step.codomain;
+    theta.precomputation = 0;
+    theta_precomputation(&theta);
+
+    theta_isogeny_t step;
+
+    // and now we do the remaining steps
+    for (unsigned i = 1; current >= 0 && todo[current]; ++i) {
+        assert(current < space);
+        while (todo[current] != 1) {
+            assert(todo[current] >= 2);
+            ++current;
+            assert(current < space);
+            const unsigned num_dbls = todo[current - 1] / 2;
+            assert(num_dbls && num_dbls < todo[current - 1]);
+            double_iter(&thetaQ1[current], &theta, &thetaQ1[current - 1], num_dbls);
+            double_iter(&thetaQ2[current], &theta, &thetaQ2[current - 1], num_dbls);
+            todo[current] = todo[current - 1] - num_dbls;
+        }
+
+        // computing the next step
+        int ret;
+        if (i == n - 2) // penultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 0, verify);
+        else if (i == n - 1) // ultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 1, 0, false);
+        else
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 1, verify);
+        if (!ret)
+            return 0;
+
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+
+        // updating the codomain
+        theta = step.codomain;
+
+        // pushing the kernel
+        assert(todo[current] == 1);
+        for (int j = 0; j < current; ++j) {
+            theta_isogeny_eval(&thetaQ1[j], &step, &thetaQ1[j]);
+            theta_isogeny_eval(&thetaQ2[j], &step, &thetaQ2[j]);
+            assert(todo[j]);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    assert(current == -1);
+
+    if (!extra_torsion) {
+        if (n >= 3) {
+            // in the last step we've skipped pushing the kernel since current was == 0, let's do it now
+            theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+            theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+        }
+
+        // penultimate step
+        theta_isogeny_compute_4(&step, &theta, &thetaQ1[0], &thetaQ2[0], 0, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+        theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+        theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+
+        // ultimate step
+        theta_isogeny_compute_2(&step, &theta, &thetaQ1[0], &thetaQ2[0], 1, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+    }
+
+    // final splitting step
+    theta_splitting_t last_step;
+
+    bool is_split = splitting_compute(&last_step, &theta, extra_torsion ? 8 : -1, randomize);
+
+    if (!is_split) {
+        debug_print("kernel did not generate an isogeny between elliptic products");
+        return 0;
+    }
+
+    if (!theta_product_structure_to_elliptic_product(E34, &last_step.B))
+        return 0;
+
+    // evaluate
+    for (size_t j = 0; j < numP; ++j) {
+        apply_isomorphism(&pts[j], &last_step.M, &pts[j]);
+        if (!theta_point_to_montgomery_point(&P12[j], &pts[j], &last_step.B))
+            return 0;
+    }
+
+    return 1;
+}
+
+int
+theta_chain_compute_and_eval(unsigned n,
+                             /*const*/ theta_couple_curve_t *E12,
+                             const theta_kernel_couple_points_t *ker,
+                             bool extra_torsion,
+                             theta_couple_curve_t *E34,
+                             theta_couple_point_t *P12,
+                             size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, false);
+}
+
+// Like theta_chain_compute_and_eval, adding extra verification checks;
+// used in the signature verification
+int
+theta_chain_compute_and_eval_verify(unsigned n,
+                                    /*const*/ theta_couple_curve_t *E12,
+                                    const theta_kernel_couple_points_t *ker,
+                                    bool extra_torsion,
+                                    theta_couple_curve_t *E34,
+                                    theta_couple_point_t *P12,
+                                    size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, true, false);
+}
+
+int
+theta_chain_compute_and_eval_randomized(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, true);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/theta_isogenies.h b/src/pqm4/sqisign_lvl3/ref/theta_isogenies.h
new file mode 100644
index 0000000..d151811
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/theta_isogenies.h
@@ -0,0 +1,18 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta isogeny header
+ */
+
+#ifndef THETA_ISOGENY_H
+#define THETA_ISOGENY_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <fp2.h>
+#include "theta_structure.h"
+#include <hd.h>
+#include <hd_splitting_transforms.h>
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/theta_structure.c b/src/pqm4/sqisign_lvl3/ref/theta_structure.c
new file mode 100644
index 0000000..ce97ac6
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/theta_structure.c
@@ -0,0 +1,78 @@
+#include "theta_structure.h"
+#include <assert.h>
+
+void
+theta_precomputation(theta_structure_t *A)
+{
+
+    if (A->precomputation) {
+        return;
+    }
+
+    theta_point_t A_dual;
+    to_squared_theta(&A_dual, &A->null_point);
+
+    fp2_t t1, t2;
+    fp2_mul(&t1, &A_dual.x, &A_dual.y);
+    fp2_mul(&t2, &A_dual.z, &A_dual.t);
+    fp2_mul(&A->XYZ0, &t1, &A_dual.z);
+    fp2_mul(&A->XYT0, &t1, &A_dual.t);
+    fp2_mul(&A->YZT0, &t2, &A_dual.y);
+    fp2_mul(&A->XZT0, &t2, &A_dual.x);
+
+    fp2_mul(&t1, &A->null_point.x, &A->null_point.y);
+    fp2_mul(&t2, &A->null_point.z, &A->null_point.t);
+    fp2_mul(&A->xyz0, &t1, &A->null_point.z);
+    fp2_mul(&A->xyt0, &t1, &A->null_point.t);
+    fp2_mul(&A->yzt0, &t2, &A->null_point.y);
+    fp2_mul(&A->xzt0, &t2, &A->null_point.x);
+
+    A->precomputation = true;
+}
+
+void
+double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in)
+{
+    to_squared_theta(out, in);
+    fp2_sqr(&out->x, &out->x);
+    fp2_sqr(&out->y, &out->y);
+    fp2_sqr(&out->z, &out->z);
+    fp2_sqr(&out->t, &out->t);
+
+    if (!A->precomputation) {
+        theta_precomputation(A);
+    }
+    fp2_mul(&out->x, &out->x, &A->YZT0);
+    fp2_mul(&out->y, &out->y, &A->XZT0);
+    fp2_mul(&out->z, &out->z, &A->XYT0);
+    fp2_mul(&out->t, &out->t, &A->XYZ0);
+
+    hadamard(out, out);
+
+    fp2_mul(&out->x, &out->x, &A->yzt0);
+    fp2_mul(&out->y, &out->y, &A->xzt0);
+    fp2_mul(&out->z, &out->z, &A->xyt0);
+    fp2_mul(&out->t, &out->t, &A->xyz0);
+}
+
+void
+double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp)
+{
+    if (exp == 0) {
+        *out = *in;
+    } else {
+        double_point(out, A, in);
+        for (int i = 1; i < exp; i++) {
+            double_point(out, A, out);
+        }
+    }
+}
+
+uint32_t
+is_product_theta_point(const theta_point_t *P)
+{
+    fp2_t t1, t2;
+    fp2_mul(&t1, &P->x, &P->t);
+    fp2_mul(&t2, &P->y, &P->z);
+    return fp2_is_equal(&t1, &t2);
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/theta_structure.h b/src/pqm4/sqisign_lvl3/ref/theta_structure.h
new file mode 100644
index 0000000..fc630b7
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/theta_structure.h
@@ -0,0 +1,135 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta structure header
+ */
+
+#ifndef THETA_STRUCTURE_H
+#define THETA_STRUCTURE_H
+
+#include <ec.h>
+#include <fp2.h>
+#include <hd.h>
+
+/** @internal
+ * @ingroup hd_module
+ * @defgroup hd_theta Functions for theta structures
+ * @{
+ */
+
+/**
+ * @brief Perform the hadamard transform on a theta point
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x+y+z+t, x-y+z-t, x+y-z-t, x-y-z+t)
+ *
+ */
+static inline void
+hadamard(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_t t1, t2, t3, t4;
+
+    // t1 = x + y
+    fp2_add(&t1, &in->x, &in->y);
+    // t2 = x - y
+    fp2_sub(&t2, &in->x, &in->y);
+    // t3 = z + t
+    fp2_add(&t3, &in->z, &in->t);
+    // t4 = z - t
+    fp2_sub(&t4, &in->z, &in->t);
+
+    fp2_add(&out->x, &t1, &t3);
+    fp2_add(&out->y, &t2, &t4);
+    fp2_sub(&out->z, &t1, &t3);
+    fp2_sub(&out->t, &t2, &t4);
+}
+
+/**
+ * @brief Square the coordinates of a theta point
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2, y^2, z^2, t^2)
+ *
+ */
+static inline void
+pointwise_square(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_sqr(&out->x, &in->x);
+    fp2_sqr(&out->y, &in->y);
+    fp2_sqr(&out->z, &in->z);
+    fp2_sqr(&out->t, &in->t);
+}
+
+/**
+ * @brief Square the coordinates and then perform the hadamard transform
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2+y^2+z^2+t^2, x^2-y^2+z^2-t^2, x^2+y^2-z^2-t^2, x^2-y^2-z^2+t^2)
+ *
+ */
+static inline void
+to_squared_theta(theta_point_t *out, const theta_point_t *in)
+{
+    pointwise_square(out, in);
+    hadamard(out, out);
+}
+
+/**
+ * @brief Perform the theta structure precomputation
+ *
+ * @param A Output: the theta_structure
+ *
+ * if A.null_point = (x,y,z,t)
+ * if (xx,yy,zz,tt) = to_squared_theta(A.null_point)
+ * Computes y0,z0,t0,Y0,Z0,T0 = x/y,x/z,x/t,XX/YY,XX/ZZ,XX/TT
+ *
+ */
+void theta_precomputation(theta_structure_t *A);
+
+/**
+ * @brief Compute the double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * in = (x,y,z,t)
+ * out = [2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in);
+
+/**
+ * @brief Compute the iterated double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * @param exp the exponent
+ * in = (x,y,z,t)
+ * out = [2^2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp);
+
+/*
+ * @brief Check if a theta point is a product theta point
+ *
+ * @param P a theta point
+ * @return 0xFFFFFFFF if true, zero otherwise
+ */
+uint32_t is_product_theta_point(const theta_point_t *P);
+
+// end hd_theta
+/**
+ * @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/tools.h b/src/pqm4/sqisign_lvl3/ref/tools.h
new file mode 100644
index 0000000..5a6a505
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/tools.h
@@ -0,0 +1,49 @@
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include <time.h>
+
+// Debug printing:
+// https://stackoverflow.com/questions/1644868/define-macro-for-debug-printing-in-c
+#ifndef NDEBUG
+#define DEBUG_PRINT 1
+#else
+#define DEBUG_PRINT 0
+#endif
+
+#ifndef __FILE_NAME__
+#define __FILE_NAME__ "NA"
+#endif
+
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+
+#ifndef __func__
+#define __func__ "NA"
+#endif
+
+#define debug_print(fmt)                                                                           \
+    do {                                                                                           \
+        if (DEBUG_PRINT)                                                                           \
+            printf("warning: %s, file %s, line %d, function %s().\n",                              \
+                   fmt,                                                                            \
+                   __FILE_NAME__,                                                                  \
+                   __LINE__,                                                                       \
+                   __func__);                                                                      \
+    } while (0)
+
+
+clock_t tic(void);
+float tac(void);                             /* time in ms since last tic */
+float TAC(const char *str);                  /* same, but prints it with label 'str' */
+float toc(const clock_t t);                  /* time in ms since t */
+float TOC(const clock_t t, const char *str); /* same, but prints it with label 'str' */
+float TOC_clock(const clock_t t, const char *str);
+
+clock_t dclock(const clock_t t); // return the clock cycle diff between now and t
+float clock_to_time(const clock_t t,
+                    const char *str); // convert the number of clock cycles t to time
+float clock_print(const clock_t t, const char *str);
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/tutil.h b/src/pqm4/sqisign_lvl3/ref/tutil.h
new file mode 100644
index 0000000..59f1620
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/tutil.h
@@ -0,0 +1,36 @@
+#ifndef TUTIL_H
+#define TUTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP16(i) __builtin_bswap16((i))
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#define UNUSED __attribute__((unused))
+#else
+#define BSWAP16(i) ((((i) >> 8) & 0xff) | (((i) & 0xff00) << 8))
+#define BSWAP32(i)                                                                                 \
+    ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
+#define UNUSED
+#endif
+
+#if defined(RADIX_64)
+#define digit_t uint64_t
+#define sdigit_t int64_t
+#define RADIX 64
+#define LOG2RADIX 6
+#define BSWAP_DIGIT(i) BSWAP64(i)
+#elif defined(RADIX_32)
+#define digit_t uint32_t
+#define sdigit_t int32_t
+#define RADIX 32
+#define LOG2RADIX 5
+#define BSWAP_DIGIT(i) BSWAP32(i)
+#else
+#error "Radix must be 32bit or 64 bit"
+#endif
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/verification.h b/src/pqm4/sqisign_lvl3/ref/verification.h
new file mode 100644
index 0000000..af67469
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/verification.h
@@ -0,0 +1,123 @@
+/** @file
+ *
+ * @brief The verification protocol
+ */
+
+#ifndef VERIFICATION_H
+#define VERIFICATION_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/** @defgroup verification SQIsignHD verification protocol
+ * @{
+ */
+
+/** @defgroup verification_t Types for SQIsignHD verification protocol
+ * @{
+ */
+
+typedef digit_t scalar_t[NWORDS_ORDER];
+typedef scalar_t scalar_mtx_2x2_t[2][2];
+
+/** @brief Type for the signature
+ *
+ * @typedef signature_t
+ *
+ * @struct signature
+ *
+ */
+typedef struct signature
+{
+    fp2_t E_aux_A; // the Montgomery A-coefficient for the auxiliary curve
+    uint8_t backtracking;
+    uint8_t two_resp_length;
+    scalar_mtx_2x2_t mat_Bchall_can_to_B_chall; // the matrix of the desired basis
+    scalar_t chall_coeff;
+    uint8_t hint_aux;
+    uint8_t hint_chall;
+} signature_t;
+
+/** @brief Type for the public keys
+ *
+ * @typedef public_key_t
+ *
+ * @struct public_key
+ *
+ */
+typedef struct public_key
+{
+    ec_curve_t curve; // the normalized A-coefficient of the Montgomery curve
+    uint8_t hint_pk;
+} public_key_t;
+
+/** @}
+ */
+
+/*************************** Functions *****************************/
+
+void public_key_init(public_key_t *pk);
+void public_key_finalize(public_key_t *pk);
+
+void hash_to_challenge(scalar_t *scalar,
+                       const public_key_t *pk,
+                       const ec_curve_t *com_curve,
+                       const unsigned char *message,
+                       size_t length);
+
+/**
+ * @brief Verification
+ *
+ * @param sig signature
+ * @param pk public key
+ * @param m message
+ * @param l size
+ * @returns 1 if the signature verifies, 0 otherwise
+ */
+int protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l);
+
+/*************************** Encoding *****************************/
+
+/** @defgroup encoding Encoding and decoding functions
+ * @{
+ */
+
+/**
+ * @brief Encodes a signature as a byte array
+ *
+ * @param enc : Byte array to encode the signature in
+ * @param sig : Signature to encode
+ */
+void signature_to_bytes(unsigned char *enc, const signature_t *sig);
+
+/**
+ * @brief Decodes a signature from a byte array
+ *
+ * @param sig : Structure to decode the signature in
+ * @param enc : Byte array to decode
+ */
+void signature_from_bytes(signature_t *sig, const unsigned char *enc);
+
+/**
+ * @brief Encodes a public key as a byte array
+ *
+ * @param enc : Byte array to encode the public key in
+ * @param pk : Public key to encode
+ */
+unsigned char *public_key_to_bytes(unsigned char *enc, const public_key_t *pk);
+
+/**
+ * @brief Decodes a public key from a byte array
+ *
+ * @param pk : Structure to decode the public key in
+ * @param enc : Byte array to decode
+ */
+const unsigned char *public_key_from_bytes(public_key_t *pk, const unsigned char *enc);
+
+/** @}
+ */
+
+/** @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl3/ref/verify.c b/src/pqm4/sqisign_lvl3/ref/verify.c
new file mode 100644
index 0000000..b5f78ad
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/verify.c
@@ -0,0 +1,309 @@
+#include <verification.h>
+#include <mp.h>
+#include <hd.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+// Check that the basis change matrix elements are canonical
+// representatives modulo 2^(SQIsign_response_length + 2).
+static int
+check_canonical_basis_change_matrix(const signature_t *sig)
+{
+    // This works as long as all values in sig->mat_Bchall_can_to_B_chall are
+    // positive integers.
+    int ret = 1;
+    scalar_t aux;
+
+    memset(aux, 0, NWORDS_ORDER * sizeof(digit_t));
+    aux[0] = 0x1;
+    multiple_mp_shiftl(aux, SQIsign_response_length + HD_extra_torsion - (int)sig->backtracking, NWORDS_ORDER);
+
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 2; j++) {
+            if (mp_compare(aux, sig->mat_Bchall_can_to_B_chall[i][j], NWORDS_ORDER) <= 0) {
+                ret = 0;
+            }
+        }
+    }
+
+    return ret;
+}
+
+// Compute the 2^n isogeny from the signature with kernel
+// P + [chall_coeff]Q and store the codomain in E_chall
+static int
+compute_challenge_verify(ec_curve_t *E_chall, const signature_t *sig, const ec_curve_t *Epk, const uint8_t hint_pk)
+{
+    ec_basis_t bas_EA;
+    ec_isog_even_t phi_chall;
+
+    // Set domain and length of 2^n isogeny
+    copy_curve(&phi_chall.curve, Epk);
+    phi_chall.length = TORSION_EVEN_POWER - sig->backtracking;
+
+    // Compute the basis from the supplied hint
+    if (!ec_curve_to_basis_2f_from_hint(&bas_EA, &phi_chall.curve, TORSION_EVEN_POWER, hint_pk)) // canonical
+        return 0;
+
+    // recovering the exact challenge
+    {
+        if (!ec_ladder3pt(&phi_chall.kernel, sig->chall_coeff, &bas_EA.P, &bas_EA.Q, &bas_EA.PmQ, &phi_chall.curve)) {
+            return 0;
+        };
+    }
+
+    // Double the kernel until is has the correct order
+    ec_dbl_iter(&phi_chall.kernel, sig->backtracking, &phi_chall.kernel, &phi_chall.curve);
+
+    // Compute the codomain
+    copy_curve(E_chall, &phi_chall.curve);
+    if (ec_eval_even(E_chall, &phi_chall, NULL, 0))
+        return 0;
+    return 1;
+}
+
+// same as matrix_application_even_basis() in id2iso.c, with some modifications:
+// - this version works with a matrix of scalars (not ibz_t).
+// - reduction modulo 2^f of matrix elements is removed here, because it is
+//   assumed that the elements are already cannonical representatives modulo
+//   2^f; this is ensured by calling check_canonical_basis_change_matrix() at
+//   the beginning of protocols_verify().
+static int
+matrix_scalar_application_even_basis(ec_basis_t *bas, const ec_curve_t *E, scalar_mtx_2x2_t *mat, int f)
+{
+    scalar_t scalar0, scalar1;
+    memset(scalar0, 0, NWORDS_ORDER * sizeof(digit_t));
+    memset(scalar1, 0, NWORDS_ORDER * sizeof(digit_t));
+
+    ec_basis_t tmp_bas;
+    copy_basis(&tmp_bas, bas);
+
+    // For a matrix [[a, c], [b, d]] we compute:
+    //
+    // first basis element R = [a]P + [b]Q
+    if (!ec_biscalar_mul(&bas->P, (*mat)[0][0], (*mat)[1][0], f, &tmp_bas, E))
+        return 0;
+    // second basis element S = [c]P + [d]Q
+    if (!ec_biscalar_mul(&bas->Q, (*mat)[0][1], (*mat)[1][1], f, &tmp_bas, E))
+        return 0;
+    // Their difference R - S = [a - c]P + [b - d]Q
+    mp_sub(scalar0, (*mat)[0][0], (*mat)[0][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar0, f, NWORDS_ORDER);
+    mp_sub(scalar1, (*mat)[1][0], (*mat)[1][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar1, f, NWORDS_ORDER);
+    return ec_biscalar_mul(&bas->PmQ, scalar0, scalar1, f, &tmp_bas, E);
+}
+
+// Compute the bases for the challenge and auxillary curve from
+// the canonical bases. Challenge basis is reconstructed from the
+// compressed scalars within the challenge.
+static int
+challenge_and_aux_basis_verify(ec_basis_t *B_chall_can,
+                               ec_basis_t *B_aux_can,
+                               ec_curve_t *E_chall,
+                               ec_curve_t *E_aux,
+                               signature_t *sig,
+                               const int pow_dim2_deg_resp)
+{
+
+    // recovering the canonical basis as TORSION_EVEN_POWER for consistency with signing
+    if (!ec_curve_to_basis_2f_from_hint(B_chall_can, E_chall, TORSION_EVEN_POWER, sig->hint_chall))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_chall_can,
+                      TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion - sig->two_resp_length,
+                      B_chall_can,
+                      E_chall);
+
+    if (!ec_curve_to_basis_2f_from_hint(B_aux_can, E_aux, TORSION_EVEN_POWER, sig->hint_aux))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_aux_can, TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion, B_aux_can, E_aux);
+
+#ifndef NDEBUG
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp + sig->two_resp_length))
+        debug_print("canonical basis has wrong order, expect something to fail");
+#endif
+
+    // applying the change matrix on the basis of E_chall
+    return matrix_scalar_application_even_basis(B_chall_can,
+                                                E_chall,
+                                                &sig->mat_Bchall_can_to_B_chall,
+                                                pow_dim2_deg_resp + HD_extra_torsion + sig->two_resp_length);
+}
+
+// When two_resp_length is non-zero, we must compute a small 2^n-isogeny
+// updating E_chall as the codomain as well as push the basis on E_chall
+// through this isogeny
+static int
+two_response_isogeny_verify(ec_curve_t *E_chall, ec_basis_t *B_chall_can, const signature_t *sig, int pow_dim2_deg_resp)
+{
+    ec_point_t ker, points[3];
+
+    // choosing the right point for the small two_isogenies
+    if (mp_is_even(sig->mat_Bchall_can_to_B_chall[0][0], NWORDS_ORDER) &&
+        mp_is_even(sig->mat_Bchall_can_to_B_chall[1][0], NWORDS_ORDER)) {
+        copy_point(&ker, &B_chall_can->Q);
+    } else {
+        copy_point(&ker, &B_chall_can->P);
+    }
+
+    copy_point(&points[0], &B_chall_can->P);
+    copy_point(&points[1], &B_chall_can->Q);
+    copy_point(&points[2], &B_chall_can->PmQ);
+
+    ec_dbl_iter(&ker, pow_dim2_deg_resp + HD_extra_torsion, &ker, E_chall);
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&ker, E_chall, sig->two_resp_length))
+        debug_print("kernel does not have order 2^(two_resp_length");
+#endif
+
+    if (ec_eval_small_chain(E_chall, &ker, sig->two_resp_length, points, 3, false)) {
+        return 0;
+    }
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&points[0], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[0] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[1], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[1] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[2], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[2] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    copy_point(&B_chall_can->P, &points[0]);
+    copy_point(&B_chall_can->Q, &points[1]);
+    copy_point(&B_chall_can->PmQ, &points[2]);
+    return 1;
+}
+
+// The commitment curve can be recovered from the codomain of the 2D
+// isogeny built from the bases computed during verification.
+static int
+compute_commitment_curve_verify(ec_curve_t *E_com,
+                                const ec_basis_t *B_chall_can,
+                                const ec_basis_t *B_aux_can,
+                                const ec_curve_t *E_chall,
+                                const ec_curve_t *E_aux,
+                                int pow_dim2_deg_resp)
+
+{
+#ifndef NDEBUG
+    // Check all the points are the correct order
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_chall_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+
+    if (!test_basis_order_twof(B_aux_can, E_aux, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_aux_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    // now compute the dim2 isogeny from Echall x E_aux -> E_com x E_aux'
+    // of kernel B_chall_can x B_aux_can
+
+    // first we set-up the kernel
+    theta_couple_curve_t EchallxEaux;
+    copy_curve(&EchallxEaux.E1, E_chall);
+    copy_curve(&EchallxEaux.E2, E_aux);
+
+    theta_kernel_couple_points_t dim_two_ker;
+    copy_bases_to_kernel(&dim_two_ker, B_chall_can, B_aux_can);
+
+    // computing the isogeny
+    theta_couple_curve_t codomain;
+    int codomain_splits;
+    ec_curve_init(&codomain.E1);
+    ec_curve_init(&codomain.E2);
+    // handling the special case where we don't need to perform any dim2 computation
+    if (pow_dim2_deg_resp == 0) {
+        codomain_splits = 1;
+        copy_curve(&codomain.E1, &EchallxEaux.E1);
+        copy_curve(&codomain.E2, &EchallxEaux.E2);
+        // We still need to check that E_chall is supersingular
+        // This assumes that HD_extra_torsion == 2
+        if (!ec_is_basis_four_torsion(B_chall_can, E_chall)) {
+            return 0;
+        }
+    } else {
+        codomain_splits = theta_chain_compute_and_eval_verify(
+            pow_dim2_deg_resp, &EchallxEaux, &dim_two_ker, true, &codomain, NULL, 0);
+    }
+
+    // computing the commitment curve
+    // its always the first one because of our (2^n,2^n)-isogeny formulae
+    copy_curve(E_com, &codomain.E1);
+
+    return codomain_splits;
+}
+
+// SQIsign verification
+int
+protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l)
+{
+    int verify;
+
+    if (!check_canonical_basis_change_matrix(sig))
+        return 0;
+
+    // Computation of the length of the dim 2 2^n isogeny
+    int pow_dim2_deg_resp = SQIsign_response_length - (int)sig->two_resp_length - (int)sig->backtracking;
+
+    // basic sanity test: checking that the response is not too long
+    if (pow_dim2_deg_resp < 0)
+        return 0;
+    // The dim 2 isogeny embeds a dim 1 isogeny of odd degree, so it can
+    // never be of length 2.
+    if (pow_dim2_deg_resp == 1)
+        return 0;
+
+    // check the public curve is valid
+    if (!ec_curve_verify_A(&(pk->curve).A))
+        return 0;
+
+    // Set auxiliary curve from the A-coefficient within the signature
+    ec_curve_t E_aux;
+    if (!ec_curve_init_from_A(&E_aux, &sig->E_aux_A))
+        return 0; // invalid curve
+
+    // checking that we are given A-coefficients and no precomputation
+    assert(fp2_is_one(&pk->curve.C) == 0xFFFFFFFF && !pk->curve.is_A24_computed_and_normalized);
+
+    // computation of the challenge
+    ec_curve_t E_chall;
+    if (!compute_challenge_verify(&E_chall, sig, &pk->curve, pk->hint_pk)) {
+        return 0;
+    }
+
+    // Computation of the canonical bases for the challenge and aux curve
+    ec_basis_t B_chall_can, B_aux_can;
+
+    if (!challenge_and_aux_basis_verify(&B_chall_can, &B_aux_can, &E_chall, &E_aux, sig, pow_dim2_deg_resp)) {
+        return 0;
+    }
+
+    // When two_resp_length != 0 we need to compute a second, short 2^r-isogeny
+    if (sig->two_resp_length > 0) {
+        if (!two_response_isogeny_verify(&E_chall, &B_chall_can, sig, pow_dim2_deg_resp)) {
+            return 0;
+        }
+    }
+
+    // We can recover the commitment curve with a 2D isogeny
+    // The supplied signature did not compute an isogeny between eliptic products
+    // and so definitely is an invalid signature.
+    ec_curve_t E_com;
+    if (!compute_commitment_curve_verify(&E_com, &B_chall_can, &B_aux_can, &E_chall, &E_aux, pow_dim2_deg_resp))
+        return 0;
+
+    scalar_t chk_chall;
+
+    // recomputing the challenge vector
+    hash_to_challenge(&chk_chall, pk, &E_com, m, l);
+
+    // performing the final check
+    verify = mp_compare(sig->chall_coeff, chk_chall, NWORDS_ORDER) == 0;
+
+    return verify;
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/xeval.c b/src/pqm4/sqisign_lvl3/ref/xeval.c
new file mode 100644
index 0000000..7fc7170
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/xeval.c
@@ -0,0 +1,64 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
+void
+xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1, t2;
+    for (int j = 0; j < lenQ; j++) {
+        fp2_add(&t0, &Q[j].x, &Q[j].z);
+        fp2_sub(&t1, &Q[j].x, &Q[j].z);
+        fp2_mul(&t2, &kps->K.x, &t1);
+        fp2_mul(&t1, &kps->K.z, &t0);
+        fp2_add(&t0, &t2, &t1);
+        fp2_sub(&t1, &t2, &t1);
+        fp2_mul(&R[j].x, &Q[j].x, &t0);
+        fp2_mul(&R[j].z, &Q[j].z, &t1);
+    }
+}
+
+void
+xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1;
+    for (int i = 0; i < lenQ; i++) {
+        fp2_mul(&t0, &Q[i].x, &Q[i].z);
+        fp2_mul(&t1, &kps->K.x, &Q[i].z);
+        fp2_add(&t1, &t1, &Q[i].x);
+        fp2_mul(&t1, &t1, &Q[i].x);
+        fp2_sqr(&R[i].x, &Q[i].z);
+        fp2_add(&R[i].x, &R[i].x, &t1);
+        fp2_mul(&R[i].z, &t0, &kps->K.z);
+    }
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
+void
+xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps)
+{
+    const ec_point_t *K = kps->K;
+
+    fp2_t t0, t1;
+
+    for (int i = 0; i < lenQ; i++) {
+        fp2_add(&t0, &Q[i].x, &Q[i].z);
+        fp2_sub(&t1, &Q[i].x, &Q[i].z);
+        fp2_mul(&(R[i].x), &t0, &K[1].x);
+        fp2_mul(&(R[i].z), &t1, &K[2].x);
+        fp2_mul(&t0, &t0, &t1);
+        fp2_mul(&t0, &t0, &K[0].x);
+        fp2_add(&t1, &(R[i].x), &(R[i].z));
+        fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
+        fp2_sqr(&t1, &t1);
+        fp2_sqr(&(R[i].z), &(R[i].z));
+        fp2_add(&(R[i].x), &t0, &t1);
+        fp2_sub(&t0, &t0, &(R[i].z));
+        fp2_mul(&(R[i].x), &(R[i].x), &t1);
+        fp2_mul(&(R[i].z), &(R[i].z), &t0);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl3/ref/xisog.c b/src/pqm4/sqisign_lvl3/ref/xisog.c
new file mode 100644
index 0000000..7242d29
--- /dev/null
+++ b/src/pqm4/sqisign_lvl3/ref/xisog.c
@@ -0,0 +1,61 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+// Degree-2 isogeny with kernel generated by P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    fp2_sqr(&B->x, &P.x);
+    fp2_sqr(&B->z, &P.z);
+    fp2_sub(&B->x, &B->z, &B->x);
+    fp2_add(&kps->K.x, &P.x, &P.z);
+    fp2_sub(&kps->K.z, &P.x, &P.z);
+}
+
+void
+xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24)
+{
+    // No need to check the square root, only used for signing.
+    fp2_t t0, four;
+    fp2_set_small(&four, 4);
+    fp2_add(&t0, &A24.x, &A24.x);
+    fp2_sub(&t0, &t0, &A24.z);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(&A24.z);
+    fp2_mul(&t0, &t0, &A24.z);
+    fp2_copy(&kps->K.x, &t0);
+    fp2_add(&B24->x, &t0, &t0);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t0, &t0, &four);
+    fp2_sqrt(&t0);
+    fp2_neg(&kps->K.z, &t0);
+    fp2_add(&B24->z, &t0, &t0);
+    fp2_add(&B24->x, &B24->x, &B24->z);
+    fp2_add(&B24->z, &B24->z, &B24->z);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    ec_point_t *K = kps->K;
+
+    fp2_sqr(&K[0].x, &P.x);
+    fp2_sqr(&K[0].z, &P.z);
+    fp2_add(&K[1].x, &K[0].z, &K[0].x);
+    fp2_sub(&K[1].z, &K[0].z, &K[0].x);
+    fp2_mul(&B->x, &K[1].x, &K[1].z);
+    fp2_sqr(&B->z, &K[0].z);
+
+    // Constants for xeval_4
+    fp2_add(&K[2].x, &P.x, &P.z);
+    fp2_sub(&K[1].x, &P.x, &P.z);
+    fp2_add(&K[0].x, &K[0].z, &K[0].z);
+    fp2_add(&K[0].x, &K[0].x, &K[0].x);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/api.h b/src/pqm4/sqisign_lvl5/ref/api.h
new file mode 100644
index 0000000..cf96baf
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/api.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <stddef.h>
+#include <sqisign_namespace.h>
+
+#define CRYPTO_SECRETKEYBYTES 701
+#define CRYPTO_PUBLICKEYBYTES 129
+#define CRYPTO_BYTES 292
+
+#define CRYPTO_ALGNAME "SQIsign_lvl5"
+
+SQISIGN_API
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+SQISIGN_API
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#endif /* api_h */
diff --git a/src/pqm4/sqisign_lvl5/ref/basis.c b/src/pqm4/sqisign_lvl5/ref/basis.c
new file mode 100644
index 0000000..94cb7fc
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/basis.c
@@ -0,0 +1,416 @@
+#include "ec.h"
+#include "fp2.h"
+#include "e0_basis.h"
+#include <assert.h>
+
+uint32_t
+ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve)
+{ // Recover y-coordinate of a point on the Montgomery curve y^2 = x^3 + Ax^2 + x
+    fp2_t t0;
+
+    fp2_sqr(&t0, Px);
+    fp2_mul(y, &t0, &curve->A); // Ax^2
+    fp2_add(y, y, Px);          // Ax^2 + x
+    fp2_mul(&t0, &t0, Px);
+    fp2_add(y, y, &t0); // x^3 + Ax^2 + x
+    // This is required, because we do not yet know that our curves are
+    // supersingular so our points live on the twist with B = 1.
+    return fp2_sqrt_verify(y);
+}
+
+static void
+difference_point(ec_point_t *PQ, const ec_point_t *P, const ec_point_t *Q, const ec_curve_t *curve)
+{
+    // Given P,Q in projective x-only, computes a deterministic choice for (P-Q)
+    // Based on Proposition 3 of https://eprint.iacr.org/2017/518.pdf
+
+    fp2_t Bxx, Bxz, Bzz, t0, t1;
+
+    fp2_mul(&t0, &P->x, &Q->x);
+    fp2_mul(&t1, &P->z, &Q->z);
+    fp2_sub(&Bxx, &t0, &t1);
+    fp2_sqr(&Bxx, &Bxx);
+    fp2_mul(&Bxx, &Bxx, &curve->C); // C*(P.x*Q.x-P.z*Q.z)^2
+    fp2_add(&Bxz, &t0, &t1);
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    fp2_add(&Bzz, &t0, &t1);
+    fp2_mul(&Bxz, &Bxz, &Bzz); // (P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_sub(&Bzz, &t0, &t1);
+    fp2_sqr(&Bzz, &Bzz);
+    fp2_mul(&Bzz, &Bzz, &curve->C); // C*(P.x*Q.z-P.z*Q.x)^2
+    fp2_mul(&Bxz, &Bxz, &curve->C); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x)
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&t0, &t0, &curve->A);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&Bxz, &Bxz, &t0); // C*(P.x*Q.x+P.z*Q.z)(P.x*Q.z+P.z*Q.x) + 2*A*P.x*Q.z*P.z*Q.x
+
+    // To ensure that the denominator is a fourth power in Fp, we normalize by
+    // C*C_bar^2*(P.z)_bar^2*(Q.z)_bar^2
+    fp_copy(&t0.re, &curve->C.re);
+    fp_neg(&t0.im, &curve->C.im);
+    fp2_sqr(&t0, &t0);
+    fp2_mul(&t0, &t0, &curve->C);
+    fp_copy(&t1.re, &P->z.re);
+    fp_neg(&t1.im, &P->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp_copy(&t1.re, &Q->z.re);
+    fp_neg(&t1.im, &Q->z.im);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_mul(&Bxx, &Bxx, &t0);
+    fp2_mul(&Bxz, &Bxz, &t0);
+    fp2_mul(&Bzz, &Bzz, &t0);
+
+    // Solving quadratic equation
+    fp2_sqr(&t0, &Bxz);
+    fp2_mul(&t1, &Bxx, &Bzz);
+    fp2_sub(&t0, &t0, &t1);
+    // No need to check if t0 is square, as per the entangled basis algorithm.
+    fp2_sqrt(&t0);
+    fp2_add(&PQ->x, &Bxz, &t0);
+    fp2_copy(&PQ->z, &Bzz);
+}
+
+// Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and the point
+// P = (X/Z : 1). For generic implementation see lift_basis()
+uint32_t
+lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    assert(fp2_is_one(&B->P.z));
+    assert(fp2_is_one(&E->C));
+
+    fp2_copy(&P->x, &B->P.x);
+    fp2_copy(&Q->x, &B->Q.x);
+    fp2_copy(&Q->z, &B->Q.z);
+    fp2_set_one(&P->z);
+    uint32_t ret = ec_recover_y(&P->y, &P->x, E);
+
+    // Algorithm of Okeya-Sakurai to recover y.Q in the montgomery model
+    fp2_t v1, v2, v3, v4;
+    fp2_mul(&v1, &P->x, &Q->z);
+    fp2_add(&v2, &Q->x, &v1);
+    fp2_sub(&v3, &Q->x, &v1);
+    fp2_sqr(&v3, &v3);
+    fp2_mul(&v3, &v3, &B->PmQ.x);
+    fp2_add(&v1, &E->A, &E->A);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_add(&v2, &v2, &v1);
+    fp2_mul(&v4, &P->x, &Q->x);
+    fp2_add(&v4, &v4, &Q->z);
+    fp2_mul(&v2, &v2, &v4);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_sub(&v2, &v2, &v1);
+    fp2_mul(&v2, &v2, &B->PmQ.z);
+    fp2_sub(&Q->y, &v3, &v2);
+    fp2_add(&v1, &P->y, &P->y);
+    fp2_mul(&v1, &v1, &Q->z);
+    fp2_mul(&v1, &v1, &B->PmQ.z);
+    fp2_mul(&Q->x, &Q->x, &v1);
+    fp2_mul(&Q->z, &Q->z, &v1);
+
+    // Transforming to a jacobian coordinate
+    fp2_sqr(&v1, &Q->z);
+    fp2_mul(&Q->y, &Q->y, &v1);
+    fp2_mul(&Q->x, &Q->x, &Q->z);
+    return ret;
+}
+
+uint32_t
+lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E)
+{
+    // Normalise the curve E such that (A : C) is (A/C : 1)
+    // and the point x(P) = (X/Z : 1).
+    fp2_t inverses[2];
+    fp2_copy(&inverses[0], &B->P.z);
+    fp2_copy(&inverses[1], &E->C);
+
+    fp2_batched_inv(inverses, 2);
+    fp2_set_one(&B->P.z);
+    fp2_set_one(&E->C);
+
+    fp2_mul(&B->P.x, &B->P.x, &inverses[0]);
+    fp2_mul(&E->A, &E->A, &inverses[1]);
+
+    // Lift the basis to Jacobian points P, Q
+    return lift_basis_normalized(P, Q, B, E);
+}
+
+// Given an x-coordinate, determines if this is a valid
+// point on the curve. Assumes C=1.
+static uint32_t
+is_on_curve(const fp2_t *x, const ec_curve_t *curve)
+{
+    assert(fp2_is_one(&curve->C));
+    fp2_t t0;
+
+    fp2_add(&t0, x, &curve->A); // x + (A/C)
+    fp2_mul(&t0, &t0, x);       // x^2 + (A/C)*x
+    fp2_add_one(&t0, &t0);      // x^2 + (A/C)*x + 1
+    fp2_mul(&t0, &t0, x);       // x^3 + (A/C)*x^2 + x
+
+    return fp2_is_square(&t0);
+}
+
+// Helper function which given a point of order k*2^n with n maximal
+// and k odd, computes a point of order 2^f
+static inline void
+clear_cofactor_for_maximal_even_order(ec_point_t *P, ec_curve_t *curve, int f)
+{
+    // clear out the odd cofactor to get a point of order 2^n
+    ec_mul(P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, P, curve);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_A24(P, P, &curve->A24, curve->is_A24_computed_and_normalized);
+    }
+}
+
+// Helper function which finds an NQR -1 / (1 + i*b) for entangled basis generation
+static uint8_t
+find_nqr_factor(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    // factor = -1/(1 + i*b) for b in Fp will be NQR whenever 1 + b^2 is NQR
+    // in Fp, so we find one of these and then invert (1 + i*b). We store b
+    // as a u8 hint to save time in verification.
+
+    // We return the hint as a u8, but use (uint16_t)n to give 2^16 - 1
+    // to make failure cryptographically negligible, with a fallback when
+    // n > 128 is required.
+    uint8_t hint;
+    uint32_t found = 0;
+    uint16_t n = start;
+
+    bool qr_b = 1;
+    fp_t b, tmp;
+    fp2_t z, t0, t1;
+
+    do {
+        while (qr_b) {
+            // find b with 1 + b^2 a non-quadratic residue
+            fp_set_small(&tmp, (uint32_t)n * n + 1);
+            qr_b = fp_is_square(&tmp);
+            n++; // keeps track of b = n - 1
+        }
+
+        // for Px := -A/(1 + i*b) to be on the curve
+        // is equivalent to A^2*(z-1) - z^2 NQR for z = 1 + i*b
+        // thus prevents unnecessary inversion pre-check
+
+        // t0 = z - 1 = i*b
+        // t1 = z = 1 + i*b
+        fp_set_small(&b, (uint32_t)n - 1);
+        fp2_set_zero(&t0);
+        fp2_set_one(&z);
+        fp_copy(&z.im, &b);
+        fp_copy(&t0.im, &b);
+
+        // A^2*(z-1) - z^2
+        fp2_sqr(&t1, &curve->A);
+        fp2_mul(&t0, &t0, &t1); // A^2 * (z - 1)
+        fp2_sqr(&t1, &z);
+        fp2_sub(&t0, &t0, &t1); // A^2 * (z - 1) - z^2
+        found = !fp2_is_square(&t0);
+
+        qr_b = 1;
+    } while (!found);
+
+    // set Px to -A/(1 + i*b)
+    fp2_copy(x, &z);
+    fp2_inv(x);
+    fp2_mul(x, x, &curve->A);
+    fp2_neg(x, x);
+
+    /*
+     * With very low probability n will not fit in 7 bits.
+     * We set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    hint = n <= 128 ? n - 1 : 0;
+
+    return hint;
+}
+
+// Helper function which finds a point x(P) = n * A
+static uint8_t
+find_nA_x_coord(fp2_t *x, ec_curve_t *curve, const uint8_t start)
+{
+    assert(!fp2_is_square(&curve->A)); // Only to be called when A is a NQR
+
+    // when A is NQR we allow x(P) to be a multiple n*A of A
+    uint8_t n = start;
+    if (n == 1) {
+        fp2_copy(x, &curve->A);
+    } else {
+        fp2_mul_small(x, &curve->A, n);
+    }
+
+    while (!is_on_curve(x, curve)) {
+        fp2_add(x, x, &curve->A);
+        n++;
+    }
+
+    /*
+     * With very low probability (1/2^128), n will not fit in 7 bits.
+     * In this case, we set hint = 0 which signals failure and the need
+     * to generate a value on the fly during verification
+     */
+    uint8_t hint = n < 128 ? n : 0;
+    return hint;
+}
+
+// The entangled basis generation does not allow A = 0
+// so we simply return the one we have already precomputed
+static void
+ec_basis_E0_2f(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    assert(fp2_is_zero(&curve->A));
+    ec_point_t P, Q;
+
+    // Set P, Q to precomputed (X : 1) values
+    fp2_copy(&P.x, &BASIS_E0_PX);
+    fp2_copy(&Q.x, &BASIS_E0_QX);
+    fp2_set_one(&P.z);
+    fp2_set_one(&Q.z);
+
+    // clear the power of two to get a point of order 2^f
+    for (int i = 0; i < TORSION_EVEN_POWER - f; i++) {
+        xDBL_E0(&P, &P);
+        xDBL_E0(&Q, &Q);
+    }
+
+    // Set P, Q in the basis and compute x(P - Q)
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->Q, &Q);
+    difference_point(&PQ2->PmQ, &P, &Q, curve);
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// and stores hints as an array for faster recomputation at a later point
+uint8_t
+ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 0;
+    }
+
+    uint8_t hint;
+    bool hint_A = fp2_is_square(&curve->A);
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_A) {
+        // when A is NQR we allow x(P) to be a multiple n*A of A
+        hint = find_nA_x_coord(&P.x, curve, 1);
+    } else {
+        // when A is QR we instead have to find (1 + b^2) a NQR
+        // such that x(P) = -A / (1 + i*b)
+        hint = find_nqr_factor(&P.x, curve, 1);
+    }
+
+    fp2_set_one(&P.z);
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+    // Finally, we compress hint_A and hint into a single bytes.
+    // We choose to set the LSB of hint to hint_A
+    assert(hint < 128); // We expect hint to be 7-bits in size
+    return (hint << 1) | hint_A;
+}
+
+// Computes a basis E[2^f] = <P, Q> where the point Q is above (0 : 0)
+// given the hints as an array for faster basis computation
+int
+ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint)
+{
+    // Normalise (A/C : 1) and ((A + 2)/4 : 1)
+    ec_normalize_curve_and_A24(curve);
+
+    if (fp2_is_zero(&curve->A)) {
+        ec_basis_E0_2f(PQ2, curve, f);
+        return 1;
+    }
+
+    // The LSB of hint encodes whether A is a QR
+    // The remaining 7-bits are used to find a valid x(P)
+    bool hint_A = hint & 1;
+    uint8_t hint_P = hint >> 1;
+
+    // Compute the points P, Q
+    ec_point_t P, Q;
+
+    if (!hint_P) {
+        // When hint_P = 0 it means we did not find a point in 128 attempts
+        // this is very rare and we almost never expect to need this fallback
+        // In either case, we can start with b = 128 to skip testing the known
+        // values which will not work
+        if (!hint_A) {
+            find_nA_x_coord(&P.x, curve, 128);
+        } else {
+            find_nqr_factor(&P.x, curve, 128);
+        }
+    } else {
+        // Otherwise we use the hint to directly find x(P) based on hint_A
+        if (!hint_A) {
+            // when A is NQR, we have found n such that x(P) = n*A
+            fp2_mul_small(&P.x, &curve->A, hint_P);
+        } else {
+            // when A is QR we have found b such that (1 + b^2) is a NQR in
+            // Fp, so we must compute x(P) = -A / (1 + i*b)
+            fp_set_one(&P.x.re);
+            fp_set_small(&P.x.im, hint_P);
+            fp2_inv(&P.x);
+            fp2_mul(&P.x, &P.x, &curve->A);
+            fp2_neg(&P.x, &P.x);
+        }
+    }
+    fp2_set_one(&P.z);
+
+#ifndef NDEBUG
+    int passed = 1;
+    passed = is_on_curve(&P.x, curve);
+    passed &= !fp2_is_square(&P.x);
+
+    if (!passed)
+        return 0;
+#endif
+
+    // set xQ to -xP - A
+    fp2_add(&Q.x, &curve->A, &P.x);
+    fp2_neg(&Q.x, &Q.x);
+    fp2_set_one(&Q.z);
+
+    // clear out the odd cofactor to get a point of order 2^f
+    clear_cofactor_for_maximal_even_order(&P, curve, f);
+    clear_cofactor_for_maximal_even_order(&Q, curve, f);
+
+    // compute PmQ, set PmQ to Q to ensure Q above (0,0)
+    difference_point(&PQ2->Q, &P, &Q, curve);
+    copy_point(&PQ2->P, &P);
+    copy_point(&PQ2->PmQ, &Q);
+
+#ifndef NDEBUG
+    passed &= test_basis_order_twof(PQ2, curve, f);
+
+    if (!passed)
+        return 0;
+#endif
+
+    return 1;
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/common.c b/src/pqm4/sqisign_lvl5/ref/common.c
new file mode 100644
index 0000000..d393e9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/common.c
@@ -0,0 +1,88 @@
+#include <fips202.h>
+#include <tutil.h>
+#include <mp.h>
+#include <encoded_sizes.h>
+#include <ec_params.h>
+#include <verification.h>
+
+void
+public_key_init(public_key_t *pk)
+{
+    ec_curve_init(&pk->curve);
+}
+
+void
+public_key_finalize(public_key_t *pk)
+{
+}
+
+// compute the challenge as the hash of the message and the commitment curve and public key
+void
+hash_to_challenge(scalar_t *scalar,
+                  const public_key_t *pk,
+                  const ec_curve_t *com_curve,
+                  const unsigned char *message,
+                  size_t length)
+{
+    unsigned char buf[2 * FP2_ENCODED_BYTES];
+    {
+        fp2_t j1, j2;
+        ec_j_inv(&j1, &pk->curve);
+        ec_j_inv(&j2, com_curve);
+        fp2_encode(buf, &j1);
+        fp2_encode(buf + FP2_ENCODED_BYTES, &j2);
+    }
+
+    {
+        // The type scalar_t represents an element of GF(p), which is about
+        // 2*lambda bits, where lambda = 128, 192 or 256, according to the
+        // security level. Thus, the variable scalar should have enough memory
+        // for the values produced by SHAKE256 in the intermediate iterations.
+
+        shake256incctx ctx;
+
+        size_t hash_bytes = ((2 * SECURITY_BITS) + 7) / 8;
+        size_t limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        size_t bits = (2 * SECURITY_BITS) % RADIX;
+        digit_t mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, buf, 2 * FP2_ENCODED_BYTES);
+        shake256_inc_absorb(&ctx, message, length);
+        shake256_inc_finalize(&ctx);
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+        for (int i = 2; i < HASH_ITERATIONS; i++) {
+            shake256_inc_init(&ctx);
+            shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+            shake256_inc_finalize(&ctx);
+            shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+            (*scalar)[limbs - 1] &= mask;
+        }
+        shake256_inc_init(&ctx);
+        shake256_inc_absorb(&ctx, (void *)(*scalar), hash_bytes);
+        shake256_inc_finalize(&ctx);
+
+        hash_bytes = ((TORSION_EVEN_POWER - SQIsign_response_length) + 7) / 8;
+        limbs = (hash_bytes + sizeof(digit_t) - 1) / sizeof(digit_t);
+        bits = (TORSION_EVEN_POWER - SQIsign_response_length) % RADIX;
+        mask = ((digit_t)-1) >> ((RADIX - bits) % RADIX);
+#ifdef TARGET_BIG_ENDIAN
+        mask = BSWAP_DIGIT(mask);
+#endif
+
+        memset(*scalar, 0, NWORDS_ORDER * sizeof(digit_t));
+        shake256_inc_squeeze((void *)(*scalar), hash_bytes, &ctx);
+        (*scalar)[limbs - 1] &= mask;
+
+#ifdef TARGET_BIG_ENDIAN
+        for (int i = 0; i < NWORDS_ORDER; i++)
+            (*scalar)[i] = BSWAP_DIGIT((*scalar)[i]);
+#endif
+
+        mp_mod_2exp(*scalar, SECURITY_BITS, NWORDS_ORDER);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/config.mk b/src/pqm4/sqisign_lvl5/ref/config.mk
new file mode 100644
index 0000000..7eead5b
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/config.mk
@@ -0,0 +1,2 @@
+elf/crypto_sign_sqisign_lvl5_ref_%.elf: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl5 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
+obj/libcrypto_sign_sqisign_lvl5_ref.a: CPPFLAGS+=-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=lvl5 -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS
diff --git a/src/pqm4/sqisign_lvl5/ref/e0_basis.c b/src/pqm4/sqisign_lvl5/ref/e0_basis.c
new file mode 100644
index 0000000..a7148e4
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/e0_basis.c
@@ -0,0 +1,55 @@
+#include <e0_basis.h>
+const fp2_t BASIS_E0_PX = {
+#if 0
+#elif RADIX == 16
+{0x1099, 0xa9f, 0x14f8, 0x1537, 0x1a13, 0x97e, 0x1095, 0xc8b, 0xdd2, 0x1c5f, 0xbdf, 0x1344, 0x1330, 0x1733, 0x185d, 0x1b08, 0x464, 0x76f, 0xe44, 0x3fc, 0x1dc0, 0x1c62, 0x88, 0x972, 0x13f4, 0x18c8, 0x6bd, 0x804, 0x1269, 0x19e0, 0x14bd, 0x10a1, 0xe5e, 0x1af2, 0x156c, 0x3f7, 0x16a1, 0x47d, 0x314}
+#elif RADIX == 32
+{0x184cba61, 0xf4f854f, 0x1fb42753, 0x45c2552, 0x1c5f6e93, 0x2688bdf, 0xedcce66, 0x64d8461, 0x1c8876f2, 0x177007f8, 0x12044718, 0x1913f44b, 0x10d7b8, 0x1cf049a5, 0x1d0a1a5e, 0x1b35e4e5, 0x1508fdea, 0x66d}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x27537a7c2a7e132e, 0x7dba4c8b84aa5fb4, 0xedcce6613445eff1, 0xc7221dbc8c9b08c2, 0x8972044718bb803f, 0x24d280435ee3227e, 0x6bc9cbd0a1a5ee78, 0x1011f6d423f7ab6}
+#else
+{0xa6f4f854fc265d, 0x4c8b84aa5fb427, 0x1309a22f7f8bedd, 0x12326c230bb7339, 0x1177007f8e443b7, 0x3227e897204471, 0x173c12694021af7, 0xd9af272f428697, 0x523eda847ef5}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x4b1, 0x178f, 0x107b, 0x6f6, 0x75e, 0x1b27, 0x4db, 0x1e1b, 0xd78, 0x15b6, 0x1130, 0x8cc, 0x1ac0, 0x9b7, 0x692, 0x1e07, 0x1f4, 0xfd7, 0x2ab, 0x7b5, 0x1040, 0xa43, 0xb6d, 0x13a1, 0x1422, 0x10c9, 0x10b0, 0x1540, 0x827, 0xa69, 0x1761, 0x1f25, 0x1d16, 0x16f2, 0x1fcb, 0x92, 0xcba, 0x1c03, 0x3c7}
+#elif RADIX == 32
+{0x1258c7b1, 0xd07bbc7, 0x9cebc6f, 0x10d936f6, 0x15b66bc7, 0x1199130, 0x926df58, 0x1f4f039a, 0x556fd70, 0x1c100f6a, 0x15b6a90, 0x1934229d, 0x15021610, 0x1534a09e, 0xdf25bb0, 0x12ede5d1, 0x5d024bf, 0xa9b}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xbc6f683dde3c9631, 0xd9af1e1b26dec9ce, 0x926df5808cc89856, 0x5155bf5c3e9e0734, 0x53a15b6a90e0807b, 0x504f540858432684, 0xdbcba2df25bb0a9a, 0x18f00d974092fe5}
+#else
+{0xded07bbc792c63, 0x11e1b26dec9cebc, 0xc046644c2b6cd7, 0x10fa781cd249b7d, 0x1c100f6a2ab7eb, 0x3268453a15b6a9, 0x54d2827aa042c2, 0x1976f2e8b7c96ec, 0x16e01b2e8125f}
+#endif
+#endif
+};
+const fp2_t BASIS_E0_QX = {
+#if 0
+#elif RADIX == 16
+{0x15c, 0x865, 0x1af6, 0x17b9, 0x6a2, 0x1c22, 0x17c5, 0x1149, 0xa7, 0x151e, 0xe57, 0x4c2, 0x18cd, 0xbd2, 0x7a4, 0x7c6, 0x74a, 0xd2, 0x902, 0x68c, 0x21e, 0x1e44, 0x1f5a, 0x1d4c, 0x115b, 0x1777, 0x16d4, 0x503, 0x3af, 0x7e4, 0x1aa7, 0x3dd, 0x827, 0x186b, 0x765, 0x1fc5, 0xc78, 0x9bd, 0xfe}
+#elif RADIX == 32
+{0x10ae12d6, 0x13af6432, 0x88d457b, 0xa4df178, 0x151e053c, 0x14984e57, 0x122f4b19, 0x14a3e31e, 0x12040d23, 0x878d18, 0xcfad791, 0xef15bea, 0x140eda97, 0x13f20ebc, 0xe3ddd53, 0x1970d682, 0x3c7f14e, 0x4eb}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x457b9d7b21942b84, 0x7814f149be2f088d, 0x22f4b19a4c272bd4, 0xc4810348e947c63d, 0x7d4cfad791043c68, 0x75e503b6a5dde2b, 0xe1ad04e3ddd539f9, 0x1326f58f1fc53b2}
+#else
+{0xf73af643285709, 0xf149be2f088d45, 0xcd261395ea3c0a, 0x3a51f18f48bd2c, 0x20878d18902069, 0x1dde2b7d4cfad79, 0x1cfc83af281db52, 0xcb86b4138f7754, 0xb4deb1e3f8a7}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x6ac, 0x25e, 0xc7a, 0x1492, 0xd01, 0xbc0, 0x118, 0x376, 0x3e0, 0x7ae, 0x573, 0x171f, 0x35a, 0x1725, 0x48f, 0xc94, 0x133c, 0x16a4, 0x10a8, 0x178d, 0xdd7, 0x798, 0x1d05, 0x39f, 0xc2a, 0x179c, 0x407, 0xd3, 0x118a, 0x1c9f, 0xeac, 0x145b, 0xc35, 0x11a2, 0x58b, 0xe4, 0x5e3, 0xae7, 0x330}
+#elif RADIX == 32
+{0x3563c78, 0x4c7a12f, 0x101a0349, 0x1bb04617, 0x7ae1f00, 0xae3e573, 0x7dc946b, 0x13c64a12, 0x1516a49, 0x375ef1b, 0x1fe829e6, 0x138c2a1c, 0x34c80f7, 0xe4fc628, 0xb45b756, 0x2e344c3, 0xf18390b, 0x339}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x349263d0978d58f, 0xb87c037608c2f01a, 0x7dc946b571f2b99e, 0xd8545a92678c9424, 0x439fe829e61baf78, 0xe3140d3203de7185, 0xc68986b45b756727, 0x32b9cbc60e42c5}
+#else
+{0x924c7a12f1ab1e, 0x37608c2f01a03, 0x15ab8f95ccf5c3e, 0x99e325091f7251, 0xc375ef1b0a8b52, 0x1e7185439fe829e, 0x1393f18a069901e, 0x1171a261ad16dd5, 0x6573978c1c85}
+#endif
+#endif
+};
diff --git a/src/pqm4/sqisign_lvl5/ref/e0_basis.h b/src/pqm4/sqisign_lvl5/ref/e0_basis.h
new file mode 100644
index 0000000..05cafb8
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/e0_basis.h
@@ -0,0 +1,3 @@
+#include <fp2.h>
+extern const fp2_t BASIS_E0_PX;
+extern const fp2_t BASIS_E0_QX;
diff --git a/src/pqm4/sqisign_lvl5/ref/ec.c b/src/pqm4/sqisign_lvl5/ref/ec.c
new file mode 100644
index 0000000..be4e4e5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/ec.c
@@ -0,0 +1,665 @@
+#include <assert.h>
+#include <stdio.h>
+#include <mp.h>
+#include <ec.h>
+
+void
+ec_point_init(ec_point_t *P)
+{ // Initialize point as identity element (1:0)
+    fp2_set_one(&(P->x));
+    fp2_set_zero(&(P->z));
+}
+
+void
+ec_curve_init(ec_curve_t *E)
+{ // Initialize the curve struct
+    // Initialize the constants
+    fp2_set_zero(&(E->A));
+    fp2_set_one(&(E->C));
+
+    // Initialize the point (A+2 : 4C)
+    ec_point_init(&(E->A24));
+
+    // Set the bool to be false by default
+    E->is_A24_computed_and_normalized = false;
+}
+
+void
+select_point(ec_point_t *Q, const ec_point_t *P1, const ec_point_t *P2, const digit_t option)
+{ // Select points in constant time
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+cswap_points(ec_point_t *P, ec_point_t *Q, const digit_t option)
+{ // Swap points in constant time
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
+    fp2_cswap(&(P->x), &(Q->x), option);
+    fp2_cswap(&(P->z), &(Q->z), option);
+}
+
+void
+ec_normalize_point(ec_point_t *P)
+{
+    fp2_inv(&P->z);
+    fp2_mul(&P->x, &P->x, &P->z);
+    fp2_set_one(&(P->z));
+}
+
+void
+ec_normalize_curve(ec_curve_t *E)
+{
+    fp2_inv(&E->C);
+    fp2_mul(&E->A, &E->A, &E->C);
+    fp2_set_one(&E->C);
+}
+
+void
+ec_curve_normalize_A24(ec_curve_t *E)
+{
+    if (!E->is_A24_computed_and_normalized) {
+        AC_to_A24(&E->A24, E);
+        ec_normalize_point(&E->A24);
+        E->is_A24_computed_and_normalized = true;
+    }
+    assert(fp2_is_one(&E->A24.z));
+}
+
+void
+ec_normalize_curve_and_A24(ec_curve_t *E)
+{ // Neither the curve or A24 are guaranteed to be normalized.
+  // First we normalize (A/C : 1) and conditionally compute
+    if (!fp2_is_one(&E->C)) {
+        ec_normalize_curve(E);
+    }
+
+    if (!E->is_A24_computed_and_normalized) {
+        // Now compute A24 = ((A + 2) / 4 : 1)
+        fp2_add_one(&E->A24.x, &E->A);     // re(A24.x) = re(A) + 1
+        fp2_add_one(&E->A24.x, &E->A24.x); // re(A24.x) = re(A) + 2
+        fp_copy(&E->A24.x.im, &E->A.im);   // im(A24.x) = im(A)
+
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 2
+        fp2_half(&E->A24.x, &E->A24.x); // (A + 2) / 4
+        fp2_set_one(&E->A24.z);
+
+        E->is_A24_computed_and_normalized = true;
+    }
+}
+
+uint32_t
+ec_is_zero(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_has_zero_coordinate(const ec_point_t *P)
+{
+    return fp2_is_zero(&P->x) | fp2_is_zero(&P->z);
+}
+
+uint32_t
+ec_is_equal(const ec_point_t *P, const ec_point_t *Q)
+{ // Evaluate if two points in Montgomery coordinates (X:Z) are equal
+  // Returns 0xFFFFFFFF (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1;
+
+    // Check if P, Q are the points at infinity
+    uint32_t l_zero = ec_is_zero(P);
+    uint32_t r_zero = ec_is_zero(Q);
+
+    // Check if PX * QZ = QX * PZ
+    fp2_mul(&t0, &P->x, &Q->z);
+    fp2_mul(&t1, &P->z, &Q->x);
+    uint32_t lr_equal = fp2_is_equal(&t0, &t1);
+
+    // Points are equal if
+    // - Both are zero, or
+    // - neither are zero AND PX * QZ = QX * PZ
+    return (l_zero & r_zero) | (~l_zero & ~r_zero * lr_equal);
+}
+
+uint32_t
+ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    if (ec_is_zero(P))
+        return 0;
+
+    uint32_t x_is_zero, tmp_is_zero;
+    fp2_t t0, t1, t2;
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t0, &t1);
+    fp2_mul(&t2, &t2, &E->A);
+    fp2_mul(&t1, &t1, &E->C);
+    fp2_add(&t1, &t1, &t1);
+    fp2_add(&t0, &t1, &t2); // 4 (CX^2+CZ^2+AXZ)
+
+    x_is_zero = fp2_is_zero(&P->x);
+    tmp_is_zero = fp2_is_zero(&t0);
+
+    // two torsion if x or x^2 + Ax + 1 is zero
+    return x_is_zero | tmp_is_zero;
+}
+
+uint32_t
+ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E)
+{
+    ec_point_t test;
+    xDBL_A24(&test, P, &E->A24, E->is_A24_computed_and_normalized);
+    return ec_is_two_torsion(&test, E);
+}
+
+uint32_t
+ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E)
+{ // Check if basis points (P, Q) form a full 2^t-basis
+    ec_point_t P2, Q2;
+    xDBL_A24(&P2, &B->P, &E->A24, E->is_A24_computed_and_normalized);
+    xDBL_A24(&Q2, &B->Q, &E->A24, E->is_A24_computed_and_normalized);
+    return (ec_is_two_torsion(&P2, E) & ec_is_two_torsion(&Q2, E) & ~ec_is_equal(&P2, &Q2));
+}
+
+int
+ec_curve_verify_A(const fp2_t *A)
+{ // Verify the Montgomery coefficient A is valid (A^2-4 \ne 0)
+  // Return 1 if curve is valid, 0 otherwise
+    fp2_t t;
+    fp2_set_one(&t);
+    fp_add(&t.re, &t.re, &t.re); // t=2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    fp_neg(&t.re, &t.re); // t=-2
+    if (fp2_is_equal(A, &t))
+        return 0;
+    return 1;
+}
+
+int
+ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A)
+{ // Initialize the curve from the A coefficient and check it is valid
+  // Return 1 if curve is valid, 0 otherwise
+    ec_curve_init(E);
+    fp2_copy(&E->A, A); // Set A
+    return ec_curve_verify_A(A);
+}
+
+void
+ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve)
+{ // j-invariant computation for Montgommery coefficient A2=(A+2C:4C)
+    fp2_t t0, t1;
+
+    fp2_sqr(&t1, &curve->C);
+    fp2_sqr(j_inv, &curve->A);
+    fp2_add(&t0, &t1, &t1);
+    fp2_sub(&t0, j_inv, &t0);
+    fp2_sub(&t0, &t0, &t1);
+    fp2_sub(j_inv, &t0, &t1);
+    fp2_sqr(&t1, &t1);
+    fp2_mul(j_inv, j_inv, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_sqr(&t1, &t0);
+    fp2_mul(&t0, &t0, &t1);
+    fp2_add(&t0, &t0, &t0);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(j_inv);
+    fp2_mul(j_inv, &t0, j_inv);
+}
+
+void
+xDBL_E0(ec_point_t *Q, const ec_point_t *P)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z) on the curve E0 with (A:C) = (0:1).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C) = (0:1). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&Q->z, &t1, &t2);
+    fp2_mul(&Q->z, &Q->z, &t2);
+}
+
+void
+xDBL(ec_point_t *Q, const ec_point_t *P, const ec_point_t *AC)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z). Computation of coefficient values A+2C and 4C
+  // on-the-fly. 
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and Montgomery curve constants (A:C). 
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    fp2_add(&t3, &AC->z, &AC->z);
+    fp2_mul(&t1, &t1, &t3);
+    fp2_add(&t1, &t1, &t1);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_add(&t0, &t3, &AC->x);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized)
+{ // Doubling of a Montgomery point in projective coordinates (X:Z).
+  // Input: projective Montgomery x-coordinates P = (XP:ZP), where xP=XP/ZP, and
+  //        the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery x-coordinates Q <- 2*P = (XQ:ZQ) such that x(2P)=XQ/ZQ.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&t1, &t1);
+    fp2_sub(&t2, &t0, &t1);
+    if (!A24_normalized)
+        fp2_mul(&t1, &t1, &A24->z);
+    fp2_mul(&Q->x, &t0, &t1);
+    fp2_mul(&t0, &t2, &A24->x);
+    fp2_add(&t0, &t0, &t1);
+    fp2_mul(&Q->z, &t0, &t2);
+}
+
+void
+xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ)
+{ // Differential addition of Montgomery points in projective coordinates (X:Z).
+  // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, and difference
+  //        PQ=P-Q=(XPQ:ZPQ).
+  // Output: projective Montgomery point R <- P+Q = (XR:ZR) such that x(P+Q)=XR/ZR.
+    fp2_t t0, t1, t2, t3;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_add(&t2, &Q->x, &Q->z);
+    fp2_sub(&t3, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t3);
+    fp2_mul(&t1, &t1, &t2);
+    fp2_add(&t2, &t0, &t1);
+    fp2_sub(&t3, &t0, &t1);
+    fp2_sqr(&t2, &t2);
+    fp2_sqr(&t3, &t3);
+    fp2_mul(&t2, &PQ->z, &t2);
+    fp2_mul(&R->z, &PQ->x, &t3);
+    fp2_copy(&R->x, &t2);
+}
+
+void
+xDBLADD(ec_point_t *R,
+        ec_point_t *S,
+        const ec_point_t *P,
+        const ec_point_t *Q,
+        const ec_point_t *PQ,
+        const ec_point_t *A24,
+        const bool A24_normalized)
+{ // Simultaneous doubling and differential addition.
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, the difference
+  //         PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C:4C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points R <- 2*P = (XR:ZR) such that x(2P)=XR/ZR, and S <- P+Q = (XS:ZS) such that =
+  //         x(Q+P)=XS/ZS.
+    fp2_t t0, t1, t2;
+
+    fp2_add(&t0, &P->x, &P->z);
+    fp2_sub(&t1, &P->x, &P->z);
+    fp2_sqr(&R->x, &t0);
+    fp2_sub(&t2, &Q->x, &Q->z);
+    fp2_add(&S->x, &Q->x, &Q->z);
+    fp2_mul(&t0, &t0, &t2);
+    fp2_sqr(&R->z, &t1);
+    fp2_mul(&t1, &t1, &S->x);
+    fp2_sub(&t2, &R->x, &R->z);
+    if (!A24_normalized)
+        fp2_mul(&R->z, &R->z, &A24->z);
+    fp2_mul(&R->x, &R->x, &R->z);
+    fp2_mul(&S->x, &A24->x, &t2);
+    fp2_sub(&S->z, &t0, &t1);
+    fp2_add(&R->z, &R->z, &S->x);
+    fp2_add(&S->x, &t0, &t1);
+    fp2_mul(&R->z, &R->z, &t2);
+    fp2_sqr(&S->z, &S->z);
+    fp2_sqr(&S->x, &S->x);
+    fp2_mul(&S->z, &S->z, &PQ->x);
+    fp2_mul(&S->x, &S->x, &PQ->z);
+}
+
+void
+xMUL(ec_point_t *Q, const ec_point_t *P, const digit_t *k, const int kbits, const ec_curve_t *curve)
+{ // The Montgomery ladder
+  // Input: projective Montgomery point P=(XP:ZP) such that xP=XP/ZP, a scalar k of bitlength kbits, and
+  //        the Montgomery curve constants (A:C) (or A24 = (A+2C/4C:1) if normalized).
+  // Output: projective Montgomery points Q <- k*P = (XQ:ZQ) such that x(k*P)=XQ/ZQ.
+    ec_point_t R0, R1, A24;
+    digit_t mask;
+    unsigned int bit, prevbit = 0, swap;
+
+    if (!curve->is_A24_computed_and_normalized) {
+        // Computation of A24=(A+2C:4C)
+        fp2_add(&A24.x, &curve->C, &curve->C);
+        fp2_add(&A24.z, &A24.x, &A24.x);
+        fp2_add(&A24.x, &A24.x, &curve->A);
+    } else {
+        fp2_copy(&A24.x, &curve->A24.x);
+        fp2_copy(&A24.z, &curve->A24.z);
+        // Assert A24 has been normalised
+        assert(fp2_is_one(&A24.z));
+    }
+
+    // R0 <- (1:0), R1 <- P
+    ec_point_init(&R0);
+    fp2_copy(&R1.x, &P->x);
+    fp2_copy(&R1.z, &P->z);
+
+    // Main loop
+    for (int i = kbits - 1; i >= 0; i--) {
+        bit = (k[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
+        swap = bit ^ prevbit;
+        prevbit = bit;
+        mask = 0 - (digit_t)swap;
+
+        cswap_points(&R0, &R1, mask);
+        xDBLADD(&R0, &R1, &R0, &R1, P, &A24, true);
+    }
+    swap = 0 ^ prevbit;
+    mask = 0 - (digit_t)swap;
+    cswap_points(&R0, &R1, mask);
+
+    fp2_copy(&Q->x, &R0.x);
+    fp2_copy(&Q->z, &R0.z);
+}
+
+int
+xDBLMUL(ec_point_t *S,
+        const ec_point_t *P,
+        const digit_t *k,
+        const ec_point_t *Q,
+        const digit_t *l,
+        const ec_point_t *PQ,
+        const int kbits,
+        const ec_curve_t *curve)
+{ // The Montgomery biladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, scalars k and l of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants (A:C).
+  // Output: projective Montgomery point S <- k*P + l*Q = (XS:ZS) such that x(k*P + l*Q)=XS/ZS.
+
+    int i, A_is_zero;
+    digit_t evens, mevens, bitk0, bitl0, maskk, maskl, temp, bs1_ip1, bs2_ip1, bs1_i, bs2_i, h;
+    digit_t sigma[2] = { 0 }, pre_sigma = 0;
+    digit_t k_t[NWORDS_ORDER], l_t[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }, r[2 * BITS] = { 0 };
+    ec_point_t DIFF1a, DIFF1b, DIFF2a, DIFF2b, R[3] = { 0 }, T[3];
+
+    // differential additions formulas are invalid in this case
+    if (ec_has_zero_coordinate(P) | ec_has_zero_coordinate(Q) | ec_has_zero_coordinate(PQ))
+        return 0;
+
+    // Derive sigma according to parity
+    bitk0 = (k[0] & 1);
+    bitl0 = (l[0] & 1);
+    maskk = 0 - bitk0; // Parity masks: 0 if even, otherwise 1...1
+    maskl = 0 - bitl0;
+    sigma[0] = (bitk0 ^ 1);
+    sigma[1] = (bitl0 ^ 1);
+    evens = sigma[0] + sigma[1]; // Count number of even scalars
+    mevens = 0 - (evens & 1);    // Mask mevens <- 0 if # even of scalars = 0 or 2, otherwise mevens = 1...1
+
+    // If k and l are both even or both odd, pick sigma = (0,1)
+    sigma[0] = (sigma[0] & mevens);
+    sigma[1] = (sigma[1] & mevens) | (1 & ~mevens);
+
+    // Convert even scalars to odd
+    one[0] = 1;
+    mp_sub(k_t, k, one, NWORDS_ORDER);
+    mp_sub(l_t, l, one, NWORDS_ORDER);
+    select_ct(k_t, k_t, k, maskk, NWORDS_ORDER);
+    select_ct(l_t, l_t, l, maskl, NWORDS_ORDER);
+
+    // Scalar recoding
+    for (i = 0; i < kbits; i++) {
+        // If sigma[0] = 1 swap k_t and l_t
+        maskk = 0 - (sigma[0] ^ pre_sigma);
+        swap_ct(k_t, l_t, maskk, NWORDS_ORDER);
+
+        if (i == kbits - 1) {
+            bs1_ip1 = 0;
+            bs2_ip1 = 0;
+        } else {
+            bs1_ip1 = mp_shiftr(k_t, 1, NWORDS_ORDER);
+            bs2_ip1 = mp_shiftr(l_t, 1, NWORDS_ORDER);
+        }
+        bs1_i = k_t[0] & 1;
+        bs2_i = l_t[0] & 1;
+
+        r[2 * i] = bs1_i ^ bs1_ip1;
+        r[2 * i + 1] = bs2_i ^ bs2_ip1;
+
+        // Revert sigma if second bit, r_(2i+1), is 1
+        pre_sigma = sigma[0];
+        maskk = 0 - r[2 * i + 1];
+        select_ct(&temp, &sigma[0], &sigma[1], maskk, 1);
+        select_ct(&sigma[1], &sigma[1], &sigma[0], maskk, 1);
+        sigma[0] = temp;
+    }
+
+    // Point initialization
+    ec_point_init(&R[0]);
+    maskk = 0 - sigma[0];
+    select_point(&R[1], P, Q, maskk);
+    select_point(&R[2], Q, P, maskk);
+
+    fp2_copy(&DIFF1a.x, &R[1].x);
+    fp2_copy(&DIFF1a.z, &R[1].z);
+    fp2_copy(&DIFF1b.x, &R[2].x);
+    fp2_copy(&DIFF1b.z, &R[2].z);
+
+    // Initialize DIFF2a <- P+Q, DIFF2b <- P-Q
+    xADD(&R[2], &R[1], &R[2], PQ);
+    if (ec_has_zero_coordinate(&R[2]))
+        return 0; // non valid formulas
+
+    fp2_copy(&DIFF2a.x, &R[2].x);
+    fp2_copy(&DIFF2a.z, &R[2].z);
+    fp2_copy(&DIFF2b.x, &PQ->x);
+    fp2_copy(&DIFF2b.z, &PQ->z);
+
+    A_is_zero = fp2_is_zero(&curve->A);
+
+    // Main loop
+    for (i = kbits - 1; i >= 0; i--) {
+        h = r[2 * i] + r[2 * i + 1]; // in {0, 1, 2}
+        maskk = 0 - (h & 1);
+        select_point(&T[0], &R[0], &R[1], maskk);
+        maskk = 0 - (h >> 1);
+        select_point(&T[0], &T[0], &R[2], maskk);
+        if (A_is_zero) {
+            xDBL_E0(&T[0], &T[0]);
+        } else {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(&T[0], &T[0], &curve->A24, true);
+        }
+
+        maskk = 0 - r[2 * i + 1]; // in {0, 1}
+        select_point(&T[1], &R[0], &R[1], maskk);
+        select_point(&T[2], &R[1], &R[2], maskk);
+
+        cswap_points(&DIFF1a, &DIFF1b, maskk);
+        xADD(&T[1], &T[1], &T[2], &DIFF1a);
+        xADD(&T[2], &R[0], &R[2], &DIFF2a);
+
+        // If hw (mod 2) = 1 then swap DIFF2a and DIFF2b
+        maskk = 0 - (h & 1);
+        cswap_points(&DIFF2a, &DIFF2b, maskk);
+
+        // R <- T
+        copy_point(&R[0], &T[0]);
+        copy_point(&R[1], &T[1]);
+        copy_point(&R[2], &T[2]);
+    }
+
+    // Output R[evens]
+    select_point(S, &R[0], &R[1], mevens);
+
+    maskk = 0 - (bitk0 & bitl0);
+    select_point(S, S, &R[2], maskk);
+    return 1;
+}
+
+int
+ec_ladder3pt(ec_point_t *R,
+             const digit_t *m,
+             const ec_point_t *P,
+             const ec_point_t *Q,
+             const ec_point_t *PQ,
+             const ec_curve_t *E)
+{ // The 3-point Montgomery ladder
+  // Input:  projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, a scalar k of
+  //         bitlength kbits, the difference PQ=P-Q=(XPQ:ZPQ), and the Montgomery curve constants A24 = (A+2C/4C:1).
+  // Output: projective Montgomery point R <- P + m*Q = (XR:ZR) such that x(P + m*Q)=XR/ZR.
+    assert(E->is_A24_computed_and_normalized);
+    if (!fp2_is_one(&E->A24.z)) {
+        return 0;
+    }
+    // Formulas are not valid in that case
+    if (ec_has_zero_coordinate(PQ)) {
+        return 0;
+    }
+
+    ec_point_t X0, X1, X2;
+    copy_point(&X0, Q);
+    copy_point(&X1, P);
+    copy_point(&X2, PQ);
+
+    int i, j;
+    digit_t t;
+    for (i = 0; i < NWORDS_ORDER; i++) {
+        t = 1;
+        for (j = 0; j < RADIX; j++) {
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            xDBLADD(&X0, &X1, &X0, &X1, &X2, &E->A24, true);
+            cswap_points(&X1, &X2, -((t & m[i]) == 0));
+            t <<= 1;
+        };
+    };
+    copy_point(R, &X1);
+    return 1;
+}
+
+// WRAPPERS to export
+
+void
+ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve)
+{
+    // If A24 = ((A+2)/4 : 1) we save multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+    } else {
+        // Otherwise we compute A24 on the fly for doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+    }
+}
+
+void
+ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve)
+{
+    if (n == 0) {
+        copy_point(res, P);
+        return;
+    }
+
+    // When the chain is long enough, we should normalise A24
+    if (n > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is normalized we can save some multiplications
+    if (curve->is_A24_computed_and_normalized) {
+        assert(fp2_is_one(&curve->A24.z));
+        xDBL_A24(res, P, &curve->A24, true);
+        for (int i = 0; i < n - 1; i++) {
+            assert(fp2_is_one(&curve->A24.z));
+            xDBL_A24(res, res, &curve->A24, true);
+        }
+    } else {
+        // Otherwise we do normal doubling
+        xDBL(res, P, (const ec_point_t *)curve);
+        for (int i = 0; i < n - 1; i++) {
+            xDBL(res, res, (const ec_point_t *)curve);
+        }
+    }
+}
+
+void
+ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve)
+{
+    ec_dbl_iter(&res->P, n, &B->P, curve);
+    ec_dbl_iter(&res->Q, n, &B->Q, curve);
+    ec_dbl_iter(&res->PmQ, n, &B->PmQ, curve);
+}
+
+void
+ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve)
+{
+    // For large scalars it's worth normalising anyway
+    if (kbits > 50) {
+        ec_curve_normalize_A24(curve);
+    }
+
+    // When A24 is computed and normalized we save some Fp2 multiplications
+    xMUL(res, P, scalar, kbits, curve);
+}
+
+int
+ec_biscalar_mul(ec_point_t *res,
+                const digit_t *scalarP,
+                const digit_t *scalarQ,
+                const int kbits,
+                const ec_basis_t *PQ,
+                const ec_curve_t *curve)
+{
+    if (fp2_is_zero(&PQ->PmQ.z))
+        return 0;
+
+    /* Differential additions behave badly when PmQ = (0:1), so we need to
+     * treat this case specifically. Since we assume P, Q are a basis, this
+     * can happen only if kbits==1 */
+    if (kbits == 1) {
+        // Sanity check: our basis should be given by 2-torsion points
+        if (!ec_is_two_torsion(&PQ->P, curve) || !ec_is_two_torsion(&PQ->Q, curve) ||
+            !ec_is_two_torsion(&PQ->PmQ, curve))
+            return 0;
+        digit_t bP, bQ;
+        bP = (scalarP[0] & 1);
+        bQ = (scalarQ[0] & 1);
+        if (bP == 0 && bQ == 0)
+            ec_point_init(res); //(1: 0)
+        else if (bP == 1 && bQ == 0)
+            copy_point(res, &PQ->P);
+        else if (bP == 0 && bQ == 1)
+            copy_point(res, &PQ->Q);
+        else if (bP == 1 && bQ == 1)
+            copy_point(res, &PQ->PmQ);
+        else // should never happen
+            assert(0);
+        return 1;
+    } else {
+        ec_curve_t E;
+        copy_curve(&E, curve);
+
+        if (!fp2_is_zero(&curve->A)) { // If A is not zero normalize
+            ec_curve_normalize_A24(&E);
+        }
+        return xDBLMUL(res, &PQ->P, scalarP, &PQ->Q, scalarQ, &PQ->PmQ, kbits, (const ec_curve_t *)&E);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/ec.h b/src/pqm4/sqisign_lvl5/ref/ec.h
new file mode 100644
index 0000000..ee2be38
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/ec.h
@@ -0,0 +1,668 @@
+﻿/** @file
+ *
+ * @authors Luca De Feo, Francisco RH
+ *
+ * @brief Elliptic curve stuff
+ */
+
+#ifndef EC_H
+#define EC_H
+#include <sqisign_namespace.h>
+#include <ec_params.h>
+#include <fp2.h>
+#include <tools.h>
+#include <stdio.h>
+
+/** @defgroup ec Elliptic curves
+ * @{
+ */
+
+/** @defgroup ec_t Data structures
+ * @{
+ */
+
+/** @brief Projective point on the Kummer line E/pm 1 in Montgomery coordinates
+ *
+ * @typedef ec_point_t
+ *
+ * @struct ec_point_t
+ *
+ * A projective point in (X:Z) or (X:Y:Z) coordinates (tbd).
+ */
+typedef struct ec_point_t
+{
+    fp2_t x;
+    fp2_t z;
+} ec_point_t;
+
+/** @brief Projective point in Montgomery coordinates
+ *
+ * @typedef jac_point_t
+ *
+ * @struct jac_point_t
+ *
+ * A projective point in (X:Y:Z) coordinates
+ */
+typedef struct jac_point_t
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+} jac_point_t;
+
+/** @brief Addition components
+ *
+ * @typedef add_components_t
+ *
+ * @struct add_components_t
+ *
+ * 3 components u,v,w that define the (X:Z) coordinates of both
+ * addition and substraction of two distinct points with
+ * P+Q =(u-v:w) and P-Q = (u+v=w)
+ */
+typedef struct add_components_t
+{
+    fp2_t u;
+    fp2_t v;
+    fp2_t w;
+} add_components_t;
+
+/** @brief A basis of a torsion subgroup
+ *
+ * @typedef ec_basis_t
+ *
+ * @struct ec_basis_t
+ *
+ * A pair of points (or a triplet, tbd) forming a basis of a torsion subgroup.
+ */
+typedef struct ec_basis_t
+{
+    ec_point_t P;
+    ec_point_t Q;
+    ec_point_t PmQ;
+} ec_basis_t;
+
+/** @brief An elliptic curve
+ *
+ * @typedef ec_curve_t
+ *
+ * @struct ec_curve_t
+ *
+ * An elliptic curve in projective Montgomery form
+ */
+typedef struct ec_curve_t
+{
+    fp2_t A;
+    fp2_t C;                             ///< cannot be 0
+    ec_point_t A24;                      // the point (A+2 : 4C)
+    bool is_A24_computed_and_normalized; // says if A24 has been computed and normalized
+} ec_curve_t;
+
+/** @brief An isogeny of degree a power of 2
+ *
+ * @typedef ec_isog_even_t
+ *
+ * @struct ec_isog_even_t
+ */
+typedef struct ec_isog_even_t
+{
+    ec_curve_t curve;  ///< The domain curve
+    ec_point_t kernel; ///< A kernel generator
+    unsigned length;   ///< The length as a 2-isogeny walk
+} ec_isog_even_t;
+
+/** @brief Isomorphism of Montgomery curves
+ *
+ * @typedef ec_isom_t
+ *
+ * @struct ec_isom_t
+ *
+ * The isomorphism is given by the map maps (X:Z) ↦ ( (Nx X + Nz Z) : (D Z) )
+ */
+typedef struct ec_isom_t
+{
+    fp2_t Nx;
+    fp2_t Nz;
+    fp2_t D;
+} ec_isom_t;
+
+// end ec_t
+/** @}
+ */
+
+/** @defgroup ec_curve_t Curves and isomorphisms
+ * @{
+ */
+
+// Initalisation for curves and points
+void ec_curve_init(ec_curve_t *E);
+void ec_point_init(ec_point_t *P);
+
+/**
+ * @brief Verify that a Montgomery coefficient is valid
+ *
+ * @param A an fp2_t
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_verify_A(const fp2_t *A);
+
+/**
+ * @brief Initialize an elliptic curve from a coefficient
+ *
+ * @param A an fp2_t
+ * @param E the elliptic curve to initialize
+ *
+ * @return 0  if curve is invalid, 1 otherwise
+ */
+int ec_curve_init_from_A(ec_curve_t *E, const fp2_t *A);
+
+// Copying points, bases and curves
+static inline void
+copy_point(ec_point_t *P, const ec_point_t *Q)
+{
+    fp2_copy(&P->x, &Q->x);
+    fp2_copy(&P->z, &Q->z);
+}
+
+static inline void
+copy_basis(ec_basis_t *B1, const ec_basis_t *B0)
+{
+    copy_point(&B1->P, &B0->P);
+    copy_point(&B1->Q, &B0->Q);
+    copy_point(&B1->PmQ, &B0->PmQ);
+}
+
+static inline void
+copy_curve(ec_curve_t *E1, const ec_curve_t *E2)
+{
+    fp2_copy(&(E1->A), &(E2->A));
+    fp2_copy(&(E1->C), &(E2->C));
+    E1->is_A24_computed_and_normalized = E2->is_A24_computed_and_normalized;
+    copy_point(&E1->A24, &E2->A24);
+}
+
+// Functions for working with the A24 point and normalisation
+
+/**
+ * @brief Reduce (A : C) to (A/C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve(ec_curve_t *E);
+
+/**
+ * @brief Reduce (A + 2 : 4C) to ((A+2)/4C : 1) in place
+ *
+ * @param E a curve
+ */
+void ec_curve_normalize_A24(ec_curve_t *E);
+
+/**
+ * @brief Normalise both (A : C) and (A + 2 : 4C) as above, in place
+ *
+ * @param E a curve
+ */
+void ec_normalize_curve_and_A24(ec_curve_t *E);
+
+/**
+ * @brief Given a curve E, compute (A+2 : 4C)
+ *
+ * @param A24 the value (A+2 : 4C) to return into
+ * @param E a curve
+ */
+static inline void
+AC_to_A24(ec_point_t *A24, const ec_curve_t *E)
+{
+    // Maybe we already have this computed
+    if (E->is_A24_computed_and_normalized) {
+        copy_point(A24, &E->A24);
+        return;
+    }
+
+    // A24 = (A+2C : 4C)
+    fp2_add(&A24->z, &E->C, &E->C);
+    fp2_add(&A24->x, &E->A, &A24->z);
+    fp2_add(&A24->z, &A24->z, &A24->z);
+}
+
+/**
+ * @brief Given a curve the point (A+2 : 4C) compute the curve coefficients (A : C)
+ *
+ * @param E a curve to compute
+ * @param A24 the value (A+2 : 4C)
+ */
+static inline void
+A24_to_AC(ec_curve_t *E, const ec_point_t *A24)
+{
+    // (A:C) = ((A+2C)*2-4C : 4C)
+    fp2_add(&E->A, &A24->x, &A24->x);
+    fp2_sub(&E->A, &E->A, &A24->z);
+    fp2_add(&E->A, &E->A, &E->A);
+    fp2_copy(&E->C, &A24->z);
+}
+
+/**
+ * @brief j-invariant.
+ *
+ * @param j_inv computed j_invariant
+ * @param curve input curve
+ */
+void ec_j_inv(fp2_t *j_inv, const ec_curve_t *curve);
+
+/**
+ * @brief Isomorphism of elliptic curve
+ * Takes as input two isomorphic Kummer lines in Montgomery form, and output an isomorphism between
+ * them
+ *
+ * @param isom computed isomorphism
+ * @param from domain curve
+ * @param to image curve
+ * @return 0xFFFFFFFF if there was an error during the computation, zero otherwise
+ */
+uint32_t ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to);
+
+/**
+ * @brief In-place evaluation of an isomorphism
+ *
+ * @param P a point
+ * @param isom an isomorphism
+ */
+void ec_iso_eval(ec_point_t *P, ec_isom_t *isom);
+
+/** @}
+ */
+/** @defgroup ec_point_t Point operations
+ * @{
+ */
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @param Q a point
+ * @return 0xFFFFFFFF if equal, zero otherwise
+ */
+uint32_t ec_is_equal(const ec_point_t *P, const ec_point_t *Q);
+
+/**
+ * @brief Point equality
+ *
+ * @param P a point
+ * @return 0xFFFFFFFF if point at infinity, zero otherwise
+ */
+uint32_t ec_is_zero(const ec_point_t *P);
+
+/**
+ * @brief Two torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_two_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Four torsion test
+ *
+ * @param P a point
+ * @param E the elliptic curve
+ * @return 0xFFFFFFFF if P is 2-torsion but not zero, zero otherwise
+ */
+uint32_t ec_is_four_torsion(const ec_point_t *P, const ec_curve_t *E);
+
+/**
+ * @brief Reduce Z-coordinate of point in place
+ *
+ * @param P a point
+ */
+void ec_normalize_point(ec_point_t *P);
+
+void xDBL_E0(ec_point_t *Q, const ec_point_t *P);
+void xADD(ec_point_t *R, const ec_point_t *P, const ec_point_t *Q, const ec_point_t *PQ);
+void xDBL_A24(ec_point_t *Q, const ec_point_t *P, const ec_point_t *A24, const bool A24_normalized);
+
+/**
+ * @brief Point doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param curve an elliptic curve
+ */
+void ec_dbl(ec_point_t *res, const ec_point_t *P, const ec_curve_t *curve);
+
+/**
+ * @brief Point iterated doubling
+ *
+ * @param res computed double of P
+ * @param P a point
+ * @param n the number of double
+ * @param curve the curve on which P lays
+ */
+void ec_dbl_iter(ec_point_t *res, int n, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Iterated doubling for a basis P, Q, PmQ
+ *
+ * @param res the computed iterated double of basis B
+ * @param n the number of doubles
+ * @param B the basis to double
+ * @param curve the parent curve of the basis
+ */
+void ec_dbl_iter_basis(ec_basis_t *res, int n, const ec_basis_t *B, ec_curve_t *curve);
+
+/**
+ * @brief Point multiplication
+ *
+ * @param res computed scalar * P
+ * @param curve the curve
+ * @param scalar an unsigned multi-precision integer
+ * @param P a point
+ * @param kbits numer of bits of the scalar
+ */
+void ec_mul(ec_point_t *res, const digit_t *scalar, const int kbits, const ec_point_t *P, ec_curve_t *curve);
+
+/**
+ * @brief Combination P+m*Q
+ *
+ * @param R computed P + m * Q
+ * @param curve the curve
+ * @param m an unsigned multi-precision integer
+ * @param P a point
+ * @param Q a point
+ * @param PQ the difference P-Q
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_ladder3pt(ec_point_t *R,
+                 const digit_t *m,
+                 const ec_point_t *P,
+                 const ec_point_t *Q,
+                 const ec_point_t *PQ,
+                 const ec_curve_t *curve);
+
+/**
+ * @brief Linear combination of points of a basis
+ *
+ * @param res computed scalarP * P + scalarQ * Q
+ * @param scalarP an unsigned multi-precision integer
+ * @param scalarQ an unsigned multi-precision integer
+ * @param kbits number of bits of the scalars, or n for points of order 2^n
+ * @param PQ a torsion basis consisting of points P and Q
+ * @param curve the curve
+ *
+ * @return 0 if there was an error, 1 otherwise
+ */
+int ec_biscalar_mul(ec_point_t *res,
+                    const digit_t *scalarP,
+                    const digit_t *scalarQ,
+                    const int kbits,
+                    const ec_basis_t *PQ,
+                    const ec_curve_t *curve);
+
+// end point computations
+/**
+ * @}
+ */
+
+/** @defgroup ec_dlog_t Torsion basis computations
+ * @{
+ */
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve along with a hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ *
+ * @return A hint
+ *
+ * The algorithm is deterministc
+ */
+uint8_t ec_curve_to_basis_2f_to_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f);
+
+/**
+ * @brief Generate a 2^f-torsion basis from a Montgomery curve and a given hint
+ *
+ * @param PQ2 an ec_basis_t
+ * @param curve an ec_curve_t
+ * @param f an integer
+ * @param hint the hint
+ *
+ * @return 1 is the basis is valid, 0 otherwise
+ *
+ * The algorithm is deterministc
+ */
+int ec_curve_to_basis_2f_from_hint(ec_basis_t *PQ2, ec_curve_t *curve, int f, const uint8_t hint);
+/** // end basis computations
+ * @}
+ */
+
+/** @defgroup ec_isog_t Isogenies
+ * @{
+ */
+
+/**
+ * @brief Evaluate isogeny of even degree on list of points.
+ * Returns 0 if successful and -1 if kernel has the wrong order or includes (0:1).
+ *
+ * @param image computed image curve
+ * @param phi isogeny
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points);
+
+/**
+ * @brief Multiplicative strategy for a short isogeny chain. Returns 1 if successfull and -1
+ * if kernel has the wrong order or includes (0:1) when special=false.
+ *
+ * @param curve domain curve, to be overwritten by the codomain curve.
+ * @param kernel a kernel generator of order 2^len
+ * @param len the length of t he 2-isogeny chain
+ * @param points a list of points to evaluate the isogeny on, modified in place
+ * @param len_points length of the list points
+ * @param special if true, allow isogenies with (0:1) in the kernel
+ *
+ * @return 0 if there was no error, 0xFFFFFFFF otherwise
+ */
+uint32_t ec_eval_small_chain(ec_curve_t *curve,
+                             const ec_point_t *kernel,
+                             int len,
+                             ec_point_t *points,
+                             unsigned len_points,
+                             bool special);
+
+/**
+ * @brief Recover Y-coordinate from X-coordinate and curve coefficients.
+ *
+ * @param y: a y-coordinate
+ * @param Px: a x-coordinate
+ * @param curve: the elliptic curve
+ *
+ * @return 0xFFFFFFFF if the point was on the curve, 0 otherwise
+ */
+uint32_t ec_recover_y(fp2_t *y, const fp2_t *Px, const ec_curve_t *curve);
+
+// Jacobian point init and copying
+void jac_init(jac_point_t *P);
+void copy_jac_point(jac_point_t *P, const jac_point_t *Q);
+
+/**
+ * @brief Test if two Jacobian points are equal
+ *
+ * @param P: a point
+ * @param Q: a point
+ *
+ * @return 0xFFFFFFFF if they are equal, 0 otherwise
+ */
+uint32_t jac_is_equal(const jac_point_t *P, const jac_point_t *Q);
+
+// Convert from Jacobian to x-only (just drop the Y-coordinate)
+void jac_to_xz(ec_point_t *P, const jac_point_t *xyP);
+// Convert from Jacobian coordinates in Montgomery model to Weierstrass
+void jac_to_ws(jac_point_t *P, fp2_t *t, fp2_t *ao3, const jac_point_t *Q, const ec_curve_t *curve);
+void jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve);
+
+// Jacobian arithmetic
+void jac_neg(jac_point_t *Q, const jac_point_t *P);
+void ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+void DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC);
+void DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t);
+void jac_to_xz_add_components(add_components_t *uvw, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ *
+ *
+ * Lifts a basis x(P), x(Q), x(P-Q) assuming the curve has (A/C : 1) and
+ * the point P = (X/Z : 1). For generic implementation see lift_basis()
+ */
+uint32_t lift_basis_normalized(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Given a basis in x-only, lift to a pair of Jacobian points
+ *
+ * @param P: a point
+ * @param Q: a point
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if there was no error, 0 otherwise
+ */
+uint32_t lift_basis(jac_point_t *P, jac_point_t *Q, ec_basis_t *B, ec_curve_t *E);
+
+/**
+ * @brief Check if basis points (P, Q) form a full 4-basis
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ *
+ * @return 0xFFFFFFFF if they form a basis, 0 otherwise
+ */
+uint32_t ec_is_basis_four_torsion(const ec_basis_t *B, const ec_curve_t *E);
+
+/*
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Test functions for printing and order checking, only used in debug mode
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+
+/**
+ * @brief Check if a point (X : Z) has order exactly 2^t
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_point_order_twof(const ec_point_t *P, const ec_curve_t *E, int t)
+{
+    ec_point_t test;
+    ec_curve_t curve;
+    test = *P;
+    copy_curve(&curve, E);
+
+    if (ec_is_zero(&test))
+        return 0;
+    // Scale point by 2^(t-1)
+    ec_dbl_iter(&test, t - 1, &test, &curve);
+    // If it's zero now, it doesnt have order 2^t
+    if (ec_is_zero(&test))
+        return 0;
+    // Ensure [2^t] P = 0
+    ec_dbl(&test, &test, &curve);
+    return ec_is_zero(&test);
+}
+
+/**
+ * @brief Check if basis points (P, Q, PmQ) all have order exactly 2^t
+ *
+ * @param B: a basis
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_basis_order_twof(const ec_basis_t *B, const ec_curve_t *E, int t)
+{
+    int check_P = test_point_order_twof(&B->P, E, t);
+    int check_Q = test_point_order_twof(&B->Q, E, t);
+    int check_PmQ = test_point_order_twof(&B->PmQ, E, t);
+
+    return check_P & check_Q & check_PmQ;
+}
+
+/**
+ * @brief Check if a Jacobian point (X : Y : Z) has order exactly 2^f
+ *
+ * @param P: a point
+ * @param E: an elliptic curve
+ * @param t: an integer
+ *
+ * @return 0xFFFFFFFF if the order is correct, 0 otherwise
+ */
+static int
+test_jac_order_twof(const jac_point_t *P, const ec_curve_t *E, int t)
+{
+    jac_point_t test;
+    test = *P;
+    if (fp2_is_zero(&test.z))
+        return 0;
+    for (int i = 0; i < t - 1; i++) {
+        DBL(&test, &test, E);
+    }
+    if (fp2_is_zero(&test.z))
+        return 0;
+    DBL(&test, &test, E);
+    return (fp2_is_zero(&test.z));
+}
+
+// Prints the x-coordinate of the point (X : 1)
+static void
+ec_point_print(const char *name, ec_point_t P)
+{
+    fp2_t a;
+    if (fp2_is_zero(&P.z)) {
+        printf("%s = INF\n", name);
+    } else {
+        fp2_copy(&a, &P.z);
+        fp2_inv(&a);
+        fp2_mul(&a, &a, &P.x);
+        fp2_print(name, &a);
+    }
+}
+
+// Prints the Montgomery coefficient A
+static void
+ec_curve_print(const char *name, ec_curve_t E)
+{
+    fp2_t a;
+    fp2_copy(&a, &E.C);
+    fp2_inv(&a);
+    fp2_mul(&a, &a, &E.A);
+    fp2_print(name, &a);
+}
+
+#endif
+// end isogeny computations
+/**
+ * @}
+ */
+
+// end ec
+/**
+ * @}
+ */
diff --git a/src/pqm4/sqisign_lvl5/ref/ec_jac.c b/src/pqm4/sqisign_lvl5/ref/ec_jac.c
new file mode 100644
index 0000000..20ca68c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/ec_jac.c
@@ -0,0 +1,335 @@
+#include <assert.h>
+#include <ec.h>
+
+void
+jac_init(jac_point_t *P)
+{ // Initialize Montgomery in Jacobian coordinates as identity element (0:1:0)
+    fp2_set_zero(&P->x);
+    fp2_set_one(&P->y);
+    fp2_set_zero(&P->z);
+}
+
+uint32_t
+jac_is_equal(const jac_point_t *P, const jac_point_t *Q)
+{ // Evaluate if two points in Jacobian coordinates (X:Y:Z) are equal
+  // Returns 1 (true) if P=Q, 0 (false) otherwise
+    fp2_t t0, t1, t2, t3;
+
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t2, &P->x, &t0); // x1*z2^2
+    fp2_sqr(&t1, &P->z);
+    fp2_mul(&t3, &Q->x, &t1); // x2*z1^2
+    fp2_sub(&t2, &t2, &t3);
+
+    fp2_mul(&t0, &t0, &Q->z);
+    fp2_mul(&t0, &P->y, &t0); // y1*z2^3
+    fp2_mul(&t1, &t1, &P->z);
+    fp2_mul(&t1, &Q->y, &t1); // y2*z1^3
+    fp2_sub(&t0, &t0, &t1);
+
+    return fp2_is_zero(&t0) & fp2_is_zero(&t2);
+}
+
+void
+jac_to_xz(ec_point_t *P, const jac_point_t *xyP)
+{
+    fp2_copy(&P->x, &xyP->x);
+    fp2_copy(&P->z, &xyP->z);
+    fp2_sqr(&P->z, &P->z);
+
+    // If xyP = (0:1:0), we currently have P=(0 : 0) but we want to set P=(1:0)
+    uint32_t c1, c2;
+    fp2_t one;
+    fp2_set_one(&one);
+
+    c1 = fp2_is_zero(&P->x);
+    c2 = fp2_is_zero(&P->z);
+    fp2_select(&P->x, &P->x, &one, c1 & c2);
+}
+
+void
+jac_to_ws(jac_point_t *Q, fp2_t *t, fp2_t *ao3, const jac_point_t *P, const ec_curve_t *curve)
+{
+    // Cost of 3M + 2S when A != 0.
+    fp_t one;
+    fp2_t a;
+    /* a = 1 - A^2/3, U = X + (A*Z^2)/3, V = Y, W = Z, T = a*Z^4*/
+    fp_set_one(&one);
+    if (!fp2_is_zero(&(curve->A))) {
+        fp_div3(&(ao3->re), &(curve->A.re));
+        fp_div3(&(ao3->im), &(curve->A.im));
+        fp2_sqr(t, &P->z);
+        fp2_mul(&Q->x, ao3, t);
+        fp2_add(&Q->x, &Q->x, &P->x);
+        fp2_sqr(t, t);
+        fp2_mul(&a, ao3, &(curve->A));
+        fp_sub(&(a.re), &one, &(a.re));
+        fp_neg(&(a.im), &(a.im));
+        fp2_mul(t, t, &a);
+    } else {
+        fp2_copy(&Q->x, &P->x);
+        fp2_sqr(t, &P->z);
+        fp2_sqr(t, t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+jac_from_ws(jac_point_t *Q, const jac_point_t *P, const fp2_t *ao3, const ec_curve_t *curve)
+{
+    // Cost of 1M + 1S when A != 0.
+    fp2_t t;
+    /* X = U - (A*W^2)/3, Y = V, Z = W. */
+    if (!fp2_is_zero(&(curve->A))) {
+        fp2_sqr(&t, &P->z);
+        fp2_mul(&t, &t, ao3);
+        fp2_sub(&Q->x, &P->x, &t);
+    }
+    fp2_copy(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+copy_jac_point(jac_point_t *P, const jac_point_t *Q)
+{
+    fp2_copy(&(P->x), &(Q->x));
+    fp2_copy(&(P->y), &(Q->y));
+    fp2_copy(&(P->z), &(Q->z));
+}
+
+void
+jac_neg(jac_point_t *Q, const jac_point_t *P)
+{
+    fp2_copy(&Q->x, &P->x);
+    fp2_neg(&Q->y, &P->y);
+    fp2_copy(&Q->z, &P->z);
+}
+
+void
+DBL(jac_point_t *Q, const jac_point_t *P, const ec_curve_t *AC)
+{ // Cost of 6M + 6S.
+  // Doubling on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding to
+  // (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    fp2_t t0, t1, t2, t3;
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_sqr(&t0, &P->x); // t0 = x1^2
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1); // t0 = 3x1^2
+    fp2_sqr(&t1, &P->z);    // t1 = z1^2
+    fp2_mul(&t2, &P->x, &AC->A);
+    fp2_add(&t2, &t2, &t2); // t2 = 2Ax1
+    fp2_add(&t2, &t1, &t2); // t2 = 2Ax1+z1^2
+    fp2_mul(&t2, &t1, &t2); // t2 = z1^2(2Ax1+z1^2)
+    fp2_add(&t2, &t0, &t2); // t2 = alpha = 3x1^2 + z1^2(2Ax1+z1^2)
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z); // z2 = 2y1z1
+    fp2_sqr(&t0, &Q->z);
+    fp2_mul(&t0, &t0, &AC->A); // t0 = 4Ay1^2z1^2
+    fp2_sqr(&t1, &P->y);
+    fp2_add(&t1, &t1, &t1);     // t1 = 2y1^2
+    fp2_add(&t3, &P->x, &P->x); // t3 = 2x1
+    fp2_mul(&t3, &t1, &t3);     // t3 = 4x1y1^2
+    fp2_sqr(&Q->x, &t2);        // x2 = alpha^2
+    fp2_sub(&Q->x, &Q->x, &t0); // x2 = alpha^2 - 4Ay1^2z1^2
+    fp2_sub(&Q->x, &Q->x, &t3);
+    fp2_sub(&Q->x, &Q->x, &t3); // x2 = alpha^2 - 4Ay1^2z1^2 - 8x1y1^2
+    fp2_sub(&Q->y, &t3, &Q->x); // y2 = 4x1y1^2 - x2
+    fp2_mul(&Q->y, &Q->y, &t2); // y2 = alpha(4x1y1^2 - x2)
+    fp2_sqr(&t1, &t1);          // t1 = 4y1^4
+    fp2_sub(&Q->y, &Q->y, &t1);
+    fp2_sub(&Q->y, &Q->y, &t1); // y2 = alpha(4x1y1^2 - x2) - 8y1^4
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+DBLW(jac_point_t *Q, fp2_t *u, const jac_point_t *P, const fp2_t *t)
+{ // Cost of 3M + 5S.
+  // Doubling on a Weierstrass curve, representation in modified Jacobian coordinates
+  // (X:Y:Z:T=a*Z^4) corresponding to (X/Z^2,Y/Z^3), where a is the curve coefficient.
+  // Formula from https://hyperelliptic.org/EFD/g1p/auto-shortw-modified.html
+
+    uint32_t flag = fp2_is_zero(&P->x) & fp2_is_zero(&P->z);
+
+    fp2_t xx, c, cc, r, s, m;
+    // XX = X^2
+    fp2_sqr(&xx, &P->x);
+    // A = 2*Y^2
+    fp2_sqr(&c, &P->y);
+    fp2_add(&c, &c, &c);
+    // AA = A^2
+    fp2_sqr(&cc, &c);
+    // R = 2*AA
+    fp2_add(&r, &cc, &cc);
+    // S = (X+A)^2-XX-AA
+    fp2_add(&s, &P->x, &c);
+    fp2_sqr(&s, &s);
+    fp2_sub(&s, &s, &xx);
+    fp2_sub(&s, &s, &cc);
+    // M = 3*XX+T1
+    fp2_add(&m, &xx, &xx);
+    fp2_add(&m, &m, &xx);
+    fp2_add(&m, &m, t);
+    // X3 = M^2-2*S
+    fp2_sqr(&Q->x, &m);
+    fp2_sub(&Q->x, &Q->x, &s);
+    fp2_sub(&Q->x, &Q->x, &s);
+    // Z3 = 2*Y*Z
+    fp2_mul(&Q->z, &P->y, &P->z);
+    fp2_add(&Q->z, &Q->z, &Q->z);
+    // Y3 = M*(S-X3)-R
+    fp2_sub(&Q->y, &s, &Q->x);
+    fp2_mul(&Q->y, &Q->y, &m);
+    fp2_sub(&Q->y, &Q->y, &r);
+    // T3 = 2*R*T1
+    fp2_mul(u, t, &r);
+    fp2_add(u, u, u);
+
+    fp2_select(&Q->x, &Q->x, &P->x, -flag);
+    fp2_select(&Q->z, &Q->z, &P->z, -flag);
+}
+
+void
+select_jac_point(jac_point_t *Q, const jac_point_t *P1, const jac_point_t *P2, const digit_t option)
+{ // Select points
+  // If option = 0 then Q <- P1, else if option = 0xFF...FF then Q <- P2
+    fp2_select(&(Q->x), &(P1->x), &(P2->x), option);
+    fp2_select(&(Q->y), &(P1->y), &(P2->y), option);
+    fp2_select(&(Q->z), &(P1->z), &(P2->z), option);
+}
+
+void
+ADD(jac_point_t *R, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Addition on a Montgomery curve, representation in Jacobian coordinates (X:Y:Z) corresponding
+    // to (x,y) = (X/Z^2,Y/Z^3) This version receives the coefficient value A
+    //
+    // Complete routine, to handle all edge cases:
+    //   if ZP == 0:            # P == inf
+    //       return Q
+    //   if ZQ == 0:            # Q == inf
+    //       return P
+    //   dy <- YQ*ZP**3 - YP*ZQ**3
+    //   dx <- XQ*ZP**2 - XP*ZQ**2
+    //   if dx == 0:             # x1 == x2
+    //       if dy == 0:         # ... and y1 == y2: doubling case
+    //           dy <- ZP*ZQ * (3*XP^2 + ZP^2 * (2*A*XP + ZP^2))
+    //           dx <- 2*YP*ZP
+    //       else:              # ... but y1 != y2, thus P = -Q
+    //           return inf
+    //   XR <- dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2)
+    //   YR <- dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3
+    //   ZR <- dx * ZP * ZQ
+
+    // Constant time processing:
+    // - The case for P == 0 or Q == 0 is handled at the end with conditional select
+    // - dy and dx are computed for both the normal and doubling cases, we switch when
+    //   dx == dy == 0 for the normal case.
+    // - If we have that P = -Q then dx = 0 and so ZR will be zero, giving us the point
+    //   at infinity for "free".
+    //
+    // These current formula are expensive and I'm probably missing some tricks...
+    // Thought I'd get the ball rolling.
+    // Cost 17M + 6S + 13a
+    fp2_t t0, t1, t2, t3, u1, u2, v1, dx, dy;
+
+    /* If P is zero or Q is zero we will conditionally swap before returning. */
+    uint32_t ctl1 = fp2_is_zero(&P->z);
+    uint32_t ctl2 = fp2_is_zero(&Q->z);
+
+    /* Precompute some values */
+    fp2_sqr(&t0, &P->z); // t0 = z1^2
+    fp2_sqr(&t1, &Q->z); // t1 = z2^2
+
+    /* Compute dy and dx for ordinary case */
+    fp2_mul(&v1, &t1, &Q->z); // v1 = z2^3
+    fp2_mul(&t2, &t0, &P->z); // t2 = z1^3
+    fp2_mul(&v1, &v1, &P->y); // v1 = y1z2^3
+    fp2_mul(&t2, &t2, &Q->y); // t2 = y2z1^3
+    fp2_sub(&dy, &t2, &v1);   // dy = y2z1^3 - y1z2^3
+    fp2_mul(&u2, &t0, &Q->x); // u2 = x2z1^2
+    fp2_mul(&u1, &t1, &P->x); // u1 = x1z2^2
+    fp2_sub(&dx, &u2, &u1);   // dx = x2z1^2 - x1z2^2
+
+    /* Compute dy and dx for doubling case */
+    fp2_add(&t1, &P->y, &P->y);   // dx_dbl = t1 = 2y1
+    fp2_add(&t2, &AC->A, &AC->A); // t2 = 2A
+    fp2_mul(&t2, &t2, &P->x);     // t2 = 2Ax1
+    fp2_add(&t2, &t2, &t0);       // t2 = 2Ax1 + z1^2
+    fp2_mul(&t2, &t2, &t0);       // t2 = z1^2 * (2Ax1 + z1^2)
+    fp2_sqr(&t0, &P->x);          // t0 = x1^2
+    fp2_add(&t2, &t2, &t0);       // t2 = x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 2*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_add(&t2, &t2, &t0);       // t2 = 3*x1^2 + z1^2 * (2Ax1 + z1^2)
+    fp2_mul(&t2, &t2, &Q->z);     // dy_dbl = t2 = z2 * (3*x1^2 + z1^2 * (2Ax1 + z1^2))
+
+    /* If dx is zero and dy is zero swap with double variables */
+    uint32_t ctl = fp2_is_zero(&dx) & fp2_is_zero(&dy);
+    fp2_select(&dx, &dx, &t1, ctl);
+    fp2_select(&dy, &dy, &t2, ctl);
+
+    /* Some more precomputations */
+    fp2_mul(&t0, &P->z, &Q->z); // t0 = z1z2
+    fp2_sqr(&t1, &t0);          // t1 = z1z2^2
+    fp2_sqr(&t2, &dx);          // t2 = dx^2
+    fp2_sqr(&t3, &dy);          // t3 = dy^2
+
+    /* Compute x3 = dy**2 - dx**2 * (A*ZP^2*ZQ^2 + XP*ZQ^2 + XQ*ZP^2) */
+    fp2_mul(&R->x, &AC->A, &t1); // x3 = A*(z1z2)^2
+    fp2_add(&R->x, &R->x, &u1);  // x3 = A*(z1z2)^2 + u1
+    fp2_add(&R->x, &R->x, &u2);  // x3 = A*(z1z2)^2 + u1 + u2
+    fp2_mul(&R->x, &R->x, &t2);  // x3 = dx^2 * (A*(z1z2)^2 + u1 + u2)
+    fp2_sub(&R->x, &t3, &R->x);  // x3 = dy^2 - dx^2 * (A*(z1z2)^2 + u1 + u2)
+
+    /* Compute y3 = dy * (XP*ZQ^2 * dx^2 - XR) - YP*ZQ^3 * dx^3*/
+    fp2_mul(&R->y, &u1, &t2);     // y3 = u1 * dx^2
+    fp2_sub(&R->y, &R->y, &R->x); // y3 = u1 * dx^2 - x3
+    fp2_mul(&R->y, &R->y, &dy);   // y3 = dy * (u1 * dx^2 - x3)
+    fp2_mul(&t3, &t2, &dx);       // t3 = dx^3
+    fp2_mul(&t3, &t3, &v1);       // t3 = v1 * dx^3
+    fp2_sub(&R->y, &R->y, &t3);   // y3 = dy * (u1 * dx^2 - x3) - v1 * dx^3
+
+    /* Compute z3 = dx * z1 * z2 */
+    fp2_mul(&R->z, &dx, &t0);
+
+    /* Finally, we need to set R = P is Q.Z = 0 and R = Q if P.Z = 0 */
+    select_jac_point(R, R, Q, ctl1);
+    select_jac_point(R, R, P, ctl2);
+}
+
+void
+jac_to_xz_add_components(add_components_t *add_comp, const jac_point_t *P, const jac_point_t *Q, const ec_curve_t *AC)
+{
+    // Take P and Q in E distinct, two jac_point_t, return three components u,v and w in Fp2 such
+    // that the xz coordinates of P+Q are (u-v:w) and of P-Q are (u+v:w)
+
+    fp2_t t0, t1, t2, t3, t4, t5, t6;
+
+    fp2_sqr(&t0, &P->z);             // t0 = z1^2
+    fp2_sqr(&t1, &Q->z);             // t1 = z2^2
+    fp2_mul(&t2, &P->x, &t1);        // t2 = x1z2^2
+    fp2_mul(&t3, &t0, &Q->x);        // t3 = z1^2x2
+    fp2_mul(&t4, &P->y, &Q->z);      // t4 = y1z2
+    fp2_mul(&t4, &t4, &t1);          // t4 = y1z2^3
+    fp2_mul(&t5, &P->z, &Q->y);      // t5 = z1y2
+    fp2_mul(&t5, &t5, &t0);          // t5 = z1^3y2
+    fp2_mul(&t0, &t0, &t1);          // t0 = (z1z2)^2
+    fp2_mul(&t6, &t4, &t5);          // t6 = (z1z_2)^3y1y2
+    fp2_add(&add_comp->v, &t6, &t6); // v  = 2(z1z_2)^3y1y2
+    fp2_sqr(&t4, &t4);               // t4 = y1^2z2^6
+    fp2_sqr(&t5, &t5);               // t5 = z1^6y_2^2
+    fp2_add(&t4, &t4, &t5);          // t4 = z1^6y_2^2 + y1^2z2^6
+    fp2_add(&t5, &t2, &t3);          // t5 = x1z2^2 +z_1^2x2
+    fp2_add(&t6, &t3, &t3);          // t6 = 2z_1^2x2
+    fp2_sub(&t6, &t5, &t6);          // t6 = lambda = x1z2^2 - z_1^2x2
+    fp2_sqr(&t6, &t6);               // t6 = lambda^2 = (x1z2^2 - z_1^2x2)^2
+    fp2_mul(&t1, &AC->A, &t0);       // t1 = A*(z1z2)^2
+    fp2_add(&t1, &t5, &t1);          // t1 = gamma =A*(z1z2)^2 + x1z2^2 +z_1^2x2
+    fp2_mul(&t1, &t1, &t6);          // t1 = gamma*lambda^2
+    fp2_sub(&add_comp->u, &t4, &t1); // u  = z1^6y_2^2 + y1^2z2^6 - gamma*lambda^2
+    fp2_mul(&add_comp->w, &t6, &t0); // w  = (z1z2)^2(lambda)^2
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/ec_params.c b/src/pqm4/sqisign_lvl5/ref/ec_params.c
new file mode 100644
index 0000000..d2aa074
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/ec_params.c
@@ -0,0 +1,4 @@
+#include <ec_params.h>
+// p+1 divided by the power of 2
+const digit_t p_cofactor_for_2f[1] = {27};
+
diff --git a/src/pqm4/sqisign_lvl5/ref/ec_params.h b/src/pqm4/sqisign_lvl5/ref/ec_params.h
new file mode 100644
index 0000000..9f2aca3
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/ec_params.h
@@ -0,0 +1,12 @@
+#ifndef EC_PARAMS_H
+#define EC_PARAMS_H
+
+#include <fp.h>
+
+#define TORSION_EVEN_POWER 500
+
+// p+1 divided by the power of 2
+extern const digit_t p_cofactor_for_2f[1];
+#define P_COFACTOR_FOR_2F_BITLENGTH 5
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/encode_verification.c b/src/pqm4/sqisign_lvl5/ref/encode_verification.c
new file mode 100644
index 0000000..fecdb9c
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/encode_verification.c
@@ -0,0 +1,220 @@
+#include <verification.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp2.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+typedef unsigned char byte_t;
+
+// digits
+
+static void
+encode_digits(byte_t *enc, const digit_t *x, size_t nbytes)
+{
+#ifdef TARGET_BIG_ENDIAN
+    const size_t ndigits = nbytes / sizeof(digit_t);
+    const size_t rem = nbytes % sizeof(digit_t);
+
+    for (size_t i = 0; i < ndigits; i++)
+        ((digit_t *)enc)[i] = BSWAP_DIGIT(x[i]);
+    if (rem) {
+        digit_t ld = BSWAP_DIGIT(x[ndigits]);
+        memcpy(enc + ndigits * sizeof(digit_t), (byte_t *)&ld, rem);
+    }
+#else
+    memcpy(enc, (const byte_t *)x, nbytes);
+#endif
+}
+
+static void
+decode_digits(digit_t *x, const byte_t *enc, size_t nbytes, size_t ndigits)
+{
+    assert(nbytes <= ndigits * sizeof(digit_t));
+    memcpy((byte_t *)x, enc, nbytes);
+    memset((byte_t *)x + nbytes, 0, ndigits * sizeof(digit_t) - nbytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (size_t i = 0; i < ndigits; i++)
+        x[i] = BSWAP_DIGIT(x[i]);
+#endif
+}
+
+// fp2_t
+
+static byte_t *
+fp2_to_bytes(byte_t *enc, const fp2_t *x)
+{
+    fp2_encode(enc, x);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+static const byte_t *
+fp2_from_bytes(fp2_t *x, const byte_t *enc)
+{
+    fp2_decode(x, enc);
+    return enc + FP2_ENCODED_BYTES;
+}
+
+// curves and points
+
+static byte_t *
+proj_to_bytes(byte_t *enc, const fp2_t *x, const fp2_t *z)
+{
+    assert(!fp2_is_zero(z));
+    fp2_t tmp = *z;
+    fp2_inv(&tmp);
+#ifndef NDEBUG
+    {
+        fp2_t chk;
+        fp2_mul(&chk, z, &tmp);
+        fp2_t one;
+        fp2_set_one(&one);
+        assert(fp2_is_equal(&chk, &one));
+    }
+#endif
+    fp2_mul(&tmp, x, &tmp);
+    enc = fp2_to_bytes(enc, &tmp);
+    return enc;
+}
+
+static const byte_t *
+proj_from_bytes(fp2_t *x, fp2_t *z, const byte_t *enc)
+{
+    enc = fp2_from_bytes(x, enc);
+    fp2_set_one(z);
+    return enc;
+}
+
+static byte_t *
+ec_curve_to_bytes(byte_t *enc, const ec_curve_t *curve)
+{
+    return proj_to_bytes(enc, &curve->A, &curve->C);
+}
+
+static const byte_t *
+ec_curve_from_bytes(ec_curve_t *curve, const byte_t *enc)
+{
+    memset(curve, 0, sizeof(*curve));
+    return proj_from_bytes(&curve->A, &curve->C, enc);
+}
+
+static byte_t *
+ec_point_to_bytes(byte_t *enc, const ec_point_t *point)
+{
+    return proj_to_bytes(enc, &point->x, &point->z);
+}
+
+static const byte_t *
+ec_point_from_bytes(ec_point_t *point, const byte_t *enc)
+{
+    return proj_from_bytes(&point->x, &point->z, enc);
+}
+
+static byte_t *
+ec_basis_to_bytes(byte_t *enc, const ec_basis_t *basis)
+{
+    enc = ec_point_to_bytes(enc, &basis->P);
+    enc = ec_point_to_bytes(enc, &basis->Q);
+    enc = ec_point_to_bytes(enc, &basis->PmQ);
+    return enc;
+}
+
+static const byte_t *
+ec_basis_from_bytes(ec_basis_t *basis, const byte_t *enc)
+{
+    enc = ec_point_from_bytes(&basis->P, enc);
+    enc = ec_point_from_bytes(&basis->Q, enc);
+    enc = ec_point_from_bytes(&basis->PmQ, enc);
+    return enc;
+}
+
+// public API
+
+byte_t *
+public_key_to_bytes(byte_t *enc, const public_key_t *pk)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_to_bytes(enc, &pk->curve);
+    *enc++ = pk->hint_pk;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+const byte_t *
+public_key_from_bytes(public_key_t *pk, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+    enc = ec_curve_from_bytes(&pk->curve, enc);
+    pk->hint_pk = *enc++;
+    assert(enc - start == PUBLICKEY_BYTES);
+    return enc;
+}
+
+void
+signature_to_bytes(byte_t *enc, const signature_t *sig)
+{
+#ifndef NDEBUG
+    byte_t *const start = enc;
+#endif
+
+    enc = fp2_to_bytes(enc, &sig->E_aux_A);
+
+    *enc++ = sig->backtracking;
+    *enc++ = sig->two_resp_length;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[0][1], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][0], nbytes);
+    enc += nbytes;
+    encode_digits(enc, sig->mat_Bchall_can_to_B_chall[1][1], nbytes);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    encode_digits(enc, sig->chall_coeff, nbytes);
+    enc += nbytes;
+
+    *enc++ = sig->hint_aux;
+    *enc++ = sig->hint_chall;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
+
+void
+signature_from_bytes(signature_t *sig, const byte_t *enc)
+{
+#ifndef NDEBUG
+    const byte_t *const start = enc;
+#endif
+
+    enc = fp2_from_bytes(&sig->E_aux_A, enc);
+
+    sig->backtracking = *enc++;
+    sig->two_resp_length = *enc++;
+
+    size_t nbytes = (SQIsign_response_length + 9) / 8;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[0][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][0], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+    decode_digits(sig->mat_Bchall_can_to_B_chall[1][1], enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    nbytes = SECURITY_BITS / 8;
+    decode_digits(sig->chall_coeff, enc, nbytes, NWORDS_ORDER);
+    enc += nbytes;
+
+    sig->hint_aux = *enc++;
+    sig->hint_chall = *enc++;
+
+    assert(enc - start == SIGNATURE_BYTES);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/encoded_sizes.h b/src/pqm4/sqisign_lvl5/ref/encoded_sizes.h
new file mode 100644
index 0000000..3aafb0d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/encoded_sizes.h
@@ -0,0 +1,11 @@
+#define SECURITY_BITS 256
+#define SQIsign_response_length 253
+#define HASH_ITERATIONS 512
+#define FP_ENCODED_BYTES 64
+#define FP2_ENCODED_BYTES 128
+#define EC_CURVE_ENCODED_BYTES 128
+#define EC_POINT_ENCODED_BYTES 128
+#define EC_BASIS_ENCODED_BYTES 384
+#define PUBLICKEY_BYTES 129
+#define SECRETKEY_BYTES 701
+#define SIGNATURE_BYTES 292
diff --git a/src/pqm4/sqisign_lvl5/ref/fp.c b/src/pqm4/sqisign_lvl5/ref/fp.c
new file mode 100644
index 0000000..48e2937
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp.c
@@ -0,0 +1,15 @@
+#include <fp.h>
+
+/*
+ * If ctl == 0x00000000, then *d is set to a0
+ * If ctl == 0xFFFFFFFF, then *d is set to a1
+ * ctl MUST be either 0x00000000 or 0xFFFFFFFF.
+ */
+void
+fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl)
+{
+    digit_t cw = (int32_t)ctl;
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++) {
+        (*d)[i] = (*a0)[i] ^ (cw & ((*a0)[i] ^ (*a1)[i]));
+    }
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/fp.h b/src/pqm4/sqisign_lvl5/ref/fp.h
new file mode 100644
index 0000000..1241d58
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp.h
@@ -0,0 +1,48 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <sqisign_namespace.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD]; // Datatype for representing field elements
+
+extern const digit_t ONE[NWORDS_FIELD];
+extern const digit_t ZERO[NWORDS_FIELD];
+// extern const digit_t PM1O3[NWORDS_FIELD];
+
+void fp_set_small(fp_t *x, const digit_t val);
+void fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val);
+void fp_set_zero(fp_t *x);
+void fp_set_one(fp_t *x);
+uint32_t fp_is_equal(const fp_t *a, const fp_t *b);
+uint32_t fp_is_zero(const fp_t *a);
+void fp_copy(fp_t *out, const fp_t *a);
+
+void fp_encode(void *dst, const fp_t *a);
+void fp_decode_reduce(fp_t *d, const void *src, size_t len);
+uint32_t fp_decode(fp_t *d, const void *src);
+
+void fp_select(fp_t *d, const fp_t *a0, const fp_t *a1, uint32_t ctl);
+void fp_cswap(fp_t *a, fp_t *b, uint32_t ctl);
+
+void fp_add(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_sub(fp_t *out, const fp_t *a, const fp_t *b);
+void fp_neg(fp_t *out, const fp_t *a);
+void fp_sqr(fp_t *out, const fp_t *a);
+void fp_mul(fp_t *out, const fp_t *a, const fp_t *b);
+
+void fp_inv(fp_t *x);
+uint32_t fp_is_square(const fp_t *a);
+void fp_sqrt(fp_t *a);
+void fp_half(fp_t *out, const fp_t *a);
+void fp_exp3div4(fp_t *out, const fp_t *a);
+void fp_div3(fp_t *out, const fp_t *a);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/fp2.c b/src/pqm4/sqisign_lvl5/ref/fp2.c
new file mode 100644
index 0000000..a258952
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp2.c
@@ -0,0 +1,328 @@
+#include <inttypes.h>
+#include <encoded_sizes.h>
+#include <fp2.h>
+
+/* Arithmetic modulo X^2 + 1 */
+
+void
+fp2_set_small(fp2_t *x, const digit_t val)
+{
+    fp_set_small(&(x->re), val);
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n)
+{
+    fp_mul_small(&x->re, &y->re, n);
+    fp_mul_small(&x->im, &y->im, n);
+}
+
+void
+fp2_set_one(fp2_t *x)
+{
+    fp_set_one(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+void
+fp2_set_zero(fp2_t *x)
+{
+    fp_set_zero(&(x->re));
+    fp_set_zero(&(x->im));
+}
+
+// Is a GF(p^2) element zero?
+// Returns 0xFF...FF (true) if a=0, 0 (false) otherwise
+uint32_t
+fp2_is_zero(const fp2_t *a)
+{
+    return fp_is_zero(&(a->re)) & fp_is_zero(&(a->im));
+}
+
+// Compare two GF(p^2) elements in constant time
+// Returns 0xFF...FF (true) if a=b, 0 (false) otherwise
+uint32_t
+fp2_is_equal(const fp2_t *a, const fp2_t *b)
+{
+    return fp_is_equal(&(a->re), &(b->re)) & fp_is_equal(&(a->im), &(b->im));
+}
+
+// Is a GF(p^2) element one?
+// Returns 0xFF...FF (true) if a=1, 0 (false) otherwise
+uint32_t
+fp2_is_one(const fp2_t *a)
+{
+    return fp_is_equal(&(a->re), &ONE) & fp_is_zero(&(a->im));
+}
+
+void
+fp2_copy(fp2_t *x, const fp2_t *y)
+{
+    fp_copy(&(x->re), &(y->re));
+    fp_copy(&(x->im), &(y->im));
+}
+
+void
+fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_add(&(x->re), &(y->re), &(z->re));
+    fp_add(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_add_one(fp2_t *x, const fp2_t *y)
+{
+    fp_add(&x->re, &y->re, &ONE);
+    fp_copy(&x->im, &y->im);
+}
+
+void
+fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_sub(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &(y->im), &(z->im));
+}
+
+void
+fp2_neg(fp2_t *x, const fp2_t *y)
+{
+    fp_neg(&(x->re), &(y->re));
+    fp_neg(&(x->im), &(y->im));
+}
+
+void
+fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z)
+{
+    fp_t t0, t1;
+
+    fp_add(&t0, &(y->re), &(y->im));
+    fp_add(&t1, &(z->re), &(z->im));
+    fp_mul(&t0, &t0, &t1);
+    fp_mul(&t1, &(y->im), &(z->im));
+    fp_mul(&(x->re), &(y->re), &(z->re));
+    fp_sub(&(x->im), &t0, &t1);
+    fp_sub(&(x->im), &(x->im), &(x->re));
+    fp_sub(&(x->re), &(x->re), &t1);
+}
+
+void
+fp2_sqr(fp2_t *x, const fp2_t *y)
+{
+    fp_t sum, diff;
+
+    fp_add(&sum, &(y->re), &(y->im));
+    fp_sub(&diff, &(y->re), &(y->im));
+    fp_mul(&(x->im), &(y->re), &(y->im));
+    fp_add(&(x->im), &(x->im), &(x->im));
+    fp_mul(&(x->re), &sum, &diff);
+}
+
+void
+fp2_inv(fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+    fp_inv(&t0);
+    fp_mul(&(x->re), &(x->re), &t0);
+    fp_mul(&(x->im), &(x->im), &t0);
+    fp_neg(&(x->im), &(x->im));
+}
+
+uint32_t
+fp2_is_square(const fp2_t *x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(&t0, &(x->re));
+    fp_sqr(&t1, &(x->im));
+    fp_add(&t0, &t0, &t1);
+
+    return fp_is_square(&t0);
+}
+
+void
+fp2_sqrt(fp2_t *a)
+{
+    fp_t x0, x1, t0, t1;
+
+    /* From "Optimized One-Dimensional SQIsign Verification on Intel and
+     * Cortex-M4" by Aardal et al: https://eprint.iacr.org/2024/1563 */
+
+    // x0 = \delta = sqrt(a0^2 + a1^2).
+    fp_sqr(&x0, &(a->re));
+    fp_sqr(&x1, &(a->im));
+    fp_add(&x0, &x0, &x1);
+    fp_sqrt(&x0);
+    // If a1 = 0, there is a risk of \delta = -a0, which makes x0 = 0 below.
+    // In that case, we restore the value \delta = a0.
+    fp_select(&x0, &x0, &(a->re), fp_is_zero(&(a->im)));
+    // x0 = \delta + a0, t0 = 2 * x0.
+    fp_add(&x0, &x0, &(a->re));
+    fp_add(&t0, &x0, &x0);
+
+    // x1 = t0^(p-3)/4
+    fp_exp3div4(&x1, &t0);
+
+    // x0 = x0 * x1, x1 = x1 * a1, t1 = (2x0)^2.
+    fp_mul(&x0, &x0, &x1);
+    fp_mul(&x1, &x1, &(a->im));
+    fp_add(&t1, &x0, &x0);
+    fp_sqr(&t1, &t1);
+    // If t1 = t0, return x0 + x1*i, otherwise x1 - x0*i.
+    fp_sub(&t0, &t0, &t1);
+    uint32_t f = fp_is_zero(&t0);
+    fp_neg(&t1, &x0);
+    fp_copy(&t0, &x1);
+    fp_select(&t0, &t0, &x0, f);
+    fp_select(&t1, &t1, &x1, f);
+
+    // Check if t0 is zero
+    uint32_t t0_is_zero = fp_is_zero(&t0);
+
+    // Check whether t0, t1 are odd
+    // Note: we encode to ensure canonical representation
+    uint8_t tmp_bytes[FP_ENCODED_BYTES];
+    fp_encode(tmp_bytes, &t0);
+    uint32_t t0_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+    fp_encode(tmp_bytes, &t1);
+    uint32_t t1_is_odd = -((uint32_t)tmp_bytes[0] & 1);
+
+    // We negate the output if:
+    // t0 is odd, or
+    // t0 is zero and t1 is odd
+    uint32_t negate_output = t0_is_odd | (t0_is_zero & t1_is_odd);
+    fp_neg(&x0, &t0);
+    fp_select(&(a->re), &t0, &x0, negate_output);
+    fp_neg(&x0, &t1);
+    fp_select(&(a->im), &t1, &x0, negate_output);
+}
+
+uint32_t
+fp2_sqrt_verify(fp2_t *a)
+{
+    fp2_t t0, t1;
+
+    fp2_copy(&t0, a);
+    fp2_sqrt(a);
+    fp2_sqr(&t1, a);
+
+    return (fp2_is_equal(&t0, &t1));
+}
+
+void
+fp2_half(fp2_t *x, const fp2_t *y)
+{
+    fp_half(&(x->re), &(y->re));
+    fp_half(&(x->im), &(y->im));
+}
+
+void
+fp2_batched_inv(fp2_t *x, int len)
+{
+    fp2_t t1[len], t2[len];
+    fp2_t inverse;
+
+    // x = x0,...,xn
+    // t1 = x0, x0*x1, ... ,x0 * x1 * ... * xn
+    fp2_copy(&t1[0], &x[0]);
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t1[i], &t1[i - 1], &x[i]);
+    }
+
+    // inverse = 1/ (x0 * x1 * ... * xn)
+    fp2_copy(&inverse, &t1[len - 1]);
+    fp2_inv(&inverse);
+
+    fp2_copy(&t2[0], &inverse);
+    // t2 = 1/ (x0 * x1 * ... * xn), 1/ (x0 * x1 * ... * x(n-1)) , ... , 1/xO
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&t2[i], &t2[i - 1], &x[len - i]);
+    }
+
+    fp2_copy(&x[0], &t2[len - 1]);
+
+    for (int i = 1; i < len; i++) {
+        fp2_mul(&x[i], &t1[i - 1], &t2[len - i - 1]);
+    }
+}
+
+// exponentiation using square and multiply
+// Warning!! Not constant time!
+void
+fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size)
+{
+    fp2_t acc;
+    digit_t bit;
+
+    fp2_copy(&acc, x);
+    fp2_set_one(out);
+
+    // Iterate over each word of exp
+    for (int j = 0; j < size; j++) {
+        // Iterate over each bit of the word
+        for (int i = 0; i < RADIX; i++) {
+            bit = (exp[j] >> i) & 1;
+            if (bit == 1) {
+                fp2_mul(out, out, &acc);
+            }
+            fp2_sqr(&acc, &acc);
+        }
+    }
+}
+
+void
+fp2_print(const char *name, const fp2_t *a)
+{
+    printf("%s0x", name);
+
+    uint8_t buf[FP_ENCODED_BYTES];
+    fp_encode(&buf, &a->re); // Encoding ensures canonical rep
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+
+    printf(" + i*0x");
+
+    fp_encode(&buf, &a->im);
+    for (int i = 0; i < FP_ENCODED_BYTES; i++) {
+        printf("%02x", buf[FP_ENCODED_BYTES - i - 1]);
+    }
+    printf("\n");
+}
+
+void
+fp2_encode(void *dst, const fp2_t *a)
+{
+    uint8_t *buf = dst;
+    fp_encode(buf, &(a->re));
+    fp_encode(buf + FP_ENCODED_BYTES, &(a->im));
+}
+
+uint32_t
+fp2_decode(fp2_t *d, const void *src)
+{
+    const uint8_t *buf = src;
+    uint32_t re, im;
+
+    re = fp_decode(&(d->re), buf);
+    im = fp_decode(&(d->im), buf + FP_ENCODED_BYTES);
+    return re & im;
+}
+
+void
+fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl)
+{
+    fp_select(&(d->re), &(a0->re), &(a1->re), ctl);
+    fp_select(&(d->im), &(a0->im), &(a1->im), ctl);
+}
+
+void
+fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl)
+{
+    fp_cswap(&(a->re), &(b->re), ctl);
+    fp_cswap(&(a->im), &(b->im), ctl);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/fp2.h b/src/pqm4/sqisign_lvl5/ref/fp2.h
new file mode 100644
index 0000000..00e673b
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp2.h
@@ -0,0 +1,41 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include <sqisign_namespace.h>
+#include "fp.h"
+#include <stdio.h>
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t
+{
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set_small(fp2_t *x, const digit_t val);
+void fp2_mul_small(fp2_t *x, const fp2_t *y, uint32_t n);
+void fp2_set_one(fp2_t *x);
+void fp2_set_zero(fp2_t *x);
+uint32_t fp2_is_zero(const fp2_t *a);
+uint32_t fp2_is_equal(const fp2_t *a, const fp2_t *b);
+uint32_t fp2_is_one(const fp2_t *a);
+void fp2_copy(fp2_t *x, const fp2_t *y);
+void fp2_add(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_add_one(fp2_t *x, const fp2_t *y);
+void fp2_sub(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_neg(fp2_t *x, const fp2_t *y);
+void fp2_mul(fp2_t *x, const fp2_t *y, const fp2_t *z);
+void fp2_sqr(fp2_t *x, const fp2_t *y);
+void fp2_inv(fp2_t *x);
+uint32_t fp2_is_square(const fp2_t *x);
+void fp2_sqrt(fp2_t *x);
+uint32_t fp2_sqrt_verify(fp2_t *a);
+void fp2_half(fp2_t *x, const fp2_t *y);
+void fp2_batched_inv(fp2_t *x, int len);
+void fp2_pow_vartime(fp2_t *out, const fp2_t *x, const digit_t *exp, const int size);
+void fp2_print(const char *name, const fp2_t *a);
+void fp2_encode(void *dst, const fp2_t *a);
+uint32_t fp2_decode(fp2_t *d, const void *src);
+void fp2_select(fp2_t *d, const fp2_t *a0, const fp2_t *a1, uint32_t ctl);
+void fp2_cswap(fp2_t *a, fp2_t *b, uint32_t ctl);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/fp_constants.h b/src/pqm4/sqisign_lvl5/ref/fp_constants.h
new file mode 100644
index 0000000..094cb4d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp_constants.h
@@ -0,0 +1,17 @@
+#if RADIX == 32
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 16
+#else
+#define NWORDS_FIELD 18
+#endif
+#define NWORDS_ORDER 16
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+#define NWORDS_FIELD 8
+#else
+#define NWORDS_FIELD 9
+#endif
+#define NWORDS_ORDER 8
+#endif
+#define BITS 512
+#define LOG2P 9
diff --git a/src/pqm4/sqisign_lvl5/ref/fp_p27500_32.c b/src/pqm4/sqisign_lvl5/ref/fp_p27500_32.c
new file mode 100644
index 0000000..ecf5ea7
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/fp_p27500_32.c
@@ -0,0 +1,1514 @@
+// clang-format off
+// Command line : python monty.py 32
+// 0x1afffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define sspint int32_t
+#define spint uint32_t
+#define udpint uint64_t
+#define dpint uint64_t
+
+#define Wordlength 32
+#define Nlimbs 18
+#define Radix 29
+#define Nbits 505
+#define Nbytes 64
+
+#define MONTGOMERY
+// propagate carries
+inline static spint prop(spint *n) {
+  int i;
+  spint mask = ((spint)1 << 29u) - (spint)1;
+  sspint carry = (sspint)n[0];
+  carry >>= 29u;
+  n[0] &= mask;
+  for (i = 1; i < 17; i++) {
+    carry += (sspint)n[i];
+    n[i] = (spint)carry & mask;
+    carry >>= 29u;
+  }
+  n[17] += (spint)carry;
+  return -((n[17] >> 1) >> 30u);
+}
+
+// propagate carries and add p if negative, propagate carries again
+inline static int flatten(spint *n) {
+  spint carry = prop(n);
+  n[0] -= (spint)1u & carry;
+  n[17] += ((spint)0xd80u) & carry;
+  (void)prop(n);
+  return (int)(carry & 1);
+}
+
+// Montgomery final subtract
+static int modfsb(spint *n) {
+  n[0] += (spint)1u;
+  n[17] -= (spint)0xd80u;
+  return flatten(n);
+}
+
+// Modular addition - reduce less than 2p
+static void modadd(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] + b[0];
+  n[1] = a[1] + b[1];
+  n[2] = a[2] + b[2];
+  n[3] = a[3] + b[3];
+  n[4] = a[4] + b[4];
+  n[5] = a[5] + b[5];
+  n[6] = a[6] + b[6];
+  n[7] = a[7] + b[7];
+  n[8] = a[8] + b[8];
+  n[9] = a[9] + b[9];
+  n[10] = a[10] + b[10];
+  n[11] = a[11] + b[11];
+  n[12] = a[12] + b[12];
+  n[13] = a[13] + b[13];
+  n[14] = a[14] + b[14];
+  n[15] = a[15] + b[15];
+  n[16] = a[16] + b[16];
+  n[17] = a[17] + b[17];
+  n[0] += (spint)2u;
+  n[17] -= (spint)0x1b00u;
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[17] += ((spint)0x1b00u) & carry;
+  (void)prop(n);
+}
+
+// Modular subtraction - reduce less than 2p
+static void modsub(const spint *a, const spint *b, spint *n) {
+  spint carry;
+  n[0] = a[0] - b[0];
+  n[1] = a[1] - b[1];
+  n[2] = a[2] - b[2];
+  n[3] = a[3] - b[3];
+  n[4] = a[4] - b[4];
+  n[5] = a[5] - b[5];
+  n[6] = a[6] - b[6];
+  n[7] = a[7] - b[7];
+  n[8] = a[8] - b[8];
+  n[9] = a[9] - b[9];
+  n[10] = a[10] - b[10];
+  n[11] = a[11] - b[11];
+  n[12] = a[12] - b[12];
+  n[13] = a[13] - b[13];
+  n[14] = a[14] - b[14];
+  n[15] = a[15] - b[15];
+  n[16] = a[16] - b[16];
+  n[17] = a[17] - b[17];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[17] += ((spint)0x1b00u) & carry;
+  (void)prop(n);
+}
+
+// Modular negation
+static void modneg(const spint *b, spint *n) {
+  spint carry;
+  n[0] = (spint)0 - b[0];
+  n[1] = (spint)0 - b[1];
+  n[2] = (spint)0 - b[2];
+  n[3] = (spint)0 - b[3];
+  n[4] = (spint)0 - b[4];
+  n[5] = (spint)0 - b[5];
+  n[6] = (spint)0 - b[6];
+  n[7] = (spint)0 - b[7];
+  n[8] = (spint)0 - b[8];
+  n[9] = (spint)0 - b[9];
+  n[10] = (spint)0 - b[10];
+  n[11] = (spint)0 - b[11];
+  n[12] = (spint)0 - b[12];
+  n[13] = (spint)0 - b[13];
+  n[14] = (spint)0 - b[14];
+  n[15] = (spint)0 - b[15];
+  n[16] = (spint)0 - b[16];
+  n[17] = (spint)0 - b[17];
+  carry = prop(n);
+  n[0] -= (spint)2u & carry;
+  n[17] += ((spint)0x1b00u) & carry;
+  (void)prop(n);
+}
+
+// Overflow limit   = 18446744073709551616
+// maximum possible = 5188148641189065362
+// Modular multiplication, c=a*b mod 2p
+static void modmul(const spint *a, const spint *b, spint *c) {
+  dpint t = 0;
+  spint p17 = 0xd80u;
+  spint q = ((spint)1 << 29u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  t += (dpint)a[0] * b[0];
+  spint v0 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[1];
+  t += (dpint)a[1] * b[0];
+  spint v1 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[2];
+  t += (dpint)a[1] * b[1];
+  t += (dpint)a[2] * b[0];
+  spint v2 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[3];
+  t += (dpint)a[1] * b[2];
+  t += (dpint)a[2] * b[1];
+  t += (dpint)a[3] * b[0];
+  spint v3 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[4];
+  t += (dpint)a[1] * b[3];
+  t += (dpint)a[2] * b[2];
+  t += (dpint)a[3] * b[1];
+  t += (dpint)a[4] * b[0];
+  spint v4 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[5];
+  t += (dpint)a[1] * b[4];
+  t += (dpint)a[2] * b[3];
+  t += (dpint)a[3] * b[2];
+  t += (dpint)a[4] * b[1];
+  t += (dpint)a[5] * b[0];
+  spint v5 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[6];
+  t += (dpint)a[1] * b[5];
+  t += (dpint)a[2] * b[4];
+  t += (dpint)a[3] * b[3];
+  t += (dpint)a[4] * b[2];
+  t += (dpint)a[5] * b[1];
+  t += (dpint)a[6] * b[0];
+  spint v6 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[7];
+  t += (dpint)a[1] * b[6];
+  t += (dpint)a[2] * b[5];
+  t += (dpint)a[3] * b[4];
+  t += (dpint)a[4] * b[3];
+  t += (dpint)a[5] * b[2];
+  t += (dpint)a[6] * b[1];
+  t += (dpint)a[7] * b[0];
+  spint v7 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[8];
+  t += (dpint)a[1] * b[7];
+  t += (dpint)a[2] * b[6];
+  t += (dpint)a[3] * b[5];
+  t += (dpint)a[4] * b[4];
+  t += (dpint)a[5] * b[3];
+  t += (dpint)a[6] * b[2];
+  t += (dpint)a[7] * b[1];
+  t += (dpint)a[8] * b[0];
+  spint v8 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[9];
+  t += (dpint)a[1] * b[8];
+  t += (dpint)a[2] * b[7];
+  t += (dpint)a[3] * b[6];
+  t += (dpint)a[4] * b[5];
+  t += (dpint)a[5] * b[4];
+  t += (dpint)a[6] * b[3];
+  t += (dpint)a[7] * b[2];
+  t += (dpint)a[8] * b[1];
+  t += (dpint)a[9] * b[0];
+  spint v9 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[10];
+  t += (dpint)a[1] * b[9];
+  t += (dpint)a[2] * b[8];
+  t += (dpint)a[3] * b[7];
+  t += (dpint)a[4] * b[6];
+  t += (dpint)a[5] * b[5];
+  t += (dpint)a[6] * b[4];
+  t += (dpint)a[7] * b[3];
+  t += (dpint)a[8] * b[2];
+  t += (dpint)a[9] * b[1];
+  t += (dpint)a[10] * b[0];
+  spint v10 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[11];
+  t += (dpint)a[1] * b[10];
+  t += (dpint)a[2] * b[9];
+  t += (dpint)a[3] * b[8];
+  t += (dpint)a[4] * b[7];
+  t += (dpint)a[5] * b[6];
+  t += (dpint)a[6] * b[5];
+  t += (dpint)a[7] * b[4];
+  t += (dpint)a[8] * b[3];
+  t += (dpint)a[9] * b[2];
+  t += (dpint)a[10] * b[1];
+  t += (dpint)a[11] * b[0];
+  spint v11 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[12];
+  t += (dpint)a[1] * b[11];
+  t += (dpint)a[2] * b[10];
+  t += (dpint)a[3] * b[9];
+  t += (dpint)a[4] * b[8];
+  t += (dpint)a[5] * b[7];
+  t += (dpint)a[6] * b[6];
+  t += (dpint)a[7] * b[5];
+  t += (dpint)a[8] * b[4];
+  t += (dpint)a[9] * b[3];
+  t += (dpint)a[10] * b[2];
+  t += (dpint)a[11] * b[1];
+  t += (dpint)a[12] * b[0];
+  spint v12 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[13];
+  t += (dpint)a[1] * b[12];
+  t += (dpint)a[2] * b[11];
+  t += (dpint)a[3] * b[10];
+  t += (dpint)a[4] * b[9];
+  t += (dpint)a[5] * b[8];
+  t += (dpint)a[6] * b[7];
+  t += (dpint)a[7] * b[6];
+  t += (dpint)a[8] * b[5];
+  t += (dpint)a[9] * b[4];
+  t += (dpint)a[10] * b[3];
+  t += (dpint)a[11] * b[2];
+  t += (dpint)a[12] * b[1];
+  t += (dpint)a[13] * b[0];
+  spint v13 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[14];
+  t += (dpint)a[1] * b[13];
+  t += (dpint)a[2] * b[12];
+  t += (dpint)a[3] * b[11];
+  t += (dpint)a[4] * b[10];
+  t += (dpint)a[5] * b[9];
+  t += (dpint)a[6] * b[8];
+  t += (dpint)a[7] * b[7];
+  t += (dpint)a[8] * b[6];
+  t += (dpint)a[9] * b[5];
+  t += (dpint)a[10] * b[4];
+  t += (dpint)a[11] * b[3];
+  t += (dpint)a[12] * b[2];
+  t += (dpint)a[13] * b[1];
+  t += (dpint)a[14] * b[0];
+  spint v14 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[15];
+  t += (dpint)a[1] * b[14];
+  t += (dpint)a[2] * b[13];
+  t += (dpint)a[3] * b[12];
+  t += (dpint)a[4] * b[11];
+  t += (dpint)a[5] * b[10];
+  t += (dpint)a[6] * b[9];
+  t += (dpint)a[7] * b[8];
+  t += (dpint)a[8] * b[7];
+  t += (dpint)a[9] * b[6];
+  t += (dpint)a[10] * b[5];
+  t += (dpint)a[11] * b[4];
+  t += (dpint)a[12] * b[3];
+  t += (dpint)a[13] * b[2];
+  t += (dpint)a[14] * b[1];
+  t += (dpint)a[15] * b[0];
+  spint v15 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[16];
+  t += (dpint)a[1] * b[15];
+  t += (dpint)a[2] * b[14];
+  t += (dpint)a[3] * b[13];
+  t += (dpint)a[4] * b[12];
+  t += (dpint)a[5] * b[11];
+  t += (dpint)a[6] * b[10];
+  t += (dpint)a[7] * b[9];
+  t += (dpint)a[8] * b[8];
+  t += (dpint)a[9] * b[7];
+  t += (dpint)a[10] * b[6];
+  t += (dpint)a[11] * b[5];
+  t += (dpint)a[12] * b[4];
+  t += (dpint)a[13] * b[3];
+  t += (dpint)a[14] * b[2];
+  t += (dpint)a[15] * b[1];
+  t += (dpint)a[16] * b[0];
+  spint v16 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[0] * b[17];
+  t += (dpint)a[1] * b[16];
+  t += (dpint)a[2] * b[15];
+  t += (dpint)a[3] * b[14];
+  t += (dpint)a[4] * b[13];
+  t += (dpint)a[5] * b[12];
+  t += (dpint)a[6] * b[11];
+  t += (dpint)a[7] * b[10];
+  t += (dpint)a[8] * b[9];
+  t += (dpint)a[9] * b[8];
+  t += (dpint)a[10] * b[7];
+  t += (dpint)a[11] * b[6];
+  t += (dpint)a[12] * b[5];
+  t += (dpint)a[13] * b[4];
+  t += (dpint)a[14] * b[3];
+  t += (dpint)a[15] * b[2];
+  t += (dpint)a[16] * b[1];
+  t += (dpint)a[17] * b[0];
+  t += (dpint)v0 * (dpint)p17;
+  spint v17 = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[1] * b[17];
+  t += (dpint)a[2] * b[16];
+  t += (dpint)a[3] * b[15];
+  t += (dpint)a[4] * b[14];
+  t += (dpint)a[5] * b[13];
+  t += (dpint)a[6] * b[12];
+  t += (dpint)a[7] * b[11];
+  t += (dpint)a[8] * b[10];
+  t += (dpint)a[9] * b[9];
+  t += (dpint)a[10] * b[8];
+  t += (dpint)a[11] * b[7];
+  t += (dpint)a[12] * b[6];
+  t += (dpint)a[13] * b[5];
+  t += (dpint)a[14] * b[4];
+  t += (dpint)a[15] * b[3];
+  t += (dpint)a[16] * b[2];
+  t += (dpint)a[17] * b[1];
+  t += (dpint)v1 * (dpint)p17;
+  c[0] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[2] * b[17];
+  t += (dpint)a[3] * b[16];
+  t += (dpint)a[4] * b[15];
+  t += (dpint)a[5] * b[14];
+  t += (dpint)a[6] * b[13];
+  t += (dpint)a[7] * b[12];
+  t += (dpint)a[8] * b[11];
+  t += (dpint)a[9] * b[10];
+  t += (dpint)a[10] * b[9];
+  t += (dpint)a[11] * b[8];
+  t += (dpint)a[12] * b[7];
+  t += (dpint)a[13] * b[6];
+  t += (dpint)a[14] * b[5];
+  t += (dpint)a[15] * b[4];
+  t += (dpint)a[16] * b[3];
+  t += (dpint)a[17] * b[2];
+  t += (dpint)v2 * (dpint)p17;
+  c[1] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[3] * b[17];
+  t += (dpint)a[4] * b[16];
+  t += (dpint)a[5] * b[15];
+  t += (dpint)a[6] * b[14];
+  t += (dpint)a[7] * b[13];
+  t += (dpint)a[8] * b[12];
+  t += (dpint)a[9] * b[11];
+  t += (dpint)a[10] * b[10];
+  t += (dpint)a[11] * b[9];
+  t += (dpint)a[12] * b[8];
+  t += (dpint)a[13] * b[7];
+  t += (dpint)a[14] * b[6];
+  t += (dpint)a[15] * b[5];
+  t += (dpint)a[16] * b[4];
+  t += (dpint)a[17] * b[3];
+  t += (dpint)v3 * (dpint)p17;
+  c[2] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[4] * b[17];
+  t += (dpint)a[5] * b[16];
+  t += (dpint)a[6] * b[15];
+  t += (dpint)a[7] * b[14];
+  t += (dpint)a[8] * b[13];
+  t += (dpint)a[9] * b[12];
+  t += (dpint)a[10] * b[11];
+  t += (dpint)a[11] * b[10];
+  t += (dpint)a[12] * b[9];
+  t += (dpint)a[13] * b[8];
+  t += (dpint)a[14] * b[7];
+  t += (dpint)a[15] * b[6];
+  t += (dpint)a[16] * b[5];
+  t += (dpint)a[17] * b[4];
+  t += (dpint)v4 * (dpint)p17;
+  c[3] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[5] * b[17];
+  t += (dpint)a[6] * b[16];
+  t += (dpint)a[7] * b[15];
+  t += (dpint)a[8] * b[14];
+  t += (dpint)a[9] * b[13];
+  t += (dpint)a[10] * b[12];
+  t += (dpint)a[11] * b[11];
+  t += (dpint)a[12] * b[10];
+  t += (dpint)a[13] * b[9];
+  t += (dpint)a[14] * b[8];
+  t += (dpint)a[15] * b[7];
+  t += (dpint)a[16] * b[6];
+  t += (dpint)a[17] * b[5];
+  t += (dpint)v5 * (dpint)p17;
+  c[4] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[6] * b[17];
+  t += (dpint)a[7] * b[16];
+  t += (dpint)a[8] * b[15];
+  t += (dpint)a[9] * b[14];
+  t += (dpint)a[10] * b[13];
+  t += (dpint)a[11] * b[12];
+  t += (dpint)a[12] * b[11];
+  t += (dpint)a[13] * b[10];
+  t += (dpint)a[14] * b[9];
+  t += (dpint)a[15] * b[8];
+  t += (dpint)a[16] * b[7];
+  t += (dpint)a[17] * b[6];
+  t += (dpint)v6 * (dpint)p17;
+  c[5] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[7] * b[17];
+  t += (dpint)a[8] * b[16];
+  t += (dpint)a[9] * b[15];
+  t += (dpint)a[10] * b[14];
+  t += (dpint)a[11] * b[13];
+  t += (dpint)a[12] * b[12];
+  t += (dpint)a[13] * b[11];
+  t += (dpint)a[14] * b[10];
+  t += (dpint)a[15] * b[9];
+  t += (dpint)a[16] * b[8];
+  t += (dpint)a[17] * b[7];
+  t += (dpint)v7 * (dpint)p17;
+  c[6] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[8] * b[17];
+  t += (dpint)a[9] * b[16];
+  t += (dpint)a[10] * b[15];
+  t += (dpint)a[11] * b[14];
+  t += (dpint)a[12] * b[13];
+  t += (dpint)a[13] * b[12];
+  t += (dpint)a[14] * b[11];
+  t += (dpint)a[15] * b[10];
+  t += (dpint)a[16] * b[9];
+  t += (dpint)a[17] * b[8];
+  t += (dpint)v8 * (dpint)p17;
+  c[7] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[9] * b[17];
+  t += (dpint)a[10] * b[16];
+  t += (dpint)a[11] * b[15];
+  t += (dpint)a[12] * b[14];
+  t += (dpint)a[13] * b[13];
+  t += (dpint)a[14] * b[12];
+  t += (dpint)a[15] * b[11];
+  t += (dpint)a[16] * b[10];
+  t += (dpint)a[17] * b[9];
+  t += (dpint)v9 * (dpint)p17;
+  c[8] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[10] * b[17];
+  t += (dpint)a[11] * b[16];
+  t += (dpint)a[12] * b[15];
+  t += (dpint)a[13] * b[14];
+  t += (dpint)a[14] * b[13];
+  t += (dpint)a[15] * b[12];
+  t += (dpint)a[16] * b[11];
+  t += (dpint)a[17] * b[10];
+  t += (dpint)v10 * (dpint)p17;
+  c[9] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[11] * b[17];
+  t += (dpint)a[12] * b[16];
+  t += (dpint)a[13] * b[15];
+  t += (dpint)a[14] * b[14];
+  t += (dpint)a[15] * b[13];
+  t += (dpint)a[16] * b[12];
+  t += (dpint)a[17] * b[11];
+  t += (dpint)v11 * (dpint)p17;
+  c[10] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[12] * b[17];
+  t += (dpint)a[13] * b[16];
+  t += (dpint)a[14] * b[15];
+  t += (dpint)a[15] * b[14];
+  t += (dpint)a[16] * b[13];
+  t += (dpint)a[17] * b[12];
+  t += (dpint)v12 * (dpint)p17;
+  c[11] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[13] * b[17];
+  t += (dpint)a[14] * b[16];
+  t += (dpint)a[15] * b[15];
+  t += (dpint)a[16] * b[14];
+  t += (dpint)a[17] * b[13];
+  t += (dpint)v13 * (dpint)p17;
+  c[12] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[14] * b[17];
+  t += (dpint)a[15] * b[16];
+  t += (dpint)a[16] * b[15];
+  t += (dpint)a[17] * b[14];
+  t += (dpint)v14 * (dpint)p17;
+  c[13] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[15] * b[17];
+  t += (dpint)a[16] * b[16];
+  t += (dpint)a[17] * b[15];
+  t += (dpint)v15 * (dpint)p17;
+  c[14] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[16] * b[17];
+  t += (dpint)a[17] * b[16];
+  t += (dpint)v16 * (dpint)p17;
+  c[15] = ((spint)t & mask);
+  t >>= 29;
+  t += (dpint)a[17] * b[17];
+  t += (dpint)v17 * (dpint)p17;
+  c[16] = ((spint)t & mask);
+  t >>= 29;
+  c[17] = (spint)t;
+}
+
+// Modular squaring, c=a*a  mod 2p
+static void modsqr(const spint *a, spint *c) {
+  udpint tot;
+  udpint t = 0;
+  spint p17 = 0xd80u;
+  spint q = ((spint)1 << 29u); // q is unsaturated radix
+  spint mask = (spint)(q - (spint)1);
+  tot = (udpint)a[0] * a[0];
+  t = tot;
+  spint v0 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[1];
+  tot *= 2;
+  t += tot;
+  spint v1 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[2];
+  tot *= 2;
+  tot += (udpint)a[1] * a[1];
+  t += tot;
+  spint v2 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[3];
+  tot += (udpint)a[1] * a[2];
+  tot *= 2;
+  t += tot;
+  spint v3 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[4];
+  tot += (udpint)a[1] * a[3];
+  tot *= 2;
+  tot += (udpint)a[2] * a[2];
+  t += tot;
+  spint v4 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[5];
+  tot += (udpint)a[1] * a[4];
+  tot += (udpint)a[2] * a[3];
+  tot *= 2;
+  t += tot;
+  spint v5 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[6];
+  tot += (udpint)a[1] * a[5];
+  tot += (udpint)a[2] * a[4];
+  tot *= 2;
+  tot += (udpint)a[3] * a[3];
+  t += tot;
+  spint v6 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[7];
+  tot += (udpint)a[1] * a[6];
+  tot += (udpint)a[2] * a[5];
+  tot += (udpint)a[3] * a[4];
+  tot *= 2;
+  t += tot;
+  spint v7 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[8];
+  tot += (udpint)a[1] * a[7];
+  tot += (udpint)a[2] * a[6];
+  tot += (udpint)a[3] * a[5];
+  tot *= 2;
+  tot += (udpint)a[4] * a[4];
+  t += tot;
+  spint v8 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[9];
+  tot += (udpint)a[1] * a[8];
+  tot += (udpint)a[2] * a[7];
+  tot += (udpint)a[3] * a[6];
+  tot += (udpint)a[4] * a[5];
+  tot *= 2;
+  t += tot;
+  spint v9 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[10];
+  tot += (udpint)a[1] * a[9];
+  tot += (udpint)a[2] * a[8];
+  tot += (udpint)a[3] * a[7];
+  tot += (udpint)a[4] * a[6];
+  tot *= 2;
+  tot += (udpint)a[5] * a[5];
+  t += tot;
+  spint v10 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[11];
+  tot += (udpint)a[1] * a[10];
+  tot += (udpint)a[2] * a[9];
+  tot += (udpint)a[3] * a[8];
+  tot += (udpint)a[4] * a[7];
+  tot += (udpint)a[5] * a[6];
+  tot *= 2;
+  t += tot;
+  spint v11 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[12];
+  tot += (udpint)a[1] * a[11];
+  tot += (udpint)a[2] * a[10];
+  tot += (udpint)a[3] * a[9];
+  tot += (udpint)a[4] * a[8];
+  tot += (udpint)a[5] * a[7];
+  tot *= 2;
+  tot += (udpint)a[6] * a[6];
+  t += tot;
+  spint v12 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[13];
+  tot += (udpint)a[1] * a[12];
+  tot += (udpint)a[2] * a[11];
+  tot += (udpint)a[3] * a[10];
+  tot += (udpint)a[4] * a[9];
+  tot += (udpint)a[5] * a[8];
+  tot += (udpint)a[6] * a[7];
+  tot *= 2;
+  t += tot;
+  spint v13 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[14];
+  tot += (udpint)a[1] * a[13];
+  tot += (udpint)a[2] * a[12];
+  tot += (udpint)a[3] * a[11];
+  tot += (udpint)a[4] * a[10];
+  tot += (udpint)a[5] * a[9];
+  tot += (udpint)a[6] * a[8];
+  tot *= 2;
+  tot += (udpint)a[7] * a[7];
+  t += tot;
+  spint v14 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[15];
+  tot += (udpint)a[1] * a[14];
+  tot += (udpint)a[2] * a[13];
+  tot += (udpint)a[3] * a[12];
+  tot += (udpint)a[4] * a[11];
+  tot += (udpint)a[5] * a[10];
+  tot += (udpint)a[6] * a[9];
+  tot += (udpint)a[7] * a[8];
+  tot *= 2;
+  t += tot;
+  spint v15 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[16];
+  tot += (udpint)a[1] * a[15];
+  tot += (udpint)a[2] * a[14];
+  tot += (udpint)a[3] * a[13];
+  tot += (udpint)a[4] * a[12];
+  tot += (udpint)a[5] * a[11];
+  tot += (udpint)a[6] * a[10];
+  tot += (udpint)a[7] * a[9];
+  tot *= 2;
+  tot += (udpint)a[8] * a[8];
+  t += tot;
+  spint v16 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[0] * a[17];
+  tot += (udpint)a[1] * a[16];
+  tot += (udpint)a[2] * a[15];
+  tot += (udpint)a[3] * a[14];
+  tot += (udpint)a[4] * a[13];
+  tot += (udpint)a[5] * a[12];
+  tot += (udpint)a[6] * a[11];
+  tot += (udpint)a[7] * a[10];
+  tot += (udpint)a[8] * a[9];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v0 * p17;
+  spint v17 = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[1] * a[17];
+  tot += (udpint)a[2] * a[16];
+  tot += (udpint)a[3] * a[15];
+  tot += (udpint)a[4] * a[14];
+  tot += (udpint)a[5] * a[13];
+  tot += (udpint)a[6] * a[12];
+  tot += (udpint)a[7] * a[11];
+  tot += (udpint)a[8] * a[10];
+  tot *= 2;
+  tot += (udpint)a[9] * a[9];
+  t += tot;
+  t += (udpint)v1 * p17;
+  c[0] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[2] * a[17];
+  tot += (udpint)a[3] * a[16];
+  tot += (udpint)a[4] * a[15];
+  tot += (udpint)a[5] * a[14];
+  tot += (udpint)a[6] * a[13];
+  tot += (udpint)a[7] * a[12];
+  tot += (udpint)a[8] * a[11];
+  tot += (udpint)a[9] * a[10];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v2 * p17;
+  c[1] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[3] * a[17];
+  tot += (udpint)a[4] * a[16];
+  tot += (udpint)a[5] * a[15];
+  tot += (udpint)a[6] * a[14];
+  tot += (udpint)a[7] * a[13];
+  tot += (udpint)a[8] * a[12];
+  tot += (udpint)a[9] * a[11];
+  tot *= 2;
+  tot += (udpint)a[10] * a[10];
+  t += tot;
+  t += (udpint)v3 * p17;
+  c[2] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[4] * a[17];
+  tot += (udpint)a[5] * a[16];
+  tot += (udpint)a[6] * a[15];
+  tot += (udpint)a[7] * a[14];
+  tot += (udpint)a[8] * a[13];
+  tot += (udpint)a[9] * a[12];
+  tot += (udpint)a[10] * a[11];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v4 * p17;
+  c[3] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[5] * a[17];
+  tot += (udpint)a[6] * a[16];
+  tot += (udpint)a[7] * a[15];
+  tot += (udpint)a[8] * a[14];
+  tot += (udpint)a[9] * a[13];
+  tot += (udpint)a[10] * a[12];
+  tot *= 2;
+  tot += (udpint)a[11] * a[11];
+  t += tot;
+  t += (udpint)v5 * p17;
+  c[4] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[6] * a[17];
+  tot += (udpint)a[7] * a[16];
+  tot += (udpint)a[8] * a[15];
+  tot += (udpint)a[9] * a[14];
+  tot += (udpint)a[10] * a[13];
+  tot += (udpint)a[11] * a[12];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v6 * p17;
+  c[5] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[7] * a[17];
+  tot += (udpint)a[8] * a[16];
+  tot += (udpint)a[9] * a[15];
+  tot += (udpint)a[10] * a[14];
+  tot += (udpint)a[11] * a[13];
+  tot *= 2;
+  tot += (udpint)a[12] * a[12];
+  t += tot;
+  t += (udpint)v7 * p17;
+  c[6] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[8] * a[17];
+  tot += (udpint)a[9] * a[16];
+  tot += (udpint)a[10] * a[15];
+  tot += (udpint)a[11] * a[14];
+  tot += (udpint)a[12] * a[13];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v8 * p17;
+  c[7] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[9] * a[17];
+  tot += (udpint)a[10] * a[16];
+  tot += (udpint)a[11] * a[15];
+  tot += (udpint)a[12] * a[14];
+  tot *= 2;
+  tot += (udpint)a[13] * a[13];
+  t += tot;
+  t += (udpint)v9 * p17;
+  c[8] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[10] * a[17];
+  tot += (udpint)a[11] * a[16];
+  tot += (udpint)a[12] * a[15];
+  tot += (udpint)a[13] * a[14];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v10 * p17;
+  c[9] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[11] * a[17];
+  tot += (udpint)a[12] * a[16];
+  tot += (udpint)a[13] * a[15];
+  tot *= 2;
+  tot += (udpint)a[14] * a[14];
+  t += tot;
+  t += (udpint)v11 * p17;
+  c[10] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[12] * a[17];
+  tot += (udpint)a[13] * a[16];
+  tot += (udpint)a[14] * a[15];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v12 * p17;
+  c[11] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[13] * a[17];
+  tot += (udpint)a[14] * a[16];
+  tot *= 2;
+  tot += (udpint)a[15] * a[15];
+  t += tot;
+  t += (udpint)v13 * p17;
+  c[12] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[14] * a[17];
+  tot += (udpint)a[15] * a[16];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v14 * p17;
+  c[13] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[15] * a[17];
+  tot *= 2;
+  tot += (udpint)a[16] * a[16];
+  t += tot;
+  t += (udpint)v15 * p17;
+  c[14] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[16] * a[17];
+  tot *= 2;
+  t += tot;
+  t += (udpint)v16 * p17;
+  c[15] = ((spint)t & mask);
+  t >>= 29;
+  tot = (udpint)a[17] * a[17];
+  t += tot;
+  t += (udpint)v17 * p17;
+  c[16] = ((spint)t & mask);
+  t >>= 29;
+  c[17] = (spint)t;
+}
+
+// copy
+static void modcpy(const spint *a, spint *c) {
+  int i;
+  for (i = 0; i < 18; i++) {
+    c[i] = a[i];
+  }
+}
+
+// square n times
+static void modnsqr(spint *a, int n) {
+  int i;
+  for (i = 0; i < n; i++) {
+    modsqr(a, a);
+  }
+}
+
+// Calculate progenitor
+static void modpro(const spint *w, spint *z) {
+  spint x[18];
+  spint t0[18];
+  spint t1[18];
+  spint t2[18];
+  spint t3[18];
+  spint t4[18];
+  spint t5[18];
+  spint t6[18];
+  modcpy(w, x);
+  modcpy(x, z);
+  modnsqr(z, 2);
+  modmul(x, z, t0);
+  modmul(x, t0, z);
+  modsqr(z, t1);
+  modmul(x, t1, t1);
+  modsqr(t1, t3);
+  modsqr(t3, t2);
+  modmul(t3, t2, t4);
+  modsqr(t4, t5);
+  modcpy(t5, t2);
+  modnsqr(t2, 2);
+  modsqr(t2, t6);
+  modmul(t2, t6, t6);
+  modmul(t5, t6, t5);
+  modnsqr(t5, 5);
+  modmul(t2, t5, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 12);
+  modmul(t2, t5, t2);
+  modcpy(t2, t5);
+  modnsqr(t5, 2);
+  modmul(t2, t5, t5);
+  modmul(t4, t5, t4);
+  modsqr(t4, t5);
+  modmul(t2, t5, t2);
+  modmul(t4, t2, t4);
+  modnsqr(t4, 27);
+  modmul(t2, t4, t2);
+  modmul(t1, t2, t2);
+  modcpy(t2, t4);
+  modnsqr(t4, 2);
+  modmul(t3, t4, t3);
+  modnsqr(t3, 58);
+  modmul(t2, t3, t2);
+  modmul(z, t2, z);
+  modcpy(z, t2);
+  modnsqr(t2, 4);
+  modmul(t1, t2, t1);
+  modmul(t0, t1, t0);
+  modmul(t1, t0, t1);
+  modsqr(t1, t2);
+  modmul(t0, t2, t0);
+  modcpy(t0, t2);
+  modnsqr(t2, 2);
+  modmul(t0, t2, t2);
+  modmul(t1, t2, t1);
+  modmul(t0, t1, t0);
+  modnsqr(t1, 128);
+  modmul(t0, t1, t1);
+  modnsqr(t1, 128);
+  modmul(t0, t1, t0);
+  modnsqr(t0, 119);
+  modmul(z, t0, z);
+}
+
+// calculate inverse, provide progenitor h if available
+static void modinv(const spint *x, const spint *h, spint *z) {
+  spint s[18];
+  spint t[18];
+  if (h == NULL) {
+    modpro(x, t);
+  } else {
+    modcpy(h, t);
+  }
+  modcpy(x, s);
+  modnsqr(t, 2);
+  modmul(s, t, z);
+}
+
+// Convert m to n-residue form, n=nres(m)
+static void nres(const spint *m, spint *n) {
+  const spint c[18] = {0x19a29700u, 0x12f6878u,  0x17b425edu, 0x1a12f684u,
+                       0x97b425eu,  0x1da12f68u, 0x1097b425u, 0xbda12f6u,
+                       0xd097b42u,  0x4bda12fu,  0x1ed097b4u, 0x84bda12u,
+                       0x5ed097bu,  0x1684bda1u, 0x25ed097u,  0xf684bdau,
+                       0x1425ed09u, 0x4bdu};
+  modmul(m, c, n);
+}
+
+// Convert n back to normal form, m=redc(n)
+static void redc(const spint *n, spint *m) {
+  int i;
+  spint c[18];
+  c[0] = 1;
+  for (i = 1; i < 18; i++) {
+    c[i] = 0;
+  }
+  modmul(n, c, m);
+  (void)modfsb(m);
+}
+
+// is unity?
+static int modis1(const spint *a) {
+  int i;
+  spint c[18];
+  spint c0;
+  spint d = 0;
+  redc(a, c);
+  for (i = 1; i < 18; i++) {
+    d |= c[i];
+  }
+  c0 = (spint)c[0];
+  return ((spint)1 & ((d - (spint)1) >> 29u) &
+          (((c0 ^ (spint)1) - (spint)1) >> 29u));
+}
+
+// is zero?
+static int modis0(const spint *a) {
+  int i;
+  spint c[18];
+  spint d = 0;
+  redc(a, c);
+  for (i = 0; i < 18; i++) {
+    d |= c[i];
+  }
+  return ((spint)1 & ((d - (spint)1) >> 29u));
+}
+
+// set to zero
+static void modzer(spint *a) {
+  int i;
+  for (i = 0; i < 18; i++) {
+    a[i] = 0;
+  }
+}
+
+// set to one
+static void modone(spint *a) {
+  int i;
+  a[0] = 1;
+  for (i = 1; i < 18; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// set to integer
+static void modint(int x, spint *a) {
+  int i;
+  a[0] = (spint)x;
+  for (i = 1; i < 18; i++) {
+    a[i] = 0;
+  }
+  nres(a, a);
+}
+
+// Modular multiplication by an integer, c=a*b mod 2p
+static void modmli(const spint *a, int b, spint *c) {
+  spint t[18];
+  modint(b, t);
+  modmul(a, t, c);
+}
+
+// Test for quadratic residue
+static int modqr(const spint *h, const spint *x) {
+  spint r[18];
+  if (h == NULL) {
+    modpro(x, r);
+    modsqr(r, r);
+  } else {
+    modsqr(h, r);
+  }
+  modmul(r, x, r);
+  return modis1(r) | modis0(x);
+}
+
+// conditional move g to f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcmv(int b, const spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 18; i++) {
+    s = g[i];
+    t = f[i];
+    f[i] = c0 * t + c1 * s;
+    f[i] -= r * (t + s);
+  }
+}
+
+// conditional swap g and f if d=1
+// strongly recommend inlining be disabled using compiler specific syntax
+static void modcsw(int b, volatile spint *g, volatile spint *f) {
+  int i;
+  spint c0, c1, s, t, w;
+  spint r = 0x5aa5a55au;
+  c0 = (1 - b) + r;
+  c1 = b + r;
+  for (i = 0; i < 18; i++) {
+    s = g[i];
+    t = f[i];
+    w = r * (t + s);
+    f[i] = c0 * t + c1 * s;
+    f[i] -= w;
+    g[i] = c0 * s + c1 * t;
+    g[i] -= w;
+  }
+}
+
+// Modular square root, provide progenitor h if available, NULL if not
+static void modsqrt(const spint *x, const spint *h, spint *r) {
+  spint s[18];
+  spint y[18];
+  if (h == NULL) {
+    modpro(x, y);
+  } else {
+    modcpy(h, y);
+  }
+  modmul(y, x, s);
+  modcpy(s, r);
+}
+
+// shift left by less than a word
+static void modshl(unsigned int n, spint *a) {
+  int i;
+  a[17] = ((a[17] << n)) | (a[16] >> (29u - n));
+  for (i = 16; i > 0; i--) {
+    a[i] = ((a[i] << n) & (spint)0x1fffffff) | (a[i - 1] >> (29u - n));
+  }
+  a[0] = (a[0] << n) & (spint)0x1fffffff;
+}
+
+// shift right by less than a word. Return shifted out part
+static int modshr(unsigned int n, spint *a) {
+  int i;
+  spint r = a[0] & (((spint)1 << n) - (spint)1);
+  for (i = 0; i < 17; i++) {
+    a[i] = (a[i] >> n) | ((a[i + 1] << (29u - n)) & (spint)0x1fffffff);
+  }
+  a[17] = a[17] >> n;
+  return r;
+}
+
+// set a= 2^r
+static void mod2r(unsigned int r, spint *a) {
+  unsigned int n = r / 29u;
+  unsigned int m = r % 29u;
+  modzer(a);
+  if (r >= 64 * 8)
+    return;
+  a[n] = 1;
+  a[n] <<= m;
+  nres(a, a);
+}
+
+// export to byte array
+static void modexp(const spint *a, char *b) {
+  int i;
+  spint c[18];
+  redc(a, c);
+  for (i = 63; i >= 0; i--) {
+    b[i] = c[0] & (spint)0xff;
+    (void)modshr(8, c);
+  }
+}
+
+// import from byte array
+// returns 1 if in range, else 0
+static int modimp(const char *b, spint *a) {
+  int i, res;
+  for (i = 0; i < 18; i++) {
+    a[i] = 0;
+  }
+  for (i = 0; i < 64; i++) {
+    modshl(8, a);
+    a[0] += (spint)(unsigned char)b[i];
+  }
+  res = modfsb(a);
+  nres(a, a);
+  return res;
+}
+
+// determine sign
+static int modsign(const spint *a) {
+  spint c[18];
+  redc(a, c);
+  return c[0] % 2;
+}
+
+// return true if equal
+static int modcmp(const spint *a, const spint *b) {
+  spint c[18], d[18];
+  int i, eq = 1;
+  redc(a, c);
+  redc(b, d);
+  for (i = 0; i < 18; i++) {
+    eq &= (((c[i] ^ d[i]) - 1) >> 29) & 1;
+  }
+  return eq;
+}
+
+// clang-format on
+/******************************************************************************
+ API functions calling generated code above
+ ******************************************************************************/
+
+#include <fp.h>
+
+const digit_t ZERO[NWORDS_FIELD] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
+const digit_t ONE[NWORDS_FIELD] = { 0x00025ed0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000800 };
+// Montgomery representation of 2^-1
+static const digit_t TWO_INV[NWORDS_FIELD] = { 0x00012f68, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                               0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                               0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000400 };
+// Montgomery representation of 3^-1
+static const digit_t THREE_INV[NWORDS_FIELD] = {
+    0x15561f9a, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555,
+    0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x0aaaaaaa, 0x15555555, 0x00000baa
+};
+// Montgomery representation of 2^512
+static const digit_t R2[NWORDS_FIELD] = { 0x03c668a5, 0x0f684bda, 0x1425ed09, 0x12f684bd, 0x1b425ed0, 0x012f684b,
+                                          0x17b425ed, 0x1a12f684, 0x097b425e, 0x1da12f68, 0x1097b425, 0x0bda12f6,
+                                          0x0d097b42, 0x04bda12f, 0x1ed097b4, 0x084bda12, 0x05ed097b, 0x00000a21 };
+
+void
+fp_set_small(fp_t *x, const digit_t val)
+{
+    modint((int)val, *x);
+}
+
+void
+fp_mul_small(fp_t *x, const fp_t *a, const uint32_t val)
+{
+    modmli(*a, (int)val, *x);
+}
+
+void
+fp_set_zero(fp_t *x)
+{
+    modzer(*x);
+}
+
+void
+fp_set_one(fp_t *x)
+{
+    modone(*x);
+}
+
+uint32_t
+fp_is_equal(const fp_t *a, const fp_t *b)
+{
+    return -(uint32_t)modcmp(*a, *b);
+}
+
+uint32_t
+fp_is_zero(const fp_t *a)
+{
+    return -(uint32_t)modis0(*a);
+}
+
+void
+fp_copy(fp_t *out, const fp_t *a)
+{
+    modcpy(*a, *out);
+}
+
+void
+fp_cswap(fp_t *a, fp_t *b, uint32_t ctl)
+{
+    modcsw((int)(ctl & 0x1), *a, *b);
+}
+
+void
+fp_add(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modadd(*a, *b, *out);
+}
+
+void
+fp_sub(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modsub(*a, *b, *out);
+}
+
+void
+fp_neg(fp_t *out, const fp_t *a)
+{
+    modneg(*a, *out);
+}
+
+void
+fp_sqr(fp_t *out, const fp_t *a)
+{
+    modsqr(*a, *out);
+}
+
+void
+fp_mul(fp_t *out, const fp_t *a, const fp_t *b)
+{
+    modmul(*a, *b, *out);
+}
+
+void
+fp_inv(fp_t *x)
+{
+    modinv(*x, NULL, *x);
+}
+
+uint32_t
+fp_is_square(const fp_t *a)
+{
+    return -(uint32_t)modqr(NULL, *a);
+}
+
+void
+fp_sqrt(fp_t *a)
+{
+    modsqrt(*a, NULL, *a);
+}
+
+void
+fp_half(fp_t *out, const fp_t *a)
+{
+    modmul(TWO_INV, *a, *out);
+}
+
+void
+fp_exp3div4(fp_t *out, const fp_t *a)
+{
+    modpro(*a, *out);
+}
+
+void
+fp_div3(fp_t *out, const fp_t *a)
+{
+    modmul(THREE_INV, *a, *out);
+}
+
+void
+fp_encode(void *dst, const fp_t *a)
+{
+    // Modified version of modexp()
+    int i;
+    spint c[18];
+    redc(*a, c);
+    for (i = 0; i < 64; i++) {
+        ((char *)dst)[i] = c[0] & (spint)0xff;
+        (void)modshr(8, c);
+    }
+}
+
+uint32_t
+fp_decode(fp_t *d, const void *src)
+{
+    // Modified version of modimp()
+    int i;
+    spint res;
+    const unsigned char *b = src;
+    for (i = 0; i < 18; i++) {
+        (*d)[i] = 0;
+    }
+    for (i = 63; i >= 0; i--) {
+        modshl(8, *d);
+        (*d)[0] += (spint)b[i];
+    }
+    res = (spint)-modfsb(*d);
+    nres(*d, *d);
+    // If the value was canonical then res = -1; otherwise, res = 0
+    for (i = 0; i < 18; i++) {
+        (*d)[i] &= res;
+    }
+    return (uint32_t)res;
+}
+
+static inline unsigned char
+add_carry(unsigned char cc, spint a, spint b, spint *d)
+{
+    udpint t = (udpint)a + (udpint)b + cc;
+    *d = (spint)t;
+    return (unsigned char)(t >> Wordlength);
+}
+
+static void
+partial_reduce(spint *out, const spint *src)
+{
+    spint h, l, quo, rem;
+    unsigned char cc;
+
+    // Split value in high (12 bits) and low (500 bits) parts.
+    h = src[15] >> 20;
+    l = src[15] & 0x000FFFFF;
+
+    // 27*2^500 = 1 mod q; hence, we add floor(h/27) + (h mod 27)*2^500
+    // to the low part.
+    quo = (h * 0x12F7) >> 17;
+    rem = h - (27 * quo);
+    cc = add_carry(0, src[0], quo, &out[0]);
+    cc = add_carry(cc, src[1], 0, &out[1]);
+    cc = add_carry(cc, src[2], 0, &out[2]);
+    cc = add_carry(cc, src[3], 0, &out[3]);
+    cc = add_carry(cc, src[4], 0, &out[4]);
+    cc = add_carry(cc, src[5], 0, &out[5]);
+    cc = add_carry(cc, src[6], 0, &out[6]);
+    cc = add_carry(cc, src[7], 0, &out[7]);
+    cc = add_carry(cc, src[8], 0, &out[8]);
+    cc = add_carry(cc, src[9], 0, &out[9]);
+    cc = add_carry(cc, src[10], 0, &out[10]);
+    cc = add_carry(cc, src[11], 0, &out[11]);
+    cc = add_carry(cc, src[12], 0, &out[12]);
+    cc = add_carry(cc, src[13], 0, &out[13]);
+    cc = add_carry(cc, src[14], 0, &out[14]);
+    (void)add_carry(cc, l, rem << 20, &out[15]);
+}
+
+// Little-endian encoding of a 32-bit integer.
+static inline void
+enc32le(void *dst, uint32_t x)
+{
+    uint8_t *buf = dst;
+    buf[0] = (uint8_t)x;
+    buf[1] = (uint8_t)(x >> 8);
+    buf[2] = (uint8_t)(x >> 16);
+    buf[3] = (uint8_t)(x >> 24);
+}
+
+// Little-endian decoding of a 32-bit integer.
+static inline uint32_t
+dec32le(const void *src)
+{
+    const uint8_t *buf = src;
+    return (spint)buf[0] | ((spint)buf[1] << 8) | ((spint)buf[2] << 16) | ((spint)buf[3] << 24);
+}
+
+void
+fp_decode_reduce(fp_t *d, const void *src, size_t len)
+{
+    uint32_t t[16];  // Stores Nbytes * 8 bits
+    uint8_t tmp[64]; // Nbytes
+    const uint8_t *b = src;
+
+    fp_set_zero(d);
+    if (len == 0) {
+        return;
+    }
+
+    size_t rem = len % 64;
+    if (rem != 0) {
+        // Input size is not a multiple of 64, we decode a partial
+        // block, which is already less than 2^500.
+        size_t k = len - rem;
+        memcpy(tmp, b + k, len - k);
+        memset(tmp + len - k, 0, (sizeof tmp) - (len - k));
+        fp_decode(d, tmp);
+        len = k;
+    }
+    // Process all remaining blocks, in descending address order.
+    while (len > 0) {
+        fp_mul(d, d, &R2);
+        len -= 64;
+        t[0] = dec32le(b + len);
+        t[1] = dec32le(b + len + 4);
+        t[2] = dec32le(b + len + 8);
+        t[3] = dec32le(b + len + 12);
+        t[4] = dec32le(b + len + 16);
+        t[5] = dec32le(b + len + 20);
+        t[6] = dec32le(b + len + 24);
+        t[7] = dec32le(b + len + 28);
+        t[8] = dec32le(b + len + 32);
+        t[9] = dec32le(b + len + 36);
+        t[10] = dec32le(b + len + 40);
+        t[11] = dec32le(b + len + 44);
+        t[12] = dec32le(b + len + 48);
+        t[13] = dec32le(b + len + 52);
+        t[14] = dec32le(b + len + 56);
+        t[15] = dec32le(b + len + 60);
+        partial_reduce(t, t);
+        enc32le(tmp, t[0]);
+        enc32le(tmp + 4, t[1]);
+        enc32le(tmp + 8, t[2]);
+        enc32le(tmp + 12, t[3]);
+        enc32le(tmp + 16, t[4]);
+        enc32le(tmp + 20, t[5]);
+        enc32le(tmp + 24, t[6]);
+        enc32le(tmp + 28, t[7]);
+        enc32le(tmp + 32, t[8]);
+        enc32le(tmp + 36, t[9]);
+        enc32le(tmp + 40, t[10]);
+        enc32le(tmp + 44, t[11]);
+        enc32le(tmp + 48, t[12]);
+        enc32le(tmp + 52, t[13]);
+        enc32le(tmp + 56, t[14]);
+        enc32le(tmp + 60, t[15]);
+        fp_t a;
+        fp_decode(&a, tmp);
+        fp_add(d, d, &a);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/hd.c b/src/pqm4/sqisign_lvl5/ref/hd.c
new file mode 100644
index 0000000..0424108
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/hd.c
@@ -0,0 +1,93 @@
+#include <hd.h>
+#include <assert.h>
+
+void
+double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2)
+{
+    ec_dbl(&out->P1, &in->P1, &E1E2->E1);
+    ec_dbl(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_point_iter(theta_couple_point_t *out,
+                         unsigned n,
+                         const theta_couple_point_t *in,
+                         const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        memmove(out, in, sizeof(theta_couple_point_t));
+    } else {
+        double_couple_point(out, in, E1E2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            double_couple_point(out, out, E1E2);
+        }
+    }
+}
+
+void
+add_couple_jac_points(theta_couple_jac_point_t *out,
+                      const theta_couple_jac_point_t *T1,
+                      const theta_couple_jac_point_t *T2,
+                      const theta_couple_curve_t *E1E2)
+{
+    ADD(&out->P1, &T1->P1, &T2->P1, &E1E2->E1);
+    ADD(&out->P2, &T1->P2, &T2->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point(theta_couple_jac_point_t *out,
+                        const theta_couple_jac_point_t *in,
+                        const theta_couple_curve_t *E1E2)
+{
+    DBL(&out->P1, &in->P1, &E1E2->E1);
+    DBL(&out->P2, &in->P2, &E1E2->E2);
+}
+
+void
+double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                             unsigned n,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2)
+{
+    if (n == 0) {
+        *out = *in;
+    } else if (n == 1) {
+        double_couple_jac_point(out, in, E1E2);
+    } else {
+        fp2_t a1, a2, t1, t2;
+
+        jac_to_ws(&out->P1, &t1, &a1, &in->P1, &E1E2->E1);
+        jac_to_ws(&out->P2, &t2, &a2, &in->P2, &E1E2->E2);
+
+        DBLW(&out->P1, &t1, &out->P1, &t1);
+        DBLW(&out->P2, &t2, &out->P2, &t2);
+        for (unsigned i = 0; i < n - 1; i++) {
+            DBLW(&out->P1, &t1, &out->P1, &t1);
+            DBLW(&out->P2, &t2, &out->P2, &t2);
+        }
+
+        jac_from_ws(&out->P1, &out->P1, &a1, &E1E2->E1);
+        jac_from_ws(&out->P2, &out->P2, &a2, &E1E2->E2);
+    }
+}
+
+void
+couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP)
+{
+    jac_to_xz(&P->P1, &xyP->P1);
+    jac_to_xz(&P->P2, &xyP->P2);
+}
+
+void
+copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2)
+{
+    // Copy the basis on E1 to (P, _) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P1, &B1->P);
+    copy_point(&ker->T2.P1, &B1->Q);
+    copy_point(&ker->T1m2.P1, &B1->PmQ);
+
+    // Copy the basis on E2 to (_, P) on T1, T2 and T1 - T2
+    copy_point(&ker->T1.P2, &B2->P);
+    copy_point(&ker->T2.P2, &B2->Q);
+    copy_point(&ker->T1m2.P2, &B2->PmQ);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/hd.h b/src/pqm4/sqisign_lvl5/ref/hd.h
new file mode 100644
index 0000000..2b16e23
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/hd.h
@@ -0,0 +1,435 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief The HD-isogenies algorithm required by the signature
+ *
+ */
+
+#ifndef HD_H
+#define HD_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <stdio.h>
+
+/** @defgroup hd_module Abelian surfaces and their isogenies
+ * @{
+ */
+
+#define HD_extra_torsion 2
+
+/** @defgroup hd_struct Data structures for dimension 2
+ * @{
+ */
+
+/** @brief Type for couple point with XZ coordinates
+ * @typedef theta_couple_point_t
+ *
+ * @struct theta_couple_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XZ coordinates
+ */
+typedef struct theta_couple_point
+{
+    ec_point_t P1;
+    ec_point_t P2;
+} theta_couple_point_t;
+
+/** @brief Type for three couple points T1, T2, T1-T2 with XZ coordinates
+ * @typedef theta_kernel_couple_points_t
+ *
+ * @struct theta_kernel_couple_points
+ *
+ * Structure for a triple of theta couple points T1, T2 and T1 - T2
+ */
+typedef struct theta_kernel_couple_points
+{
+    theta_couple_point_t T1;
+    theta_couple_point_t T2;
+    theta_couple_point_t T1m2;
+} theta_kernel_couple_points_t;
+
+/** @brief Type for couple point with XYZ coordinates
+ * @typedef theta_couple_jac_point_t
+ *
+ * @struct theta_couple_jac_point
+ *
+ * Structure for the couple point on an elliptic product
+ * using XYZ coordinates
+ */
+typedef struct theta_couple_jac_point
+{
+    jac_point_t P1;
+    jac_point_t P2;
+} theta_couple_jac_point_t;
+
+/** @brief Type for couple curve *
+ * @typedef theta_couple_curve_t
+ *
+ * @struct theta_couple_curve
+ *
+ * the  theta_couple_curve structure
+ */
+typedef struct theta_couple_curve
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+} theta_couple_curve_t;
+
+/** @brief Type for a product E1 x E2 with corresponding bases
+ * @typedef theta_couple_curve_with_basis_t
+ *
+ * @struct theta_couple_curve_with_basis
+ *
+ * tType for a product E1 x E2 with corresponding bases Ei[2^n]
+ */
+typedef struct theta_couple_curve_with_basis
+{
+    ec_curve_t E1;
+    ec_curve_t E2;
+    ec_basis_t B1;
+    ec_basis_t B2;
+} theta_couple_curve_with_basis_t;
+
+/** @brief Type for theta point *
+ * @typedef theta_point_t
+ *
+ * @struct theta_point
+ *
+ * the  theta_point structure used
+ */
+typedef struct theta_point
+{
+    fp2_t x;
+    fp2_t y;
+    fp2_t z;
+    fp2_t t;
+} theta_point_t;
+
+/** @brief Type for theta point with repeating components
+ * @typedef theta_point_compact_t
+ *
+ * @struct theta_point_compact
+ *
+ * the  theta_point structure used for points with repeated components
+ */
+typedef struct theta_point_compact
+{
+    fp2_t x;
+    fp2_t y;
+} theta_point_compact_t;
+
+/** @brief Type for theta structure *
+ * @typedef theta_structure_t
+ *
+ * @struct theta_structure
+ *
+ * the  theta_structure structure used
+ */
+typedef struct theta_structure
+{
+    theta_point_t null_point;
+    bool precomputation;
+
+    // Eight precomputed values used for doubling and
+    // (2,2)-isogenies.
+    fp2_t XYZ0;
+    fp2_t YZT0;
+    fp2_t XZT0;
+    fp2_t XYT0;
+
+    fp2_t xyz0;
+    fp2_t yzt0;
+    fp2_t xzt0;
+    fp2_t xyt0;
+} theta_structure_t;
+
+/** @brief A 2x2 matrix used for action by translation
+ * @typedef translation_matrix_t
+ *
+ * @struct translation_matrix
+ *
+ * Structure to hold 4 fp2_t elements representing a 2x2 matrix used when computing
+ * a compatible theta structure during gluing.
+ */
+typedef struct translation_matrix
+{
+    fp2_t g00;
+    fp2_t g01;
+    fp2_t g10;
+    fp2_t g11;
+} translation_matrix_t;
+
+/** @brief A 4x4 matrix used for basis changes
+ * @typedef basis_change_matrix_t
+ *
+ * @struct basis_change_matrix
+ *
+ * Structure to hold 16 elements representing a 4x4 matrix used for changing
+ * the basis of a theta point.
+ */
+typedef struct basis_change_matrix
+{
+    fp2_t m[4][4];
+} basis_change_matrix_t;
+
+/** @brief Type for gluing (2,2) theta isogeny *
+ * @typedef theta_gluing_t
+ *
+ * @struct theta_gluing
+ *
+ * the  theta_gluing structure
+ */
+typedef struct theta_gluing
+{
+
+    theta_couple_curve_t domain;
+    theta_couple_jac_point_t xyK1_8;
+    theta_point_compact_t imageK1_8;
+    basis_change_matrix_t M;
+    theta_point_t precomputation;
+    theta_point_t codomain;
+
+} theta_gluing_t;
+
+/** @brief Type for standard (2,2) theta isogeny *
+ * @typedef theta_isogeny_t
+ *
+ * @struct theta_isogeny
+ *
+ * the  theta_isogeny structure
+ */
+typedef struct theta_isogeny
+{
+    theta_point_t T1_8;
+    theta_point_t T2_8;
+    bool hadamard_bool_1;
+    bool hadamard_bool_2;
+    theta_structure_t domain;
+    theta_point_t precomputation;
+    theta_structure_t codomain;
+} theta_isogeny_t;
+
+/** @brief Type for splitting isomorphism *
+ * @typedef theta_splitting_t
+ *
+ * @struct theta_splitting
+ *
+ * the theta_splitting structure
+ */
+typedef struct theta_splitting
+{
+    basis_change_matrix_t M;
+    theta_structure_t B;
+
+} theta_splitting_t;
+
+// end of hd_struct
+/**
+ * @}
+ */
+
+/** @defgroup hd_functions Functions for dimension 2
+ * @{
+ */
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_point(theta_couple_point_t *out, const theta_couple_point_t *in, const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param n : the number of iteration
+ * @param E1E2 an elliptic product
+ * @param in the theta couple point in the elliptic product
+ * in = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_point_iter(theta_couple_point_t *out,
+                              unsigned n,
+                              const theta_couple_point_t *in,
+                              const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the addition of two points in (X : Y : Z) coordinates on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param T1 the theta couple jac point in the elliptic product
+ * @param T2 the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1, P2), (Q1, Q2)
+ * out = (P1 + Q1, P2 + Q2)
+ *
+ **/
+void add_couple_jac_points(theta_couple_jac_point_t *out,
+                           const theta_couple_jac_point_t *T1,
+                           const theta_couple_jac_point_t *T2,
+                           const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the double of the theta couple point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_point
+ * @param in the theta couple point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in = (P1,P2)
+ * out = [2] (P1,P2)
+ *
+ */
+void double_couple_jac_point(theta_couple_jac_point_t *out,
+                             const theta_couple_jac_point_t *in,
+                             const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief Compute the iterated double of the theta couple jac point in on the elliptic product E12
+ *
+ * @param out Output: the theta_couple_jac_point
+ * @param n : the number of iteration
+ * @param in the theta couple jac point in the elliptic product
+ * @param E1E2 an elliptic product
+ * in  = (P1,P2)
+ * out = [2^n] (P1,P2)
+ *
+ */
+void double_couple_jac_point_iter(theta_couple_jac_point_t *out,
+                                  unsigned n,
+                                  const theta_couple_jac_point_t *in,
+                                  const theta_couple_curve_t *E1E2);
+
+/**
+ * @brief A forgetful function which returns (X : Z) points given a pair of (X : Y : Z) points
+ *
+ * @param P Output: the theta_couple_point
+ * @param xyP : the theta_couple_jac_point
+ **/
+void couple_jac_to_xz(theta_couple_point_t *P, const theta_couple_jac_point_t *xyP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval(unsigned n,
+                                 /*const*/ theta_couple_curve_t *E12,
+                                 const theta_kernel_couple_points_t *ker,
+                                 bool extra_torsion,
+                                 theta_couple_curve_t *E34,
+                                 theta_couple_point_t *P12,
+                                 size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it does extra isotropy
+ * checks on the kernel.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_verify(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP);
+
+/**
+ * @brief Compute a (2,2) isogeny chain in dimension 2 between elliptic
+ * products in the theta_model and evaluate at a list of points of the form
+ * (P1,0) or (0,P2). Returns 0 if the codomain fails to split (or there is
+ * an error during the computation) and 1 otherwise.
+ * Compared to theta_chain_compute_and_eval, it selects a random Montgomery
+ * model of the codomain.
+ *
+ * @param n : the length of the isogeny chain
+ * @param E12 an elliptic curve product
+ * @param ker T1, T2 and T1-T2. couple points on E12[2^(n+2)]
+ * @param extra_torsion boolean indicating if we give the points in E12[2^n] or
+ * E12[2^(n+HD_extra_torsion)]
+ * @param E34 Output: the codomain curve
+ * @param P12 Input/Output: pointer to points to be pushed through the isogeny (in-place)
+ * @param numP: length of the list of points given in P12 (can be zero)
+ * @returns 1 on success, 0 on failure
+ *
+ */
+int theta_chain_compute_and_eval_randomized(unsigned n,
+                                            /*const*/ theta_couple_curve_t *E12,
+                                            const theta_kernel_couple_points_t *ker,
+                                            bool extra_torsion,
+                                            theta_couple_curve_t *E34,
+                                            theta_couple_point_t *P12,
+                                            size_t numP);
+
+/**
+ * @brief Given a bases B1 on E1 and B2 on E2 copies this to create a kernel
+ *         on E1 x E2 as couple points T1, T2 and T1 - T2
+ *
+ * @param ker Output: a kernel for dim_two_isogenies (T1, T2, T1-T2)
+ * @param B1 Input basis on E1
+ * @param B2 Input basis on E2
+ **/
+void copy_bases_to_kernel(theta_kernel_couple_points_t *ker, const ec_basis_t *B1, const ec_basis_t *B2);
+
+/**
+ * @brief Given a couple of points (P1, P2) on a couple of curves (E1, E2)
+ * this function tests if both points are of order exactly 2^t
+ *
+ * @param T: couple point (P1, P2)
+ * @param E: a couple of curves (E1, E2)
+ * @param t: an integer
+ * @returns 0xFFFFFFFF on success, 0 on failure
+ */
+static int
+test_couple_point_order_twof(const theta_couple_point_t *T, const theta_couple_curve_t *E, int t)
+{
+    int check_P1 = test_point_order_twof(&T->P1, &E->E1, t);
+    int check_P2 = test_point_order_twof(&T->P2, &E->E2, t);
+
+    return check_P1 & check_P2;
+}
+
+// end of hd_functions
+/**
+ * @}
+ */
+// end of hd_module
+/**
+ * @}
+ */
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.c b/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.c
new file mode 100644
index 0000000..a697ac7
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.c
@@ -0,0 +1,143 @@
+#include <hd_splitting_transforms.h>
+
+#define FP2_ZERO 0
+#define FP2_ONE 1
+#define FP2_I 2
+#define FP2_MINUS_ONE 3
+#define FP2_MINUS_I 4
+
+const int EVEN_INDEX[10][2] = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 0}, {1, 2}, {2, 0}, {2, 1}, {3, 0}, {3, 3}};
+const int CHI_EVAL[4][4] = {{1, 1, 1, 1}, {1, -1, 1, -1}, {1, 1, -1, -1}, {1, -1, -1, 1}};
+const fp2_t FP2_CONSTANTS[5] = {{
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x500}
+#elif RADIX == 32
+{0x25ed0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x800}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x97, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x130000000000000}
+#else
+{0x12f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xb00000000000}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x500}
+#elif RADIX == 32
+{0x25ed0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x800}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x97, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x130000000000000}
+#else
+{0x12f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xb00000000000}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x1ffb, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1bf}
+#elif RADIX == 32
+{0x1ffda12f, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x57f}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xffffffffffffff68, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffff}
+#else
+{0x1fffffffffffed0, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0xffffffffffff}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+}, {
+#if 0
+#elif RADIX == 16
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 32
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#else
+{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+#endif
+#endif
+, 
+#if 0
+#elif RADIX == 16
+{0x1ffb, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1fff, 0x1bf}
+#elif RADIX == 32
+{0x1ffda12f, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x57f}
+#elif RADIX == 64
+#if defined(SQISIGN_GF_IMPL_BROADWELL)
+{0xffffffffffffff68, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fffffffffffff}
+#else
+{0x1fffffffffffed0, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0x1ffffffffffffff, 0xffffffffffff}
+#endif
+#endif
+}};
+const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10] = {{{{FP2_ONE, FP2_I, FP2_ONE, FP2_I}, {FP2_ONE, FP2_MINUS_I, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_MINUS_ONE, FP2_MINUS_I}, {FP2_MINUS_ONE, FP2_I, FP2_MINUS_ONE, FP2_I}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_ONE, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}};
+const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6] = {{{{FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}}}, {{{FP2_ZERO, FP2_ZERO, FP2_ZERO, FP2_ONE}, {FP2_ZERO, FP2_ZERO, FP2_ONE, FP2_ZERO}, {FP2_ZERO, FP2_ONE, FP2_ZERO, FP2_ZERO}, {FP2_ONE, FP2_ZERO, FP2_ZERO, FP2_ZERO}}}, {{{FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE}, {FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}}}, {{{FP2_ONE, FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_MINUS_ONE, FP2_ONE, FP2_ONE}, {FP2_MINUS_ONE, FP2_ONE, FP2_MINUS_ONE, FP2_ONE}, {FP2_ONE, FP2_ONE, FP2_ONE, FP2_ONE}}}, {{{FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}}}, {{{FP2_ONE, FP2_I, FP2_I, FP2_MINUS_ONE}, {FP2_I, FP2_ONE, FP2_MINUS_ONE, FP2_I}, {FP2_I, FP2_MINUS_ONE, FP2_ONE, FP2_I}, {FP2_MINUS_ONE, FP2_I, FP2_I, FP2_ONE}}}};
diff --git a/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.h b/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.h
new file mode 100644
index 0000000..b3147a4
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/hd_splitting_transforms.h
@@ -0,0 +1,18 @@
+#ifndef HD_SPLITTING_H
+#define HD_SPLITTING_H
+
+#include <hd.h>
+#include <stdint.h>
+
+typedef struct precomp_basis_change_matrix {
+    uint8_t m[4][4];
+} precomp_basis_change_matrix_t;
+
+extern const int EVEN_INDEX[10][2];
+extern const int CHI_EVAL[4][4];
+extern const fp2_t FP2_CONSTANTS[5];
+extern const precomp_basis_change_matrix_t SPLITTING_TRANSFORMS[10];
+extern const precomp_basis_change_matrix_t NORMALIZATION_TRANSFORMS[6];
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl5/ref/isog.h b/src/pqm4/sqisign_lvl5/ref/isog.h
new file mode 100644
index 0000000..b251ca3
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/isog.h
@@ -0,0 +1,28 @@
+#ifndef _ISOG_H_
+#define _ISOG_H_
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/* KPS structure for isogenies of degree 2 or 4 */
+typedef struct
+{
+    ec_point_t K;
+} ec_kps2_t;
+typedef struct
+{
+    ec_point_t K[3];
+} ec_kps4_t;
+
+void xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P); // degree-2 isogeny construction
+void xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24);
+
+void xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P); // degree-4 isogeny construction
+void xisog_4_singular(ec_kps4_t *kps, ec_point_t *B24, const ec_point_t P, ec_point_t A24);
+
+void xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps);
+void xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps);
+
+void xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps);
+void xeval_4_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_point_t P, const ec_kps4_t *kps);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/isog_chains.c b/src/pqm4/sqisign_lvl5/ref/isog_chains.c
new file mode 100644
index 0000000..abc9808
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/isog_chains.c
@@ -0,0 +1,241 @@
+#include "isog.h"
+#include <assert.h>
+
+// since we use degree 4 isogeny steps, we need to handle the odd case with care
+static uint32_t
+ec_eval_even_strategy(ec_curve_t *curve,
+                      ec_point_t *points,
+                      unsigned len_points,
+                      const ec_point_t *kernel,
+                      const int isog_len)
+{
+    ec_curve_normalize_A24(curve);
+    ec_point_t A24;
+    copy_point(&A24, &curve->A24);
+
+    int space = 1;
+    for (int i = 1; i < isog_len; i *= 2)
+        ++space;
+
+    // Stack of remaining kernel points and their associated orders
+    ec_point_t splits[space];
+    uint16_t todo[space];
+    splits[0] = *kernel;
+    todo[0] = isog_len;
+
+    int current = 0; // Pointer to current top of stack
+
+    // Chain of 4-isogenies
+    for (int j = 0; j < isog_len / 2; ++j) {
+        assert(current >= 0);
+        assert(todo[current] >= 1);
+        // Get the next point of order 4
+        while (todo[current] != 2) {
+            assert(todo[current] >= 3);
+            // A new split will be added
+            ++current;
+            assert(current < space);
+            // We set the seed of the new split to be computed and saved
+            copy_point(&splits[current], &splits[current - 1]);
+            // if we copied from the very first element, then we perform one additional doubling
+            unsigned num_dbls = todo[current - 1] / 4 * 2 + todo[current - 1] % 2;
+            todo[current] = todo[current - 1] - num_dbls;
+            while (num_dbls--)
+                xDBL_A24(&splits[current], &splits[current], &A24, false);
+        }
+
+        if (j == 0) {
+            assert(fp2_is_one(&A24.z));
+            if (!ec_is_four_torsion(&splits[current], curve))
+                return -1;
+
+            ec_point_t T;
+            xDBL_A24(&T, &splits[current], &A24, false);
+            if (fp2_is_zero(&T.x))
+                return -1; // special isogenies not allowed
+        } else {
+            assert(todo[current] == 2);
+#ifndef NDEBUG
+            if (fp2_is_zero(&splits[current].z))
+                debug_print("splitting point z coordinate is unexpectedly zero");
+
+            ec_point_t test;
+            xDBL_A24(&test, &splits[current], &A24, false);
+            if (fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly zero before doubling");
+            xDBL_A24(&test, &test, &A24, false);
+            if (!fp2_is_zero(&test.z))
+                debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+        }
+
+        // Evaluate 4-isogeny
+        ec_kps4_t kps4;
+        xisog_4(&kps4, &A24, splits[current]);
+        xeval_4(splits, splits, current, &kps4);
+        for (int i = 0; i < current; ++i)
+            todo[i] -= 2;
+        xeval_4(points, points, len_points, &kps4);
+
+        --current;
+    }
+    assert(isog_len % 2 ? !current : current == -1);
+
+    // Final 2-isogeny
+    if (isog_len % 2) {
+#ifndef NDEBUG
+        if (fp2_is_zero(&splits[0].z))
+            debug_print("splitting point z coordinate is unexpectedly zero");
+        ec_point_t test;
+        copy_point(&test, &splits[0]);
+        xDBL_A24(&test, &test, &A24, false);
+        if (!fp2_is_zero(&test.z))
+            debug_print("z coordinate is unexpectedly not zero after doubling");
+#endif
+
+        // We need to check the order of this point in case there were no 4-isogenies
+        if (isog_len == 1 && !ec_is_two_torsion(&splits[0], curve))
+            return -1;
+        if (fp2_is_zero(&splits[0].x)) {
+            // special isogenies not allowed
+            // this case can only happen if isog_len == 1; otherwise the
+            // previous 4-isogenies we computed ensure that $T=(0:1)$ is put
+            // as the kernel of the dual isogeny
+            return -1;
+        }
+
+        ec_kps2_t kps2;
+        xisog_2(&kps2, &A24, splits[0]);
+        xeval_2(points, points, len_points, &kps2);
+    }
+
+    // Output curve in the form (A:C)
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+
+    return 0;
+}
+
+uint32_t
+ec_eval_even(ec_curve_t *image, ec_isog_even_t *phi, ec_point_t *points, unsigned len_points)
+{
+    copy_curve(image, &phi->curve);
+    return ec_eval_even_strategy(image, points, len_points, &phi->kernel, phi->length);
+}
+
+// naive implementation
+uint32_t
+ec_eval_small_chain(ec_curve_t *curve,
+                    const ec_point_t *kernel,
+                    int len,
+                    ec_point_t *points,
+                    unsigned len_points,
+                    bool special) // do we allow special isogenies?
+{
+
+    ec_point_t A24;
+    AC_to_A24(&A24, curve);
+
+    ec_kps2_t kps;
+    ec_point_t small_K, big_K;
+    copy_point(&big_K, kernel);
+
+    for (int i = 0; i < len; i++) {
+        copy_point(&small_K, &big_K);
+        // small_K = big_K;
+        for (int j = 0; j < len - i - 1; j++) {
+            xDBL_A24(&small_K, &small_K, &A24, false);
+        }
+        // Check the order of the point before the first isogeny step
+        if (i == 0 && !ec_is_two_torsion(&small_K, curve))
+            return (uint32_t)-1;
+        // Perform isogeny step
+        if (fp2_is_zero(&small_K.x)) {
+            if (special) {
+                ec_point_t B24;
+                xisog_2_singular(&kps, &B24, A24);
+                xeval_2_singular(&big_K, &big_K, 1, &kps);
+                xeval_2_singular(points, points, len_points, &kps);
+                copy_point(&A24, &B24);
+            } else {
+                return (uint32_t)-1;
+            }
+        } else {
+            xisog_2(&kps, &A24, small_K);
+            xeval_2(&big_K, &big_K, 1, &kps);
+            xeval_2(points, points, len_points, &kps);
+        }
+    }
+    A24_to_AC(curve, &A24);
+
+    curve->is_A24_computed_and_normalized = false;
+    return 0;
+}
+
+uint32_t
+ec_isomorphism(ec_isom_t *isom, const ec_curve_t *from, const ec_curve_t *to)
+{
+    fp2_t t0, t1, t2, t3, t4;
+
+    fp2_mul(&t0, &from->A, &from->C);
+    fp2_mul(&t1, &to->A, &to->C);
+
+    fp2_mul(&t2, &t1, &to->C); // toA*toC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*toA*toC^2
+    fp2_sqr(&t3, &to->A);
+    fp2_mul(&t3, &t3, &to->A); // toA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->Nx, &t3, &t2); // 2*toA^3-9*toA*toC^2
+    fp2_mul(&t2, &t0, &from->A);  // fromA^2*fromC
+    fp2_sqr(&t3, &from->C);
+    fp2_mul(&t3, &t3, &from->C); // fromC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);             // 3*fromC^3
+    fp2_sub(&t3, &t3, &t2);             // 3*fromC^3-fromA^2*fromC
+    fp2_mul(&isom->Nx, &isom->Nx, &t3); // lambda_x = (2*toA^3-9*toA*toC^2)*(3*fromC^3-fromA^2*fromC)
+
+    fp2_mul(&t2, &t0, &from->C); // fromA*fromC^2
+    fp2_add(&t3, &t2, &t2);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t3, &t3, &t3);
+    fp2_add(&t2, &t2, &t3); // 9*fromA*fromC^2
+    fp2_sqr(&t3, &from->A);
+    fp2_mul(&t3, &t3, &from->A); // fromA^3
+    fp2_add(&t3, &t3, &t3);
+    fp2_sub(&isom->D, &t3, &t2); // 2*fromA^3-9*fromA*fromC^2
+    fp2_mul(&t2, &t1, &to->A);   // toA^2*toC
+    fp2_sqr(&t3, &to->C);
+    fp2_mul(&t3, &t3, &to->C); // toC^3
+    fp2_add(&t4, &t3, &t3);
+    fp2_add(&t3, &t4, &t3);           // 3*toC^3
+    fp2_sub(&t3, &t3, &t2);           // 3*toC^3-toA^2*toC
+    fp2_mul(&isom->D, &isom->D, &t3); // lambda_z = (2*fromA^3-9*fromA*fromC^2)*(3*toC^3-toA^2*toC)
+
+    // Mont -> SW -> SW -> Mont
+    fp2_mul(&t0, &to->C, &from->A);
+    fp2_mul(&t0, &t0, &isom->Nx); // lambda_x*toC*fromA
+    fp2_mul(&t1, &from->C, &to->A);
+    fp2_mul(&t1, &t1, &isom->D);  // lambda_z*fromC*toA
+    fp2_sub(&isom->Nz, &t0, &t1); // lambda_x*toC*fromA - lambda_z*fromC*toA
+    fp2_mul(&t0, &from->C, &to->C);
+    fp2_add(&t1, &t0, &t0);
+    fp2_add(&t0, &t0, &t1);             // 3*fromC*toC
+    fp2_mul(&isom->D, &isom->D, &t0);   // 3*lambda_z*fromC*toC
+    fp2_mul(&isom->Nx, &isom->Nx, &t0); // 3*lambda_x*fromC*toC
+
+    return (fp2_is_zero(&isom->Nx) | fp2_is_zero(&isom->D));
+}
+
+void
+ec_iso_eval(ec_point_t *P, ec_isom_t *isom)
+{
+    fp2_t tmp;
+    fp2_mul(&P->x, &P->x, &isom->Nx);
+    fp2_mul(&tmp, &P->z, &isom->Nz);
+    fp2_add(&P->x, &P->x, &tmp);
+    fp2_mul(&P->z, &P->z, &isom->D);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/mp.c b/src/pqm4/sqisign_lvl5/ref/mp.c
new file mode 100644
index 0000000..27f4a96
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/mp.c
@@ -0,0 +1,357 @@
+#include <mp.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+// double-wide multiplication
+void
+MUL(digit_t *out, const digit_t a, const digit_t b)
+{
+#ifdef RADIX_32
+    uint64_t r = (uint64_t)a * b;
+    out[0] = r & 0xFFFFFFFFUL;
+    out[1] = r >> 32;
+
+#elif defined(RADIX_64) && defined(_MSC_VER)
+    uint64_t umul_hi;
+    out[0] = _umul128(a, b, &umul_hi);
+    out[1] = umul_hi;
+
+#elif defined(RADIX_64) && defined(HAVE_UINT128)
+    unsigned __int128 umul_tmp;
+    umul_tmp = (unsigned __int128)(a) * (unsigned __int128)(b);
+    out[0] = (uint64_t)umul_tmp;
+    out[1] = (uint64_t)(umul_tmp >> 64);
+
+#else
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4);
+    al = a & mask_low;               // Low part
+    ah = a >> (sizeof(digit_t) * 4); // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t) * 4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low; // out00
+
+    res1 = albl >> (sizeof(digit_t) * 4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t) * 4);
+    out[0] ^= temp << (sizeof(digit_t) * 4); // out01
+
+    res1 = ahbl >> (sizeof(digit_t) * 4);
+    res2 = albh >> (sizeof(digit_t) * 4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low; // out10
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry; // out11
+
+#endif
+}
+
+void
+mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision addition
+    unsigned int i, carry = 0;
+
+    for (i = 0; i < nwords; i++) {
+        ADDC(c[i], carry, a[i], b[i], carry);
+    }
+}
+
+digit_t
+mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift by 1...RADIX-1
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords - 1; i++) {
+        SHIFTR(x[i + 1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords - 1] >>= shift;
+    return bit_out;
+}
+
+void
+mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift by 1...RADIX-1
+
+    for (int i = nwords - 1; i > 0; i--) {
+        SHIFTL(x[i], x[i - 1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+void
+multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords)
+{
+    int t = shift;
+    while (t > RADIX - 1) {
+        mp_shiftl(x, RADIX - 1, nwords);
+        t = t - (RADIX - 1);
+    }
+    mp_shiftl(x, t, nwords);
+}
+
+// The below functions were taken from the EC module
+
+void
+mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords)
+{ // Multiprecision subtraction, assuming a > b
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < nwords; i++) {
+        SUBC(c[i], borrow, a[i], b[i], borrow);
+    }
+}
+
+void
+select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords)
+{ // Select c <- a if mask = 0, select c <- b if mask = 1...1
+
+    for (int i = 0; i < nwords; i++) {
+        c[i] = ((a[i] ^ b[i]) & mask) ^ a[i];
+    }
+}
+
+void
+swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords)
+{ // Swap entries
+  // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then a <- b and b <- a
+    digit_t temp;
+
+    for (int i = 0; i < nwords; i++) {
+        temp = option & (a[i] ^ b[i]);
+        a[i] = temp ^ a[i];
+        b[i] = temp ^ b[i];
+    }
+}
+
+int
+mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords)
+{ // Multiprecision comparison, a=b? : (1) a>b, (0) a=b, (-1) a<b
+
+    for (int i = nwords - 1; i >= 0; i--) {
+        if (a[i] > b[i])
+            return 1;
+        else if (a[i] < b[i])
+            return -1;
+    }
+    return 0;
+}
+
+bool
+mp_is_zero(const digit_t *a, unsigned int nwords)
+{ // Is a multiprecision element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < nwords; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void
+mp_mul2(digit_t *c, const digit_t *a, const digit_t *b)
+{ // Multiprecision multiplication fixed to two-digit operands
+    unsigned int carry = 0;
+    digit_t t0[2], t1[2], t2[2];
+
+    MUL(t0, a[0], b[0]);
+    MUL(t1, a[0], b[1]);
+    ADDC(t0[1], carry, t0[1], t1[0], carry);
+    ADDC(t1[1], carry, 0, t1[1], carry);
+    MUL(t2, a[1], b[1]);
+    ADDC(t2[0], carry, t2[0], t1[1], carry);
+    ADDC(t2[1], carry, 0, t2[1], carry);
+    c[0] = t0[0];
+    c[1] = t0[1];
+    c[2] = t2[0];
+    c[3] = t2[1];
+}
+
+void
+mp_print(const digit_t *a, size_t nwords)
+{
+    printf("0x");
+    for (size_t i = 0; i < nwords; i++) {
+#ifdef RADIX_32
+        printf("%08" PRIx32, a[nwords - i - 1]); // Print each word with 8 hex digits
+#elif defined(RADIX_64)
+        printf("%016" PRIx64, a[nwords - i - 1]); // Print each word with 16 hex digits
+#endif
+    }
+}
+
+void
+mp_copy(digit_t *b, const digit_t *a, size_t nwords)
+{
+    for (size_t i = 0; i < nwords; i++) {
+        b[i] = a[i];
+    }
+}
+
+void
+mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords)
+{
+    // Multiprecision multiplication, c = a*b, for nwords-digit inputs, with nwords-digit output
+    // explicitly does not use the higher half of c, as we do not need in our applications
+    digit_t carry, UV[2], t[nwords], cc[nwords];
+
+    for (size_t i = 0; i < nwords; i++) {
+        cc[i] = 0;
+    }
+
+    for (size_t i = 0; i < nwords; i++) {
+
+        MUL(t, a[i], b[0]);
+
+        for (size_t j = 1; j < nwords - 1; j++) {
+            MUL(UV, a[i], b[j]);
+            ADDC(t[j], carry, t[j], UV[0], 0);
+            t[j + 1] = UV[1] + carry;
+        }
+
+        int j = nwords - 1;
+        MUL(UV, a[i], b[j]);
+        ADDC(t[j], carry, t[j], UV[0], 0);
+
+        mp_add(&cc[i], &cc[i], t, nwords - i);
+    }
+
+    mp_copy(c, cc, nwords);
+}
+
+void
+mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords)
+{ // Multiprecision modulo 2^e, with 0 <= a < 2^(e)
+    unsigned int i, q = e >> LOG2RADIX, r = e & (RADIX - 1);
+
+    if (q < nwords) {
+        a[q] &= ((digit_t)1 << r) - 1;
+
+        for (i = q + 1; i < nwords; i++) {
+            a[i] = 0;
+        }
+    }
+}
+
+void
+mp_neg(digit_t *a, unsigned int nwords)
+{ // negates a
+    for (size_t i = 0; i < nwords; i++) {
+        a[i] ^= -1;
+    }
+
+    a[0] += 1;
+}
+
+bool
+mp_is_one(const digit_t *x, unsigned int nwords)
+{ // returns true if x represents 1, and false otherwise
+    if (x[0] != 1) {
+        return false;
+    }
+
+    for (size_t i = 1; i < nwords; i++) {
+        if (x[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void
+mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords)
+{ // Inversion modulo 2^e, using Newton's method and Hensel lifting
+    // we take the first power of 2 larger than e to use
+    // requires a to be odd, of course
+    // returns b such that a*b = 1 mod 2^e
+    assert((a[0] & 1) == 1);
+
+    digit_t x[nwords], y[nwords], aa[nwords], mp_one[nwords], tmp[nwords];
+    mp_copy(aa, a, nwords);
+
+    mp_one[0] = 1;
+    for (unsigned int i = 1; i < nwords; i++) {
+        mp_one[i] = 0;
+    }
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    p -= 2; // using k = 4 for initial inverse
+    int w = (1 << (p + 2));
+
+    mp_mod_2exp(aa, w, nwords);
+    mp_add(x, aa, aa, nwords);
+    mp_add(x, x, aa, nwords);  // should be 3a
+    x[0] ^= (1 << 1);          // so that x equals (3a)^2 xor 2
+    mp_mod_2exp(x, w, nwords); // now x*a = 1 mod 2^4, which we lift
+
+    mp_mul(tmp, aa, x, nwords);
+    mp_neg(tmp, nwords);
+    mp_add(y, mp_one, tmp, nwords);
+
+    // Hensel lifting for p rounds
+    for (int i = 0; i < p; i++) {
+        mp_add(tmp, mp_one, y, nwords);
+        mp_mul(x, x, tmp, nwords);
+        mp_mul(y, y, y, nwords);
+    }
+
+    mp_mod_2exp(x, w, nwords);
+    mp_copy(b, x, nwords);
+
+    //  verify results
+    mp_mul(x, x, aa, nwords);
+    mp_mod_2exp(x, w, nwords);
+    assert(mp_is_one(x, nwords));
+}
+
+void
+mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords)
+{
+    // given a matrix ( ( a, b ), (c,  d) ) of values mod 2^e
+    // returns the inverse matrix gamma ( (d, -b), (-c, a) )
+    // where gamma is the inverse of the determinant a*d - b*c
+    // assumes the matrix is invertible, otherwises, inversion of determinant fails
+
+    int p = 1;
+    while ((1 << p) < e) {
+        p++;
+    }
+    int w = (1 << (p));
+
+    digit_t det[nwords], tmp[nwords], resa[nwords], resb[nwords], resc[nwords], resd[nwords];
+    mp_mul(tmp, r1, s2, nwords);
+    mp_mul(det, r2, s1, nwords);
+    mp_sub(det, tmp, det, nwords);
+    mp_inv_2e(det, det, e, nwords);
+
+    mp_mul(resa, det, s2, nwords);
+    mp_mul(resb, det, r2, nwords);
+    mp_mul(resc, det, s1, nwords);
+    mp_mul(resd, det, r1, nwords);
+
+    mp_neg(resb, nwords);
+    mp_neg(resc, nwords);
+
+    mp_mod_2exp(resa, w, nwords);
+    mp_mod_2exp(resb, w, nwords);
+    mp_mod_2exp(resc, w, nwords);
+    mp_mod_2exp(resd, w, nwords);
+
+    mp_copy(r1, resa, nwords);
+    mp_copy(r2, resb, nwords);
+    mp_copy(s1, resc, nwords);
+    mp_copy(s2, resd, nwords);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/mp.h b/src/pqm4/sqisign_lvl5/ref/mp.h
new file mode 100644
index 0000000..b3733b5
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/mp.h
@@ -0,0 +1,88 @@
+#ifndef MP_H
+#define MP_H
+
+#include <sqisign_namespace.h>
+#include <stdbool.h>
+#include <tutil.h>
+
+// Functions taken from the GF module
+
+void mp_add(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+digit_t mp_shiftr(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void multiple_mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t *x, const unsigned int shift, const unsigned int nwords);
+void MUL(digit_t *out, const digit_t a, const digit_t b);
+
+// Functions taken from the EC module
+
+void mp_sub(digit_t *c, const digit_t *a, const digit_t *b, const unsigned int nwords);
+void select_ct(digit_t *c, const digit_t *a, const digit_t *b, const digit_t mask, const int nwords);
+void swap_ct(digit_t *a, digit_t *b, const digit_t option, const int nwords);
+int mp_compare(const digit_t *a, const digit_t *b, unsigned int nwords);
+bool mp_is_zero(const digit_t *a, unsigned int nwords);
+void mp_mul2(digit_t *c, const digit_t *a, const digit_t *b);
+
+// Further functions for multiprecision arithmetic
+void mp_print(const digit_t *a, size_t nwords);
+void mp_copy(digit_t *b, const digit_t *a, size_t nwords);
+void mp_neg(digit_t *a, unsigned int nwords);
+bool mp_is_one(const digit_t *x, unsigned int nwords);
+void mp_mul(digit_t *c, const digit_t *a, const digit_t *b, size_t nwords);
+void mp_mod_2exp(digit_t *a, unsigned int e, unsigned int nwords);
+void mp_inv_2e(digit_t *b, const digit_t *a, int e, unsigned int nwords);
+void mp_invert_matrix(digit_t *r1, digit_t *r2, digit_t *s1, digit_t *s2, int e, unsigned int nwords);
+
+#define mp_is_odd(x, nwords) (((nwords) != 0) & (int)(x)[0])
+#define mp_is_even(x, nwords) (!mp_is_odd(x, nwords))
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+static inline unsigned int
+is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int
+is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int
+is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations
+ * **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                                              \
+    {                                                                                                                  \
+        digit_t tempReg = (addend1) + (digit_t)(carryIn);                                                              \
+        (sumOut) = (addend2) + tempReg;                                                                                \
+        (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg));    \
+    }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                                                  \
+    {                                                                                                                  \
+        digit_t tempReg = (minuend) - (subtrahend);                                                                    \
+        unsigned int borrowReg =                                                                                       \
+            (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));                \
+        (differenceOut) = tempReg - (digit_t)(borrowIn);                                                               \
+        (borrowOut) = borrowReg;                                                                                       \
+    }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                                              \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/pqm4_api.c b/src/pqm4/sqisign_lvl5/ref/pqm4_api.c
new file mode 100644
index 0000000..49cba9d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/pqm4_api.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <sig.h>
+#include <string.h>
+
+typedef struct {
+  size_t mlen;
+  char msg[59];
+  size_t smlen;
+  char sm[59 + CRYPTO_BYTES];
+} SQISign_KAT_t;
+
+const char kat_lvl5_pk[CRYPTO_PUBLICKEYBYTES] = {
+  0x4B, 0xA3, 0x81, 0xDA, 0xF9, 0x17, 0x40, 0x97, 0x4C, 0xB3, 0x61, 0xE6, 0x5A, 0x1B, 0x82, 0xFD, 0x17, 0x4A, 0x1F, 0x58, 0x18, 0x7A, 0xD8, 0x2C, 0xD8, 0xBC, 0x06, 0xCC, 0x3E, 0xC4, 0x29, 0x56, 0xC2, 0x4E, 0x7F, 0xA7, 0x54, 0x6D, 0xBE, 0x63, 0x50, 0x30, 0x73, 0xCB, 0x42, 0x57, 0x7F, 0x57, 0xD5, 0xCF, 0x36, 0xE2, 0xF0, 0x6E, 0xBD, 0xFB, 0x6E, 0x02, 0x7F, 0xCD, 0xD6, 0x52, 0x57, 0x01, 0x04, 0x70, 0x5B, 0xF7, 0x83, 0x55, 0xD9, 0x61, 0x24, 0xA2, 0xBF, 0x6B, 0x49, 0x2E, 0xFC, 0x43, 0x49, 0xF4, 0xD3, 0xAF, 0x47, 0x55, 0xCB, 0x3D, 0x4C, 0xAB, 0xD5, 0x05, 0x8A, 0x6B, 0x2B, 0x22, 0x8E, 0x47, 0x26, 0xE5, 0x15, 0xD2, 0x4B, 0x4C, 0x33, 0x12, 0x44, 0x49, 0x1B, 0x0A, 0x59, 0xEC, 0x94, 0x1D, 0xDD, 0xE7, 0xDA, 0x12, 0xEE, 0xE7, 0x3C, 0x84, 0x75, 0x40, 0xCE, 0xEA, 0x70, 0x00, 0x0B, 
+};
+
+const SQISign_KAT_t kat_lvl5[2] = {
+  {
+    .mlen = 32,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 32 + CRYPTO_BYTES,
+    .sm = { 0x8D, 0x7F, 0xCC, 0x36, 0xA4, 0x56, 0x85, 0x22, 0x11, 0x50, 0x98, 0x9C, 0x4F, 0x23, 0xCD, 0x24, 0x6D, 0xB1, 0x3A, 0x6A, 0x8C, 0xA5, 0x6D, 0x81, 0x13, 0x6D, 0x95, 0x6B, 0xD6, 0x15, 0x5A, 0x37, 0x8E, 0x96, 0x98, 0x14, 0x4A, 0x0E, 0x27, 0x2A, 0x0B, 0x8C, 0x80, 0x64, 0xC5, 0x8C, 0x62, 0x5C, 0xB4, 0x5C, 0xB0, 0x64, 0x99, 0x19, 0x9C, 0x6B, 0x0D, 0x34, 0x60, 0x9F, 0x75, 0x01, 0xA0, 0x01, 0x3A, 0x3D, 0xDF, 0x64, 0xB4, 0x74, 0x38, 0x39, 0x6D, 0x3A, 0x41, 0xE3, 0xA5, 0xAF, 0xDF, 0xEB, 0x1E, 0xAC, 0x87, 0x8A, 0x1E, 0xDF, 0x76, 0x1B, 0x81, 0x33, 0xAF, 0x43, 0x53, 0xB8, 0x2C, 0x2C, 0x7A, 0x29, 0x89, 0x9C, 0x2F, 0xD7, 0x22, 0x78, 0x88, 0x98, 0xA4, 0x24, 0xF6, 0x0B, 0xFE, 0x4A, 0xA1, 0x5F, 0xC0, 0x71, 0xD1, 0x31, 0xB4, 0xE5, 0x3A, 0xDB, 0xB7, 0x8B, 0xCD, 0xF0, 0x8C, 0x00, 0x00, 0x01, 0x2C, 0x89, 0x64, 0x6E, 0x35, 0xD3, 0xF8, 0x25, 0x76, 0x2E, 0x84, 0xF2, 0x68, 0x37, 0x8E, 0xAB, 0x35, 0x7D, 0x02, 0xE9, 0x01, 0x29, 0x22, 0xE5, 0x78, 0x3E, 0x49, 0xB0, 0x21, 0xCB, 0xD4, 0x07, 0xEB, 0x60, 0x32, 0x18, 0x0E, 0x17, 0xCB, 0x7D, 0x58, 0xCF, 0x54, 0xC1, 0xFF, 0x5E, 0x96, 0x25, 0x82, 0x68, 0x1F, 0x4B, 0x6C, 0xD5, 0x0E, 0xD3, 0x11, 0x8C, 0x36, 0xBA, 0x05, 0xC1, 0xDC, 0x34, 0xE2, 0xBD, 0x31, 0x86, 0x8C, 0x51, 0xC9, 0xE2, 0x7D, 0xEB, 0xA1, 0x7F, 0x15, 0xA0, 0xDF, 0xEC, 0xF7, 0xA2, 0x2C, 0x30, 0x7D, 0x1E, 0x20, 0x91, 0xCF, 0x97, 0xF0, 0x6B, 0x9A, 0x21, 0x58, 0x26, 0x86, 0x9F, 0x7E, 0xC5, 0x40, 0xA9, 0xF6, 0xB0, 0x9C, 0x89, 0xDD, 0xC3, 0xE5, 0xF5, 0xEC, 0xAE, 0xBD, 0x0E, 0xB1, 0xDD, 0xDA, 0xCC, 0x0C, 0xAE, 0x0E, 0xC9, 0xCD, 0xDE, 0xEF, 0xEE, 0x46, 0x61, 0xC2, 0xEB, 0xB0, 0x8B, 0xD3, 0x8F, 0xB7, 0x86, 0x16, 0xAC, 0x84, 0x3A, 0x47, 0xA3, 0x48, 0x8B, 0xF6, 0xDB, 0x2B, 0x4E, 0xA2, 0x60, 0x1F, 0xA5, 0x47, 0xD1, 0xFA, 0x34, 0x65, 0xD2, 0x63, 0x00, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+  {
+    .mlen = 59,
+    .msg = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+    .smlen = 59 + CRYPTO_BYTES,
+    .sm = { 0xD5, 0x7B, 0x91, 0xF4, 0x45, 0xB3, 0xE1, 0x8E, 0xAF, 0x36, 0xAF, 0xC9, 0xDE, 0xB4, 0x78, 0xCC, 0x8A, 0x8C, 0xEA, 0x72, 0x1E, 0xF3, 0x8F, 0xEC, 0xF3, 0xDC, 0xA6, 0x92, 0x22, 0xC4, 0xB8, 0x16, 0xD4, 0x20, 0x3C, 0x91, 0x0E, 0x0E, 0x92, 0xA0, 0xE6, 0xC0, 0x0E, 0xB1, 0x22, 0x50, 0xA3, 0x9B, 0x8D, 0x5F, 0x18, 0x3B, 0xAF, 0xDE, 0xBE, 0xD2, 0x1A, 0x74, 0x88, 0x6F, 0x8E, 0xD9, 0xA8, 0x01, 0xF0, 0x41, 0x22, 0x82, 0x9F, 0xC5, 0xDF, 0x2D, 0xD3, 0x4F, 0x84, 0xAC, 0x15, 0xA1, 0xEF, 0xC0, 0x14, 0x16, 0xB7, 0x12, 0xDC, 0xE9, 0xB8, 0x94, 0x48, 0x13, 0x63, 0xD2, 0x3C, 0x6F, 0x21, 0xD1, 0x70, 0xDA, 0xC6, 0x7A, 0xC8, 0xF5, 0x03, 0xC9, 0x8A, 0xAA, 0x52, 0x48, 0x50, 0x79, 0x8D, 0xA5, 0x1B, 0x19, 0xCE, 0x98, 0x3D, 0xBE, 0xB2, 0xB4, 0x7E, 0x92, 0x23, 0xDD, 0x98, 0x9B, 0x7D, 0x01, 0x01, 0x04, 0xAE, 0xFA, 0xF9, 0xEF, 0x9E, 0xE9, 0xE9, 0xE0, 0x5B, 0x2B, 0x03, 0x41, 0xB6, 0x5F, 0xAA, 0x38, 0x5E, 0xA3, 0x3C, 0x8D, 0x25, 0x7C, 0x51, 0x5E, 0x74, 0x55, 0x09, 0xE1, 0xB4, 0x0C, 0xF9, 0x31, 0x4F, 0x20, 0xC1, 0x23, 0x0D, 0x16, 0x1D, 0x6A, 0x79, 0x98, 0x81, 0x52, 0x77, 0x4A, 0x7F, 0xF7, 0x01, 0x61, 0xE8, 0x6B, 0xF7, 0xE6, 0x75, 0x63, 0x8F, 0xB4, 0x73, 0xF3, 0x83, 0x44, 0x43, 0x01, 0x86, 0x12, 0x6D, 0xE6, 0x63, 0xF3, 0x10, 0x64, 0xDB, 0xCB, 0x6A, 0x84, 0xA2, 0x46, 0x7B, 0x80, 0x18, 0x7D, 0x66, 0x15, 0x82, 0xFF, 0xDD, 0x45, 0x41, 0xDA, 0x94, 0x5E, 0x46, 0xD0, 0x18, 0x1D, 0x63, 0x75, 0xB2, 0xE1, 0xAF, 0x9B, 0xF7, 0xDD, 0x72, 0x99, 0x15, 0xE4, 0x48, 0xD9, 0x72, 0x45, 0x51, 0x17, 0x2A, 0x71, 0xB4, 0x88, 0x19, 0xCD, 0xC6, 0x4C, 0x2B, 0xAF, 0x34, 0xB0, 0x89, 0x3B, 0x36, 0x62, 0x96, 0x48, 0x19, 0x87, 0x65, 0x44, 0x9A, 0x88, 0x66, 0x0E, 0x5A, 0xA9, 0x2C, 0xA7, 0x67, 0x54, 0xC3, 0x53, 0xCF, 0x62, 0x59, 0x6A, 0x60, 0xFF, 0x17, 0x8A, 0xA1, 0x69, 0x65, 0x00, 0x04, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, },
+  },
+};
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+  memcpy(pk, kat_lvl5_pk, CRYPTO_PUBLICKEYBYTES);
+  // We don't need the secret key
+  memset(sk, 0, CRYPTO_SECRETKEYBYTES);
+}
+
+int crypto_sign(unsigned char *sm, size_t *smlen, const unsigned char *m,
+                size_t mlen, const unsigned char *sk) {
+  for (size_t i = 0; i < sizeof(kat_lvl5) / sizeof(kat_lvl5[0]); i++) {
+    if (mlen == kat_lvl5[i].mlen) {
+      memcpy(sm, kat_lvl5[i].sm, kat_lvl5[i].smlen);
+      *smlen = kat_lvl5[i].smlen;
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int crypto_sign_open(unsigned char *m, size_t *mlen, const unsigned char *sm,
+                     size_t smlen, const unsigned char *pk) {
+  unsigned long long mlen_ull = *mlen;
+  int ret = sqisign_open(m, &mlen_ull, sm, smlen, pk);
+  if (mlen) {
+    *mlen = mlen_ull;
+  }
+  return ret;
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/rng.h b/src/pqm4/sqisign_lvl5/ref/rng.h
new file mode 100644
index 0000000..3c24d07
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/rng.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef rng_h
+#define rng_h
+
+#include "randombytes.h"
+
+#endif /* rng_h */
diff --git a/src/pqm4/sqisign_lvl5/ref/sig.h b/src/pqm4/sqisign_lvl5/ref/sig.h
new file mode 100644
index 0000000..4c33510
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/sig.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SQISIGN_H
+#define SQISIGN_H
+
+#include <stdint.h>
+#include <sqisign_namespace.h>
+
+#if defined(ENABLE_SIGN)
+/**
+ * SQIsign keypair generation.
+ *
+ * The implementation corresponds to SQIsign.CompactKeyGen() in the SQIsign spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[out] pk SQIsign public key
+ * @param[out] sk SQIsign secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_keypair(unsigned char *pk, unsigned char *sk);
+
+/**
+ * SQIsign signature generation.
+ *
+ * The implementation performs SQIsign.expandSK() + SQIsign.sign() in the SQIsign spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+SQISIGN_API 
+int sqisign_sign(unsigned char *sm,
+                 unsigned long long *smlen,
+                 const unsigned char *m,
+                 unsigned long long mlen,
+                 const unsigned char *sk);
+#endif
+
+/**
+ * SQIsign open signature.
+ *
+ * The implementation performs SQIsign.verify(). If the signature verification succeeded, the
+ * original message is stored in m. Keys provided is a compact public key. The caller is responsible
+ * to allocate sufficient memory to hold m.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+SQISIGN_API
+int sqisign_open(unsigned char *m,
+                 unsigned long long *mlen,
+                 const unsigned char *sm,
+                 unsigned long long smlen,
+                 const unsigned char *pk);
+
+/**
+ * SQIsign verify signature.
+ *
+ * If the signature verification succeeded, returns 0, otherwise 1.
+ *
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] siglen Length of sig
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+SQISIGN_API 
+int sqisign_verify(const unsigned char *m,
+                   unsigned long long mlen,
+                   const unsigned char *sig,
+                   unsigned long long siglen,
+                   const unsigned char *pk);
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/sqisign.c b/src/pqm4/sqisign_lvl5/ref/sqisign.c
new file mode 100644
index 0000000..57fd75d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/sqisign.c
@@ -0,0 +1,106 @@
+#include <sig.h>
+#include <string.h>
+#include <encoded_sizes.h>
+#include <verification.h>
+#if defined(ENABLE_SIGN)
+#include <signature.h>
+#endif
+
+#if defined(ENABLE_SIGN)
+SQISIGN_API
+int
+sqisign_keypair(unsigned char *pk, unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    secret_key_init(&skt);
+
+    ret = !protocols_keygen(&pkt, &skt);
+
+    secret_key_to_bytes(sk, &skt, &pkt);
+    public_key_to_bytes(pk, &pkt);
+    secret_key_finalize(&skt);
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_sign(unsigned char *sm,
+             unsigned long long *smlen,
+             const unsigned char *m,
+             unsigned long long mlen,
+             const unsigned char *sk)
+{
+    int ret = 0;
+    secret_key_t skt;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+    secret_key_init(&skt);
+    secret_key_from_bytes(&skt, &pkt, sk);
+
+    memmove(sm + SIGNATURE_BYTES, m, mlen);
+
+    ret = !protocols_sign(&sigt, &pkt, &skt, sm + SIGNATURE_BYTES, mlen);
+    if (ret != 0) {
+        *smlen = 0;
+        goto err;
+    }
+
+    signature_to_bytes(sm, &sigt);
+    *smlen = SIGNATURE_BYTES + mlen;
+
+err:
+    secret_key_finalize(&skt);
+    return ret;
+}
+#endif
+
+SQISIGN_API
+int
+sqisign_open(unsigned char *m,
+             unsigned long long *mlen,
+             const unsigned char *sm,
+             unsigned long long smlen,
+             const unsigned char *pk)
+{
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sm);
+
+    ret = !protocols_verify(&sigt, &pkt, sm + SIGNATURE_BYTES, smlen - SIGNATURE_BYTES);
+
+    if (!ret) {
+        *mlen = smlen - SIGNATURE_BYTES;
+        memmove(m, sm + SIGNATURE_BYTES, *mlen);
+    } else {
+        *mlen = 0;
+        memset(m, 0, smlen - SIGNATURE_BYTES);
+    }
+
+    return ret;
+}
+
+SQISIGN_API
+int
+sqisign_verify(const unsigned char *m,
+               unsigned long long mlen,
+               const unsigned char *sig,
+               unsigned long long siglen,
+               const unsigned char *pk)
+{
+
+    int ret = 0;
+    public_key_t pkt = { 0 };
+    signature_t sigt;
+
+    public_key_from_bytes(&pkt, pk);
+    signature_from_bytes(&sigt, sig);
+
+    ret = !protocols_verify(&sigt, &pkt, m, mlen);
+
+    return ret;
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/sqisign_namespace.h b/src/pqm4/sqisign_lvl5/ref/sqisign_namespace.h
new file mode 100644
index 0000000..14fd51d
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/sqisign_namespace.h
@@ -0,0 +1,1022 @@
+
+#ifndef SQISIGN_NAMESPACE_H
+#define SQISIGN_NAMESPACE_H
+
+//#define DISABLE_NAMESPACING
+
+#if defined(_WIN32)
+#define SQISIGN_API __declspec(dllexport)
+#else
+#define SQISIGN_API __attribute__((visibility("default")))
+#endif
+
+#define PARAM_JOIN3_(a, b, c) sqisign_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(SQISIGN_VARIANT, end, s)
+
+#define PARAM_JOIN2_(a, b) sqisign_##a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME2(end, s) PARAM_JOIN2(end, s)
+
+#ifndef DISABLE_NAMESPACING
+#define SQISIGN_NAMESPACE_GENERIC(s) PARAM_NAME2(gen, s)
+#else
+#define SQISIGN_NAMESPACE_GENERIC(s) s
+#endif
+
+#if defined(SQISIGN_VARIANT) && !defined(DISABLE_NAMESPACING)
+#if defined(SQISIGN_BUILD_TYPE_REF)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(SQISIGN_BUILD_TYPE_OPT)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(SQISIGN_BUILD_TYPE_BROADWELL)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(broadwell, s)
+#elif defined(SQISIGN_BUILD_TYPE_ARM64CRYPTO)
+#define SQISIGN_NAMESPACE(s) PARAM_NAME3(arm64crypto, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define SQISIGN_NAMESPACE(s) s
+#endif
+
+// Namespacing symbols exported from algebra.c:
+#undef quat_alg_add
+#undef quat_alg_conj
+#undef quat_alg_coord_mul
+#undef quat_alg_elem_copy
+#undef quat_alg_elem_copy_ibz
+#undef quat_alg_elem_equal
+#undef quat_alg_elem_is_zero
+#undef quat_alg_elem_mul_by_scalar
+#undef quat_alg_elem_set
+#undef quat_alg_equal_denom
+#undef quat_alg_init_set_ui
+#undef quat_alg_make_primitive
+#undef quat_alg_mul
+#undef quat_alg_norm
+#undef quat_alg_normalize
+#undef quat_alg_scalar
+#undef quat_alg_sub
+
+#define quat_alg_add                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_add)
+#define quat_alg_conj                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_conj)
+#define quat_alg_coord_mul                              SQISIGN_NAMESPACE_GENERIC(quat_alg_coord_mul)
+#define quat_alg_elem_copy                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy)
+#define quat_alg_elem_copy_ibz                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_copy_ibz)
+#define quat_alg_elem_equal                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_equal)
+#define quat_alg_elem_is_zero                           SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_is_zero)
+#define quat_alg_elem_mul_by_scalar                     SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_mul_by_scalar)
+#define quat_alg_elem_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_set)
+#define quat_alg_equal_denom                            SQISIGN_NAMESPACE_GENERIC(quat_alg_equal_denom)
+#define quat_alg_init_set_ui                            SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set_ui)
+#define quat_alg_make_primitive                         SQISIGN_NAMESPACE_GENERIC(quat_alg_make_primitive)
+#define quat_alg_mul                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_mul)
+#define quat_alg_norm                                   SQISIGN_NAMESPACE_GENERIC(quat_alg_norm)
+#define quat_alg_normalize                              SQISIGN_NAMESPACE_GENERIC(quat_alg_normalize)
+#define quat_alg_scalar                                 SQISIGN_NAMESPACE_GENERIC(quat_alg_scalar)
+#define quat_alg_sub                                    SQISIGN_NAMESPACE_GENERIC(quat_alg_sub)
+
+// Namespacing symbols exported from api.c:
+#undef crypto_sign
+#undef crypto_sign_keypair
+#undef crypto_sign_open
+
+#define crypto_sign                                     SQISIGN_NAMESPACE(crypto_sign)
+#define crypto_sign_keypair                             SQISIGN_NAMESPACE(crypto_sign_keypair)
+#define crypto_sign_open                                SQISIGN_NAMESPACE(crypto_sign_open)
+
+// Namespacing symbols exported from basis.c:
+#undef ec_curve_to_basis_2f_from_hint
+#undef ec_curve_to_basis_2f_to_hint
+#undef ec_recover_y
+#undef lift_basis
+#undef lift_basis_normalized
+
+#define ec_curve_to_basis_2f_from_hint                  SQISIGN_NAMESPACE(ec_curve_to_basis_2f_from_hint)
+#define ec_curve_to_basis_2f_to_hint                    SQISIGN_NAMESPACE(ec_curve_to_basis_2f_to_hint)
+#define ec_recover_y                                    SQISIGN_NAMESPACE(ec_recover_y)
+#define lift_basis                                      SQISIGN_NAMESPACE(lift_basis)
+#define lift_basis_normalized                           SQISIGN_NAMESPACE(lift_basis_normalized)
+
+// Namespacing symbols exported from biextension.c:
+#undef clear_cofac
+#undef ec_dlog_2_tate
+#undef ec_dlog_2_weil
+#undef fp2_frob
+#undef reduced_tate
+#undef weil
+
+#define clear_cofac                                     SQISIGN_NAMESPACE(clear_cofac)
+#define ec_dlog_2_tate                                  SQISIGN_NAMESPACE(ec_dlog_2_tate)
+#define ec_dlog_2_weil                                  SQISIGN_NAMESPACE(ec_dlog_2_weil)
+#define fp2_frob                                        SQISIGN_NAMESPACE(fp2_frob)
+#define reduced_tate                                    SQISIGN_NAMESPACE(reduced_tate)
+#define weil                                            SQISIGN_NAMESPACE(weil)
+
+// Namespacing symbols exported from common.c:
+#undef hash_to_challenge
+#undef public_key_finalize
+#undef public_key_init
+
+#define hash_to_challenge                               SQISIGN_NAMESPACE(hash_to_challenge)
+#define public_key_finalize                             SQISIGN_NAMESPACE(public_key_finalize)
+#define public_key_init                                 SQISIGN_NAMESPACE(public_key_init)
+
+// Namespacing symbols exported from dim2.c:
+#undef ibz_2x2_mul_mod
+#undef ibz_mat_2x2_add
+#undef ibz_mat_2x2_copy
+#undef ibz_mat_2x2_det_from_ibz
+#undef ibz_mat_2x2_eval
+#undef ibz_mat_2x2_inv_mod
+#undef ibz_mat_2x2_set
+#undef ibz_vec_2_set
+
+#define ibz_2x2_mul_mod                                 SQISIGN_NAMESPACE_GENERIC(ibz_2x2_mul_mod)
+#define ibz_mat_2x2_add                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_add)
+#define ibz_mat_2x2_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_copy)
+#define ibz_mat_2x2_det_from_ibz                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_det_from_ibz)
+#define ibz_mat_2x2_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_eval)
+#define ibz_mat_2x2_inv_mod                             SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_inv_mod)
+#define ibz_mat_2x2_set                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_set)
+#define ibz_vec_2_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_set)
+
+// Namespacing symbols exported from dim2id2iso.c:
+#undef dim2id2iso_arbitrary_isogeny_evaluation
+#undef dim2id2iso_ideal_to_isogeny_clapotis
+#undef find_uv
+#undef fixed_degree_isogeny_and_eval
+
+#define dim2id2iso_arbitrary_isogeny_evaluation         SQISIGN_NAMESPACE(dim2id2iso_arbitrary_isogeny_evaluation)
+#define dim2id2iso_ideal_to_isogeny_clapotis            SQISIGN_NAMESPACE(dim2id2iso_ideal_to_isogeny_clapotis)
+#define find_uv                                         SQISIGN_NAMESPACE(find_uv)
+#define fixed_degree_isogeny_and_eval                   SQISIGN_NAMESPACE(fixed_degree_isogeny_and_eval)
+
+// Namespacing symbols exported from dim4.c:
+#undef ibz_inv_dim4_make_coeff_mpm
+#undef ibz_inv_dim4_make_coeff_pmp
+#undef ibz_mat_4x4_copy
+#undef ibz_mat_4x4_equal
+#undef ibz_mat_4x4_eval
+#undef ibz_mat_4x4_eval_t
+#undef ibz_mat_4x4_gcd
+#undef ibz_mat_4x4_identity
+#undef ibz_mat_4x4_inv_with_det_as_denom
+#undef ibz_mat_4x4_is_identity
+#undef ibz_mat_4x4_mul
+#undef ibz_mat_4x4_negate
+#undef ibz_mat_4x4_scalar_div
+#undef ibz_mat_4x4_scalar_mul
+#undef ibz_mat_4x4_transpose
+#undef ibz_mat_4x4_zero
+#undef ibz_vec_4_add
+#undef ibz_vec_4_content
+#undef ibz_vec_4_copy
+#undef ibz_vec_4_copy_ibz
+#undef ibz_vec_4_is_zero
+#undef ibz_vec_4_linear_combination
+#undef ibz_vec_4_negate
+#undef ibz_vec_4_scalar_div
+#undef ibz_vec_4_scalar_mul
+#undef ibz_vec_4_set
+#undef ibz_vec_4_sub
+#undef quat_qf_eval
+
+#define ibz_inv_dim4_make_coeff_mpm                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_mpm)
+#define ibz_inv_dim4_make_coeff_pmp                     SQISIGN_NAMESPACE_GENERIC(ibz_inv_dim4_make_coeff_pmp)
+#define ibz_mat_4x4_copy                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_copy)
+#define ibz_mat_4x4_equal                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_equal)
+#define ibz_mat_4x4_eval                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval)
+#define ibz_mat_4x4_eval_t                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_eval_t)
+#define ibz_mat_4x4_gcd                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_gcd)
+#define ibz_mat_4x4_identity                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_identity)
+#define ibz_mat_4x4_inv_with_det_as_denom               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_inv_with_det_as_denom)
+#define ibz_mat_4x4_is_identity                         SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_identity)
+#define ibz_mat_4x4_mul                                 SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_mul)
+#define ibz_mat_4x4_negate                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_negate)
+#define ibz_mat_4x4_scalar_div                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_div)
+#define ibz_mat_4x4_scalar_mul                          SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_scalar_mul)
+#define ibz_mat_4x4_transpose                           SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_transpose)
+#define ibz_mat_4x4_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_zero)
+#define ibz_vec_4_add                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_add)
+#define ibz_vec_4_content                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_content)
+#define ibz_vec_4_copy                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy)
+#define ibz_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_ibz)
+#define ibz_vec_4_is_zero                               SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_is_zero)
+#define ibz_vec_4_linear_combination                    SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination)
+#define ibz_vec_4_negate                                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_negate)
+#define ibz_vec_4_scalar_div                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_div)
+#define ibz_vec_4_scalar_mul                            SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul)
+#define ibz_vec_4_set                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_set)
+#define ibz_vec_4_sub                                   SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_sub)
+#define quat_qf_eval                                    SQISIGN_NAMESPACE_GENERIC(quat_qf_eval)
+
+// Namespacing symbols exported from ec.c:
+#undef cswap_points
+#undef ec_biscalar_mul
+#undef ec_curve_init
+#undef ec_curve_init_from_A
+#undef ec_curve_normalize_A24
+#undef ec_curve_verify_A
+#undef ec_dbl
+#undef ec_dbl_iter
+#undef ec_dbl_iter_basis
+#undef ec_has_zero_coordinate
+#undef ec_is_basis_four_torsion
+#undef ec_is_equal
+#undef ec_is_four_torsion
+#undef ec_is_two_torsion
+#undef ec_is_zero
+#undef ec_j_inv
+#undef ec_ladder3pt
+#undef ec_mul
+#undef ec_normalize_curve
+#undef ec_normalize_curve_and_A24
+#undef ec_normalize_point
+#undef ec_point_init
+#undef select_point
+#undef xADD
+#undef xDBL
+#undef xDBLADD
+#undef xDBLMUL
+#undef xDBL_A24
+#undef xDBL_E0
+#undef xMUL
+
+#define cswap_points                                    SQISIGN_NAMESPACE(cswap_points)
+#define ec_biscalar_mul                                 SQISIGN_NAMESPACE(ec_biscalar_mul)
+#define ec_curve_init                                   SQISIGN_NAMESPACE(ec_curve_init)
+#define ec_curve_init_from_A                            SQISIGN_NAMESPACE(ec_curve_init_from_A)
+#define ec_curve_normalize_A24                          SQISIGN_NAMESPACE(ec_curve_normalize_A24)
+#define ec_curve_verify_A                               SQISIGN_NAMESPACE(ec_curve_verify_A)
+#define ec_dbl                                          SQISIGN_NAMESPACE(ec_dbl)
+#define ec_dbl_iter                                     SQISIGN_NAMESPACE(ec_dbl_iter)
+#define ec_dbl_iter_basis                               SQISIGN_NAMESPACE(ec_dbl_iter_basis)
+#define ec_has_zero_coordinate                          SQISIGN_NAMESPACE(ec_has_zero_coordinate)
+#define ec_is_basis_four_torsion                        SQISIGN_NAMESPACE(ec_is_basis_four_torsion)
+#define ec_is_equal                                     SQISIGN_NAMESPACE(ec_is_equal)
+#define ec_is_four_torsion                              SQISIGN_NAMESPACE(ec_is_four_torsion)
+#define ec_is_two_torsion                               SQISIGN_NAMESPACE(ec_is_two_torsion)
+#define ec_is_zero                                      SQISIGN_NAMESPACE(ec_is_zero)
+#define ec_j_inv                                        SQISIGN_NAMESPACE(ec_j_inv)
+#define ec_ladder3pt                                    SQISIGN_NAMESPACE(ec_ladder3pt)
+#define ec_mul                                          SQISIGN_NAMESPACE(ec_mul)
+#define ec_normalize_curve                              SQISIGN_NAMESPACE(ec_normalize_curve)
+#define ec_normalize_curve_and_A24                      SQISIGN_NAMESPACE(ec_normalize_curve_and_A24)
+#define ec_normalize_point                              SQISIGN_NAMESPACE(ec_normalize_point)
+#define ec_point_init                                   SQISIGN_NAMESPACE(ec_point_init)
+#define select_point                                    SQISIGN_NAMESPACE(select_point)
+#define xADD                                            SQISIGN_NAMESPACE(xADD)
+#define xDBL                                            SQISIGN_NAMESPACE(xDBL)
+#define xDBLADD                                         SQISIGN_NAMESPACE(xDBLADD)
+#define xDBLMUL                                         SQISIGN_NAMESPACE(xDBLMUL)
+#define xDBL_A24                                        SQISIGN_NAMESPACE(xDBL_A24)
+#define xDBL_E0                                         SQISIGN_NAMESPACE(xDBL_E0)
+#define xMUL                                            SQISIGN_NAMESPACE(xMUL)
+
+// Namespacing symbols exported from ec_jac.c:
+#undef ADD
+#undef DBL
+#undef DBLW
+#undef copy_jac_point
+#undef jac_from_ws
+#undef jac_init
+#undef jac_is_equal
+#undef jac_neg
+#undef jac_to_ws
+#undef jac_to_xz
+#undef jac_to_xz_add_components
+#undef select_jac_point
+
+#define ADD                                             SQISIGN_NAMESPACE(ADD)
+#define DBL                                             SQISIGN_NAMESPACE(DBL)
+#define DBLW                                            SQISIGN_NAMESPACE(DBLW)
+#define copy_jac_point                                  SQISIGN_NAMESPACE(copy_jac_point)
+#define jac_from_ws                                     SQISIGN_NAMESPACE(jac_from_ws)
+#define jac_init                                        SQISIGN_NAMESPACE(jac_init)
+#define jac_is_equal                                    SQISIGN_NAMESPACE(jac_is_equal)
+#define jac_neg                                         SQISIGN_NAMESPACE(jac_neg)
+#define jac_to_ws                                       SQISIGN_NAMESPACE(jac_to_ws)
+#define jac_to_xz                                       SQISIGN_NAMESPACE(jac_to_xz)
+#define jac_to_xz_add_components                        SQISIGN_NAMESPACE(jac_to_xz_add_components)
+#define select_jac_point                                SQISIGN_NAMESPACE(select_jac_point)
+
+// Namespacing symbols exported from encode_signature.c:
+#undef secret_key_from_bytes
+#undef secret_key_to_bytes
+
+#define secret_key_from_bytes                           SQISIGN_NAMESPACE(secret_key_from_bytes)
+#define secret_key_to_bytes                             SQISIGN_NAMESPACE(secret_key_to_bytes)
+
+// Namespacing symbols exported from encode_verification.c:
+#undef public_key_from_bytes
+#undef public_key_to_bytes
+#undef signature_from_bytes
+#undef signature_to_bytes
+
+#define public_key_from_bytes                           SQISIGN_NAMESPACE(public_key_from_bytes)
+#define public_key_to_bytes                             SQISIGN_NAMESPACE(public_key_to_bytes)
+#define signature_from_bytes                            SQISIGN_NAMESPACE(signature_from_bytes)
+#define signature_to_bytes                              SQISIGN_NAMESPACE(signature_to_bytes)
+
+// Namespacing symbols exported from finit.c:
+#undef ibz_mat_2x2_finalize
+#undef ibz_mat_2x2_init
+#undef ibz_mat_4x4_finalize
+#undef ibz_mat_4x4_init
+#undef ibz_vec_2_finalize
+#undef ibz_vec_2_init
+#undef ibz_vec_4_finalize
+#undef ibz_vec_4_init
+#undef quat_alg_elem_finalize
+#undef quat_alg_elem_init
+#undef quat_alg_finalize
+#undef quat_alg_init_set
+#undef quat_lattice_finalize
+#undef quat_lattice_init
+#undef quat_left_ideal_finalize
+#undef quat_left_ideal_init
+
+#define ibz_mat_2x2_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_finalize)
+#define ibz_mat_2x2_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_init)
+#define ibz_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_finalize)
+#define ibz_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_init)
+#define ibz_vec_2_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_finalize)
+#define ibz_vec_2_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_init)
+#define ibz_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_finalize)
+#define ibz_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_init)
+#define quat_alg_elem_finalize                          SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_finalize)
+#define quat_alg_elem_init                              SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_init)
+#define quat_alg_finalize                               SQISIGN_NAMESPACE_GENERIC(quat_alg_finalize)
+#define quat_alg_init_set                               SQISIGN_NAMESPACE_GENERIC(quat_alg_init_set)
+#define quat_lattice_finalize                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_finalize)
+#define quat_lattice_init                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_init)
+#define quat_left_ideal_finalize                        SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_finalize)
+#define quat_left_ideal_init                            SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_init)
+
+// Namespacing symbols exported from fp.c:
+#undef fp_select
+
+#define fp_select                                       SQISIGN_NAMESPACE(fp_select)
+
+// Namespacing symbols exported from fp.c, fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_exp3div4
+#undef fp_inv
+#undef fp_is_square
+#undef fp_sqrt
+
+#define fp_exp3div4                                     SQISIGN_NAMESPACE(fp_exp3div4)
+#define fp_inv                                          SQISIGN_NAMESPACE(fp_inv)
+#define fp_is_square                                    SQISIGN_NAMESPACE(fp_is_square)
+#define fp_sqrt                                         SQISIGN_NAMESPACE(fp_sqrt)
+
+// Namespacing symbols exported from fp2.c:
+#undef fp2_add
+#undef fp2_add_one
+#undef fp2_batched_inv
+#undef fp2_copy
+#undef fp2_cswap
+#undef fp2_decode
+#undef fp2_encode
+#undef fp2_half
+#undef fp2_inv
+#undef fp2_is_equal
+#undef fp2_is_one
+#undef fp2_is_square
+#undef fp2_is_zero
+#undef fp2_mul
+#undef fp2_mul_small
+#undef fp2_neg
+#undef fp2_pow_vartime
+#undef fp2_print
+#undef fp2_select
+#undef fp2_set_one
+#undef fp2_set_small
+#undef fp2_set_zero
+#undef fp2_sqr
+#undef fp2_sqrt
+#undef fp2_sqrt_verify
+#undef fp2_sub
+
+#define fp2_add                                         SQISIGN_NAMESPACE(fp2_add)
+#define fp2_add_one                                     SQISIGN_NAMESPACE(fp2_add_one)
+#define fp2_batched_inv                                 SQISIGN_NAMESPACE(fp2_batched_inv)
+#define fp2_copy                                        SQISIGN_NAMESPACE(fp2_copy)
+#define fp2_cswap                                       SQISIGN_NAMESPACE(fp2_cswap)
+#define fp2_decode                                      SQISIGN_NAMESPACE(fp2_decode)
+#define fp2_encode                                      SQISIGN_NAMESPACE(fp2_encode)
+#define fp2_half                                        SQISIGN_NAMESPACE(fp2_half)
+#define fp2_inv                                         SQISIGN_NAMESPACE(fp2_inv)
+#define fp2_is_equal                                    SQISIGN_NAMESPACE(fp2_is_equal)
+#define fp2_is_one                                      SQISIGN_NAMESPACE(fp2_is_one)
+#define fp2_is_square                                   SQISIGN_NAMESPACE(fp2_is_square)
+#define fp2_is_zero                                     SQISIGN_NAMESPACE(fp2_is_zero)
+#define fp2_mul                                         SQISIGN_NAMESPACE(fp2_mul)
+#define fp2_mul_small                                   SQISIGN_NAMESPACE(fp2_mul_small)
+#define fp2_neg                                         SQISIGN_NAMESPACE(fp2_neg)
+#define fp2_pow_vartime                                 SQISIGN_NAMESPACE(fp2_pow_vartime)
+#define fp2_print                                       SQISIGN_NAMESPACE(fp2_print)
+#define fp2_select                                      SQISIGN_NAMESPACE(fp2_select)
+#define fp2_set_one                                     SQISIGN_NAMESPACE(fp2_set_one)
+#define fp2_set_small                                   SQISIGN_NAMESPACE(fp2_set_small)
+#define fp2_set_zero                                    SQISIGN_NAMESPACE(fp2_set_zero)
+#define fp2_sqr                                         SQISIGN_NAMESPACE(fp2_sqr)
+#define fp2_sqrt                                        SQISIGN_NAMESPACE(fp2_sqrt)
+#define fp2_sqrt_verify                                 SQISIGN_NAMESPACE(fp2_sqrt_verify)
+#define fp2_sub                                         SQISIGN_NAMESPACE(fp2_sub)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c:
+#undef fp_copy
+#undef fp_cswap
+#undef fp_decode
+#undef fp_decode_reduce
+#undef fp_div3
+#undef fp_encode
+#undef fp_half
+#undef fp_is_equal
+#undef fp_is_zero
+#undef fp_mul_small
+#undef fp_neg
+#undef fp_set_one
+#undef fp_set_small
+#undef fp_set_zero
+
+#define fp_copy                                         SQISIGN_NAMESPACE(fp_copy)
+#define fp_cswap                                        SQISIGN_NAMESPACE(fp_cswap)
+#define fp_decode                                       SQISIGN_NAMESPACE(fp_decode)
+#define fp_decode_reduce                                SQISIGN_NAMESPACE(fp_decode_reduce)
+#define fp_div3                                         SQISIGN_NAMESPACE(fp_div3)
+#define fp_encode                                       SQISIGN_NAMESPACE(fp_encode)
+#define fp_half                                         SQISIGN_NAMESPACE(fp_half)
+#define fp_is_equal                                     SQISIGN_NAMESPACE(fp_is_equal)
+#define fp_is_zero                                      SQISIGN_NAMESPACE(fp_is_zero)
+#define fp_mul_small                                    SQISIGN_NAMESPACE(fp_mul_small)
+#define fp_neg                                          SQISIGN_NAMESPACE(fp_neg)
+#define fp_set_one                                      SQISIGN_NAMESPACE(fp_set_one)
+#define fp_set_small                                    SQISIGN_NAMESPACE(fp_set_small)
+#define fp_set_zero                                     SQISIGN_NAMESPACE(fp_set_zero)
+
+// Namespacing symbols exported from fp_p27500_64.c, fp_p5248_64.c, fp_p65376_64.c, gf27500.c, gf5248.c, gf65376.c:
+#undef fp_add
+#undef fp_mul
+#undef fp_sqr
+#undef fp_sub
+
+#define fp_add                                          SQISIGN_NAMESPACE(fp_add)
+#define fp_mul                                          SQISIGN_NAMESPACE(fp_mul)
+#define fp_sqr                                          SQISIGN_NAMESPACE(fp_sqr)
+#define fp_sub                                          SQISIGN_NAMESPACE(fp_sub)
+
+// Namespacing symbols exported from gf27500.c:
+#undef gf27500_decode
+#undef gf27500_decode_reduce
+#undef gf27500_div
+#undef gf27500_div3
+#undef gf27500_encode
+#undef gf27500_invert
+#undef gf27500_legendre
+#undef gf27500_sqrt
+
+#define gf27500_decode                                  SQISIGN_NAMESPACE(gf27500_decode)
+#define gf27500_decode_reduce                           SQISIGN_NAMESPACE(gf27500_decode_reduce)
+#define gf27500_div                                     SQISIGN_NAMESPACE(gf27500_div)
+#define gf27500_div3                                    SQISIGN_NAMESPACE(gf27500_div3)
+#define gf27500_encode                                  SQISIGN_NAMESPACE(gf27500_encode)
+#define gf27500_invert                                  SQISIGN_NAMESPACE(gf27500_invert)
+#define gf27500_legendre                                SQISIGN_NAMESPACE(gf27500_legendre)
+#define gf27500_sqrt                                    SQISIGN_NAMESPACE(gf27500_sqrt)
+
+// Namespacing symbols exported from gf27500.c, gf5248.c, gf65376.c:
+#undef fp2_mul_c0
+#undef fp2_mul_c1
+#undef fp2_sq_c0
+#undef fp2_sq_c1
+
+#define fp2_mul_c0                                      SQISIGN_NAMESPACE(fp2_mul_c0)
+#define fp2_mul_c1                                      SQISIGN_NAMESPACE(fp2_mul_c1)
+#define fp2_sq_c0                                       SQISIGN_NAMESPACE(fp2_sq_c0)
+#define fp2_sq_c1                                       SQISIGN_NAMESPACE(fp2_sq_c1)
+
+// Namespacing symbols exported from gf5248.c:
+#undef gf5248_decode
+#undef gf5248_decode_reduce
+#undef gf5248_div
+#undef gf5248_div3
+#undef gf5248_encode
+#undef gf5248_invert
+#undef gf5248_legendre
+#undef gf5248_sqrt
+
+#define gf5248_decode                                   SQISIGN_NAMESPACE(gf5248_decode)
+#define gf5248_decode_reduce                            SQISIGN_NAMESPACE(gf5248_decode_reduce)
+#define gf5248_div                                      SQISIGN_NAMESPACE(gf5248_div)
+#define gf5248_div3                                     SQISIGN_NAMESPACE(gf5248_div3)
+#define gf5248_encode                                   SQISIGN_NAMESPACE(gf5248_encode)
+#define gf5248_invert                                   SQISIGN_NAMESPACE(gf5248_invert)
+#define gf5248_legendre                                 SQISIGN_NAMESPACE(gf5248_legendre)
+#define gf5248_sqrt                                     SQISIGN_NAMESPACE(gf5248_sqrt)
+
+// Namespacing symbols exported from gf65376.c:
+#undef gf65376_decode
+#undef gf65376_decode_reduce
+#undef gf65376_div
+#undef gf65376_div3
+#undef gf65376_encode
+#undef gf65376_invert
+#undef gf65376_legendre
+#undef gf65376_sqrt
+
+#define gf65376_decode                                  SQISIGN_NAMESPACE(gf65376_decode)
+#define gf65376_decode_reduce                           SQISIGN_NAMESPACE(gf65376_decode_reduce)
+#define gf65376_div                                     SQISIGN_NAMESPACE(gf65376_div)
+#define gf65376_div3                                    SQISIGN_NAMESPACE(gf65376_div3)
+#define gf65376_encode                                  SQISIGN_NAMESPACE(gf65376_encode)
+#define gf65376_invert                                  SQISIGN_NAMESPACE(gf65376_invert)
+#define gf65376_legendre                                SQISIGN_NAMESPACE(gf65376_legendre)
+#define gf65376_sqrt                                    SQISIGN_NAMESPACE(gf65376_sqrt)
+
+// Namespacing symbols exported from hd.c:
+#undef add_couple_jac_points
+#undef copy_bases_to_kernel
+#undef couple_jac_to_xz
+#undef double_couple_jac_point
+#undef double_couple_jac_point_iter
+#undef double_couple_point
+#undef double_couple_point_iter
+
+#define add_couple_jac_points                           SQISIGN_NAMESPACE(add_couple_jac_points)
+#define copy_bases_to_kernel                            SQISIGN_NAMESPACE(copy_bases_to_kernel)
+#define couple_jac_to_xz                                SQISIGN_NAMESPACE(couple_jac_to_xz)
+#define double_couple_jac_point                         SQISIGN_NAMESPACE(double_couple_jac_point)
+#define double_couple_jac_point_iter                    SQISIGN_NAMESPACE(double_couple_jac_point_iter)
+#define double_couple_point                             SQISIGN_NAMESPACE(double_couple_point)
+#define double_couple_point_iter                        SQISIGN_NAMESPACE(double_couple_point_iter)
+
+// Namespacing symbols exported from hnf.c:
+#undef ibz_mat_4x4_is_hnf
+#undef ibz_mat_4xn_hnf_mod_core
+#undef ibz_vec_4_copy_mod
+#undef ibz_vec_4_linear_combination_mod
+#undef ibz_vec_4_scalar_mul_mod
+
+#define ibz_mat_4x4_is_hnf                              SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_is_hnf)
+#define ibz_mat_4xn_hnf_mod_core                        SQISIGN_NAMESPACE_GENERIC(ibz_mat_4xn_hnf_mod_core)
+#define ibz_vec_4_copy_mod                              SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_copy_mod)
+#define ibz_vec_4_linear_combination_mod                SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_linear_combination_mod)
+#define ibz_vec_4_scalar_mul_mod                        SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_scalar_mul_mod)
+
+// Namespacing symbols exported from hnf_internal.c:
+#undef ibz_centered_mod
+#undef ibz_conditional_assign
+#undef ibz_mod_not_zero
+#undef ibz_xgcd_with_u_not_0
+
+#define ibz_centered_mod                                SQISIGN_NAMESPACE_GENERIC(ibz_centered_mod)
+#define ibz_conditional_assign                          SQISIGN_NAMESPACE_GENERIC(ibz_conditional_assign)
+#define ibz_mod_not_zero                                SQISIGN_NAMESPACE_GENERIC(ibz_mod_not_zero)
+#define ibz_xgcd_with_u_not_0                           SQISIGN_NAMESPACE_GENERIC(ibz_xgcd_with_u_not_0)
+
+// Namespacing symbols exported from ibz_division.c:
+#undef ibz_xgcd
+
+#define ibz_xgcd                                        SQISIGN_NAMESPACE_GENERIC(ibz_xgcd)
+
+// Namespacing symbols exported from id2iso.c:
+#undef change_of_basis_matrix_tate
+#undef change_of_basis_matrix_tate_invert
+#undef ec_biscalar_mul_ibz_vec
+#undef endomorphism_application_even_basis
+#undef id2iso_ideal_to_kernel_dlogs_even
+#undef id2iso_kernel_dlogs_to_ideal_even
+#undef matrix_application_even_basis
+
+#define change_of_basis_matrix_tate                     SQISIGN_NAMESPACE(change_of_basis_matrix_tate)
+#define change_of_basis_matrix_tate_invert              SQISIGN_NAMESPACE(change_of_basis_matrix_tate_invert)
+#define ec_biscalar_mul_ibz_vec                         SQISIGN_NAMESPACE(ec_biscalar_mul_ibz_vec)
+#define endomorphism_application_even_basis             SQISIGN_NAMESPACE(endomorphism_application_even_basis)
+#define id2iso_ideal_to_kernel_dlogs_even               SQISIGN_NAMESPACE(id2iso_ideal_to_kernel_dlogs_even)
+#define id2iso_kernel_dlogs_to_ideal_even               SQISIGN_NAMESPACE(id2iso_kernel_dlogs_to_ideal_even)
+#define matrix_application_even_basis                   SQISIGN_NAMESPACE(matrix_application_even_basis)
+
+// Namespacing symbols exported from ideal.c:
+#undef quat_lideal_add
+#undef quat_lideal_class_gram
+#undef quat_lideal_conjugate_without_hnf
+#undef quat_lideal_copy
+#undef quat_lideal_create
+#undef quat_lideal_create_principal
+#undef quat_lideal_equals
+#undef quat_lideal_generator
+#undef quat_lideal_inter
+#undef quat_lideal_inverse_lattice_without_hnf
+#undef quat_lideal_mul
+#undef quat_lideal_norm
+#undef quat_lideal_right_order
+#undef quat_lideal_right_transporter
+#undef quat_order_discriminant
+#undef quat_order_is_maximal
+
+#define quat_lideal_add                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_add)
+#define quat_lideal_class_gram                          SQISIGN_NAMESPACE_GENERIC(quat_lideal_class_gram)
+#define quat_lideal_conjugate_without_hnf               SQISIGN_NAMESPACE_GENERIC(quat_lideal_conjugate_without_hnf)
+#define quat_lideal_copy                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_copy)
+#define quat_lideal_create                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_create)
+#define quat_lideal_create_principal                    SQISIGN_NAMESPACE_GENERIC(quat_lideal_create_principal)
+#define quat_lideal_equals                              SQISIGN_NAMESPACE_GENERIC(quat_lideal_equals)
+#define quat_lideal_generator                           SQISIGN_NAMESPACE_GENERIC(quat_lideal_generator)
+#define quat_lideal_inter                               SQISIGN_NAMESPACE_GENERIC(quat_lideal_inter)
+#define quat_lideal_inverse_lattice_without_hnf         SQISIGN_NAMESPACE_GENERIC(quat_lideal_inverse_lattice_without_hnf)
+#define quat_lideal_mul                                 SQISIGN_NAMESPACE_GENERIC(quat_lideal_mul)
+#define quat_lideal_norm                                SQISIGN_NAMESPACE_GENERIC(quat_lideal_norm)
+#define quat_lideal_right_order                         SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_order)
+#define quat_lideal_right_transporter                   SQISIGN_NAMESPACE_GENERIC(quat_lideal_right_transporter)
+#define quat_order_discriminant                         SQISIGN_NAMESPACE_GENERIC(quat_order_discriminant)
+#define quat_order_is_maximal                           SQISIGN_NAMESPACE_GENERIC(quat_order_is_maximal)
+
+// Namespacing symbols exported from intbig.c:
+#undef ibz_abs
+#undef ibz_add
+#undef ibz_bitsize
+#undef ibz_cmp
+#undef ibz_cmp_int32
+#undef ibz_convert_to_str
+#undef ibz_copy
+#undef ibz_copy_digits
+#undef ibz_div
+#undef ibz_div_2exp
+#undef ibz_div_floor
+#undef ibz_divides
+#undef ibz_finalize
+#undef ibz_gcd
+#undef ibz_get
+#undef ibz_init
+#undef ibz_invmod
+#undef ibz_is_even
+#undef ibz_is_odd
+#undef ibz_is_one
+#undef ibz_is_zero
+#undef ibz_legendre
+#undef ibz_mod
+#undef ibz_mod_ui
+#undef ibz_mul
+#undef ibz_neg
+#undef ibz_pow
+#undef ibz_pow_mod
+#undef ibz_print
+#undef ibz_probab_prime
+#undef ibz_rand_interval
+#undef ibz_rand_interval_bits
+#undef ibz_rand_interval_i
+#undef ibz_rand_interval_minm_m
+#undef ibz_set
+#undef ibz_set_from_str
+#undef ibz_size_in_base
+#undef ibz_sqrt
+#undef ibz_sqrt_floor
+#undef ibz_sqrt_mod_p
+#undef ibz_sub
+#undef ibz_swap
+#undef ibz_to_digits
+#undef ibz_two_adic
+
+#define ibz_abs                                         SQISIGN_NAMESPACE_GENERIC(ibz_abs)
+#define ibz_add                                         SQISIGN_NAMESPACE_GENERIC(ibz_add)
+#define ibz_bitsize                                     SQISIGN_NAMESPACE_GENERIC(ibz_bitsize)
+#define ibz_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibz_cmp)
+#define ibz_cmp_int32                                   SQISIGN_NAMESPACE_GENERIC(ibz_cmp_int32)
+#define ibz_convert_to_str                              SQISIGN_NAMESPACE_GENERIC(ibz_convert_to_str)
+#define ibz_copy                                        SQISIGN_NAMESPACE_GENERIC(ibz_copy)
+#define ibz_copy_digits                                 SQISIGN_NAMESPACE_GENERIC(ibz_copy_digits)
+#define ibz_div                                         SQISIGN_NAMESPACE_GENERIC(ibz_div)
+#define ibz_div_2exp                                    SQISIGN_NAMESPACE_GENERIC(ibz_div_2exp)
+#define ibz_div_floor                                   SQISIGN_NAMESPACE_GENERIC(ibz_div_floor)
+#define ibz_divides                                     SQISIGN_NAMESPACE_GENERIC(ibz_divides)
+#define ibz_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibz_finalize)
+#define ibz_gcd                                         SQISIGN_NAMESPACE_GENERIC(ibz_gcd)
+#define ibz_get                                         SQISIGN_NAMESPACE_GENERIC(ibz_get)
+#define ibz_init                                        SQISIGN_NAMESPACE_GENERIC(ibz_init)
+#define ibz_invmod                                      SQISIGN_NAMESPACE_GENERIC(ibz_invmod)
+#define ibz_is_even                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_even)
+#define ibz_is_odd                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_odd)
+#define ibz_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibz_is_one)
+#define ibz_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibz_is_zero)
+#define ibz_legendre                                    SQISIGN_NAMESPACE_GENERIC(ibz_legendre)
+#define ibz_mod                                         SQISIGN_NAMESPACE_GENERIC(ibz_mod)
+#define ibz_mod_ui                                      SQISIGN_NAMESPACE_GENERIC(ibz_mod_ui)
+#define ibz_mul                                         SQISIGN_NAMESPACE_GENERIC(ibz_mul)
+#define ibz_neg                                         SQISIGN_NAMESPACE_GENERIC(ibz_neg)
+#define ibz_pow                                         SQISIGN_NAMESPACE_GENERIC(ibz_pow)
+#define ibz_pow_mod                                     SQISIGN_NAMESPACE_GENERIC(ibz_pow_mod)
+#define ibz_print                                       SQISIGN_NAMESPACE_GENERIC(ibz_print)
+#define ibz_probab_prime                                SQISIGN_NAMESPACE_GENERIC(ibz_probab_prime)
+#define ibz_rand_interval                               SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval)
+#define ibz_rand_interval_bits                          SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_bits)
+#define ibz_rand_interval_i                             SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_i)
+#define ibz_rand_interval_minm_m                        SQISIGN_NAMESPACE_GENERIC(ibz_rand_interval_minm_m)
+#define ibz_set                                         SQISIGN_NAMESPACE_GENERIC(ibz_set)
+#define ibz_set_from_str                                SQISIGN_NAMESPACE_GENERIC(ibz_set_from_str)
+#define ibz_size_in_base                                SQISIGN_NAMESPACE_GENERIC(ibz_size_in_base)
+#define ibz_sqrt                                        SQISIGN_NAMESPACE_GENERIC(ibz_sqrt)
+#define ibz_sqrt_floor                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_floor)
+#define ibz_sqrt_mod_p                                  SQISIGN_NAMESPACE_GENERIC(ibz_sqrt_mod_p)
+#define ibz_sub                                         SQISIGN_NAMESPACE_GENERIC(ibz_sub)
+#define ibz_swap                                        SQISIGN_NAMESPACE_GENERIC(ibz_swap)
+#define ibz_to_digits                                   SQISIGN_NAMESPACE_GENERIC(ibz_to_digits)
+#define ibz_two_adic                                    SQISIGN_NAMESPACE_GENERIC(ibz_two_adic)
+
+// Namespacing symbols exported from integers.c:
+#undef ibz_cornacchia_prime
+#undef ibz_generate_random_prime
+
+#define ibz_cornacchia_prime                            SQISIGN_NAMESPACE_GENERIC(ibz_cornacchia_prime)
+#define ibz_generate_random_prime                       SQISIGN_NAMESPACE_GENERIC(ibz_generate_random_prime)
+
+// Namespacing symbols exported from isog_chains.c:
+#undef ec_eval_even
+#undef ec_eval_small_chain
+#undef ec_iso_eval
+#undef ec_isomorphism
+
+#define ec_eval_even                                    SQISIGN_NAMESPACE(ec_eval_even)
+#define ec_eval_small_chain                             SQISIGN_NAMESPACE(ec_eval_small_chain)
+#define ec_iso_eval                                     SQISIGN_NAMESPACE(ec_iso_eval)
+#define ec_isomorphism                                  SQISIGN_NAMESPACE(ec_isomorphism)
+
+// Namespacing symbols exported from keygen.c:
+#undef protocols_keygen
+#undef secret_key_finalize
+#undef secret_key_init
+
+#define protocols_keygen                                SQISIGN_NAMESPACE(protocols_keygen)
+#define secret_key_finalize                             SQISIGN_NAMESPACE(secret_key_finalize)
+#define secret_key_init                                 SQISIGN_NAMESPACE(secret_key_init)
+
+// Namespacing symbols exported from l2.c:
+#undef quat_lattice_lll
+#undef quat_lll_core
+
+#define quat_lattice_lll                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_lll)
+#define quat_lll_core                                   SQISIGN_NAMESPACE_GENERIC(quat_lll_core)
+
+// Namespacing symbols exported from lat_ball.c:
+#undef quat_lattice_bound_parallelogram
+#undef quat_lattice_sample_from_ball
+
+#define quat_lattice_bound_parallelogram                SQISIGN_NAMESPACE_GENERIC(quat_lattice_bound_parallelogram)
+#define quat_lattice_sample_from_ball                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_sample_from_ball)
+
+// Namespacing symbols exported from lattice.c:
+#undef quat_lattice_add
+#undef quat_lattice_alg_elem_mul
+#undef quat_lattice_conjugate_without_hnf
+#undef quat_lattice_contains
+#undef quat_lattice_dual_without_hnf
+#undef quat_lattice_equal
+#undef quat_lattice_gram
+#undef quat_lattice_hnf
+#undef quat_lattice_inclusion
+#undef quat_lattice_index
+#undef quat_lattice_intersect
+#undef quat_lattice_mat_alg_coord_mul_without_hnf
+#undef quat_lattice_mul
+#undef quat_lattice_reduce_denom
+
+#define quat_lattice_add                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_add)
+#define quat_lattice_alg_elem_mul                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_alg_elem_mul)
+#define quat_lattice_conjugate_without_hnf              SQISIGN_NAMESPACE_GENERIC(quat_lattice_conjugate_without_hnf)
+#define quat_lattice_contains                           SQISIGN_NAMESPACE_GENERIC(quat_lattice_contains)
+#define quat_lattice_dual_without_hnf                   SQISIGN_NAMESPACE_GENERIC(quat_lattice_dual_without_hnf)
+#define quat_lattice_equal                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_equal)
+#define quat_lattice_gram                               SQISIGN_NAMESPACE_GENERIC(quat_lattice_gram)
+#define quat_lattice_hnf                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_hnf)
+#define quat_lattice_inclusion                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_inclusion)
+#define quat_lattice_index                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_index)
+#define quat_lattice_intersect                          SQISIGN_NAMESPACE_GENERIC(quat_lattice_intersect)
+#define quat_lattice_mat_alg_coord_mul_without_hnf      SQISIGN_NAMESPACE_GENERIC(quat_lattice_mat_alg_coord_mul_without_hnf)
+#define quat_lattice_mul                                SQISIGN_NAMESPACE_GENERIC(quat_lattice_mul)
+#define quat_lattice_reduce_denom                       SQISIGN_NAMESPACE_GENERIC(quat_lattice_reduce_denom)
+
+// Namespacing symbols exported from lll_applications.c:
+#undef quat_lideal_lideal_mul_reduced
+#undef quat_lideal_prime_norm_reduced_equivalent
+#undef quat_lideal_reduce_basis
+
+#define quat_lideal_lideal_mul_reduced                  SQISIGN_NAMESPACE_GENERIC(quat_lideal_lideal_mul_reduced)
+#define quat_lideal_prime_norm_reduced_equivalent       SQISIGN_NAMESPACE_GENERIC(quat_lideal_prime_norm_reduced_equivalent)
+#define quat_lideal_reduce_basis                        SQISIGN_NAMESPACE_GENERIC(quat_lideal_reduce_basis)
+
+// Namespacing symbols exported from lll_verification.c:
+#undef ibq_vec_4_copy_ibz
+#undef quat_lll_bilinear
+#undef quat_lll_gram_schmidt_transposed_with_ibq
+#undef quat_lll_set_ibq_parameters
+#undef quat_lll_verify
+
+#define ibq_vec_4_copy_ibz                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_copy_ibz)
+#define quat_lll_bilinear                               SQISIGN_NAMESPACE_GENERIC(quat_lll_bilinear)
+#define quat_lll_gram_schmidt_transposed_with_ibq       SQISIGN_NAMESPACE_GENERIC(quat_lll_gram_schmidt_transposed_with_ibq)
+#define quat_lll_set_ibq_parameters                     SQISIGN_NAMESPACE_GENERIC(quat_lll_set_ibq_parameters)
+#define quat_lll_verify                                 SQISIGN_NAMESPACE_GENERIC(quat_lll_verify)
+
+// Namespacing symbols exported from mem.c:
+#undef sqisign_secure_clear
+#undef sqisign_secure_free
+
+#define sqisign_secure_clear                            SQISIGN_NAMESPACE_GENERIC(sqisign_secure_clear)
+#define sqisign_secure_free                             SQISIGN_NAMESPACE_GENERIC(sqisign_secure_free)
+
+// Namespacing symbols exported from mp.c:
+#undef MUL
+#undef mp_add
+#undef mp_compare
+#undef mp_copy
+#undef mp_inv_2e
+#undef mp_invert_matrix
+#undef mp_is_one
+#undef mp_is_zero
+#undef mp_mod_2exp
+#undef mp_mul
+#undef mp_mul2
+#undef mp_neg
+#undef mp_print
+#undef mp_shiftl
+#undef mp_shiftr
+#undef mp_sub
+#undef multiple_mp_shiftl
+#undef select_ct
+#undef swap_ct
+
+#define MUL                                             SQISIGN_NAMESPACE_GENERIC(MUL)
+#define mp_add                                          SQISIGN_NAMESPACE_GENERIC(mp_add)
+#define mp_compare                                      SQISIGN_NAMESPACE_GENERIC(mp_compare)
+#define mp_copy                                         SQISIGN_NAMESPACE_GENERIC(mp_copy)
+#define mp_inv_2e                                       SQISIGN_NAMESPACE_GENERIC(mp_inv_2e)
+#define mp_invert_matrix                                SQISIGN_NAMESPACE_GENERIC(mp_invert_matrix)
+#define mp_is_one                                       SQISIGN_NAMESPACE_GENERIC(mp_is_one)
+#define mp_is_zero                                      SQISIGN_NAMESPACE_GENERIC(mp_is_zero)
+#define mp_mod_2exp                                     SQISIGN_NAMESPACE_GENERIC(mp_mod_2exp)
+#define mp_mul                                          SQISIGN_NAMESPACE_GENERIC(mp_mul)
+#define mp_mul2                                         SQISIGN_NAMESPACE_GENERIC(mp_mul2)
+#define mp_neg                                          SQISIGN_NAMESPACE_GENERIC(mp_neg)
+#define mp_print                                        SQISIGN_NAMESPACE_GENERIC(mp_print)
+#define mp_shiftl                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftl)
+#define mp_shiftr                                       SQISIGN_NAMESPACE_GENERIC(mp_shiftr)
+#define mp_sub                                          SQISIGN_NAMESPACE_GENERIC(mp_sub)
+#define multiple_mp_shiftl                              SQISIGN_NAMESPACE_GENERIC(multiple_mp_shiftl)
+#define select_ct                                       SQISIGN_NAMESPACE_GENERIC(select_ct)
+#define swap_ct                                         SQISIGN_NAMESPACE_GENERIC(swap_ct)
+
+// Namespacing symbols exported from normeq.c:
+#undef quat_change_to_O0_basis
+#undef quat_lattice_O0_set
+#undef quat_lattice_O0_set_extremal
+#undef quat_order_elem_create
+#undef quat_represent_integer
+#undef quat_sampling_random_ideal_O0_given_norm
+
+#define quat_change_to_O0_basis                         SQISIGN_NAMESPACE_GENERIC(quat_change_to_O0_basis)
+#define quat_lattice_O0_set                             SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set)
+#define quat_lattice_O0_set_extremal                    SQISIGN_NAMESPACE_GENERIC(quat_lattice_O0_set_extremal)
+#define quat_order_elem_create                          SQISIGN_NAMESPACE_GENERIC(quat_order_elem_create)
+#define quat_represent_integer                          SQISIGN_NAMESPACE_GENERIC(quat_represent_integer)
+#define quat_sampling_random_ideal_O0_given_norm        SQISIGN_NAMESPACE_GENERIC(quat_sampling_random_ideal_O0_given_norm)
+
+// Namespacing symbols exported from printer.c:
+#undef ibz_mat_2x2_print
+#undef ibz_mat_4x4_print
+#undef ibz_vec_2_print
+#undef ibz_vec_4_print
+#undef quat_alg_elem_print
+#undef quat_alg_print
+#undef quat_lattice_print
+#undef quat_left_ideal_print
+
+#define ibz_mat_2x2_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_2x2_print)
+#define ibz_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibz_mat_4x4_print)
+#define ibz_vec_2_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_2_print)
+#define ibz_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibz_vec_4_print)
+#define quat_alg_elem_print                             SQISIGN_NAMESPACE_GENERIC(quat_alg_elem_print)
+#define quat_alg_print                                  SQISIGN_NAMESPACE_GENERIC(quat_alg_print)
+#define quat_lattice_print                              SQISIGN_NAMESPACE_GENERIC(quat_lattice_print)
+#define quat_left_ideal_print                           SQISIGN_NAMESPACE_GENERIC(quat_left_ideal_print)
+
+// Namespacing symbols exported from random_input_generation.c:
+#undef quat_test_input_random_ideal_generation
+#undef quat_test_input_random_ideal_lattice_generation
+#undef quat_test_input_random_lattice_generation
+
+#define quat_test_input_random_ideal_generation         SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_generation)
+#define quat_test_input_random_ideal_lattice_generation SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_ideal_lattice_generation)
+#define quat_test_input_random_lattice_generation       SQISIGN_NAMESPACE_GENERIC(quat_test_input_random_lattice_generation)
+
+// Namespacing symbols exported from rationals.c:
+#undef ibq_abs
+#undef ibq_add
+#undef ibq_cmp
+#undef ibq_copy
+#undef ibq_finalize
+#undef ibq_init
+#undef ibq_inv
+#undef ibq_is_ibz
+#undef ibq_is_one
+#undef ibq_is_zero
+#undef ibq_mat_4x4_finalize
+#undef ibq_mat_4x4_init
+#undef ibq_mat_4x4_print
+#undef ibq_mul
+#undef ibq_neg
+#undef ibq_reduce
+#undef ibq_set
+#undef ibq_sub
+#undef ibq_to_ibz
+#undef ibq_vec_4_finalize
+#undef ibq_vec_4_init
+#undef ibq_vec_4_print
+
+#define ibq_abs                                         SQISIGN_NAMESPACE_GENERIC(ibq_abs)
+#define ibq_add                                         SQISIGN_NAMESPACE_GENERIC(ibq_add)
+#define ibq_cmp                                         SQISIGN_NAMESPACE_GENERIC(ibq_cmp)
+#define ibq_copy                                        SQISIGN_NAMESPACE_GENERIC(ibq_copy)
+#define ibq_finalize                                    SQISIGN_NAMESPACE_GENERIC(ibq_finalize)
+#define ibq_init                                        SQISIGN_NAMESPACE_GENERIC(ibq_init)
+#define ibq_inv                                         SQISIGN_NAMESPACE_GENERIC(ibq_inv)
+#define ibq_is_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_ibz)
+#define ibq_is_one                                      SQISIGN_NAMESPACE_GENERIC(ibq_is_one)
+#define ibq_is_zero                                     SQISIGN_NAMESPACE_GENERIC(ibq_is_zero)
+#define ibq_mat_4x4_finalize                            SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_finalize)
+#define ibq_mat_4x4_init                                SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_init)
+#define ibq_mat_4x4_print                               SQISIGN_NAMESPACE_GENERIC(ibq_mat_4x4_print)
+#define ibq_mul                                         SQISIGN_NAMESPACE_GENERIC(ibq_mul)
+#define ibq_neg                                         SQISIGN_NAMESPACE_GENERIC(ibq_neg)
+#define ibq_reduce                                      SQISIGN_NAMESPACE_GENERIC(ibq_reduce)
+#define ibq_set                                         SQISIGN_NAMESPACE_GENERIC(ibq_set)
+#define ibq_sub                                         SQISIGN_NAMESPACE_GENERIC(ibq_sub)
+#define ibq_to_ibz                                      SQISIGN_NAMESPACE_GENERIC(ibq_to_ibz)
+#define ibq_vec_4_finalize                              SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_finalize)
+#define ibq_vec_4_init                                  SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_init)
+#define ibq_vec_4_print                                 SQISIGN_NAMESPACE_GENERIC(ibq_vec_4_print)
+
+// Namespacing symbols exported from sign.c:
+#undef protocols_sign
+
+#define protocols_sign                                  SQISIGN_NAMESPACE(protocols_sign)
+
+// Namespacing symbols exported from sqisign.c:
+#undef sqisign_keypair
+#undef sqisign_open
+#undef sqisign_sign
+#undef sqisign_verify
+
+#define sqisign_keypair                                 SQISIGN_NAMESPACE(sqisign_keypair)
+#define sqisign_open                                    SQISIGN_NAMESPACE(sqisign_open)
+#define sqisign_sign                                    SQISIGN_NAMESPACE(sqisign_sign)
+#define sqisign_verify                                  SQISIGN_NAMESPACE(sqisign_verify)
+
+// Namespacing symbols exported from theta_isogenies.c:
+#undef theta_chain_compute_and_eval
+#undef theta_chain_compute_and_eval_randomized
+#undef theta_chain_compute_and_eval_verify
+
+#define theta_chain_compute_and_eval                    SQISIGN_NAMESPACE(theta_chain_compute_and_eval)
+#define theta_chain_compute_and_eval_randomized         SQISIGN_NAMESPACE(theta_chain_compute_and_eval_randomized)
+#define theta_chain_compute_and_eval_verify             SQISIGN_NAMESPACE(theta_chain_compute_and_eval_verify)
+
+// Namespacing symbols exported from theta_structure.c:
+#undef double_iter
+#undef double_point
+#undef is_product_theta_point
+#undef theta_precomputation
+
+#define double_iter                                     SQISIGN_NAMESPACE(double_iter)
+#define double_point                                    SQISIGN_NAMESPACE(double_point)
+#define is_product_theta_point                          SQISIGN_NAMESPACE(is_product_theta_point)
+#define theta_precomputation                            SQISIGN_NAMESPACE(theta_precomputation)
+
+// Namespacing symbols exported from verify.c:
+#undef protocols_verify
+
+#define protocols_verify                                SQISIGN_NAMESPACE(protocols_verify)
+
+// Namespacing symbols exported from xeval.c:
+#undef xeval_2
+#undef xeval_2_singular
+#undef xeval_4
+
+#define xeval_2                                         SQISIGN_NAMESPACE(xeval_2)
+#define xeval_2_singular                                SQISIGN_NAMESPACE(xeval_2_singular)
+#define xeval_4                                         SQISIGN_NAMESPACE(xeval_4)
+
+// Namespacing symbols exported from xisog.c:
+#undef xisog_2
+#undef xisog_2_singular
+#undef xisog_4
+
+#define xisog_2                                         SQISIGN_NAMESPACE(xisog_2)
+#define xisog_2_singular                                SQISIGN_NAMESPACE(xisog_2_singular)
+#define xisog_4                                         SQISIGN_NAMESPACE(xisog_4)
+
+
+#endif
+
diff --git a/src/pqm4/sqisign_lvl5/ref/theta_isogenies.c b/src/pqm4/sqisign_lvl5/ref/theta_isogenies.c
new file mode 100644
index 0000000..478a9ab
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/theta_isogenies.c
@@ -0,0 +1,1283 @@
+#include "theta_isogenies.h"
+#include <stdio.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <tools.h>
+#include <rng.h>
+
+// Select a base change matrix in constant time, with M1 a regular
+// base change matrix and M2 a precomputed base change matrix
+// If option = 0 then M <- M1, else if option = 0xFF...FF then M <- M2
+static inline void
+select_base_change_matrix(basis_change_matrix_t *M,
+                          const basis_change_matrix_t *M1,
+                          const precomp_basis_change_matrix_t *M2,
+                          const uint32_t option)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            fp2_select(&M->m[i][j], &M1->m[i][j], &FP2_CONSTANTS[M2->m[i][j]], option);
+}
+
+// Set a regular base change matrix from a precomputed one
+static inline void
+set_base_change_matrix_from_precomp(basis_change_matrix_t *res, const precomp_basis_change_matrix_t *M)
+{
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            res->m[i][j] = FP2_CONSTANTS[M->m[i][j]];
+}
+
+static inline void
+choose_index_theta_point(fp2_t *res, int ind, const theta_point_t *T)
+{
+    const fp2_t *src = NULL;
+    switch (ind % 4) {
+        case 0:
+            src = &T->x;
+            break;
+        case 1:
+            src = &T->y;
+            break;
+        case 2:
+            src = &T->z;
+            break;
+        case 3:
+            src = &T->t;
+            break;
+        default:
+            assert(0);
+    }
+    fp2_copy(res, src);
+}
+
+// same as apply_isomorphism method but more efficient when the t component of P is zero.
+static void
+apply_isomorphism_general(theta_point_t *res,
+                          const basis_change_matrix_t *M,
+                          const theta_point_t *P,
+                          const bool Pt_not_zero)
+{
+    fp2_t x1;
+    theta_point_t temp;
+
+    fp2_mul(&temp.x, &P->x, &M->m[0][0]);
+    fp2_mul(&x1, &P->y, &M->m[0][1]);
+    fp2_add(&temp.x, &temp.x, &x1);
+    fp2_mul(&x1, &P->z, &M->m[0][2]);
+    fp2_add(&temp.x, &temp.x, &x1);
+
+    fp2_mul(&temp.y, &P->x, &M->m[1][0]);
+    fp2_mul(&x1, &P->y, &M->m[1][1]);
+    fp2_add(&temp.y, &temp.y, &x1);
+    fp2_mul(&x1, &P->z, &M->m[1][2]);
+    fp2_add(&temp.y, &temp.y, &x1);
+
+    fp2_mul(&temp.z, &P->x, &M->m[2][0]);
+    fp2_mul(&x1, &P->y, &M->m[2][1]);
+    fp2_add(&temp.z, &temp.z, &x1);
+    fp2_mul(&x1, &P->z, &M->m[2][2]);
+    fp2_add(&temp.z, &temp.z, &x1);
+
+    fp2_mul(&temp.t, &P->x, &M->m[3][0]);
+    fp2_mul(&x1, &P->y, &M->m[3][1]);
+    fp2_add(&temp.t, &temp.t, &x1);
+    fp2_mul(&x1, &P->z, &M->m[3][2]);
+    fp2_add(&temp.t, &temp.t, &x1);
+
+    if (Pt_not_zero) {
+        fp2_mul(&x1, &P->t, &M->m[0][3]);
+        fp2_add(&temp.x, &temp.x, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[1][3]);
+        fp2_add(&temp.y, &temp.y, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[2][3]);
+        fp2_add(&temp.z, &temp.z, &x1);
+
+        fp2_mul(&x1, &P->t, &M->m[3][3]);
+        fp2_add(&temp.t, &temp.t, &x1);
+    }
+
+    fp2_copy(&res->x, &temp.x);
+    fp2_copy(&res->y, &temp.y);
+    fp2_copy(&res->z, &temp.z);
+    fp2_copy(&res->t, &temp.t);
+}
+
+static void
+apply_isomorphism(theta_point_t *res, const basis_change_matrix_t *M, const theta_point_t *P)
+{
+    apply_isomorphism_general(res, M, P, true);
+}
+
+// set res = M1 * M2 with matrix multiplication
+static void
+base_change_matrix_multiplication(basis_change_matrix_t *res,
+                                  const basis_change_matrix_t *M1,
+                                  const basis_change_matrix_t *M2)
+{
+    basis_change_matrix_t tmp;
+    fp2_t sum, m_ik, m_kj;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            fp2_set_zero(&sum);
+            for (int k = 0; k < 4; k++) {
+                m_ik = M1->m[i][k];
+                m_kj = M2->m[k][j];
+                fp2_mul(&m_ik, &m_ik, &m_kj);
+                fp2_add(&sum, &sum, &m_ik);
+            }
+            tmp.m[i][j] = sum;
+        }
+    }
+    *res = tmp;
+}
+
+// compute the theta_point corresponding to the couple of point T on an elliptic product
+static void
+base_change(theta_point_t *out, const theta_gluing_t *phi, const theta_couple_point_t *T)
+{
+    theta_point_t null_point;
+
+    // null_point = (a : b : c : d)
+    // a = P1.x P2.x, b = P1.x P2.z, c = P1.z P2.x, d = P1.z P2.z
+    fp2_mul(&null_point.x, &T->P1.x, &T->P2.x);
+    fp2_mul(&null_point.y, &T->P1.x, &T->P2.z);
+    fp2_mul(&null_point.z, &T->P2.x, &T->P1.z);
+    fp2_mul(&null_point.t, &T->P1.z, &T->P2.z);
+
+    // Apply the basis change
+    apply_isomorphism(out, &phi->M, &null_point);
+}
+
+static void
+action_by_translation_z_and_det(fp2_t *z_inv, fp2_t *det_inv, const ec_point_t *P4, const ec_point_t *P2)
+{
+    // Store the Z-coordinate to invert
+    fp2_copy(z_inv, &P4->z);
+
+    // Then collect detij = xij wij - uij zij
+    fp2_t tmp;
+    fp2_mul(det_inv, &P4->x, &P2->z);
+    fp2_mul(&tmp, &P4->z, &P2->x);
+    fp2_sub(det_inv, det_inv, &tmp);
+}
+
+static void
+action_by_translation_compute_matrix(translation_matrix_t *G,
+                                     const ec_point_t *P4,
+                                     const ec_point_t *P2,
+                                     const fp2_t *z_inv,
+                                     const fp2_t *det_inv)
+{
+    fp2_t tmp;
+
+    // Gi.g10 = uij xij /detij - xij/zij
+    fp2_mul(&tmp, &P4->x, z_inv);
+    fp2_mul(&G->g10, &P4->x, &P2->x);
+    fp2_mul(&G->g10, &G->g10, det_inv);
+    fp2_sub(&G->g10, &G->g10, &tmp);
+
+    // Gi.g11 = uij zij * detij
+    fp2_mul(&G->g11, &P2->x, det_inv);
+    fp2_mul(&G->g11, &G->g11, &P4->z);
+
+    // Gi.g00 = -Gi.g11
+    fp2_neg(&G->g00, &G->g11);
+
+    // Gi.g01 = - wij zij detij
+    fp2_mul(&G->g01, &P2->z, det_inv);
+    fp2_mul(&G->g01, &G->g01, &P4->z);
+    fp2_neg(&G->g01, &G->g01);
+}
+
+// Returns 1 if the basis is as expected and 0 otherwise
+// We only expect this to fail for malformed signatures, so
+// do not require this to run in constant time.
+static int
+verify_two_torsion(const theta_couple_point_t *K1_2, const theta_couple_point_t *K2_2, const theta_couple_curve_t *E12)
+{
+    // First check if any point in K1_2 or K2_2 is zero, if they are then the points did not have
+    // order 8 when we started gluing
+    if (ec_is_zero(&K1_2->P1) | ec_is_zero(&K1_2->P2) | ec_is_zero(&K2_2->P1) | ec_is_zero(&K2_2->P2)) {
+        return 0;
+    }
+
+    // Now ensure that P1, Q1 and P2, Q2 are independent. For points of order two this means
+    // that they're not the same
+    if (ec_is_equal(&K1_2->P1, &K2_2->P1) | ec_is_equal(&K1_2->P2, &K2_2->P2)) {
+        return 0;
+    }
+
+    // Finally, double points to ensure all points have order exactly 0
+    theta_couple_point_t O1, O2;
+    double_couple_point(&O1, K1_2, E12);
+    double_couple_point(&O2, K2_2, E12);
+    // If this check fails then the points had order 2*f for some f, and the kernel is malformed.
+    if (!(ec_is_zero(&O1.P1) & ec_is_zero(&O1.P2) & ec_is_zero(&O2.P1) & ec_is_zero(&O2.P2))) {
+        return 0;
+    }
+
+    return 1;
+}
+
+// Computes the action by translation for four points
+// (P1, P2) and (Q1, Q2) on E1 x E2 simultaneously to
+// save on inversions.
+// Returns 0 if any of Pi or Qi does not have order 2
+// and 1 otherwise
+static int
+action_by_translation(translation_matrix_t *Gi,
+                      const theta_couple_point_t *K1_4,
+                      const theta_couple_point_t *K2_4,
+                      const theta_couple_curve_t *E12)
+{
+    // Compute points of order 2 from Ki_4
+    theta_couple_point_t K1_2, K2_2;
+    double_couple_point(&K1_2, K1_4, E12);
+    double_couple_point(&K2_2, K2_4, E12);
+
+    if (!verify_two_torsion(&K1_2, &K2_2, E12)) {
+        return 0;
+    }
+
+    // We need to invert four Z coordinates and
+    // four determinants which we do with batched
+    // inversion
+    fp2_t inverses[8];
+    action_by_translation_z_and_det(&inverses[0], &inverses[4], &K1_4->P1, &K1_2.P1);
+    action_by_translation_z_and_det(&inverses[1], &inverses[5], &K1_4->P2, &K1_2.P2);
+    action_by_translation_z_and_det(&inverses[2], &inverses[6], &K2_4->P1, &K2_2.P1);
+    action_by_translation_z_and_det(&inverses[3], &inverses[7], &K2_4->P2, &K2_2.P2);
+
+    fp2_batched_inv(inverses, 8);
+    if (fp2_is_zero(&inverses[0]))
+        return 0; // something was wrong with our input (which somehow was not caught by
+                  // verify_two_torsion)
+
+    action_by_translation_compute_matrix(&Gi[0], &K1_4->P1, &K1_2.P1, &inverses[0], &inverses[4]);
+    action_by_translation_compute_matrix(&Gi[1], &K1_4->P2, &K1_2.P2, &inverses[1], &inverses[5]);
+    action_by_translation_compute_matrix(&Gi[2], &K2_4->P1, &K2_2.P1, &inverses[2], &inverses[6]);
+    action_by_translation_compute_matrix(&Gi[3], &K2_4->P2, &K2_2.P2, &inverses[3], &inverses[7]);
+
+    return 1;
+}
+
+// Given the appropriate four torsion, computes the
+// change of basis to compute the correct theta null
+// point.
+// Returns 0 if the order of K1_4 or K2_4 is not 4
+static int
+gluing_change_of_basis(basis_change_matrix_t *M,
+                       const theta_couple_point_t *K1_4,
+                       const theta_couple_point_t *K2_4,
+                       const theta_couple_curve_t *E12)
+{
+    // Compute the four 2x2 matrices for the action by translation
+    // on the four points:
+    translation_matrix_t Gi[4];
+    if (!action_by_translation(Gi, K1_4, K2_4, E12))
+        return 0;
+
+    // Computation of the 4x4 matrix from Mij
+    // t001, t101 (resp t002, t102) first column of M11 * M21 (resp M12 * M22)
+    fp2_t t001, t101, t002, t102, tmp;
+
+    fp2_mul(&t001, &Gi[0].g00, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g01, &Gi[2].g10);
+    fp2_add(&t001, &t001, &tmp);
+
+    fp2_mul(&t101, &Gi[0].g10, &Gi[2].g00);
+    fp2_mul(&tmp, &Gi[0].g11, &Gi[2].g10);
+    fp2_add(&t101, &t101, &tmp);
+
+    fp2_mul(&t002, &Gi[1].g00, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g01, &Gi[3].g10);
+    fp2_add(&t002, &t002, &tmp);
+
+    fp2_mul(&t102, &Gi[1].g10, &Gi[3].g00);
+    fp2_mul(&tmp, &Gi[1].g11, &Gi[3].g10);
+    fp2_add(&t102, &t102, &tmp);
+
+    // trace for the first row
+    fp2_set_one(&M->m[0][0]);
+    fp2_mul(&tmp, &t001, &t002);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g00);
+    fp2_add(&M->m[0][0], &M->m[0][0], &tmp);
+
+    fp2_mul(&M->m[0][1], &t001, &t102);
+    fp2_mul(&tmp, &Gi[2].g00, &Gi[3].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+    fp2_mul(&tmp, &Gi[0].g00, &Gi[1].g10);
+    fp2_add(&M->m[0][1], &M->m[0][1], &tmp);
+
+    fp2_mul(&M->m[0][2], &t101, &t002);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g00);
+    fp2_add(&M->m[0][2], &M->m[0][2], &tmp);
+
+    fp2_mul(&M->m[0][3], &t101, &t102);
+    fp2_mul(&tmp, &Gi[2].g10, &Gi[3].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+    fp2_mul(&tmp, &Gi[0].g10, &Gi[1].g10);
+    fp2_add(&M->m[0][3], &M->m[0][3], &tmp);
+
+    // Compute the action of (0,out.K2_4.P2) for the second row
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][1]);
+    fp2_mul(&M->m[1][0], &Gi[3].g00, &M->m[0][0]);
+    fp2_add(&M->m[1][0], &M->m[1][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][1]);
+    fp2_mul(&M->m[1][1], &Gi[3].g10, &M->m[0][0]);
+    fp2_add(&M->m[1][1], &M->m[1][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g01, &M->m[0][3]);
+    fp2_mul(&M->m[1][2], &Gi[3].g00, &M->m[0][2]);
+    fp2_add(&M->m[1][2], &M->m[1][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[3].g11, &M->m[0][3]);
+    fp2_mul(&M->m[1][3], &Gi[3].g10, &M->m[0][2]);
+    fp2_add(&M->m[1][3], &M->m[1][3], &tmp);
+
+    // compute the action of (K1_4.P1,0) for the third row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][2]);
+    fp2_mul(&M->m[2][0], &Gi[0].g00, &M->m[0][0]);
+    fp2_add(&M->m[2][0], &M->m[2][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[0][3]);
+    fp2_mul(&M->m[2][1], &Gi[0].g00, &M->m[0][1]);
+    fp2_add(&M->m[2][1], &M->m[2][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][2]);
+    fp2_mul(&M->m[2][2], &Gi[0].g10, &M->m[0][0]);
+    fp2_add(&M->m[2][2], &M->m[2][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[0][3]);
+    fp2_mul(&M->m[2][3], &Gi[0].g10, &M->m[0][1]);
+    fp2_add(&M->m[2][3], &M->m[2][3], &tmp);
+
+    // compute the action of (K1_4.P1,K2_4.P2) for the final row
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][2]);
+    fp2_mul(&M->m[3][0], &Gi[0].g00, &M->m[1][0]);
+    fp2_add(&M->m[3][0], &M->m[3][0], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g01, &M->m[1][3]);
+    fp2_mul(&M->m[3][1], &Gi[0].g00, &M->m[1][1]);
+    fp2_add(&M->m[3][1], &M->m[3][1], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][2]);
+    fp2_mul(&M->m[3][2], &Gi[0].g10, &M->m[1][0]);
+    fp2_add(&M->m[3][2], &M->m[3][2], &tmp);
+
+    fp2_mul(&tmp, &Gi[0].g11, &M->m[1][3]);
+    fp2_mul(&M->m[3][3], &Gi[0].g10, &M->m[1][1]);
+    fp2_add(&M->m[3][3], &M->m[3][3], &tmp);
+
+    return 1;
+}
+
+/**
+ * @brief Compute the gluing isogeny from an elliptic product
+ *
+ * @param out Output: the theta_gluing
+ * @param K1_8 a couple point
+ * @param E12 an elliptic curve product
+ * @param K2_8 a point in E2[8]
+ *
+ * out : E1xE2 -> A of kernel [4](K1_8,K2_8)
+ * if the kernel supplied has the incorrect order, or gluing seems malformed,
+ * returns 0, otherwise returns 1.
+ */
+static int
+gluing_compute(theta_gluing_t *out,
+               const theta_couple_curve_t *E12,
+               const theta_couple_jac_point_t *xyK1_8,
+               const theta_couple_jac_point_t *xyK2_8,
+               bool verify)
+{
+    // Ensure that we have been given the eight torsion
+#ifndef NDEBUG
+    {
+        int check = test_jac_order_twof(&xyK1_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK1_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P1, &E12->E1, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK1_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P1 does not have order 8");
+        check = test_jac_order_twof(&xyK2_8->P2, &E12->E2, 3);
+        if (!check)
+            debug_print("xyK2_8->P2 does not have order 8");
+    }
+#endif
+
+    out->xyK1_8 = *xyK1_8;
+    out->domain = *E12;
+
+    // Given points in E[8] x E[8] we need the four torsion below
+    theta_couple_jac_point_t xyK1_4, xyK2_4;
+
+    double_couple_jac_point(&xyK1_4, xyK1_8, E12);
+    double_couple_jac_point(&xyK2_4, xyK2_8, E12);
+
+    // Convert from (X:Y:Z) coordinates to (X:Z)
+    theta_couple_point_t K1_8, K2_8;
+    theta_couple_point_t K1_4, K2_4;
+
+    couple_jac_to_xz(&K1_8, xyK1_8);
+    couple_jac_to_xz(&K2_8, xyK2_8);
+    couple_jac_to_xz(&K1_4, &xyK1_4);
+    couple_jac_to_xz(&K2_4, &xyK2_4);
+
+    // Set the basis change matrix, if we have not been given a valid K[8] for this computation
+    // gluing_change_of_basis will detect this and return 0
+    if (!gluing_change_of_basis(&out->M, &K1_4, &K2_4, E12)) {
+        debug_print("gluing failed as kernel does not have correct order");
+        return 0;
+    }
+
+    // apply the base change to the kernel
+    theta_point_t TT1, TT2;
+
+    base_change(&TT1, out, &K1_8);
+    base_change(&TT2, out, &K2_8);
+
+    // compute the codomain
+    to_squared_theta(&TT1, &TT1);
+    to_squared_theta(&TT2, &TT2);
+
+    // If the kernel is well formed then TT1.t and TT2.t are zero
+    // if they are not, we exit early as the signature we are validating
+    // is probably malformed
+    if (!(fp2_is_zero(&TT1.t) & fp2_is_zero(&TT2.t))) {
+        debug_print("gluing failed TT1.t or TT2.t is not zero");
+        return 0;
+    }
+    // Test our projective factors are non zero
+    if (fp2_is_zero(&TT1.x) | fp2_is_zero(&TT2.x) | fp2_is_zero(&TT1.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT1.z))
+        return 0; // invalid input
+
+    // Projective factor: Ax
+    fp2_mul(&out->codomain.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.y, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.z, &TT1.x, &TT2.z);
+    fp2_set_zero(&out->codomain.t);
+    // Projective factor: ABCxz
+    fp2_mul(&out->precomputation.x, &TT1.y, &TT2.z);
+    fp2_copy(&out->precomputation.y, &out->codomain.z);
+    fp2_copy(&out->precomputation.z, &out->codomain.y);
+    fp2_set_zero(&out->precomputation.t);
+
+    // Compute the two components of phi(K1_8) = (x:x:y:y).
+    fp2_mul(&out->imageK1_8.x, &TT1.x, &out->precomputation.x);
+    fp2_mul(&out->imageK1_8.y, &TT1.z, &out->precomputation.z);
+
+    // If K1_8 and K2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points [2]K1_8 and [2]K2_8 are isotropic.
+    if (verify) {
+        fp2_t t1, t2;
+        fp2_mul(&t1, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&out->imageK1_8.x, &t1))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t2, &t1))
+            return 0;
+    }
+
+    // compute the final codomain
+    hadamard(&out->codomain, &out->codomain);
+    return 1;
+}
+
+// sub routine of the gluing eval
+static void
+gluing_eval_point(theta_point_t *image, const theta_couple_jac_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T1, T2;
+    add_components_t add_comp1, add_comp2;
+
+    // Compute the cross addition components of P1+Q1 and P2+Q2
+    jac_to_xz_add_components(&add_comp1, &P->P1, &phi->xyK1_8.P1, &phi->domain.E1);
+    jac_to_xz_add_components(&add_comp2, &P->P2, &phi->xyK1_8.P2, &phi->domain.E2);
+
+    // Compute T1 and T2 derived from the cross addition components.
+    fp2_mul(&T1.x, &add_comp1.u, &add_comp2.u); // T1x = u1u2
+    fp2_mul(&T2.t, &add_comp1.v, &add_comp2.v); // T2t = v1v2
+    fp2_add(&T1.x, &T1.x, &T2.t);               // T1x = u1u2 + v1v2
+    fp2_mul(&T1.y, &add_comp1.u, &add_comp2.w); // T1y = u1w2
+    fp2_mul(&T1.z, &add_comp1.w, &add_comp2.u); // T1z = w1u2
+    fp2_mul(&T1.t, &add_comp1.w, &add_comp2.w); // T1t = w1w2
+    fp2_add(&T2.x, &add_comp1.u, &add_comp1.v); // T2x = (u1+v1)
+    fp2_add(&T2.y, &add_comp2.u, &add_comp2.v); // T2y = (u2+v2)
+    fp2_mul(&T2.x, &T2.x, &T2.y);               // T2x = (u1+v1)(u2+v2)
+    fp2_sub(&T2.x, &T2.x, &T1.x);               // T1x = v1u2 + u1v2
+    fp2_mul(&T2.y, &add_comp1.v, &add_comp2.w); // T2y = v1w2
+    fp2_mul(&T2.z, &add_comp1.w, &add_comp2.v); // T2z = w1v2
+    fp2_set_zero(&T2.t);                        // T2t = 0
+
+    // Apply the basis change and compute their respective square
+    // theta(P+Q) = M.T1 - M.T2 and theta(P-Q) = M.T1 + M.T2
+    apply_isomorphism_general(&T1, &phi->M, &T1, true);
+    apply_isomorphism_general(&T2, &phi->M, &T2, false);
+    pointwise_square(&T1, &T1);
+    pointwise_square(&T2, &T2);
+
+    // the difference between the two is therefore theta(P+Q)theta(P-Q)
+    // whose hadamard transform is then the product of the dual
+    // theta_points of phi(P) and phi(Q).
+    fp2_sub(&T1.x, &T1.x, &T2.x);
+    fp2_sub(&T1.y, &T1.y, &T2.y);
+    fp2_sub(&T1.z, &T1.z, &T2.z);
+    fp2_sub(&T1.t, &T1.t, &T2.t);
+    hadamard(&T1, &T1);
+
+    // Compute (x, y, z, t)
+    // As imageK1_8 = (x:x:y:y), its inverse is (y:y:x:x).
+    fp2_mul(&image->x, &T1.x, &phi->imageK1_8.y);
+    fp2_mul(&image->y, &T1.y, &phi->imageK1_8.y);
+    fp2_mul(&image->z, &T1.z, &phi->imageK1_8.x);
+    fp2_mul(&image->t, &T1.t, &phi->imageK1_8.x);
+
+    hadamard(image, image);
+}
+
+// Same as gluing_eval_point but in the very special case where we already know that the point will
+// have a zero coordinate at the place where the zero coordinate of the dual_theta_nullpoint would
+// have made the computation difficult
+static int
+gluing_eval_point_special_case(theta_point_t *image, const theta_couple_point_t *P, const theta_gluing_t *phi)
+{
+    theta_point_t T;
+
+    // Apply the basis change
+    base_change(&T, phi, P);
+
+    // Apply the to_squared_theta transform
+    to_squared_theta(&T, &T);
+
+    // This coordinate should always be 0 in a gluing because D=0.
+    // If this is not the case, something went very wrong, so reject
+    if (!fp2_is_zero(&T.t))
+        return 0;
+
+    // Compute (x, y, z, t)
+    fp2_mul(&image->x, &T.x, &phi->precomputation.x);
+    fp2_mul(&image->y, &T.y, &phi->precomputation.y);
+    fp2_mul(&image->z, &T.z, &phi->precomputation.z);
+    fp2_set_zero(&image->t);
+
+    hadamard(image, image);
+    return 1;
+}
+
+/**
+ * @brief Evaluate a gluing isogeny from an elliptic product on a basis
+ *
+ * @param image1 Output: the theta_point of the image of the first couple of points
+ * @param image2 Output : the theta point of the image of the second couple of points
+ * @param xyT1: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param xyT2: A pair of points (X : Y : Z) on E1E2 to glue using phi
+ * @param phi : a gluing isogeny E1 x E2 -> A
+ *
+ **/
+static void
+gluing_eval_basis(theta_point_t *image1,
+                  theta_point_t *image2,
+                  const theta_couple_jac_point_t *xyT1,
+                  const theta_couple_jac_point_t *xyT2,
+                  const theta_gluing_t *phi)
+{
+    gluing_eval_point(image1, xyT1, phi);
+    gluing_eval_point(image2, xyT2, phi);
+}
+
+/**
+ * @brief Compute a (2,2) isogeny in dimension 2 in the theta_model
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_8 a point in A[8]
+ * @param T2_8 a point in A[8]
+ * @param hadamard_bool_1 a boolean used for the last two steps of the chain
+ * @param hadamard_bool_2 a boolean used for the last two steps of the chain
+ *
+ * out : A -> B of kernel [4](T1_8,T2_8)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ * verify: add extra sanity check to ensure our 8-torsion points are coherent with the isogeny
+ *
+ */
+static int
+theta_isogeny_compute(theta_isogeny_t *out,
+                      const theta_structure_t *A,
+                      const theta_point_t *T1_8,
+                      const theta_point_t *T2_8,
+                      bool hadamard_bool_1,
+                      bool hadamard_bool_2,
+                      bool verify)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_8;
+    out->T2_8 = *T2_8;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_8);
+        to_squared_theta(&TT1, &TT1);
+        hadamard(&TT2, T2_8);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_8);
+        to_squared_theta(&TT2, T2_8);
+    }
+
+    fp2_t t1, t2;
+
+    // Test that our projective factor ABCDxzw is non zero, where
+    // TT1=(Ax, Bx, Cy, Dy), TT2=(Az, Bw, Cz, Dw)
+    // But ABCDxzw=0 can only happen if we had an unexpected splitting in
+    // the isogeny chain.
+    // In either case reject
+    // (this is not strictly necessary, we could just return (0:0:0:0))
+    if (fp2_is_zero(&TT2.x) | fp2_is_zero(&TT2.y) | fp2_is_zero(&TT2.z) | fp2_is_zero(&TT2.t) | fp2_is_zero(&TT1.x) |
+        fp2_is_zero(&TT1.y))
+        return 0;
+
+    fp2_mul(&t1, &TT1.x, &TT2.y);
+    fp2_mul(&t2, &TT1.y, &TT2.x);
+    fp2_mul(&out->codomain.null_point.x, &TT2.x, &t1);
+    fp2_mul(&out->codomain.null_point.y, &TT2.y, &t2);
+    fp2_mul(&out->codomain.null_point.z, &TT2.z, &t1);
+    fp2_mul(&out->codomain.null_point.t, &TT2.t, &t2);
+    fp2_t t3;
+    fp2_mul(&t3, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.x, &t3, &TT1.y);
+    fp2_mul(&out->precomputation.y, &t3, &TT1.x);
+    fp2_copy(&out->precomputation.z, &out->codomain.null_point.t);
+    fp2_copy(&out->precomputation.t, &out->codomain.null_point.z);
+
+    // If T1_8 and T2_8 are our 8-torsion points, this ensures that the
+    // 4-torsion points 2T1_8 and 2T2_8 are isotropic.
+    if (verify) {
+        fp2_mul(&t1, &TT1.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT1.y, &out->precomputation.y);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT1.z, &out->precomputation.z);
+        fp2_mul(&t2, &TT1.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.x, &out->precomputation.x);
+        fp2_mul(&t2, &TT2.z, &out->precomputation.z);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+        fp2_mul(&t1, &TT2.y, &out->precomputation.y);
+        fp2_mul(&t2, &TT2.t, &out->precomputation.t);
+        if (!fp2_is_equal(&t1, &t2))
+            return 0;
+    }
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+    return 1;
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the 4 torsion above the kernel is known and not the 8
+ * torsion
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_4 a point in A[4]
+ * @param T2_4 a point in A[4]
+ * @param hadamard_bool_1 a boolean
+ * @param hadamard_bool_2 a boolean
+ *
+ * out : A -> B of kernel [2](T1_4,T2_4)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_4(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_4,
+                        const theta_point_t *T2_4,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_4;
+    out->T2_8 = *T2_4;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT1, TT2;
+    // we will compute:
+    // TT1 = (xAB, _ , xCD, _)
+    // TT2 = (AA,BB,CC,DD)
+
+    // fp2_t xA_inv,zA_inv,tB_inv;
+
+    if (hadamard_bool_1) {
+        hadamard(&TT1, T1_4);
+        to_squared_theta(&TT1, &TT1);
+
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT1, T1_4);
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    fp2_t sqaabb, sqaacc;
+    fp2_mul(&sqaabb, &TT2.x, &TT2.y);
+    fp2_mul(&sqaacc, &TT2.x, &TT2.z);
+    // No need to check the square roots, only used for signing.
+    // sqaabb = sqrt(AA*BB)
+    fp2_sqrt(&sqaabb);
+    // sqaacc = sqrt(AA*CC)
+    fp2_sqrt(&sqaacc);
+
+    // we compute out->codomain.null_point = (xAB * sqaacc * AA, xAB *sqaabb *sqaacc, xCD*sqaabb *
+    // AA) out->precomputation = (xAB * BB * CC *DD , sqaabb * CC * DD * xAB , sqaacc * BB* DD * xAB
+    // , xCD * sqaabb *sqaacc * BB)
+
+    fp2_mul(&out->codomain.null_point.y, &sqaabb, &sqaacc);
+    fp2_mul(&out->precomputation.t, &out->codomain.null_point.y, &TT1.z);
+    fp2_mul(&out->codomain.null_point.y, &out->codomain.null_point.y,
+            &TT1.x); // done for out->codomain.null_point.y
+
+    fp2_mul(&out->codomain.null_point.t, &TT1.z, &sqaabb);
+    fp2_mul(&out->codomain.null_point.t, &out->codomain.null_point.t,
+            &TT2.x); // done for out->codomain.null_point.t
+
+    fp2_mul(&out->codomain.null_point.x, &TT1.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.z, &out->codomain.null_point.x,
+            &TT2.z); // done for out->codomain.null_point.z
+    fp2_mul(&out->codomain.null_point.x, &out->codomain.null_point.x,
+            &sqaacc); // done for out->codomain.null_point.x
+
+    fp2_mul(&out->precomputation.x, &TT1.x, &TT2.t);
+    fp2_mul(&out->precomputation.z, &out->precomputation.x, &TT2.y);
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.z);
+    fp2_mul(&out->precomputation.y, &out->precomputation.x, &sqaabb); // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y);  // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &sqaacc); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y);  // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+/**
+ * @brief Compute a (2,2) isogeny when only the kernel is known and not the 8 or 4 torsion above
+ *
+ * @param out Output: the theta_isogeny
+ * @param A a theta null point for the domain
+ * @param T1_2 a point in A[2]
+ * @param T2_2 a point in A[2]
+ * @param hadamard_bool_1 a boolean
+ * @param boo2 a boolean
+ *
+ * out : A -> B of kernel (T1_2,T2_2)
+ * hadamard_bool_1 controls if the domain is in standard or dual coordinates
+ * hadamard_bool_2 controls if the codomain is in standard or dual coordinates
+ *
+ */
+static void
+theta_isogeny_compute_2(theta_isogeny_t *out,
+                        const theta_structure_t *A,
+                        const theta_point_t *T1_2,
+                        const theta_point_t *T2_2,
+                        bool hadamard_bool_1,
+                        bool hadamard_bool_2)
+{
+    out->hadamard_bool_1 = hadamard_bool_1;
+    out->hadamard_bool_2 = hadamard_bool_2;
+    out->domain = *A;
+    out->T1_8 = *T1_2;
+    out->T2_8 = *T2_2;
+    out->codomain.precomputation = false;
+
+    theta_point_t TT2;
+    // we will compute:
+    // TT2 = (AA,BB,CC,DD)
+
+    if (hadamard_bool_1) {
+        hadamard(&TT2, &A->null_point);
+        to_squared_theta(&TT2, &TT2);
+    } else {
+        to_squared_theta(&TT2, &A->null_point);
+    }
+
+    // we compute out->codomain.null_point = (AA,sqaabb, sqaacc, sqaadd)
+    // out->precomputation = (  BB * CC *DD , sqaabb * CC * DD , sqaacc * BB* DD , sqaadd * BB * CC)
+    fp2_copy(&out->codomain.null_point.x, &TT2.x);
+    fp2_mul(&out->codomain.null_point.y, &TT2.x, &TT2.y);
+    fp2_mul(&out->codomain.null_point.z, &TT2.x, &TT2.z);
+    fp2_mul(&out->codomain.null_point.t, &TT2.x, &TT2.t);
+    // No need to check the square roots, only used for signing.
+    fp2_sqrt(&out->codomain.null_point.y);
+    fp2_sqrt(&out->codomain.null_point.z);
+    fp2_sqrt(&out->codomain.null_point.t);
+
+    fp2_mul(&out->precomputation.x, &TT2.z, &TT2.t);
+    fp2_mul(&out->precomputation.y,
+            &out->precomputation.x,
+            &out->codomain.null_point.y);                            // done for out->precomputation.y
+    fp2_mul(&out->precomputation.x, &out->precomputation.x, &TT2.y); // done for out->precomputation.x
+    fp2_mul(&out->precomputation.z, &TT2.t, &out->codomain.null_point.z);
+    fp2_mul(&out->precomputation.z, &out->precomputation.z, &TT2.y); // done for out->precomputation.z
+    fp2_mul(&out->precomputation.t, &TT2.z, &out->codomain.null_point.t);
+    fp2_mul(&out->precomputation.t, &out->precomputation.t, &TT2.y); // done for out->precomputation.t
+
+    if (hadamard_bool_2) {
+        hadamard(&out->codomain.null_point, &out->codomain.null_point);
+    }
+}
+
+static void
+theta_isogeny_eval(theta_point_t *out, const theta_isogeny_t *phi, const theta_point_t *P)
+{
+    if (phi->hadamard_bool_1) {
+        hadamard(out, P);
+        to_squared_theta(out, out);
+    } else {
+        to_squared_theta(out, P);
+    }
+    fp2_mul(&out->x, &out->x, &phi->precomputation.x);
+    fp2_mul(&out->y, &out->y, &phi->precomputation.y);
+    fp2_mul(&out->z, &out->z, &phi->precomputation.z);
+    fp2_mul(&out->t, &out->t, &phi->precomputation.t);
+
+    if (phi->hadamard_bool_2) {
+        hadamard(out, out);
+    }
+}
+
+#if defined(ENABLE_SIGN)
+// Sample a random secret index in [0, 5] to select one of the 6 normalisation
+// matrices for the normalisation of the output of the (2,2)-chain during
+// splitting
+static unsigned char
+sample_random_index(void)
+{
+    // To avoid bias in reduction we should only consider integers smaller
+    // than 2^32 which are a multiple of 6, so we only reduce bytes with a
+    // value in [0, 4294967292-1].
+    // We have 4294967292/2^32 = ~99.9999999% chance that the first try is "good".
+    unsigned char seed_arr[4];
+    uint32_t seed;
+
+    do {
+        randombytes(seed_arr, 4);
+        seed = (seed_arr[0] | (seed_arr[1] << 8) | (seed_arr[2] << 16) | (seed_arr[3] << 24));
+    } while (seed >= 4294967292U);
+
+    uint32_t secret_index = seed - (((uint64_t)seed * 2863311531U) >> 34) * 6;
+    assert(secret_index == seed % 6); // ensure the constant time trick above works
+    return (unsigned char)secret_index;
+}
+#endif
+
+static bool
+splitting_compute(theta_splitting_t *out, const theta_structure_t *A, int zero_index, bool randomize)
+
+{
+    // init
+    uint32_t ctl;
+    uint32_t count = 0;
+    fp2_t U_cst, t1, t2;
+
+    memset(&out->M, 0, sizeof(basis_change_matrix_t));
+
+    // enumerate through all indices
+    for (int i = 0; i < 10; i++) {
+        fp2_set_zero(&U_cst);
+        for (int t = 0; t < 4; t++) {
+            // Iterate through the null point
+            choose_index_theta_point(&t2, t, &A->null_point);
+            choose_index_theta_point(&t1, t ^ EVEN_INDEX[i][1], &A->null_point);
+
+            // Compute t1 * t2
+            fp2_mul(&t1, &t1, &t2);
+            // If CHI_EVAL(i,t) is +1 we want ctl to be 0 and
+            // If CHI_EVAL(i,t) is -1 we want ctl to be 0xFF..FF
+            ctl = (uint32_t)(CHI_EVAL[EVEN_INDEX[i][0]][t] >> 1);
+            assert(ctl == 0 || ctl == 0xffffffff);
+
+            fp2_neg(&t2, &t1);
+            fp2_select(&t1, &t1, &t2, ctl);
+
+            // Then we compute U_cst ± (t1 * t2)
+            fp2_add(&U_cst, &U_cst, &t1);
+        }
+
+        // If U_cst is 0 then update the splitting matrix
+        ctl = fp2_is_zero(&U_cst);
+        count -= ctl;
+        select_base_change_matrix(&out->M, &out->M, &SPLITTING_TRANSFORMS[i], ctl);
+        if (zero_index != -1 && i == zero_index &&
+            !ctl) { // extra checks if we know exactly where the 0 index should be
+            return 0;
+        }
+    }
+
+#if defined(ENABLE_SIGN)
+    // Pick a random normalization matrix
+    if (randomize) {
+        unsigned char secret_index = sample_random_index();
+        basis_change_matrix_t Mrandom;
+
+        set_base_change_matrix_from_precomp(&Mrandom, &NORMALIZATION_TRANSFORMS[0]);
+
+        // Use a constant time selection to pick the index we want
+        for (unsigned char i = 1; i < 6; i++) {
+            // When i == secret_index, mask == 0 and 0xFF..FF otherwise
+            int32_t mask = i - secret_index;
+            mask = (mask | -mask) >> 31;
+            select_base_change_matrix(&Mrandom, &Mrandom, &NORMALIZATION_TRANSFORMS[i], ~mask);
+        }
+        base_change_matrix_multiplication(&out->M, &Mrandom, &out->M);
+    }
+#else
+    assert(!randomize);
+#endif
+
+    // apply the isomorphism to ensure the null point is compatible with splitting
+    apply_isomorphism(&out->B.null_point, &out->M, &A->null_point);
+
+    // splitting was successful only if exactly one zero was identified
+    return count == 1;
+}
+
+static int
+theta_product_structure_to_elliptic_product(theta_couple_curve_t *E12, theta_structure_t *A)
+{
+    fp2_t xx, yy;
+
+    // This should be true from our computations in splitting_compute
+    // but still check this for sanity
+    if (!is_product_theta_point(&A->null_point))
+        return 0;
+
+    ec_curve_init(&(E12->E1));
+    ec_curve_init(&(E12->E2));
+
+    // A valid elliptic theta null point has no zero coordinate
+    if (fp2_is_zero(&A->null_point.x) | fp2_is_zero(&A->null_point.y) | fp2_is_zero(&A->null_point.z))
+        return 0;
+
+    // xx = x², yy = y²
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.y);
+    // xx = x^4, yy = y^4
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A2 = -2(x^4+y^4)/(x^4-y^4)
+    fp2_add(&E12->E2.A, &xx, &yy);
+    fp2_sub(&E12->E2.C, &xx, &yy);
+    fp2_add(&E12->E2.A, &E12->E2.A, &E12->E2.A);
+    fp2_neg(&E12->E2.A, &E12->E2.A);
+
+    // same with x,z
+    fp2_sqr(&xx, &A->null_point.x);
+    fp2_sqr(&yy, &A->null_point.z);
+    fp2_sqr(&xx, &xx);
+    fp2_sqr(&yy, &yy);
+
+    // A1 = -2(x^4+z^4)/(x^4-z^4)
+    fp2_add(&E12->E1.A, &xx, &yy);
+    fp2_sub(&E12->E1.C, &xx, &yy);
+    fp2_add(&E12->E1.A, &E12->E1.A, &E12->E1.A);
+    fp2_neg(&E12->E1.A, &E12->E1.A);
+
+    if (fp2_is_zero(&E12->E1.C) | fp2_is_zero(&E12->E2.C))
+        return 0;
+
+    return 1;
+}
+
+static int
+theta_point_to_montgomery_point(theta_couple_point_t *P12, const theta_point_t *P, const theta_structure_t *A)
+{
+    fp2_t temp;
+    const fp2_t *x, *z;
+
+    if (!is_product_theta_point(P))
+        return 0;
+
+    x = &P->x;
+    z = &P->y;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->z;
+        z = &P->t;
+    }
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        return 0; // at this point P=(0:0:0:0) so is invalid
+    }
+    // P2.X = A.null_point.y * P.x + A.null_point.x * P.y
+    // P2.Z = - A.null_point.y * P.x + A.null_point.x * P.y
+    fp2_mul(&P12->P2.x, &A->null_point.y, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P2.z, &temp, &P12->P2.x);
+    fp2_add(&P12->P2.x, &P12->P2.x, &temp);
+
+    x = &P->x;
+    z = &P->z;
+    if (fp2_is_zero(x) & fp2_is_zero(z)) {
+        x = &P->y;
+        z = &P->t;
+    }
+    // P1.X = A.null_point.z * P.x + A.null_point.x * P.z
+    // P1.Z = -A.null_point.z * P.x + A.null_point.x * P.z
+    fp2_mul(&P12->P1.x, &A->null_point.z, x);
+    fp2_mul(&temp, &A->null_point.x, z);
+    fp2_sub(&P12->P1.z, &temp, &P12->P1.x);
+    fp2_add(&P12->P1.x, &P12->P1.x, &temp);
+    return 1;
+}
+
+static int
+_theta_chain_compute_impl(unsigned n,
+                          theta_couple_curve_t *E12,
+                          const theta_kernel_couple_points_t *ker,
+                          bool extra_torsion,
+                          theta_couple_curve_t *E34,
+                          theta_couple_point_t *P12,
+                          size_t numP,
+                          bool verify,
+                          bool randomize)
+{
+    theta_structure_t theta;
+
+    // lift the basis
+    theta_couple_jac_point_t xyT1, xyT2;
+
+    ec_basis_t bas1 = { .P = ker->T1.P1, .Q = ker->T2.P1, .PmQ = ker->T1m2.P1 };
+    ec_basis_t bas2 = { .P = ker->T1.P2, .Q = ker->T2.P2, .PmQ = ker->T1m2.P2 };
+    if (!lift_basis(&xyT1.P1, &xyT2.P1, &bas1, &E12->E1))
+        return 0;
+    if (!lift_basis(&xyT1.P2, &xyT2.P2, &bas2, &E12->E2))
+        return 0;
+
+    const unsigned extra = HD_extra_torsion * extra_torsion;
+
+#ifndef NDEBUG
+    assert(extra == 0 || extra == 2); // only cases implemented
+    if (!test_point_order_twof(&bas2.P, &E12->E2, n + extra))
+        debug_print("bas2.P does not have correct order");
+
+    if (!test_jac_order_twof(&xyT2.P2, &E12->E2, n + extra))
+        debug_print("xyT2.P2 does not have correct order");
+#endif
+
+    theta_point_t pts[numP ? numP : 1];
+
+    int space = 1;
+    for (unsigned i = 1; i < n; i *= 2)
+        ++space;
+
+    uint16_t todo[space];
+    todo[0] = n - 2 + extra;
+
+    int current = 0;
+
+    // kernel points for the gluing isogeny
+    theta_couple_jac_point_t jacQ1[space], jacQ2[space];
+    jacQ1[0] = xyT1;
+    jacQ2[0] = xyT2;
+    while (todo[current] != 1) {
+        assert(todo[current] >= 2);
+        ++current;
+        assert(current < space);
+        // the gluing isogeny is quite a bit more expensive than the others,
+        // so we adjust the usual splitting rule here a little bit: towards
+        // the end of the doubling chain it will be cheaper to recompute the
+        // doublings after evaluation than to push the intermediate points.
+        const unsigned num_dbls = todo[current - 1] >= 16 ? todo[current - 1] / 2 : todo[current - 1] - 1;
+        assert(num_dbls && num_dbls < todo[current - 1]);
+        double_couple_jac_point_iter(&jacQ1[current], num_dbls, &jacQ1[current - 1], E12);
+        double_couple_jac_point_iter(&jacQ2[current], num_dbls, &jacQ2[current - 1], E12);
+        todo[current] = todo[current - 1] - num_dbls;
+    }
+
+    // kernel points for the remaining isogeny steps
+    theta_point_t thetaQ1[space], thetaQ2[space];
+
+    // the gluing step
+    theta_gluing_t first_step;
+    {
+        assert(todo[current] == 1);
+
+        // compute the gluing isogeny
+        if (!gluing_compute(&first_step, E12, &jacQ1[current], &jacQ2[current], verify))
+            return 0;
+
+        // evaluate
+        for (unsigned j = 0; j < numP; ++j) {
+            assert(ec_is_zero(&P12[j].P1) || ec_is_zero(&P12[j].P2));
+            if (!gluing_eval_point_special_case(&pts[j], &P12[j], &first_step))
+                return 0;
+        }
+
+        // push kernel points through gluing isogeny
+        for (int j = 0; j < current; ++j) {
+            gluing_eval_basis(&thetaQ1[j], &thetaQ2[j], &jacQ1[j], &jacQ2[j], &first_step);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    // set-up the theta_structure for the first codomain
+    theta.null_point = first_step.codomain;
+    theta.precomputation = 0;
+    theta_precomputation(&theta);
+
+    theta_isogeny_t step;
+
+    // and now we do the remaining steps
+    for (unsigned i = 1; current >= 0 && todo[current]; ++i) {
+        assert(current < space);
+        while (todo[current] != 1) {
+            assert(todo[current] >= 2);
+            ++current;
+            assert(current < space);
+            const unsigned num_dbls = todo[current - 1] / 2;
+            assert(num_dbls && num_dbls < todo[current - 1]);
+            double_iter(&thetaQ1[current], &theta, &thetaQ1[current - 1], num_dbls);
+            double_iter(&thetaQ2[current], &theta, &thetaQ2[current - 1], num_dbls);
+            todo[current] = todo[current - 1] - num_dbls;
+        }
+
+        // computing the next step
+        int ret;
+        if (i == n - 2) // penultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 0, verify);
+        else if (i == n - 1) // ultimate step
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 1, 0, false);
+        else
+            ret = theta_isogeny_compute(&step, &theta, &thetaQ1[current], &thetaQ2[current], 0, 1, verify);
+        if (!ret)
+            return 0;
+
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+
+        // updating the codomain
+        theta = step.codomain;
+
+        // pushing the kernel
+        assert(todo[current] == 1);
+        for (int j = 0; j < current; ++j) {
+            theta_isogeny_eval(&thetaQ1[j], &step, &thetaQ1[j]);
+            theta_isogeny_eval(&thetaQ2[j], &step, &thetaQ2[j]);
+            assert(todo[j]);
+            --todo[j];
+        }
+
+        --current;
+    }
+
+    assert(current == -1);
+
+    if (!extra_torsion) {
+        if (n >= 3) {
+            // in the last step we've skipped pushing the kernel since current was == 0, let's do it now
+            theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+            theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+        }
+
+        // penultimate step
+        theta_isogeny_compute_4(&step, &theta, &thetaQ1[0], &thetaQ2[0], 0, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+        theta_isogeny_eval(&thetaQ1[0], &step, &thetaQ1[0]);
+        theta_isogeny_eval(&thetaQ2[0], &step, &thetaQ2[0]);
+
+        // ultimate step
+        theta_isogeny_compute_2(&step, &theta, &thetaQ1[0], &thetaQ2[0], 1, 0);
+        for (unsigned j = 0; j < numP; ++j)
+            theta_isogeny_eval(&pts[j], &step, &pts[j]);
+        theta = step.codomain;
+    }
+
+    // final splitting step
+    theta_splitting_t last_step;
+
+    bool is_split = splitting_compute(&last_step, &theta, extra_torsion ? 8 : -1, randomize);
+
+    if (!is_split) {
+        debug_print("kernel did not generate an isogeny between elliptic products");
+        return 0;
+    }
+
+    if (!theta_product_structure_to_elliptic_product(E34, &last_step.B))
+        return 0;
+
+    // evaluate
+    for (size_t j = 0; j < numP; ++j) {
+        apply_isomorphism(&pts[j], &last_step.M, &pts[j]);
+        if (!theta_point_to_montgomery_point(&P12[j], &pts[j], &last_step.B))
+            return 0;
+    }
+
+    return 1;
+}
+
+int
+theta_chain_compute_and_eval(unsigned n,
+                             /*const*/ theta_couple_curve_t *E12,
+                             const theta_kernel_couple_points_t *ker,
+                             bool extra_torsion,
+                             theta_couple_curve_t *E34,
+                             theta_couple_point_t *P12,
+                             size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, false);
+}
+
+// Like theta_chain_compute_and_eval, adding extra verification checks;
+// used in the signature verification
+int
+theta_chain_compute_and_eval_verify(unsigned n,
+                                    /*const*/ theta_couple_curve_t *E12,
+                                    const theta_kernel_couple_points_t *ker,
+                                    bool extra_torsion,
+                                    theta_couple_curve_t *E34,
+                                    theta_couple_point_t *P12,
+                                    size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, true, false);
+}
+
+int
+theta_chain_compute_and_eval_randomized(unsigned n,
+                                        /*const*/ theta_couple_curve_t *E12,
+                                        const theta_kernel_couple_points_t *ker,
+                                        bool extra_torsion,
+                                        theta_couple_curve_t *E34,
+                                        theta_couple_point_t *P12,
+                                        size_t numP)
+{
+    return _theta_chain_compute_impl(n, E12, ker, extra_torsion, E34, P12, numP, false, true);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/theta_isogenies.h b/src/pqm4/sqisign_lvl5/ref/theta_isogenies.h
new file mode 100644
index 0000000..d151811
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/theta_isogenies.h
@@ -0,0 +1,18 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta isogeny header
+ */
+
+#ifndef THETA_ISOGENY_H
+#define THETA_ISOGENY_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+#include <fp2.h>
+#include "theta_structure.h"
+#include <hd.h>
+#include <hd_splitting_transforms.h>
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/theta_structure.c b/src/pqm4/sqisign_lvl5/ref/theta_structure.c
new file mode 100644
index 0000000..ce97ac6
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/theta_structure.c
@@ -0,0 +1,78 @@
+#include "theta_structure.h"
+#include <assert.h>
+
+void
+theta_precomputation(theta_structure_t *A)
+{
+
+    if (A->precomputation) {
+        return;
+    }
+
+    theta_point_t A_dual;
+    to_squared_theta(&A_dual, &A->null_point);
+
+    fp2_t t1, t2;
+    fp2_mul(&t1, &A_dual.x, &A_dual.y);
+    fp2_mul(&t2, &A_dual.z, &A_dual.t);
+    fp2_mul(&A->XYZ0, &t1, &A_dual.z);
+    fp2_mul(&A->XYT0, &t1, &A_dual.t);
+    fp2_mul(&A->YZT0, &t2, &A_dual.y);
+    fp2_mul(&A->XZT0, &t2, &A_dual.x);
+
+    fp2_mul(&t1, &A->null_point.x, &A->null_point.y);
+    fp2_mul(&t2, &A->null_point.z, &A->null_point.t);
+    fp2_mul(&A->xyz0, &t1, &A->null_point.z);
+    fp2_mul(&A->xyt0, &t1, &A->null_point.t);
+    fp2_mul(&A->yzt0, &t2, &A->null_point.y);
+    fp2_mul(&A->xzt0, &t2, &A->null_point.x);
+
+    A->precomputation = true;
+}
+
+void
+double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in)
+{
+    to_squared_theta(out, in);
+    fp2_sqr(&out->x, &out->x);
+    fp2_sqr(&out->y, &out->y);
+    fp2_sqr(&out->z, &out->z);
+    fp2_sqr(&out->t, &out->t);
+
+    if (!A->precomputation) {
+        theta_precomputation(A);
+    }
+    fp2_mul(&out->x, &out->x, &A->YZT0);
+    fp2_mul(&out->y, &out->y, &A->XZT0);
+    fp2_mul(&out->z, &out->z, &A->XYT0);
+    fp2_mul(&out->t, &out->t, &A->XYZ0);
+
+    hadamard(out, out);
+
+    fp2_mul(&out->x, &out->x, &A->yzt0);
+    fp2_mul(&out->y, &out->y, &A->xzt0);
+    fp2_mul(&out->z, &out->z, &A->xyt0);
+    fp2_mul(&out->t, &out->t, &A->xyz0);
+}
+
+void
+double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp)
+{
+    if (exp == 0) {
+        *out = *in;
+    } else {
+        double_point(out, A, in);
+        for (int i = 1; i < exp; i++) {
+            double_point(out, A, out);
+        }
+    }
+}
+
+uint32_t
+is_product_theta_point(const theta_point_t *P)
+{
+    fp2_t t1, t2;
+    fp2_mul(&t1, &P->x, &P->t);
+    fp2_mul(&t2, &P->y, &P->z);
+    return fp2_is_equal(&t1, &t2);
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/theta_structure.h b/src/pqm4/sqisign_lvl5/ref/theta_structure.h
new file mode 100644
index 0000000..fc630b7
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/theta_structure.h
@@ -0,0 +1,135 @@
+/** @file
+ *
+ * @authors Antonin Leroux
+ *
+ * @brief the theta structure header
+ */
+
+#ifndef THETA_STRUCTURE_H
+#define THETA_STRUCTURE_H
+
+#include <ec.h>
+#include <fp2.h>
+#include <hd.h>
+
+/** @internal
+ * @ingroup hd_module
+ * @defgroup hd_theta Functions for theta structures
+ * @{
+ */
+
+/**
+ * @brief Perform the hadamard transform on a theta point
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x+y+z+t, x-y+z-t, x+y-z-t, x-y-z+t)
+ *
+ */
+static inline void
+hadamard(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_t t1, t2, t3, t4;
+
+    // t1 = x + y
+    fp2_add(&t1, &in->x, &in->y);
+    // t2 = x - y
+    fp2_sub(&t2, &in->x, &in->y);
+    // t3 = z + t
+    fp2_add(&t3, &in->z, &in->t);
+    // t4 = z - t
+    fp2_sub(&t4, &in->z, &in->t);
+
+    fp2_add(&out->x, &t1, &t3);
+    fp2_add(&out->y, &t2, &t4);
+    fp2_sub(&out->z, &t1, &t3);
+    fp2_sub(&out->t, &t2, &t4);
+}
+
+/**
+ * @brief Square the coordinates of a theta point
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2, y^2, z^2, t^2)
+ *
+ */
+static inline void
+pointwise_square(theta_point_t *out, const theta_point_t *in)
+{
+    fp2_sqr(&out->x, &in->x);
+    fp2_sqr(&out->y, &in->y);
+    fp2_sqr(&out->z, &in->z);
+    fp2_sqr(&out->t, &in->t);
+}
+
+/**
+ * @brief Square the coordinates and then perform the hadamard transform
+ *
+ * @param out Output: the theta_point
+ * @param in a theta point*
+ * in = (x,y,z,t)
+ * out = (x^2+y^2+z^2+t^2, x^2-y^2+z^2-t^2, x^2+y^2-z^2-t^2, x^2-y^2-z^2+t^2)
+ *
+ */
+static inline void
+to_squared_theta(theta_point_t *out, const theta_point_t *in)
+{
+    pointwise_square(out, in);
+    hadamard(out, out);
+}
+
+/**
+ * @brief Perform the theta structure precomputation
+ *
+ * @param A Output: the theta_structure
+ *
+ * if A.null_point = (x,y,z,t)
+ * if (xx,yy,zz,tt) = to_squared_theta(A.null_point)
+ * Computes y0,z0,t0,Y0,Z0,T0 = x/y,x/z,x/t,XX/YY,XX/ZZ,XX/TT
+ *
+ */
+void theta_precomputation(theta_structure_t *A);
+
+/**
+ * @brief Compute the double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * in = (x,y,z,t)
+ * out = [2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_point(theta_point_t *out, theta_structure_t *A, const theta_point_t *in);
+
+/**
+ * @brief Compute the iterated double of the theta point in on the theta struc A
+ *
+ * @param out Output: the theta_point
+ * @param A a theta structure
+ * @param in a theta point in the theta structure A
+ * @param exp the exponent
+ * in = (x,y,z,t)
+ * out = [2^2] (x,y,z,t)
+ * /!\ assumes that no coordinates is zero and that the precomputation of A has been done
+ *
+ */
+void double_iter(theta_point_t *out, theta_structure_t *A, const theta_point_t *in, int exp);
+
+/*
+ * @brief Check if a theta point is a product theta point
+ *
+ * @param P a theta point
+ * @return 0xFFFFFFFF if true, zero otherwise
+ */
+uint32_t is_product_theta_point(const theta_point_t *P);
+
+// end hd_theta
+/**
+ * @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/tools.h b/src/pqm4/sqisign_lvl5/ref/tools.h
new file mode 100644
index 0000000..5a6a505
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/tools.h
@@ -0,0 +1,49 @@
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include <time.h>
+
+// Debug printing:
+// https://stackoverflow.com/questions/1644868/define-macro-for-debug-printing-in-c
+#ifndef NDEBUG
+#define DEBUG_PRINT 1
+#else
+#define DEBUG_PRINT 0
+#endif
+
+#ifndef __FILE_NAME__
+#define __FILE_NAME__ "NA"
+#endif
+
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+
+#ifndef __func__
+#define __func__ "NA"
+#endif
+
+#define debug_print(fmt)                                                                           \
+    do {                                                                                           \
+        if (DEBUG_PRINT)                                                                           \
+            printf("warning: %s, file %s, line %d, function %s().\n",                              \
+                   fmt,                                                                            \
+                   __FILE_NAME__,                                                                  \
+                   __LINE__,                                                                       \
+                   __func__);                                                                      \
+    } while (0)
+
+
+clock_t tic(void);
+float tac(void);                             /* time in ms since last tic */
+float TAC(const char *str);                  /* same, but prints it with label 'str' */
+float toc(const clock_t t);                  /* time in ms since t */
+float TOC(const clock_t t, const char *str); /* same, but prints it with label 'str' */
+float TOC_clock(const clock_t t, const char *str);
+
+clock_t dclock(const clock_t t); // return the clock cycle diff between now and t
+float clock_to_time(const clock_t t,
+                    const char *str); // convert the number of clock cycles t to time
+float clock_print(const clock_t t, const char *str);
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/tutil.h b/src/pqm4/sqisign_lvl5/ref/tutil.h
new file mode 100644
index 0000000..59f1620
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/tutil.h
@@ -0,0 +1,36 @@
+#ifndef TUTIL_H
+#define TUTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP16(i) __builtin_bswap16((i))
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#define UNUSED __attribute__((unused))
+#else
+#define BSWAP16(i) ((((i) >> 8) & 0xff) | (((i) & 0xff00) << 8))
+#define BSWAP32(i)                                                                                 \
+    ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
+#define UNUSED
+#endif
+
+#if defined(RADIX_64)
+#define digit_t uint64_t
+#define sdigit_t int64_t
+#define RADIX 64
+#define LOG2RADIX 6
+#define BSWAP_DIGIT(i) BSWAP64(i)
+#elif defined(RADIX_32)
+#define digit_t uint32_t
+#define sdigit_t int32_t
+#define RADIX 32
+#define LOG2RADIX 5
+#define BSWAP_DIGIT(i) BSWAP32(i)
+#else
+#error "Radix must be 32bit or 64 bit"
+#endif
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/verification.h b/src/pqm4/sqisign_lvl5/ref/verification.h
new file mode 100644
index 0000000..af67469
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/verification.h
@@ -0,0 +1,123 @@
+/** @file
+ *
+ * @brief The verification protocol
+ */
+
+#ifndef VERIFICATION_H
+#define VERIFICATION_H
+
+#include <sqisign_namespace.h>
+#include <ec.h>
+
+/** @defgroup verification SQIsignHD verification protocol
+ * @{
+ */
+
+/** @defgroup verification_t Types for SQIsignHD verification protocol
+ * @{
+ */
+
+typedef digit_t scalar_t[NWORDS_ORDER];
+typedef scalar_t scalar_mtx_2x2_t[2][2];
+
+/** @brief Type for the signature
+ *
+ * @typedef signature_t
+ *
+ * @struct signature
+ *
+ */
+typedef struct signature
+{
+    fp2_t E_aux_A; // the Montgomery A-coefficient for the auxiliary curve
+    uint8_t backtracking;
+    uint8_t two_resp_length;
+    scalar_mtx_2x2_t mat_Bchall_can_to_B_chall; // the matrix of the desired basis
+    scalar_t chall_coeff;
+    uint8_t hint_aux;
+    uint8_t hint_chall;
+} signature_t;
+
+/** @brief Type for the public keys
+ *
+ * @typedef public_key_t
+ *
+ * @struct public_key
+ *
+ */
+typedef struct public_key
+{
+    ec_curve_t curve; // the normalized A-coefficient of the Montgomery curve
+    uint8_t hint_pk;
+} public_key_t;
+
+/** @}
+ */
+
+/*************************** Functions *****************************/
+
+void public_key_init(public_key_t *pk);
+void public_key_finalize(public_key_t *pk);
+
+void hash_to_challenge(scalar_t *scalar,
+                       const public_key_t *pk,
+                       const ec_curve_t *com_curve,
+                       const unsigned char *message,
+                       size_t length);
+
+/**
+ * @brief Verification
+ *
+ * @param sig signature
+ * @param pk public key
+ * @param m message
+ * @param l size
+ * @returns 1 if the signature verifies, 0 otherwise
+ */
+int protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l);
+
+/*************************** Encoding *****************************/
+
+/** @defgroup encoding Encoding and decoding functions
+ * @{
+ */
+
+/**
+ * @brief Encodes a signature as a byte array
+ *
+ * @param enc : Byte array to encode the signature in
+ * @param sig : Signature to encode
+ */
+void signature_to_bytes(unsigned char *enc, const signature_t *sig);
+
+/**
+ * @brief Decodes a signature from a byte array
+ *
+ * @param sig : Structure to decode the signature in
+ * @param enc : Byte array to decode
+ */
+void signature_from_bytes(signature_t *sig, const unsigned char *enc);
+
+/**
+ * @brief Encodes a public key as a byte array
+ *
+ * @param enc : Byte array to encode the public key in
+ * @param pk : Public key to encode
+ */
+unsigned char *public_key_to_bytes(unsigned char *enc, const public_key_t *pk);
+
+/**
+ * @brief Decodes a public key from a byte array
+ *
+ * @param pk : Structure to decode the public key in
+ * @param enc : Byte array to decode
+ */
+const unsigned char *public_key_from_bytes(public_key_t *pk, const unsigned char *enc);
+
+/** @}
+ */
+
+/** @}
+ */
+
+#endif
diff --git a/src/pqm4/sqisign_lvl5/ref/verify.c b/src/pqm4/sqisign_lvl5/ref/verify.c
new file mode 100644
index 0000000..b5f78ad
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/verify.c
@@ -0,0 +1,309 @@
+#include <verification.h>
+#include <mp.h>
+#include <hd.h>
+#include <encoded_sizes.h>
+#include <assert.h>
+
+// Check that the basis change matrix elements are canonical
+// representatives modulo 2^(SQIsign_response_length + 2).
+static int
+check_canonical_basis_change_matrix(const signature_t *sig)
+{
+    // This works as long as all values in sig->mat_Bchall_can_to_B_chall are
+    // positive integers.
+    int ret = 1;
+    scalar_t aux;
+
+    memset(aux, 0, NWORDS_ORDER * sizeof(digit_t));
+    aux[0] = 0x1;
+    multiple_mp_shiftl(aux, SQIsign_response_length + HD_extra_torsion - (int)sig->backtracking, NWORDS_ORDER);
+
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 2; j++) {
+            if (mp_compare(aux, sig->mat_Bchall_can_to_B_chall[i][j], NWORDS_ORDER) <= 0) {
+                ret = 0;
+            }
+        }
+    }
+
+    return ret;
+}
+
+// Compute the 2^n isogeny from the signature with kernel
+// P + [chall_coeff]Q and store the codomain in E_chall
+static int
+compute_challenge_verify(ec_curve_t *E_chall, const signature_t *sig, const ec_curve_t *Epk, const uint8_t hint_pk)
+{
+    ec_basis_t bas_EA;
+    ec_isog_even_t phi_chall;
+
+    // Set domain and length of 2^n isogeny
+    copy_curve(&phi_chall.curve, Epk);
+    phi_chall.length = TORSION_EVEN_POWER - sig->backtracking;
+
+    // Compute the basis from the supplied hint
+    if (!ec_curve_to_basis_2f_from_hint(&bas_EA, &phi_chall.curve, TORSION_EVEN_POWER, hint_pk)) // canonical
+        return 0;
+
+    // recovering the exact challenge
+    {
+        if (!ec_ladder3pt(&phi_chall.kernel, sig->chall_coeff, &bas_EA.P, &bas_EA.Q, &bas_EA.PmQ, &phi_chall.curve)) {
+            return 0;
+        };
+    }
+
+    // Double the kernel until is has the correct order
+    ec_dbl_iter(&phi_chall.kernel, sig->backtracking, &phi_chall.kernel, &phi_chall.curve);
+
+    // Compute the codomain
+    copy_curve(E_chall, &phi_chall.curve);
+    if (ec_eval_even(E_chall, &phi_chall, NULL, 0))
+        return 0;
+    return 1;
+}
+
+// same as matrix_application_even_basis() in id2iso.c, with some modifications:
+// - this version works with a matrix of scalars (not ibz_t).
+// - reduction modulo 2^f of matrix elements is removed here, because it is
+//   assumed that the elements are already cannonical representatives modulo
+//   2^f; this is ensured by calling check_canonical_basis_change_matrix() at
+//   the beginning of protocols_verify().
+static int
+matrix_scalar_application_even_basis(ec_basis_t *bas, const ec_curve_t *E, scalar_mtx_2x2_t *mat, int f)
+{
+    scalar_t scalar0, scalar1;
+    memset(scalar0, 0, NWORDS_ORDER * sizeof(digit_t));
+    memset(scalar1, 0, NWORDS_ORDER * sizeof(digit_t));
+
+    ec_basis_t tmp_bas;
+    copy_basis(&tmp_bas, bas);
+
+    // For a matrix [[a, c], [b, d]] we compute:
+    //
+    // first basis element R = [a]P + [b]Q
+    if (!ec_biscalar_mul(&bas->P, (*mat)[0][0], (*mat)[1][0], f, &tmp_bas, E))
+        return 0;
+    // second basis element S = [c]P + [d]Q
+    if (!ec_biscalar_mul(&bas->Q, (*mat)[0][1], (*mat)[1][1], f, &tmp_bas, E))
+        return 0;
+    // Their difference R - S = [a - c]P + [b - d]Q
+    mp_sub(scalar0, (*mat)[0][0], (*mat)[0][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar0, f, NWORDS_ORDER);
+    mp_sub(scalar1, (*mat)[1][0], (*mat)[1][1], NWORDS_ORDER);
+    mp_mod_2exp(scalar1, f, NWORDS_ORDER);
+    return ec_biscalar_mul(&bas->PmQ, scalar0, scalar1, f, &tmp_bas, E);
+}
+
+// Compute the bases for the challenge and auxillary curve from
+// the canonical bases. Challenge basis is reconstructed from the
+// compressed scalars within the challenge.
+static int
+challenge_and_aux_basis_verify(ec_basis_t *B_chall_can,
+                               ec_basis_t *B_aux_can,
+                               ec_curve_t *E_chall,
+                               ec_curve_t *E_aux,
+                               signature_t *sig,
+                               const int pow_dim2_deg_resp)
+{
+
+    // recovering the canonical basis as TORSION_EVEN_POWER for consistency with signing
+    if (!ec_curve_to_basis_2f_from_hint(B_chall_can, E_chall, TORSION_EVEN_POWER, sig->hint_chall))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_chall_can,
+                      TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion - sig->two_resp_length,
+                      B_chall_can,
+                      E_chall);
+
+    if (!ec_curve_to_basis_2f_from_hint(B_aux_can, E_aux, TORSION_EVEN_POWER, sig->hint_aux))
+        return 0;
+
+    // setting to the right order
+    ec_dbl_iter_basis(B_aux_can, TORSION_EVEN_POWER - pow_dim2_deg_resp - HD_extra_torsion, B_aux_can, E_aux);
+
+#ifndef NDEBUG
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp + sig->two_resp_length))
+        debug_print("canonical basis has wrong order, expect something to fail");
+#endif
+
+    // applying the change matrix on the basis of E_chall
+    return matrix_scalar_application_even_basis(B_chall_can,
+                                                E_chall,
+                                                &sig->mat_Bchall_can_to_B_chall,
+                                                pow_dim2_deg_resp + HD_extra_torsion + sig->two_resp_length);
+}
+
+// When two_resp_length is non-zero, we must compute a small 2^n-isogeny
+// updating E_chall as the codomain as well as push the basis on E_chall
+// through this isogeny
+static int
+two_response_isogeny_verify(ec_curve_t *E_chall, ec_basis_t *B_chall_can, const signature_t *sig, int pow_dim2_deg_resp)
+{
+    ec_point_t ker, points[3];
+
+    // choosing the right point for the small two_isogenies
+    if (mp_is_even(sig->mat_Bchall_can_to_B_chall[0][0], NWORDS_ORDER) &&
+        mp_is_even(sig->mat_Bchall_can_to_B_chall[1][0], NWORDS_ORDER)) {
+        copy_point(&ker, &B_chall_can->Q);
+    } else {
+        copy_point(&ker, &B_chall_can->P);
+    }
+
+    copy_point(&points[0], &B_chall_can->P);
+    copy_point(&points[1], &B_chall_can->Q);
+    copy_point(&points[2], &B_chall_can->PmQ);
+
+    ec_dbl_iter(&ker, pow_dim2_deg_resp + HD_extra_torsion, &ker, E_chall);
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&ker, E_chall, sig->two_resp_length))
+        debug_print("kernel does not have order 2^(two_resp_length");
+#endif
+
+    if (ec_eval_small_chain(E_chall, &ker, sig->two_resp_length, points, 3, false)) {
+        return 0;
+    }
+
+#ifndef NDEBUG
+    if (!test_point_order_twof(&points[0], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[0] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[1], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[1] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+    if (!test_point_order_twof(&points[2], E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("points[2] does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    copy_point(&B_chall_can->P, &points[0]);
+    copy_point(&B_chall_can->Q, &points[1]);
+    copy_point(&B_chall_can->PmQ, &points[2]);
+    return 1;
+}
+
+// The commitment curve can be recovered from the codomain of the 2D
+// isogeny built from the bases computed during verification.
+static int
+compute_commitment_curve_verify(ec_curve_t *E_com,
+                                const ec_basis_t *B_chall_can,
+                                const ec_basis_t *B_aux_can,
+                                const ec_curve_t *E_chall,
+                                const ec_curve_t *E_aux,
+                                int pow_dim2_deg_resp)
+
+{
+#ifndef NDEBUG
+    // Check all the points are the correct order
+    if (!test_basis_order_twof(B_chall_can, E_chall, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_chall_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+
+    if (!test_basis_order_twof(B_aux_can, E_aux, HD_extra_torsion + pow_dim2_deg_resp))
+        debug_print("B_aux_can does not have order 2^(HD_extra_torsion + pow_dim2_deg_resp");
+#endif
+
+    // now compute the dim2 isogeny from Echall x E_aux -> E_com x E_aux'
+    // of kernel B_chall_can x B_aux_can
+
+    // first we set-up the kernel
+    theta_couple_curve_t EchallxEaux;
+    copy_curve(&EchallxEaux.E1, E_chall);
+    copy_curve(&EchallxEaux.E2, E_aux);
+
+    theta_kernel_couple_points_t dim_two_ker;
+    copy_bases_to_kernel(&dim_two_ker, B_chall_can, B_aux_can);
+
+    // computing the isogeny
+    theta_couple_curve_t codomain;
+    int codomain_splits;
+    ec_curve_init(&codomain.E1);
+    ec_curve_init(&codomain.E2);
+    // handling the special case where we don't need to perform any dim2 computation
+    if (pow_dim2_deg_resp == 0) {
+        codomain_splits = 1;
+        copy_curve(&codomain.E1, &EchallxEaux.E1);
+        copy_curve(&codomain.E2, &EchallxEaux.E2);
+        // We still need to check that E_chall is supersingular
+        // This assumes that HD_extra_torsion == 2
+        if (!ec_is_basis_four_torsion(B_chall_can, E_chall)) {
+            return 0;
+        }
+    } else {
+        codomain_splits = theta_chain_compute_and_eval_verify(
+            pow_dim2_deg_resp, &EchallxEaux, &dim_two_ker, true, &codomain, NULL, 0);
+    }
+
+    // computing the commitment curve
+    // its always the first one because of our (2^n,2^n)-isogeny formulae
+    copy_curve(E_com, &codomain.E1);
+
+    return codomain_splits;
+}
+
+// SQIsign verification
+int
+protocols_verify(signature_t *sig, const public_key_t *pk, const unsigned char *m, size_t l)
+{
+    int verify;
+
+    if (!check_canonical_basis_change_matrix(sig))
+        return 0;
+
+    // Computation of the length of the dim 2 2^n isogeny
+    int pow_dim2_deg_resp = SQIsign_response_length - (int)sig->two_resp_length - (int)sig->backtracking;
+
+    // basic sanity test: checking that the response is not too long
+    if (pow_dim2_deg_resp < 0)
+        return 0;
+    // The dim 2 isogeny embeds a dim 1 isogeny of odd degree, so it can
+    // never be of length 2.
+    if (pow_dim2_deg_resp == 1)
+        return 0;
+
+    // check the public curve is valid
+    if (!ec_curve_verify_A(&(pk->curve).A))
+        return 0;
+
+    // Set auxiliary curve from the A-coefficient within the signature
+    ec_curve_t E_aux;
+    if (!ec_curve_init_from_A(&E_aux, &sig->E_aux_A))
+        return 0; // invalid curve
+
+    // checking that we are given A-coefficients and no precomputation
+    assert(fp2_is_one(&pk->curve.C) == 0xFFFFFFFF && !pk->curve.is_A24_computed_and_normalized);
+
+    // computation of the challenge
+    ec_curve_t E_chall;
+    if (!compute_challenge_verify(&E_chall, sig, &pk->curve, pk->hint_pk)) {
+        return 0;
+    }
+
+    // Computation of the canonical bases for the challenge and aux curve
+    ec_basis_t B_chall_can, B_aux_can;
+
+    if (!challenge_and_aux_basis_verify(&B_chall_can, &B_aux_can, &E_chall, &E_aux, sig, pow_dim2_deg_resp)) {
+        return 0;
+    }
+
+    // When two_resp_length != 0 we need to compute a second, short 2^r-isogeny
+    if (sig->two_resp_length > 0) {
+        if (!two_response_isogeny_verify(&E_chall, &B_chall_can, sig, pow_dim2_deg_resp)) {
+            return 0;
+        }
+    }
+
+    // We can recover the commitment curve with a 2D isogeny
+    // The supplied signature did not compute an isogeny between eliptic products
+    // and so definitely is an invalid signature.
+    ec_curve_t E_com;
+    if (!compute_commitment_curve_verify(&E_com, &B_chall_can, &B_aux_can, &E_chall, &E_aux, pow_dim2_deg_resp))
+        return 0;
+
+    scalar_t chk_chall;
+
+    // recomputing the challenge vector
+    hash_to_challenge(&chk_chall, pk, &E_com, m, l);
+
+    // performing the final check
+    verify = mp_compare(sig->chall_coeff, chk_chall, NWORDS_ORDER) == 0;
+
+    return verify;
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/xeval.c b/src/pqm4/sqisign_lvl5/ref/xeval.c
new file mode 100644
index 0000000..7fc7170
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/xeval.c
@@ -0,0 +1,64 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+// Degree-2 isogeny evaluation with kenerl generated by P != (0, 0)
+void
+xeval_2(ec_point_t *R, ec_point_t *const Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1, t2;
+    for (int j = 0; j < lenQ; j++) {
+        fp2_add(&t0, &Q[j].x, &Q[j].z);
+        fp2_sub(&t1, &Q[j].x, &Q[j].z);
+        fp2_mul(&t2, &kps->K.x, &t1);
+        fp2_mul(&t1, &kps->K.z, &t0);
+        fp2_add(&t0, &t2, &t1);
+        fp2_sub(&t1, &t2, &t1);
+        fp2_mul(&R[j].x, &Q[j].x, &t0);
+        fp2_mul(&R[j].z, &Q[j].z, &t1);
+    }
+}
+
+void
+xeval_2_singular(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps2_t *kps)
+{
+    fp2_t t0, t1;
+    for (int i = 0; i < lenQ; i++) {
+        fp2_mul(&t0, &Q[i].x, &Q[i].z);
+        fp2_mul(&t1, &kps->K.x, &Q[i].z);
+        fp2_add(&t1, &t1, &Q[i].x);
+        fp2_mul(&t1, &t1, &Q[i].x);
+        fp2_sqr(&R[i].x, &Q[i].z);
+        fp2_add(&R[i].x, &R[i].x, &t1);
+        fp2_mul(&R[i].z, &t0, &kps->K.z);
+    }
+}
+
+// Degree-4 isogeny evaluation with kenerl generated by P such that [2]P != (0, 0)
+void
+xeval_4(ec_point_t *R, const ec_point_t *Q, const int lenQ, const ec_kps4_t *kps)
+{
+    const ec_point_t *K = kps->K;
+
+    fp2_t t0, t1;
+
+    for (int i = 0; i < lenQ; i++) {
+        fp2_add(&t0, &Q[i].x, &Q[i].z);
+        fp2_sub(&t1, &Q[i].x, &Q[i].z);
+        fp2_mul(&(R[i].x), &t0, &K[1].x);
+        fp2_mul(&(R[i].z), &t1, &K[2].x);
+        fp2_mul(&t0, &t0, &t1);
+        fp2_mul(&t0, &t0, &K[0].x);
+        fp2_add(&t1, &(R[i].x), &(R[i].z));
+        fp2_sub(&(R[i].z), &(R[i].x), &(R[i].z));
+        fp2_sqr(&t1, &t1);
+        fp2_sqr(&(R[i].z), &(R[i].z));
+        fp2_add(&(R[i].x), &t0, &t1);
+        fp2_sub(&t0, &t0, &(R[i].z));
+        fp2_mul(&(R[i].x), &(R[i].x), &t1);
+        fp2_mul(&(R[i].z), &(R[i].z), &t0);
+    }
+}
diff --git a/src/pqm4/sqisign_lvl5/ref/xisog.c b/src/pqm4/sqisign_lvl5/ref/xisog.c
new file mode 100644
index 0000000..7242d29
--- /dev/null
+++ b/src/pqm4/sqisign_lvl5/ref/xisog.c
@@ -0,0 +1,61 @@
+#include "isog.h"
+#include "ec.h"
+#include <assert.h>
+
+// -------------------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+// Degree-2 isogeny with kernel generated by P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_2(ec_kps2_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    fp2_sqr(&B->x, &P.x);
+    fp2_sqr(&B->z, &P.z);
+    fp2_sub(&B->x, &B->z, &B->x);
+    fp2_add(&kps->K.x, &P.x, &P.z);
+    fp2_sub(&kps->K.z, &P.x, &P.z);
+}
+
+void
+xisog_2_singular(ec_kps2_t *kps, ec_point_t *B24, ec_point_t A24)
+{
+    // No need to check the square root, only used for signing.
+    fp2_t t0, four;
+    fp2_set_small(&four, 4);
+    fp2_add(&t0, &A24.x, &A24.x);
+    fp2_sub(&t0, &t0, &A24.z);
+    fp2_add(&t0, &t0, &t0);
+    fp2_inv(&A24.z);
+    fp2_mul(&t0, &t0, &A24.z);
+    fp2_copy(&kps->K.x, &t0);
+    fp2_add(&B24->x, &t0, &t0);
+    fp2_sqr(&t0, &t0);
+    fp2_sub(&t0, &t0, &four);
+    fp2_sqrt(&t0);
+    fp2_neg(&kps->K.z, &t0);
+    fp2_add(&B24->z, &t0, &t0);
+    fp2_add(&B24->x, &B24->x, &B24->z);
+    fp2_add(&B24->z, &B24->z, &B24->z);
+}
+
+// Degree-4 isogeny with kernel generated by P such that [2]P != (0 ,0)
+// Outputs the curve coefficient in the form A24=(A+2C:4C)
+void
+xisog_4(ec_kps4_t *kps, ec_point_t *B, const ec_point_t P)
+{
+    ec_point_t *K = kps->K;
+
+    fp2_sqr(&K[0].x, &P.x);
+    fp2_sqr(&K[0].z, &P.z);
+    fp2_add(&K[1].x, &K[0].z, &K[0].x);
+    fp2_sub(&K[1].z, &K[0].z, &K[0].x);
+    fp2_mul(&B->x, &K[1].x, &K[1].z);
+    fp2_sqr(&B->z, &K[0].z);
+
+    // Constants for xeval_4
+    fp2_add(&K[2].x, &P.x, &P.z);
+    fp2_sub(&K[1].x, &P.x, &P.z);
+    fp2_add(&K[0].x, &K[0].z, &K[0].z);
+    fp2_add(&K[0].x, &K[0].x, &K[0].x);
+}