initial version of SQIsign

Co-authored-by: Jorge Chavez-Saab <jorgechavezsaab@gmail.com> Co-authored-by: Maria Corte-Real Santos <36373796+mariascrs@users.noreply.github.com> Co-authored-by: Luca De Feo <github@defeo.lu> Co-authored-by: Jonathan Komada Eriksen <jonathan.eriksen97@gmail.com> Co-authored-by: Basil Hess <bhe@zurich.ibm.com> Co-authored-by: Antonin Leroux <18654258+tonioecto@users.noreply.github.com> Co-authored-by: Patrick Longa <plonga@microsoft.com> Co-authored-by: Lorenz Panny <lorenz@yx7.cc> Co-authored-by: Francisco Rodríguez-Henríquez <francisco.rodriguez@tii.ae> Co-authored-by: Sina Schaeffler <108983332+syndrakon@users.noreply.github.com> Co-authored-by: Benjamin Wesolowski <19474926+Calodeon@users.noreply.github.com>
2023-06-01 00:00:00 +00:00
commit 28ff420dd0
285 changed files with 70301 additions and 0 deletions
--- a/src/gf/broadwell/lvl1/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set(SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL
+    fp_asm.S fp.c fp2.c
+)
+
+add_library(${LIB_GF_${SVARIANT_UPPER}} ${SOURCE_FILES_GF_${SVARIANT_UPPER}_BROADWELL})
+target_include_directories(${LIB_GF_${SVARIANT_UPPER}} PRIVATE common ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} include ${PROJECT_SOURCE_DIR}/include ${INC_COMMON})
+target_compile_options(${LIB_GF_${SVARIANT_UPPER}} PRIVATE ${C_OPT_FLAGS})
+
+add_subdirectory(test)
--- a/src/gf/broadwell/lvl1/Makefile
+++ b/src/gf/broadwell/lvl1/Makefile
@@ -0,0 +1,46 @@
+
+CC=gcc
+CFLAGS= -O3 -std=gnu11 -Wall -march=native -Wno-missing-braces -Wno-logical-not-parentheses 
+LDFLAGS=-lm
+AR=ar rcs
+RANLIB=ranlib
+
+OBJECTS=objs/fp_p1913.o objs/fp.o objs/fp2.o objs/fp_asm.o objs/random.o
+
+all: lib tests
+	
+objs/fp_p1913.o: fp_p1913.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp_p1913.c -o objs/fp_p1913.o
+	
+objs/fp.o: fp.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp.c -o objs/fp.o
+	
+objs/fp2.o: fp2.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) fp2.c -o objs/fp2.o
+
+objs/fp_asm.o: fp_asm.S
+	$(CC) -c $(CFLAGS) fp_asm.S -o objs/fp_asm.o
+
+objs/random.o: ../../../common/generic/randombytes_system.c
+	$(CC) -c $(CFLAGS) ../../../common/generic/randombytes_system.c -o objs/random.o
+
+lib: $(OBJECTS)
+	rm -rf lib
+	mkdir lib
+	$(AR) lib/libtest.a $^
+	$(RANLIB) lib/libtest.a
+
+tests: lib
+	$(CC) $(CFLAGS) -L./lib test/test_fp.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp -lgmp
+	$(CC) $(CFLAGS) -L./lib test/test_fp2.c test/test_extras.c -ltest $(LDFLAGS) -o test_fp2 -lgmp
+
+check: tests
+
+.PHONY: clean
+
+clean:
+	rm -rf *.req objs lib test_fp*
+
--- a/src/gf/broadwell/lvl1/fp.c
+++ b/src/gf/broadwell/lvl1/fp.c
@@ -0,0 +1,192 @@
+#include "include/fp.h"
+
+const uint64_t p[NWORDS_FIELD] =  { 0xffffffffffffffff, 0x252C9E49355147FF, 0x33A6A86587407437, 0x34E29E286B95D98C };
+const uint64_t R2[NWORDS_FIELD] = { 0x233625AE400674D4, 0x20AFD6C1025A1C2E, 0x30A841AB0920655D, 0x0D72E7D67C30CD3D };
+const uint64_t pp[NWORDS_FIELD] = { 0x01, 0x00, 0x00, 0x00 };
+
+
+void fp_set(digit_t* x, const digit_t val)
+{ // Set field element x = val, where val has wordsize
+
+    x[0] = val;
+    for (unsigned int i = 1; i < NWORDS_FIELD; i++) {
+        x[i] = 0;
+    }
+}
+
+void fp_mont_setone(digit_t* out1) {
+    out1[0] = 0x4;
+    out1[1] = UINT64_C(0x6b4d86db2abae000);
+    out1[2] = UINT64_C(0x31655e69e2fe2f23);
+    out1[3] = UINT64_C(0x2c75875e51a899cf);
+}
+
+bool fp_is_equal(const digit_t* a, const digit_t* b)
+{ // Compare two field elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ b[i];
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+bool fp_is_zero(const digit_t* a)
+{ // Is a field element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+    digit_t r = 0;
+
+    for (unsigned int i = 0; i < NWORDS_FIELD; i++)
+        r |= a[i] ^ 0;
+
+    return (bool)is_digit_zero_ct(r);
+}
+
+void fp_copy(digit_t* out, const digit_t* a)
+{
+    memcpy(out, a, NWORDS_FIELD*RADIX/8);
+}
+
+void fp_neg(digit_t* out, const digit_t* a)
+{ // Modular negation, out = -a mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+    unsigned int i, borrow = 0;
+
+    for (i = 0; i < NWORDS_FIELD; i++) {
+        SUBC(out[i], borrow, ((digit_t*)p)[i], a[i], borrow);
+    }
+    fp_sub(out, out, (digit_t*)p);
+}
+
+void fp_tomont(digit_t* out, const digit_t* a)
+{ // Conversion to Montgomery representation
+  // out = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
+
+    fp_mul(out, a, (digit_t*)&R2);
+}
+
+void fp_frommont(digit_t* out, const digit_t* a)
+{ // Conversion from Montgomery representation to standard representation
+  // out = a*R^(-1) mod p, where a in [0, p-1].
+    digit_t one[NWORDS_FIELD] = {0};
+
+    one[0] = 1;
+    fp_mul(out, a, one);
+}
+
+void MUL(digit_t* out, const digit_t a, const digit_t b)
+{ // Digit multiplication, digit*digit -> 2-digit result 
+  // Inputs: a, b in [0, 2^w-1], where w is the computer wordsize 
+  // Output: 0 < out < 2^(2w)-1    
+    register digit_t al, ah, bl, bh, temp;
+    digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
+    digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4);
+
+    al = a & mask_low;                        // Low part
+    ah = a >> (sizeof(digit_t)*4);            // High part
+    bl = b & mask_low;
+    bh = b >> (sizeof(digit_t)*4);
+
+    albl = al * bl;
+    albh = al * bh;
+    ahbl = ah * bl;
+    ahbh = ah * bh;
+    out[0] = albl & mask_low;                 // out00
+
+    res1 = albl >> (sizeof(digit_t)*4);
+    res2 = ahbl & mask_low;
+    res3 = albh & mask_low;
+    temp = res1 + res2 + res3;
+    carry = temp >> (sizeof(digit_t)*4);
+    out[0] ^= temp << (sizeof(digit_t)*4);    // out01   
+
+    res1 = ahbl >> (sizeof(digit_t)*4);
+    res2 = albh >> (sizeof(digit_t)*4);
+    res3 = ahbh & mask_low;
+    temp = res1 + res2 + res3 + carry;
+    out[1] = temp & mask_low;                 // out10 
+    carry = temp & mask_high;
+    out[1] ^= (ahbh & mask_high) + carry;     // out11
+}
+
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision right shift
+    digit_t bit_out = x[0] & 1;
+
+    for (unsigned int i = 0; i < nwords-1; i++) {
+        SHIFTR(x[i+1], x[i], shift, x[i], RADIX);
+    }
+    x[nwords-1] >>= shift;
+    return bit_out;
+}
+
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords)
+{ // Multiprecision left shift
+
+    for (int i = nwords-1; i > 0; i--) {
+        SHIFTL(x[i], x[i-1], shift, x[i], RADIX);
+    }
+    x[0] <<= shift;
+}
+
+static void fp_exp3div4(digit_t* out, const digit_t* a)
+{ // Fixed exponentiation out = a^((p-3)/4) mod p
+  // Input: a in [0, p-1] 
+  // Output: out in [0, p-1] 
+  // Requirement: p = 3(mod 4)
+    fp_t p_t, acc;
+    digit_t bit;
+
+    memcpy((digit_t*)p_t, (digit_t*)p, NWORDS_FIELD*RADIX/8);
+    memcpy((digit_t*)acc, (digit_t*)a, NWORDS_FIELD*RADIX/8);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    mp_shiftr(p_t, 1, NWORDS_FIELD);
+    fp_set(out, 1);
+    fp_tomont(out, out);
+
+    for (int i = 0; i < NWORDS_FIELD*RADIX-2; i++) {
+        bit = p_t[0] & 1;
+        mp_shiftr(p_t, 1, NWORDS_FIELD);
+        if (bit == 1) {
+            fp_mul(out, out, acc);
+        }
+        fp_sqr(acc, acc);
+    }
+}
+
+void fp_inv(digit_t* a)
+{ // Modular inversion, out = x^-1*R mod p, where R = 2^(w*nwords), w is the computer wordsize and nwords is the number of words to represent p
+  // Input: a=xR in [0, p-1] 
+  // Output: out in [0, p-1]. It outputs 0 if the input does not have an inverse  
+  // Requirement: Ceiling(Log(p)) < w*nwords
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_sqr(t, t);
+    fp_mul(a, t, a);    // a^(p-2)
+}
+
+bool fp_is_square(const digit_t* a)
+{ // Is field element a square?
+  // Output: out = 0 (false), 1 (true)
+    fp_t t, one;
+
+    fp_exp3div4(t, a);
+    fp_sqr(t, t);
+    fp_mul(t, t, a);    // a^((p-1)/2)
+    fp_frommont(t, t);
+    fp_set(one, 1);
+
+    return fp_is_equal(t, one);
+}
+
+void fp_sqrt(digit_t* a)
+{ // Square root computation, out = a^((p+1)/4) mod p
+    fp_t t;
+
+    fp_exp3div4(t, a);
+    fp_mul(a, t, a);    // a^((p+1)/4)
+}
--- a/src/gf/broadwell/lvl1/fp2.c
+++ b/src/gf/broadwell/lvl1/fp2.c
@@ -0,0 +1,190 @@
+#include <fp2.h>
+
+extern const digit_t R[NWORDS_FIELD];
+
+extern void fp2_sq_c0(fp2_t *out, const fp2_t *in);
+extern void fp2_sq_c1(fp_t *out, const fp2_t *in);
+
+extern void fp2_mul_c0(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+extern void fp2_mul_c1(fp_t *out, const fp2_t *in0, const fp2_t *in1);
+
+/* Arithmetic modulo X^2 + 1 */
+
+void fp2_set(fp2_t* x, const digit_t val)
+{
+    fp_set(x->re, val);
+    fp_set(x->im, 0);
+}
+
+bool fp2_is_zero(const fp2_t* a)
+{ // Is a GF(p^2) element zero?
+  // Returns 1 (true) if a=0, 0 (false) otherwise
+
+    return fp_is_zero(a->re) & fp_is_zero(a->im);
+}
+
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b)
+{ // Compare two GF(p^2) elements in constant time
+  // Returns 1 (true) if a=b, 0 (false) otherwise
+
+    return fp_is_equal(a->re, b->re) & fp_is_equal(a->im, b->im);
+}
+
+void fp2_copy(fp2_t* x, const fp2_t* y)
+{
+    fp_copy(x->re, y->re);
+    fp_copy(x->im, y->im);
+}
+
+fp2_t fp2_non_residue()
+{ // 2 + i is a quadratic non-residue for p1913
+    fp_t one = {0};
+    fp2_t res;
+
+    one[0] = 1;
+    fp_tomont(one, one);
+    fp_add(res.re, one, one);
+    fp_copy(res.im, one);
+    return res;
+}
+
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_add(x->re, y->re, z->re);
+    fp_add(x->im, y->im, z->im);
+}
+
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_sub(x->re, y->re, z->re);
+    fp_sub(x->im, y->im, z->im);
+}
+
+void fp2_neg(fp2_t* x, const fp2_t* y)
+{
+    fp_neg(x->re, y->re);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z)
+{
+    fp_t t;
+
+    fp2_mul_c0(&t, y, z);              // c0 = a0*b0 - a1*b1
+    fp2_mul_c1(&x->im, y, z);          // c1 = a0*b1 + a1*b0 
+    x->re[0] = t[0]; x->re[1] = t[1]; x->re[2] = t[2]; x->re[3] = t[3];
+}
+
+void fp2_sqr(fp2_t* x, const fp2_t* y) {
+    fp2_t t;
+
+    fp2_sq_c0(&t, y);               // c0 = (a0+a1)(a0-a1)
+    fp2_sq_c1(&x->im, y);           // c1 = 2a0*a1
+    x->re[0] = t.re[0]; x->re[1] = t.re[1]; x->re[2] = t.re[2]; x->re[3] = t.re[3];
+}
+
+void fp2_inv(fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+    fp_inv(t0);
+    fp_mul(x->re, x->re, t0);
+    fp_mul(x->im, x->im, t0);
+    fp_neg(x->im, x->im);
+}
+
+bool fp2_is_square(const fp2_t* x)
+{
+    fp_t t0, t1;
+
+    fp_sqr(t0, x->re);
+    fp_sqr(t1, x->im);
+    fp_add(t0, t0, t1);
+
+    return fp_is_square(t0);
+}
+
+void fp2_frob(fp2_t* x, const fp2_t* y)
+{
+    memcpy((digit_t*)x->re, (digit_t*)y->re, NWORDS_FIELD*RADIX/8);
+    fp_neg(x->im, y->im);
+}
+
+void fp2_tomont(fp2_t* x, const fp2_t* y)
+{ 
+    fp_tomont(x->re, y->re);
+    fp_tomont(x->im, y->im);
+}
+
+void fp2_frommont(fp2_t* x, const fp2_t* y)
+{
+    fp_frommont(x->re, y->re);
+    fp_frommont(x->im, y->im);
+}
+
+// NOTE: old, non-constant-time implementation. Could be optimized
+void fp2_sqrt(fp2_t* x)
+{
+    fp_t sdelta, re, tmp1, tmp2, inv2, im;
+
+    if (fp_is_zero(x->im)) {
+        if (fp_is_square(x->re)) {
+            fp_sqrt(x->re);
+            return;
+        } else {
+            fp_neg(x->im, x->re);
+            fp_sqrt(x->im);
+            fp_set(x->re, 0);
+            return;
+        }
+    }
+
+    // sdelta = sqrt(re^2 + im^2)
+    fp_sqr(sdelta, x->re);
+    fp_sqr(tmp1, x->im);
+    fp_add(sdelta, sdelta, tmp1);
+    fp_sqrt(sdelta);
+
+    fp_set(inv2, 2);
+    fp_tomont(inv2, inv2);     // inv2 <- 2
+    fp_inv(inv2);
+    fp_add(re, x->re, sdelta);
+    fp_mul(re, re, inv2);
+    memcpy((digit_t*)tmp2, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    if (!fp_is_square(tmp2)) {
+        fp_sub(re, x->re, sdelta);
+        fp_mul(re, re, inv2);
+    }
+
+    fp_sqrt(re);
+    memcpy((digit_t*)im, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+
+    fp_inv(im);
+    fp_mul(im, im, inv2);
+    fp_mul(x->im, im, x->im);    
+    memcpy((digit_t*)x->re, (digit_t*)re, NWORDS_FIELD*RADIX/8);
+}
+
+// Lexicographic comparison of two field elements. Returns +1 if x > y, -1 if x < y, 0 if x = y
+int fp2_cmp(fp2_t* x, fp2_t* y){
+    fp2_t a, b;
+    fp2_frommont(&a, x);
+    fp2_frommont(&b, y);
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.re[i] > b.re[i])
+            return 1;
+        if(a.re[i] < b.re[i])
+            return -1;
+    }
+    for(int i = NWORDS_FIELD-1; i >= 0; i--){
+        if(a.im[i] > b.im[i])
+            return 1;
+        if(a.im[i] < b.im[i])
+            return -1;
+    }
+    return 0;
+}
--- a/src/gf/broadwell/lvl1/fp_asm.S
+++ b/src/gf/broadwell/lvl1/fp_asm.S
@@ -0,0 +1,555 @@
+.intel_syntax noprefix
+
+.set pbytes,32
+.set plimbs,4
+
+.global p_plus_1
+p_plus_1: .quad 0x0000000000000000, 0x252C9E4935514800, 0x33A6A86587407437, 0x34E29E286B95D98C
+
+.text
+.p2align 4,,15
+
+.global fp_add
+fp_add:
+  push   r12  
+  xor    rax, rax
+  mov    r8, [rsi]
+  mov    r9, [rsi+8]
+  mov    r10, [rsi+16]
+  mov    r11, [rsi+24]
+  add    r8, [rdx] 
+  adc    r9, [rdx+8] 
+  adc    r10, [rdx+16] 
+  adc    r11, [rdx+24] 
+  mov    r12, [rip+p]
+  sub    r8, r12
+  mov    rcx, [rip+p+8]
+  sbb    r9, rcx
+  mov    rsi, [rip+p+16]
+  sbb    r10, rsi
+  mov    rdx, [rip+p+24]
+  sbb    r11, rdx
+  sbb    rax, 0
+  
+  and    r12, rax
+  and    rcx, rax
+  and    rsi, rax
+  and    rdx, rax
+  
+  add    r8, r12  
+  adc    r9, rcx  
+  adc    r10, rsi  
+  adc    r11, rdx 
+  mov    [rdi], r8
+  mov    [rdi+8], r9 
+  mov    [rdi+16], r10 
+  mov    [rdi+24], r11
+  pop    r12
+  ret
+
+.global fp_sub
+fp_sub:
+  push   r12  
+  xor    rax, rax
+  mov    r8, [rsi]
+  mov    r9, [rsi+8]
+  mov    r10, [rsi+16]
+  mov    r11, [rsi+24]
+  sub    r8, [rdx] 
+  sbb    r9, [rdx+8] 
+  sbb    r10, [rdx+16] 
+  sbb    r11, [rdx+24]
+  sbb    rax, 0
+  
+  mov    r12, [rip+p]
+  mov    rcx, [rip+p+8]
+  mov    rsi, [rip+p+16]
+  mov    rdx, [rip+p+24]
+  and    r12, rax
+  and    rcx, rax
+  and    rsi, rax
+  and    rdx, rax  
+  add    r8, r12  
+  adc    r9, rcx 
+  adc    r10, rsi  
+  adc    r11, rdx 
+  mov    [rdi], r8
+  mov    [rdi+8], r9 
+  mov    [rdi+16], r10 
+  mov    [rdi+24], r11 
+  pop    r12
+  ret
+  
+///////////////////////////////////////////////////////////////// MACROS
+// z = a x bi + z
+// Inputs: base memory pointer M1 (a),
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z4]
+// Output: [Z0:Z4]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro MULADD64x256 M1, Z0, Z1, Z2, Z3, Z4, T0, T1, C
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    \C, \C
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    adc    \Z4, 0   
+.endm
+
+.macro MULADD64x192 M1, Z0, Z1, Z2, Z3, T0, T1
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    rax, rax
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    adc    \Z3, 0   
+.endm
+  
+//***********************************************************************
+//  Multiplication in GF(p^2), non-complex part
+//  Operation: c [rdi] = a0 x b0 - a1 x b1
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//          b = [b1, b0] stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_mul_c0
+fp2_mul_c0:    
+    push   r12 
+    push   r13 
+    push   r14   
+    mov    rcx, rdx
+	
+	// [rdi0:3] <- p - b1
+	mov    r8, [rip+p]  
+	mov    r9, [rip+p+8]   
+	mov    r10, [rip+p+16]
+	mov    r11, [rip+p+24] 
+	mov    rax, [rcx+32]
+	mov    rdx, [rcx+40]        
+	sub    r8, rax
+	sbb    r9, rdx
+	mov    rax, [rcx+48]
+	mov    rdx, [rcx+56]
+	sbb    r10, rax
+	sbb    r11, rdx
+	mov    [rdi], r8
+	mov    [rdi+8], r9
+	mov    [rdi+16], r10
+	mov    [rdi+24], r11
+    
+    // [r8:r12] <- z = a0 x b00 - a1 x b10
+    mov    rdx, [rcx]
+    mulx   r9, r8, [rsi]         
+    xor    rax, rax
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16] 
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24]
+    adox   r11, r13  
+    adox   r12, rax
+           
+    mov    rdx, [rdi]    
+    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
+    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r8                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    
+    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, [rcx+8]
+    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
+    mov    rdx, [rdi+8]    
+    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
+    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r9                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    
+    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
+    mov    rdx, [rcx+16]
+    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
+    mov    rdx, [rdi+16]    
+    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
+    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r10                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    
+    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
+    mov    rdx, [rcx+24]
+    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
+    mov    rdx, [rdi+24]    
+    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
+    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r11                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+  
+//***********************************************************************
+//  Multiplication in GF(p^2), complex part
+//  Operation: c [rdi] = a0 x b1 + a1 x b0
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//          b = [b1, b0] stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_mul_c1
+fp2_mul_c1:    
+    push   r12 
+    push   r13 
+    push   r14   
+    mov    rcx, rdx
+    
+    // [r8:r12] <- z = a0 x b10 + a1 x b00
+    mov    rdx, [rcx+32]
+    mulx   r9, r8, [rsi]         
+    xor    rax, rax
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16] 
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24]
+    adox   r11, r13  
+    adox   r12, rax
+           
+    mov    rdx, [rcx]    
+    MULADD64x256 [rsi+32], r8, r9, r10, r11, r12, r13, r14, rax
+    // [r9:r12] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r8                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r9, r10, r11, r12, r13, r14
+    
+    // [r9:r12, r8] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, [rcx+40]
+    MULADD64x256 [rsi], r9, r10, r11, r12, r8, r13, r14, r8
+    mov    rdx, [rcx+8]    
+    MULADD64x256 [rsi+32], r9, r10, r11, r12, r8, r13, r14, rax
+    // [r10:r12, r8] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r9                 // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r10, r11, r12, r8, r13, r14
+    
+    // [r10:r12, r8:r9] <- z = a0 x b02 - a1 x b12 + z 
+    mov    rdx, [rcx+48]
+    MULADD64x256 [rsi], r10, r11, r12, r8, r9, r13, r14, r9
+    mov    rdx, [rcx+16]    
+    MULADD64x256 [rsi+32], r10, r11, r12, r8, r9, r13, r14, rax
+    // [r11:r12, r8:r9] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r10                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r11, r12, r8, r9, r13, r14
+    
+    // [r11:r12, r8:r10] <- z = a0 x b03 - a1 x b13 + z 
+    mov    rdx, [rcx+56]
+    MULADD64x256 [rsi], r11, r12, r8, r9, r10, r13, r14, r10
+    mov    rdx, [rcx+24]    
+    MULADD64x256 [rsi+32], r11, r12, r8, r9, r10, r13, r14, rax
+    // [r12, r8:r10] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, r11                // rdx <- z0 
+    MULADD64x192 [rip+p_plus_1+8], r12, r8, r9, r10, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+ 
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b (mod p)
+// Inputs: base memory pointers M0 (a), M1 (b)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z4], pre-stores a0 x b
+// Output: [Z0:Z4]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPMUL256x256 M0, M1, Z0, Z1, Z2, Z3, Z4, T0, T1           
+    // [Z1:Z4] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z0                 // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    
+    // [Z1:Z4, Z0] <- z = a01 x a1 + z 
+    mov    rdx, 8\M0
+    MULADD64x256 \M1, \Z1, \Z2, \Z3, \Z4, \Z0, \T0, \T1, \Z0
+    // [Z2:Z4, Z0] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z1                 // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z2, \Z3, \Z4, \Z0, \T0, \T1
+    
+    // [Z2:Z4, Z0:Z1] <- z = a02 x a1 + z  
+    mov    rdx, 16\M0
+    MULADD64x256 \M1, \Z2, \Z3, \Z4, \Z0, \Z1, \T0, \T1, \Z1
+    // [Z3:Z4, Z0:Z1] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z2                // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z3, \Z4, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z4, Z0:Z2] <- z = a03 x a1 + z
+    mov    rdx, 24\M0
+    MULADD64x256 \M1, \Z3, \Z4, \Z0, \Z1, \Z2, \T0, \T1, \Z2
+    // [Z4, Z0:Z2] <- z = (z0 x p_plus_1 + z)/2^64
+    mov    rdx, \Z3                // rdx <- z0
+    MULADD64x192 [rip+p_plus_1+8], \Z4, \Z0, \Z1, \Z2, \T0, \T1
+.endm
+
+//***********************************************************************
+//  Squaring in GF(p^2), non-complex part
+//  Operation: c [rdi] = (a0+a1) x (a0-a1)
+//  Inputs: a = [a1, a0] stored in [rsi] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_sq_c0
+fp2_sq_c0:   
+    push   r12 
+    push   r13
+
+	// a0 + a1
+	mov    rdx, [rsi]
+	mov    r9, [rsi+8]
+	mov    r10, [rsi+16]
+	mov    r11, [rsi+24]
+	add    rdx, [rsi+32]
+	adc    r9, [rsi+40]
+	adc    r10, [rsi+48]
+	adc    r11, [rsi+56]
+	mov    [rdi], rdx
+	mov    [rdi+8], r9
+	mov    [rdi+16], r10
+	mov    [rdi+24], r11
+	
+	// a0 - a1 + p
+	mov    r8, [rsi]
+	mov    r10, [rsi+8]
+	mov    r12, [rsi+16]
+	mov    r13, [rsi+24]
+	sub    r8, [rsi+32]
+	sbb    r10, [rsi+40]
+	sbb    r12, [rsi+48] 
+	sbb    r13, [rsi+56]
+	add    r8, [rip+p]                    
+	adc    r10, [rip+p+8]
+	adc    r12, [rip+p+16]
+	adc    r13, [rip+p+24]
+	mov    [rdi+32], r8               
+	mov    [rdi+40], r10 
+	mov    [rdi+48], r12 
+	mov    [rdi+56], r13 
+    
+    // [r8:r12] <- z = a00 x a1
+    mulx   r9, r8, r8   
+    xor    rax, rax
+    mulx   r10, r11, r10  
+    adox   r9, r11        
+    mulx   r11, r12, r12  
+    adox   r10, r12        
+    mulx   r12, r13, r13  
+    adox   r11, r13
+    adox   r12, rax 
+
+    FPMUL256x256 [rdi], [rdi+32], r8, r9, r10, r11, r12, r13, rcx
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10
+    pop    r13
+    pop    r12
+    ret
+
+//***********************************************************************
+//  Squaring in GF(p^2), complex part
+//  Operation: c [rdi] = 2a0 x a1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp2_sq_c1
+fp2_sq_c1:  
+    push   r12
+    push   r13 
+	
+	mov    rdx, [rsi]
+	mov    r9, [rsi+8]
+	mov    r10, [rsi+16]
+	mov    r11, [rsi+24]
+	add    rdx, rdx
+	adc    r9, r9
+	adc    r10, r10
+	adc    r11, r11
+	sub    rsp, 32
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10 
+	mov    [rsp+24], r11   
+    
+    // [r8:r12] <- z = a00 x a1
+    mulx   r9, r8, [rsi+32]
+    xor    rax, rax 
+    mulx   r10, r11, [rsi+40]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+48]
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+56]
+    adox   r11, r13  
+    adox   r12, rax 
+
+	FPMUL256x256 [rsp], [rsi+32], r8, r9, r10, r11, r12, r13, rcx
+	add    rsp, 32
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10 
+    pop    r13
+    pop    r12
+    ret
+
+//***********************************************************************
+//  Field multiplication in GF(p)
+//  Operation: c = a x b mod p
+//  Inputs: a stored in [rsi], b stored in [rdx] 
+//  Output: c stored in [rdi]
+//***********************************************************************
+.global fp_mul
+fp_mul: 
+    push   r12
+    push   r13 
+    push   r14 
+    mov    rcx, rdx 
+     
+    // [r8:r12] <- z = a x b0
+    mov    rdx, [rcx]
+    mulx   r9, r8, [rsi]
+    xor    rax, rax 
+    mulx   r10, r11, [rsi+8]
+    adox   r9, r11        
+    mulx   r11, r12, [rsi+16]
+    adox   r10, r12        
+    mulx   r12, r13, [rsi+24] 
+    adox   r11, r13
+    adox   r12, rax 
+
+	FPMUL256x256 [rcx], [rsi], r8, r9, r10, r11, r12, r13, r14
+
+	// Final correction                        
+	mov    rsi, [rip+p]
+	mov    rcx, [rip+p+8]
+	mov    rdx, [rip+p+16]
+	mov    r11, [rip+p+24]
+	sub    r12, rsi
+	sbb    r8, rcx
+	sbb    r9, rdx
+	sbb    r10, r11
+	sbb    rax, 0
+	and    rsi, rax
+	and    rcx, rax
+	and    rdx, rax
+	and    r11, rax
+	add    r12, rsi
+	adc    r8, rcx
+	adc    r9, rdx
+	adc    r10, r11
+    
+    mov    [rdi], r12          
+    mov    [rdi+8], r8         
+    mov    [rdi+16], r9         
+    mov    [rdi+24], r10  
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+    
+.global fp_sqr
+fp_sqr:
+    mov rdx, rsi
+    jmp fp_mul
--- a/src/gf/broadwell/lvl1/include/fp.h
+++ b/src/gf/broadwell/lvl1/include/fp.h
@@ -0,0 +1,76 @@
+#ifndef FP_H
+#define FP_H
+
+//////////////////////////////////////////////// NOTE: this is placed here for now
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <tutil.h>
+#include <fp_constants.h>
+
+typedef digit_t fp_t[NWORDS_FIELD];  // Datatype for representing field elements
+
+void fp_set(digit_t* x, const digit_t val);
+bool fp_is_equal(const digit_t* a, const digit_t* b);
+bool fp_is_zero(const digit_t* a);
+void fp_copy(digit_t* out, const digit_t* a);
+digit_t mp_shiftr(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void mp_shiftl(digit_t* x, const unsigned int shift, const unsigned int nwords);
+void fp_add(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_sub(digit_t* out, const digit_t* a, const digit_t* b);
+void fp_neg(digit_t* out, const digit_t* a);
+void fp_sqr(digit_t* out, const digit_t* a);
+void fp_mul(digit_t* out, const digit_t* a, const digit_t* b);
+void MUL(digit_t* out, const digit_t a, const digit_t b);
+void fp_inv(digit_t* x);
+bool fp_is_square(const digit_t* a);
+void fp_sqrt(digit_t* a);
+void fp_tomont(digit_t* out, const digit_t* a);
+void fp_frommont(digit_t* out, const digit_t* a);
+void fp_mont_setone(digit_t* out);
+
+/********************** Constant-time unsigned comparisons ***********************/
+
+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
+
+static inline unsigned int is_digit_nonzero_ct(digit_t x)
+{ // Is x != 0?
+    return (unsigned int)((x | (0 - x)) >> (RADIX - 1));
+}
+
+static inline unsigned int is_digit_zero_ct(digit_t x)
+{ // Is x = 0?
+    return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
+}
+
+static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
+{ // Is x < y?
+    return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1));
+}
+
+/********************** Platform-independent macros for digit-size operations **********************/
+
+// Digit addition with carry
+#define ADDC(sumOut, carryOut, addend1, addend2, carryIn)                                         \
+    { digit_t tempReg = (addend1) + (digit_t)(carryIn);                                           \
+    (sumOut) = (addend2) + tempReg;                                                               \
+    (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); }
+
+// Digit subtraction with borrow
+#define SUBC(differenceOut, borrowOut, minuend, subtrahend, borrowIn)                             \
+    { digit_t tempReg = (minuend) - (subtrahend);                                                 \
+    unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg)));  \
+    (differenceOut) = tempReg - (digit_t)(borrowIn);                                              \
+    (borrowOut) = borrowReg; }
+
+// Shift right with flexible datatype
+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift)));
+
+// Digit shift left
+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize)                                         \
+    (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
+
+#endif
--- a/src/gf/broadwell/lvl1/include/fp2.h
+++ b/src/gf/broadwell/lvl1/include/fp2.h
@@ -0,0 +1,29 @@
+#ifndef FP2_H
+#define FP2_H
+
+#include "fp.h"
+
+// Structure for representing elements in GF(p^2)
+typedef struct fp2_t {
+    fp_t re, im;
+} fp2_t;
+
+void fp2_set(fp2_t* x, const digit_t val);
+bool fp2_is_zero(const fp2_t* a);
+bool fp2_is_equal(const fp2_t* a, const fp2_t* b);
+void fp2_copy(fp2_t* x, const fp2_t* y);
+fp2_t fp2_non_residue();
+void fp2_add(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sub(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_neg(fp2_t* x, const fp2_t* y);
+void fp2_mul(fp2_t* x, const fp2_t* y, const fp2_t* z);
+void fp2_sqr(fp2_t* x, const fp2_t* y);
+void fp2_inv(fp2_t* x);
+bool fp2_is_square(const fp2_t* x);
+void fp2_frob(fp2_t* x, const fp2_t* y);
+void fp2_sqrt(fp2_t* x);
+void fp2_tomont(fp2_t* x, const fp2_t* y);
+void fp2_frommont(fp2_t* x, const fp2_t* y);
+int fp2_cmp(fp2_t* x, fp2_t* y);
+
+#endif
--- a/src/gf/broadwell/lvl1/test/CMakeLists.txt
+++ b/src/gf/broadwell/lvl1/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp test_fp.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp sqisign_test_gf_${SVARIANT_LOWER}_fp test ${SQISIGN_TEST_REPS})
+
+add_executable(sqisign_test_gf_${SVARIANT_LOWER}_fp2 test_fp2.c test_extras.c)
+target_link_libraries(sqisign_test_gf_${SVARIANT_LOWER}_fp2 ${LIB_GF_${SVARIANT_UPPER}})
+target_include_directories(sqisign_test_gf_${SVARIANT_LOWER}_fp2 PRIVATE ../include ${INC_COMMON} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_PUBLIC})
+add_test(sqisign_test_gf_${SVARIANT_LOWER}_fp2 sqisign_test_gf_${SVARIANT_LOWER}_fp2 test ${SQISIGN_TEST_REPS})
--- a/src/gf/broadwell/lvl1/test/test_extras.c
+++ b/src/gf/broadwell/lvl1/test/test_extras.c
@@ -0,0 +1,74 @@
+#include "test_extras.h"
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+extern const digit_t R2[NWORDS_FIELD];
+
+#if 0
+int64_t cpucycles(void)
+{ // Access system counter for benchmarking
+    unsigned int hi, lo;
+
+    asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi));
+    return ((int64_t)lo) | (((int64_t)hi) << 32);
+}
+#endif
+
+
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords)
+{ // Comparing "nword" elements, a=b? : (1) a>b, (0) a=b, (-1) a<b
+  // SECURITY NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY.
+    int i;
+
+    for (i = nwords-1; i >= 0; i--)
+    {
+        if (a[i] > b[i]) return 1;
+        else if (a[i] < b[i]) return -1;
+    }
+
+    return 0; 
+}
+
+
+static void sub_test(digit_t* out, digit_t* a, digit_t* b, unsigned int nwords)
+{ // Subtraction without borrow, out = a-b where a>b
+  // SECURITY NOTE: this function does not have constant-time execution. It is for TESTING ONLY.     
+    unsigned int i;
+    digit_t res, carry, borrow = 0;
+  
+    for (i = 0; i < nwords; i++)
+    {
+        res = a[i] - b[i];
+        carry = (a[i] < b[i]);
+        out[i] = res - borrow;
+        borrow = carry || (res < borrow);
+    } 
+}
+
+
+void fprandom_test(digit_t* a)
+{ // Generating a pseudo-random field element in [0, p-1] 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+    unsigned int i, diff = 256-254, nwords = NWORDS_FIELD;
+    unsigned char* string = NULL;
+
+    string = (unsigned char*)a;
+    for (i = 0; i < sizeof(digit_t)*nwords; i++) {
+        *(string + i) = (unsigned char)rand();              // Obtain 256-bit number
+    }
+    a[nwords-1] &= (((digit_t)(-1) << diff) >> diff);
+
+    while (compare_words((digit_t*)p, a, nwords) < 1) {  // Force it to [0, modulus-1]
+        sub_test(a, a, (digit_t*)p, nwords);
+    }
+}
+
+
+void fp2random_test(fp2_t* a)
+{ // Generating a pseudo-random element in GF(p^2) 
+  // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY.
+
+    fprandom_test(a->re);
+    fprandom_test(a->im);
+}
--- a/src/gf/broadwell/lvl1/test/test_extras.h
+++ b/src/gf/broadwell/lvl1/test/test_extras.h
@@ -0,0 +1,25 @@
+
+#ifndef TEST_EXTRAS_H
+#define TEST_EXTRAS_H
+
+#include <time.h>
+#include <stdlib.h>
+#include "../include/fp.h"
+#include "../include/fp2.h"
+
+#define PASSED    0
+#define FAILED    1
+    
+// Access system counter for benchmarking
+//int64_t cpucycles(void);
+
+// Comparing "nword" elements, a=b? : (1) a!=b, (0) a=b
+int compare_words(digit_t* a, digit_t* b, unsigned int nwords);
+
+// Generating a pseudo-random field element in [0, p-1] 
+void fprandom_test(digit_t* a);
+
+// Generating a pseudo-random element in GF(p^2)
+void fp2random_test(fp2_t* a);
+
+#endif
--- a/src/gf/broadwell/lvl1/test/test_fp.c
+++ b/src/gf/broadwell/lvl1/test/test_fp.c
@@ -0,0 +1,295 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp_test()
+{ // Tests for the field arithmetic
+    bool OK = true;
+    int n, passed;
+    fp_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing field arithmetic over GF(p): \n\n"); 
+
+    // Field addition
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d); 
+
+        fp_add(d, a, b); fp_add(e, d, c);                 // e = (a+b)+c
+        fp_add(d, b, c); fp_add(f, d, a);                 // f = a+(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_add(d, a, b);                                  // d = a+b 
+        fp_add(e, b, a);                                  // e = b+a
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_add(d, a, b);                                  // d = a+0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);   
+        fp_neg(d, a);                      
+        fp_add(e, a, d);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) addition tests ............................................ PASSED");
+    else { printf("  GF(p) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field subtraction
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a); fprandom_test(b); fprandom_test(c); fprandom_test(d);
+
+        fp_sub(d, a, b); fp_sub(e, d, c);                 // e = (a-b)-c
+        fp_add(d, b, c); fp_sub(f, a, d);                 // f = a-(b+c)
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_sub(d, a, b);                                  // d = a-b 
+        fp_sub(e, b, a);
+        fp_neg(e, e);                                     // e = -(b-a)
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(b, 0);
+        fp_sub(d, a, b);                                  // d = a-0 
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp_set(b, 0);              
+        fp_sub(e, a, a);                                  // e = a+(-a)
+        if (compare_words(e, b, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field multiplication
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fprandom_test(a); fprandom_test(b); fprandom_test(c);
+        
+        fp_tomont(ma, a);
+        fp_frommont(c, ma);
+        if (compare_words(a, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c);
+        fp_mul(md, ma, mb); fp_mul(me, md, mc);                          // e = (a*b)*c
+        fp_mul(md, mb, mc); fp_mul(mf, md, ma);                          // f = a*(b*c)
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a); fp_tomont(mb, b); fp_tomont(mc, c); 
+        fp_add(md, mb, mc); fp_mul(me, ma, md);                          // e = a*(b+c)
+        fp_mul(md, ma, mb); fp_mul(mf, ma, mc); fp_add(mf, md, mf);      // f = a*b+a*c
+        fp_frommont(e, me);
+        fp_frommont(f, mf);
+        if (compare_words(e, f, NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp_tomont(ma, a); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*b 
+        fp_mul(me, mb, ma);                                              // e = b*a 
+        fp_frommont(d, md);
+        fp_frommont(e, me);
+        if (compare_words(d, e, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_tomont(ma, a);
+        fp_set(b, 1); fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*1  
+        fp_frommont(a, ma);
+        fp_frommont(d, md);                
+        if (compare_words(a, d, NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp_set(b, 0);
+        fp_tomont(mb, b);
+        fp_mul(md, ma, mb);                                              // d = a*0 
+        fp_frommont(d, md);                
+        if (compare_words(b, d, NWORDS_FIELD)!=0) { passed=0; break; } 
+    }
+    if (passed==1) printf("  GF(p) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field squaring
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+        
+        fp_tomont(ma, a);
+        fp_sqr(mb, ma);                                   // b = a^2
+        fp_mul(mc, ma, ma);                               // c = a*a 
+        fp_frommont(b, mb);
+        fp_frommont(c, mc);
+        if (compare_words(b, c, NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp_set(a, 0); fp_tomont(ma, a);
+        fp_sqr(md, ma);                                   // d = 0^2 
+        if (compare_words(ma, md, NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p) squaring tests............................................. PASSED");
+    else { printf("  GF(p) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Field inversion
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_set(d, 1);
+        memcpy(mb, ma, RADIX/8 * NWORDS_FIELD);
+        fp_inv(ma);
+        fp_mul(mc, ma, mb);                               // c = a*a^-1 
+        fp_frommont(c, mc);
+        if (compare_words(c, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp_set(a, 0);
+        fp_set(d, 0);
+        fp_inv(a);                                        // c = 0^-1
+        if (compare_words(a, d, NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p) inversion tests............................................ PASSED");
+    else { printf("  GF(p) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Square root and square detection
+    passed = 1;
+    for (n = 0; n < TEST_LOOPS; n++)
+    {
+        fprandom_test(a);
+
+        fp_tomont(ma, a);
+        fp_sqr(mc, ma);
+        fp_frommont(c, mc);                               // c = a^2
+        if (fp_is_square(mc) != 1) { passed = 0; break; }
+
+        fp_sqrt(mc);                                      // c = a = sqrt(c) 
+        fp_neg(md, mc);
+        fp_frommont(c, mc);
+        fp_frommont(d, md);
+        if ((compare_words(a, c, NWORDS_FIELD) != 0) && (compare_words(a, d, NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests........................................ PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+ 
+    return OK;
+}
+
+bool fp_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking field arithmetic: \n\n"); 
+        
+    fprandom_test(a); fprandom_test(b); fprandom_test(c);
+
+    // GF(p) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_add(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_sub(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp_mul(c, a, b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) inversion
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_inv(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p) square root
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_sqrt(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n = 0; n < BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp_is_square(a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp_run();
+    } else {
+        exit(1);
+    }
+}
--- a/src/gf/broadwell/lvl1/test/test_fp2.c
+++ b/src/gf/broadwell/lvl1/test/test_fp2.c
@@ -0,0 +1,307 @@
+#include "test_extras.h"
+#include <stdio.h>
+#include <string.h>
+#include <bench.h>
+
+// Global constants
+extern const digit_t p[NWORDS_FIELD];
+
+// Benchmark and test parameters  
+static int BENCH_LOOPS = 100000;       // Number of iterations per bench
+static int TEST_LOOPS  = 100000;       // Number of iterations per test
+
+
+bool fp2_test()
+{ // Tests for the GF(p^2) arithmetic
+    bool OK = true;
+    int n, passed;
+    fp2_t a, b, c, d, e, f, ma, mb, mc, md, me, mf;
+
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Testing arithmetic over GF(p^2): \n\n"); 
+
+    // Addition in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d); 
+
+        fp2_add(&d, &a, &b); fp2_add(&e, &d, &c);                 // e = (a+b)+c
+        fp2_add(&d, &b, &c); fp2_add(&f, &d, &a);                 // f = a+(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_add(&d, &a, &b);                                      // d = a+b 
+        fp2_add(&e, &b, &a);                                      // e = b+a
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_add(&d, &a, &b);                                      // d = a+0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);   
+        fp2_neg(&d, &a);                      
+        fp2_add(&e, &a, &d);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) addition tests ............................................ PASSED");
+    else { printf("  GF(p^2) addition tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Subtraction in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c); fp2random_test(&d);
+
+        fp2_sub(&d, &a, &b); fp2_sub(&e, &d, &c);                 // e = (a-b)-c
+        fp2_add(&d, &b, &c); fp2_sub(&f, &a, &d);                 // f = a-(b+c)
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_sub(&d, &a, &b);                                      // d = a-b 
+        fp2_sub(&e, &b, &a);
+        fp2_neg(&e, &e);                                          // e = -(b-a)
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&b, 0);
+        fp2_sub(&d, &a, &b);                                      // d = a-0 
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+        
+        fp2_set(&b, 0);              
+        fp2_sub(&e, &a, &a);                                      // e = a+(-a)
+        if (compare_words((digit_t*)&e, (digit_t*)&b, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) subtraction tests ......................................... PASSED");
+    else { printf("  GF(p^2) subtraction tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Multiplication in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {    
+        fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+        
+        fp2_tomont(&ma, &a);
+        fp2_frommont(&c, &ma);
+        if (compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c);
+        fp2_mul(&md, &ma, &mb); fp2_mul(&me, &md, &mc);                          // e = (a*b)*c
+        fp2_mul(&md, &mb, &mc); fp2_mul(&mf, &md, &ma);                          // f = a*(b*c)
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b); fp2_tomont(&mc, &c); 
+        fp2_add(&md, &mb, &mc); fp2_mul(&me, &ma, &md);                          // e = a*(b+c)
+        fp2_mul(&md, &ma, &mb); fp2_mul(&mf, &ma, &mc); fp2_add(&mf, &md, &mf);  // f = a*b+a*c
+        fp2_frommont(&e, &me);
+        fp2_frommont(&f, &mf);
+        if (compare_words((digit_t*)&e, (digit_t*)&f, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+     
+        fp2_tomont(&ma, &a); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*b 
+        fp2_mul(&me, &mb, &ma);                                                  // e = b*a 
+        fp2_frommont(&d, &md);
+        fp2_frommont(&e, &me);
+        if (compare_words((digit_t*)&d, (digit_t*)&e, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&b, 1); fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*1  
+        fp2_frommont(&a, &ma);
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+       
+        fp2_set(&b, 0);
+        fp2_tomont(&mb, &b);
+        fp2_mul(&md, &ma, &mb);                                                  // d = a*0 
+        fp2_frommont(&d, &md);                
+        if (compare_words((digit_t*)&b, (digit_t*)&d, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) multiplication tests ...................................... PASSED");
+    else { printf("  GF(p^2) multiplication tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    // Squaring in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+        
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mb, &ma);                                          // b = a^2
+        fp2_mul(&mc, &ma, &ma);                                     // c = a*a 
+        fp2_frommont(&b, &mb);
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&b, (digit_t*)&c, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+
+        fp2_set(&a, 0); fp2_tomont(&ma, &a);
+        fp2_sqr(&md, &ma);                                          // d = 0^2 
+        if (compare_words((digit_t*)&ma, (digit_t*)&md, 2*NWORDS_FIELD)!=0) { passed=0; break; }
+    }
+    if (passed==1) printf("  GF(p^2) squaring tests............................................. PASSED");
+    else { printf("  GF(p^2) squaring tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Inversion in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_set(&d, 1);
+        memcpy(&mb, &ma, RADIX/8 * 2*NWORDS_FIELD);
+        fp2_inv(&ma);
+        fp2_mul(&mc, &ma, &mb);                                     // c = a*a^-1 
+        fp2_frommont(&c, &mc);
+        if (compare_words((digit_t*)&c, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+
+        fp2_set(&a, 0);
+        fp2_set(&d, 0);
+        fp2_inv(&a);                                                // c = 0^-1
+        if (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  GF(p^2) inversion tests............................................ PASSED");
+    else { printf("  GF(p^2) inversion tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+    
+    // Square root and square detection in GF(p^2)
+    passed = 1;
+    for (n=0; n<TEST_LOOPS; n++)
+    {
+        fp2random_test(&a);
+
+        fp2_tomont(&ma, &a);
+        fp2_sqr(&mc, &ma);
+        fp2_frommont(&c, &mc);                                      // c = a^2
+        if (fp2_is_square(&mc) != 1) { passed = 0; break; }        
+
+        fp2_sqrt(&mc);                                              // c = a = sqrt(c) 
+        fp2_neg(&md, &mc);
+        fp2_frommont(&c, &mc);
+        fp2_frommont(&d, &md);
+        if ((compare_words((digit_t*)&a, (digit_t*)&c, 2*NWORDS_FIELD) != 0) & (compare_words((digit_t*)&a, (digit_t*)&d, 2*NWORDS_FIELD) != 0)) { passed = 0; break; }
+    }
+    if (passed == 1) printf("  Square root, square tests.......................................... PASSED");
+    else { printf("  Square root, square tests... FAILED"); printf("\n"); return false; }
+    printf("\n");
+
+    return OK;
+}
+
+bool fp2_run()
+{
+    bool OK = true;
+    int n;
+    unsigned long long cycles, cycles1, cycles2;
+    fp2_t a, b, c;
+        
+    printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
+    printf("Benchmarking arithmetic over GF(p^2): \n\n"); 
+        
+    fp2random_test(&a); fp2random_test(&b); fp2random_test(&c);
+
+    // GF(p^2) addition
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_add(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) addition runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) subtraction
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_sub(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) subtraction runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) squaring
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqr(&c, &a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) squaring runs in .......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) multiplication
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles(); 
+        fp2_mul(&c, &a, &b);
+        cycles2 = cpucycles();
+        cycles = cycles+(cycles2-cycles1);
+    }
+    printf("  GF(p^2) multiplication runs in .................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) inversion
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_inv(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) inversion runs in ......................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // GF(p^2) square root
+    cycles = 0;
+    for (n = 0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_sqrt(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  GF(p^2) square root runs in ....................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    // Square checking
+    cycles = 0;
+    for (n=0; n<BENCH_LOOPS; n++)
+    {
+        cycles1 = cpucycles();
+        fp2_is_square(&a);
+        cycles2 = cpucycles();
+        cycles = cycles + (cycles2 - cycles1);
+    }
+    printf("  Square checking runs in ........................................... %7lld cycles", cycles/BENCH_LOOPS);
+    printf("\n");
+
+    return OK;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 3) {
+        printf("Please enter an argument: 'test' or 'bench' and <reps>\n");
+        exit(1);
+    }
+    if (!strcmp(argv[1], "test")) {
+        TEST_LOOPS = atoi(argv[2]);
+        return !fp2_test();
+    } else if (!strcmp(argv[1], "bench")) {
+        BENCH_LOOPS = atoi(argv[2]);
+        return !fp2_run();
+    } else {
+        exit(1);
+    }
+}