second-round version of SQIsign

Co-authored-by: Marius A. Aardal <marius.andre.aardal@gmail.com> Co-authored-by: Gora Adj <gora.adj@tii.ae> Co-authored-by: Diego F. Aranha <dfaranha@cs.au.dk> Co-authored-by: Andrea Basso <sqisign@andreabasso.com> Co-authored-by: Isaac Andrés Canales Martínez <icanalesm0500@gmail.com> Co-authored-by: Jorge Chávez-Saab <jorgechavezsaab@gmail.com> Co-authored-by: Maria Corte-Real Santos <mariascrsantos98@gmail.com> Co-authored-by: Luca De Feo <github@defeo.lu> Co-authored-by: Max Duparc <max.duparc@epfl.ch> Co-authored-by: Jonathan Komada Eriksen <jonathan.eriksen97@gmail.com> Co-authored-by: Décio Luiz Gazzoni Filho <decio@decpp.net> Co-authored-by: Basil Hess <bhe@zurich.ibm.com> Co-authored-by: Antonin Leroux <antonin.leroux@polytechnique.org> Co-authored-by: Patrick Longa <plonga@microsoft.com> Co-authored-by: Luciano Maino <mainoluciano.96@gmail.com> Co-authored-by: Michael Meyer <michael@random-oracles.org> Co-authored-by: Hiroshi Onuki <onuki@mist.i.u-tokyo.ac.jp> Co-authored-by: Lorenz Panny <lorenz@yx7.cc> Co-authored-by: Giacomo Pope <giacomopope@gmail.com> Co-authored-by: Krijn Reijnders <reijnderskrijn@gmail.com> Co-authored-by: Damien Robert <damien.robert@inria.fr> Co-authored-by: Francisco Rodríguez-Henriquez <francisco.rodriguez@tii.ae> Co-authored-by: Sina Schaeffler <sschaeffle@student.ethz.ch> Co-authored-by: Benjamin Wesolowski <benjamin.wesolowski@ens-lyon.fr>
2025-02-06 00:00:00 +00:00
parent ff34a8cd18
commit 91e9e464fe
481 changed files with 80785 additions and 55963 deletions
--- a/.astylerc
+++ b/.astylerc
@@ -1,16 +0,0 @@
 # find include src test -name '*.[ch]' | xargs astyle --options=.astylerc
 --style=google 
 --indent=spaces
 #--indent-preproc-define
 #--indent-preproc-cond
 --pad-oper 
 --pad-comma 
 --pad-header
 #--unpad-paren 
 --align-pointer=name 
 --add-braces 
 --convert-tabs
 --mode=c 
 # disable backup files
 --suffix=none
 --lineend=linux
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,246 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Mozilla
 AccessModifierOffset: -2
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: Right
 AlignConsecutiveAssignments:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   true
  AlignFunctionPointers: false
  PadOperators:    true
 AlignConsecutiveBitFields:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
 AlignConsecutiveDeclarations:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
 AlignConsecutiveMacros:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
 AlignConsecutiveShortCaseStatements:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCaseColons: false
 AlignEscapedNewlines: Right
 AlignOperands:   Align
 AlignTrailingComments:
  Kind:            Always
  OverEmptyLines:  0
 AllowAllArgumentsOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: false
 AllowBreakBeforeNoexceptSpecifier: Never
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortCompoundRequirementOnASingleLine: true
 AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: All
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: TopLevel
 AlwaysBreakAfterReturnType: AllDefinitions
 AlwaysBreakBeforeMultilineStrings: false
 AlwaysBreakTemplateDeclarations: Yes
 AttributeMacros:
  - __capability
 BinPackArguments: false
 BinPackParameters: false
 BitFieldColonSpacing: Both
 BraceWrapping:
  AfterCaseLabel:  false
  AfterClass:      true
  AfterControlStatement: Never
  AfterEnum:       true
  AfterExternBlock: true
  AfterFunction:   true
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     true
  AfterUnion:      true
  BeforeCatch:     false
  BeforeElse:      false
  BeforeLambdaBody: false
  BeforeWhile:     false
  IndentBraces:    false
  SplitEmptyFunction: true
  SplitEmptyRecord: false
  SplitEmptyNamespace: true
 BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Leave
 BreakAfterJavaFieldAnnotations: false
 BreakArrays:     true
 BreakBeforeBinaryOperators: None
 BreakBeforeConceptDeclarations: Always
 BreakBeforeBraces: Mozilla
 BreakBeforeInlineASMColon: OnlyMultiline
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
 BreakStringLiterals: true
 ColumnLimit:     120
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 EmptyLineAfterAccessModifier: Never
 EmptyLineBeforeAccessModifier: LogicalBlock
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: false
 ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
 IfMacros:
  - KJ_IF_MAYBE
 IncludeBlocks:   Preserve
 IncludeCategories:
  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
    Priority:        2
    SortPriority:    0
    CaseSensitive:   false
  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
    Priority:        3
    SortPriority:    0
    CaseSensitive:   false
  - Regex:           '.*'
    Priority:        1
    SortPriority:    0
    CaseSensitive:   false
 IncludeIsMainRegex: '(Test)?$'
 IncludeIsMainSourceRegex: ''
 IndentAccessModifiers: false
 IndentCaseBlocks: false
 IndentCaseLabels: true
 IndentExternBlock: AfterExternBlock
 IndentGotoLabels: true
 IndentPPDirectives: None
 IndentRequiresClause: true
 IndentWidth:     4
 IndentWrappedFunctionNames: false
 InsertBraces:    false
 InsertNewlineAtEOF: false
 InsertTrailingCommas: None
 IntegerLiteralSeparator:
  Binary:          0
  BinaryMinDigits: 0
  Decimal:         0
  DecimalMinDigits: 0
  Hex:             0
  HexMinDigits:    0
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: true
 KeepEmptyLinesAtEOF: false
 LambdaBodyIndentation: Signature
 LineEnding:      DeriveLF
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Auto
 ObjCBlockIndentWidth: 4
 ObjCBreakBeforeNestedBlockParam: true
 ObjCSpaceAfterProperty: true
 ObjCSpaceBeforeProtocolList: false
 PackConstructorInitializers: BinPack
 PenaltyBreakAssignment: 4
 PenaltyBreakBeforeFirstCallParameter: 19
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakOpenParenthesis: 0
 PenaltyBreakScopeResolution: 500
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyIndentedWhitespace: 0
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 PPIndentWidth:   -1
 QualifierAlignment: Leave
 ReferenceAlignment: Pointer
 ReflowComments:  true
 RemoveBracesLLVM: false
 RemoveParentheses: Leave
 RemoveSemicolon: false
 RequiresClausePosition: OwnLine
 RequiresExpressionIndentation: OuterScope
 SeparateDefinitionBlocks: Leave
 ShortNamespaceLines: 1
 SkipMacroDefinitionBody: false
 SortIncludes:    false
 SortJavaStaticImport: Before
 SortUsingDeclarations: LexicographicNumeric
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: false
 SpaceAroundPointerQualifiers: Default
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCaseColon: false
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeJsonColon: false
 SpaceBeforeParens: ControlStatements
 SpaceBeforeParensOptions:
  AfterControlStatements: true
  AfterForeachMacros: true
  AfterFunctionDefinitionName: false
  AfterFunctionDeclarationName: false
  AfterIfMacros:   true
  AfterOverloadedOperator: false
  AfterPlacementOperator: true
  AfterRequiresInClause: false
  AfterRequiresInExpression: false
  BeforeNonEmptyParentheses: false
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceBeforeSquareBrackets: false
 SpaceInEmptyBlock: false
 SpacesBeforeTrailingComments: 1
 SpacesInAngles:  Never
 SpacesInContainerLiterals: true
 SpacesInLineCommentPrefix:
  Minimum:         1
  Maximum:         -1
 SpacesInParens:  Never
 SpacesInParensOptions:
  InCStyleCasts:   false
  InConditionalStatements: false
  InEmptyParentheses: false
  Other:           false
 SpacesInSquareBrackets: false
 Standard:        Latest
 StatementAttributeLikeMacros:
  - Q_EMIT
 StatementMacros:
  - Q_UNUSED
  - QT_REQUIRE_VERSION
 TabWidth:        8
 UseTab:          Never
 VerilogBreakBetweenInstancePorts: true
 WhitespaceSensitiveMacros:
  - BOOST_PP_STRINGIZE
  - CF_SWIFT_NAME
  - NS_SWIFT_NAME
  - PP_STRINGIZE
  - STRINGIZE
 ...
--- a/.cmake/32bit.cmake
+++ b/.cmake/32bit.cmake
@@ -0,0 +1,8 @@
 set(CMAKE_SYSTEM_NAME ${CMAKE_HOST_SYSTEM_NAME})
 if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64")
    set(CMAKE_SYSTEM_PROCESSOR i686)
 endif()
 set(GMP_LIBRARY "BUILD" CACHE STRING "" FORCE)
 set(GMP_BUILD_CONFIG_ARGS "ABI=32" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS "-m32" CACHE STRING "" FORCE)
 set(CMAKE_EXE_LINKER_FLAGS "-m32" CACHE STRING "" FORCE)
--- a/.cmake/bm.cmake
+++ b/.cmake/bm.cmake
@@ -0,0 +1,12 @@
 add_custom_target(bm
    COMMAND ${CMAKE_COMMAND} -E echo "Running all benchmarks..."
 )
 foreach(bm_bin ${BM_BINS})
    add_custom_command(
        TARGET bm
        POST_BUILD
        COMMAND $<TARGET_FILE:${bm_bin}>
        COMMENT "Running ${bm_bin}"
    )
 endforeach()
--- a/.cmake/flags.cmake
+++ b/.cmake/flags.cmake
@@ -24,19 +24,29 @@ if(MSVC)
 	endif()
 else()
 	set(STRICT_OPTIONS_CXX "${STRICT_OPTIONS_CXX} -std=c++14 -O2")
-	set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wall -Wuninitialized -Wno-deprecated-declarations -Wno-missing-field-initializers")
+	set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wall -Wuninitialized -Wno-deprecated-declarations -Wno-missing-field-initializers -Wno-unused-function -Wno-missing-braces")
-	if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+	if (CMAKE_BUILD_TYPE STREQUAL "Debug")
 		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -Og -g")
 	else()
 		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -O3")
 	endif()
-	set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -std=c99 -Wno-error=strict-prototypes -fvisibility=hidden -funroll-loops -Wno-error=implicit-function-declaration -Wno-error=attributes")
+	set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -std=c11 -Wno-error=strict-prototypes -fvisibility=hidden -funroll-loops -Wno-error=implicit-function-declaration -Wno-error=attributes")
 	if(CMAKE_C_COMPILER_ID MATCHES "Clang")
 		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-error=unknown-warning-option -Qunused-arguments -Wno-tautological-compare")
-		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-unused-function -Wno-pass-failed")
+		set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-pass-failed")
 	endif()
 	if(ENABLE_STRICT)
-		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -Werror -Wextra -Wno-unused-parameter -fno-strict-aliasing")
+		set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} ${STRICT_OPTIONS_CPP} -Werror -Wextra -Wno-unused-parameter -fno-strict-aliasing")
 	endif()
 endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${STRICT_OPTIONS_C}")
-#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STRICT_OPTIONS_CXX} ${STRICT_OPTIONS_CPP}")
+
 if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 	# enable link-time optimization (LTO)
 	include(CheckIPOSupported)
 	check_ipo_supported(RESULT result)
 	if(result)
 		set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 	endif()
 endif()
--- a/.cmake/gmpconfig.cmake
+++ b/.cmake/gmpconfig.cmake
@@ -1,5 +1,16 @@
 if (GMP_LIBRARY STREQUAL "SYSTEM")
  # use system gmp version
  message(STATUS "Using system GMP")
-if (ENABLE_GMP_BUILD)
+  find_library(GMP gmp)
  find_path(GMP_INCLUDE gmp.h)
  add_library(GMP UNKNOWN IMPORTED)
  set_target_properties(GMP PROPERTIES
    IMPORTED_LOCATION ${GMP}
    INTERFACE_INCLUDE_DIRECTORIES ${GMP_INCLUDE}
  )
 elseif (GMP_LIBRARY STREQUAL "BUILD")
  # Download and build own libgmp version
  if (POLICY CMP0135)
    cmake_policy(SET CMP0135 NEW)
@@ -8,29 +19,70 @@ if (ENABLE_GMP_BUILD)
  option(ENABLE_GMP_STATIC "Option to statically link. Default is dynamic linking" OFF)
  if (ENABLE_GMP_STATIC)
    set(GMP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
  else()
    set(GMP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
  else()
    set(GMP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
  endif()
-  message("${GMP_BUILD_CONFIG_ARGS}")
+  cmake_host_system_information(RESULT N QUERY NUMBER_OF_PHYSICAL_CORES)
  if (N EQUAL 0)
    # Choose a "safe" amount
    set(N 8)
  endif()
  set(GMP_PARALLEL_BUILD_ARGS -j${N}) 
  message(STATUS "Building GMP with additional options: ${GMP_BUILD_CONFIG_ARGS}")
  include(ExternalProject)
  find_program(MAKE_EXE NAMES make gmake nmake)
  set(libgmp_INSTALL_DIR "${CMAKE_BINARY_DIR}/libgmp")
  ExternalProject_Add(libgmp_external
    PREFIX ${libgmp_INSTALL_DIR}
-    URL               https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz
+    URL               https://gmplib.org/download/gmp/gmp-6.3.0.tar.xz
-    URL_HASH          SHA256=fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2
+    URL_HASH          SHA256=a3c2b80201b89e68616f4ad30bc66aee4927c3ce50e33929ca819d5c43538898
    CONFIGURE_COMMAND ${libgmp_INSTALL_DIR}/src/libgmp_external/configure --prefix=${libgmp_INSTALL_DIR} ${GMP_BUILD_CONFIG_ARGS}
-    BUILD_COMMAND     ${MAKE_EXE} -j8
+    BUILD_COMMAND     ${MAKE_EXE} ${GMP_PARALLEL_BUILD_ARGS}
    INSTALL_COMMAND   ${MAKE_EXE} install
    BUILD_BYPRODUCTS  ${libgmp_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gmp${GMP_LIB_SUFFIX}
  )
-  set(GMP ${libgmp_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gmp${GMP_LIB_SUFFIX})
+  # Needed to avoid errors about missing directory when creating the GMP target
-  include_directories(${libgmp_INSTALL_DIR}/include)
+  file(MAKE_DIRECTORY ${libgmp_INSTALL_DIR}/include)
  if(ENABLE_GMP_STATIC)
    add_library(GMP STATIC IMPORTED)
    set_target_properties(GMP PROPERTIES
      IMPORTED_LOCATION ${libgmp_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}gmp${GMP_LIB_SUFFIX}
      INTERFACE_INCLUDE_DIRECTORIES ${libgmp_INSTALL_DIR}/include
    )
  else()
    add_library(GMP SHARED IMPORTED)
    set_target_properties(GMP PROPERTIES
      IMPORTED_LOCATION ${libgmp_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}gmp${GMP_LIB_SUFFIX}
      INTERFACE_INCLUDE_DIRECTORIES ${libgmp_INSTALL_DIR}/include
    )
  endif()
  add_dependencies(GMP libgmp_external)
 elseif (GMP_LIBRARY STREQUAL "MINI")
  # Use mini-gmp
  message(STATUS "Using mini-GMP")
  include(CheckTypeSize)
  add_library(GMP STATIC
    ${PROJECT_SOURCE_DIR}/src/mini-gmp/mini-gmp.c ${PROJECT_SOURCE_DIR}/src/mini-gmp/mini-gmp-extra.c)
  target_include_directories(GMP PRIVATE ${PROJECT_SOURCE_DIR}/src/common/generic/include) # for tutil.h
  target_include_directories(GMP INTERFACE ${PROJECT_SOURCE_DIR}/src/mini-gmp)
  set_source_files_properties(${PROJECT_SOURCE_DIR}/src/mini-gmp/mini-gmp.c PROPERTIES COMPILE_OPTIONS "-w")
  set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/src/mini-gmp")
  set(CMAKE_EXTRA_INCLUDE_FILES "mini-gmp.h")
  check_type_size("mp_limb_t" MP_LIMB_T_BYTES)
  math(EXPR GMP_LIMB_BITS "${MP_LIMB_T_BYTES} * 8")
  add_compile_definitions(GMP_LIMB_BITS=${GMP_LIMB_BITS})
  add_compile_definitions(MINI_GMP)
 else()
-  # use system gmp version
+  message(FATAL_ERROR "Invalid choice for GMP_LIBRARY: ${GMP_LIBRARY}")
  find_library(GMP gmp)
  find_path(GMP_INCLUDE gmp.h)
  include_directories(${GMP_INCLUDE})
 endif()
--- a/.cmake/impl_type.cmake
+++ b/.cmake/impl_type.cmake
@@ -1,7 +1,15 @@
 get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
 string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/include)
    set(INC_${CCSD_NAME_UPPER}_GENERIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 endif()
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CCSD_NAME}x)
    set(${CCSD_NAME_UPPER}_GENERIC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${CCSD_NAME}x)
 endif()
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/broadwell AND SQISIGN_BUILD_TYPE MATCHES "broadwell")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/broadwell)
 elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/arm64crypto AND SQISIGN_BUILD_TYPE MATCHES "arm64crypto")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/arm64crypto)
 elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/opt AND SQISIGN_BUILD_TYPE MATCHES "opt")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/opt)
 elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ref)
--- a/.cmake/sqisign_variant.cmake
+++ b/.cmake/sqisign_variant.cmake
@@ -1,6 +1,6 @@
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/generic)
    set(LIB_${CCSD_NAME_UPPER} sqisign_${CCSD_NAME}_generic CACHE INTERNAL "LIB")
-    set(INC_${CCSD_NAME_UPPER} ${CMAKE_CURRENT_SOURCE_DIR}/generic/include CACHE INTERNAL "LIB")
+    set(INC_${CCSD_NAME_UPPER} ${CMAKE_CURRENT_SOURCE_DIR}/generic/include CACHE INTERNAL "INC")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
    FOREACH(SVARIANT ${SVARIANT_S})
        string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
--- a/.cmake/target.cmake
+++ b/.cmake/target.cmake
@@ -1,39 +1,100 @@
 # SPDX-License-Identifier: Apache-2.0
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
+include(CheckTypeSize)
-    add_definitions(-DTARGET_ARM64)
+
-    add_definitions(-DRADIX_64)
+function(check_target_feature CODE RUN_RESULT)
-elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+    set(TEMP_FILE "${CMAKE_BINARY_DIR}/check_target_feature.c")
-    add_definitions(-DTARGET_ARM)
+    file(WRITE
-    add_definitions(-DRADIX_32)
+        ${TEMP_FILE}
-elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+        "int main(void) {
-    add_definitions(-DTARGET_AMD64)
+            ${CODE}
-    add_definitions(-DRADIX_64)
+            return 0;
-elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "i686")
+        }")
-    add_definitions(-DTARGET_X86)
+
-    add_definitions(-DRADIX_32)
+    try_run(TEMP_RUN_RESULT TEMP_COMPILE_RESULT ${CMAKE_BINARY_DIR} ${TEMP_FILE})
-elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(s390x.*|S390X.*)")
+
-    add_definitions(-DTARGET_S390X)
+    set(${RUN_RESULT} ${TEMP_RUN_RESULT} PARENT_SCOPE)
-    add_definitions(-DTARGET_BIG_ENDIAN)
+    if (ARGC EQUAL 3)
-    add_definitions(-DRADIX_64)
+        set(${ARGV2} ${TEMP_COMPILE_RESULT} PARENT_SCOPE)
    endif()
    file(REMOVE ${TEMP_FILE})
 endfunction()
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
    add_compile_definitions(TARGET_ARM64)
    set(RADIX 64)
    if (NOT APPLE)
        check_target_feature("asm volatile(\"mrs x0, PMCCNTR_EL0\" : : : \"x0\");" CYCCNT)
        if (CYCCNT STREQUAL "FAILED_TO_RUN")
            message(STATUS "Cycle counter not supported, reverting to fallback measurement")
            add_compile_definitions(NO_CYCLE_COUNTER)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
    add_compile_definitions(TARGET_ARM)
    set(RADIX 32)
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
    add_compile_definitions(TARGET_AMD64)
    set(RADIX 64)
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "i686")
    add_compile_definitions(TARGET_X86)
    set(RADIX 32)
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(s390x.*|S390X.*)")
    add_compile_definitions(TARGET_S390X TARGET_BIG_ENDIAN)
    set(RADIX 64)
 else()
-    add_definitions(-DTARGET_OTHER)
+    add_compile_definitions(TARGET_OTHER)
-    add_definitions(-DRADIX_64)
+    set(RADIX 64)
    message("Warning: system architecture not detected, defaulting to 64 bit")
 endif()
-if (UNIX)
+if (NOT GF_RADIX STREQUAL "AUTO")
-    add_definitions(-DTARGET_OS_UNIX)
+    if (NOT((GF_RADIX EQUAL 64) OR (GF_RADIX EQUAL 32)))
        message(FATAL_ERROR "Currently supported options for GF_RADIX: 32 or 64. Aborting")
    endif()
    set(RADIX ${GF_RADIX})
 endif()
 if (NOT DEFINED SQISIGN_BUILD_TYPE)
    set(SQISIGN_BUILD_TYPE "ref")
 endif()
 if (RADIX EQUAL 32)
    if (${SQISIGN_BUILD_TYPE} MATCHES "broadwell")
        message(FATAL_ERROR "Broadwell implementation not supported in 32-bit build")
    endif()
 else()
-    add_definitions(-DTARGET_OS_OTHER)
+    # Testing for unsigned 128-bit integer support
    check_type_size("__uint128_t" uint128_t)
    if (${HAVE_uint128_t} AND (uint128_t EQUAL 16))
        add_compile_definitions(HAVE_UINT128)
    elseif(${SQISIGN_BUILD_TYPE} MATCHES "ref")
        message(WARNING "Compiler/platform does not support unsigned 128-bit integers, falling back to 32-bit build")
        set(RADIX 32)
    endif()
 endif()
 message(STATUS "Using ${RADIX}-bit radix for gf module")
 if (RADIX EQUAL 32)
    add_compile_definitions(RADIX_32)
 elseif (RADIX EQUAL 64)
    add_compile_definitions(RADIX_64)
 endif()
 if (UNIX)
    add_compile_definitions(TARGET_OS_UNIX)
 else()
    add_compile_definitions(TARGET_OS_OTHER)
 endif()
 set(C_OPT_FLAGS "")
-if ((NOT DEFINED SQISIGN_BUILD_TYPE))
+if (NOT DEFINED SQISIGN_TEST_REPS)
-  set(SQISIGN_BUILD_TYPE opt)
+    set(SQISIGN_TEST_REPS 10)
 endif()
-if ((NOT DEFINED SQISIGN_TEST_REPS))
+add_compile_definitions(SQISIGN_TEST_REPS=${SQISIGN_TEST_REPS})
  set(SQISIGN_TEST_REPS 1000)
 endif()
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -1,16 +0,0 @@
 ;; Emacs style file
 ;;
 ;; Sets spaces-only indentation, 4-spaces tab stops, linux kernel
 ;; coding style
 (
 (nil . ((indent-tabs-mode . nil)
         (tab-width . 4)
         )
      )
 (c-default-style . ((c-mode . "linux")
                     ))
 (c-mode . ((c-file-style . "linux")
            (c-basic-offset . 4)
            )
         )
 )
--- a/.github/workflows/checks-daily.yml
+++ b/.github/workflows/checks-daily.yml
@@ -0,0 +1,19 @@
 name: Daily workflow for various checks
 on:
  schedule:
    - cron: '0 0 * * *'  # This cron expression means "run at midnight UTC every day"
 jobs:
  checks:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install dependencies
      run: |
        sudo apt -y update && sudo apt -y install gcc cmake libgmp-dev scala
    - name: Check namespace
      run: scripts/check_namespace.sh
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -2,58 +2,98 @@ name: CMake
 on:
  push:
-    branches: [ '*' ]
+    branches: ["**"]
-  pull_request:
+#  pull_request:
-    branches: [ "main" ]
+#    branches: [ "main" ]
 env:
  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
-  BUILD_TYPE: Release
+  BUILD_TYPE: Debug
 jobs:
  build:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    # You can convert this to a matrix build if you need cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        platform: [x64, arm64]
        toolchain: [""]
        sqisign_build_type: [ref]
        sqisign_test_reps: [10]
        gf_radix: [AUTO, 32]
        enable_sign: [ON]
        gmp_library: [SYSTEM]
        include:
          - platform: x64
            toolchain: ""
            sqisign_build_type: broadwell
            sqisign_test_reps: 10
            gf_radix: AUTO
            enable_sign: ON
            gmp_library: SYSTEM
          - platform: x64
            toolchain: .cmake/32bit.cmake
            sqisign_build_type: ref
            sqisign_test_reps: 10
            gf_radix: 32
            enable_sign: ON
            gmp_library: BUILD # Redundant, as it's set in .cmake/32bit.cmake
          - platform: x64
            toolchain: ""
            sqisign_build_type: ref
            sqisign_test_reps: 10
            gf_radix: AUTO
            enable_sign: OFF
            gmp_library: SYSTEM
          - platform: x64
            toolchain: ""
            sqisign_build_type: ref
            sqisign_test_reps: 10
            gf_radix: AUTO
            enable_sign: ON
            gmp_library: MINI
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
-    - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
-    - name: Install dependencies Valgrind, GMP, Doxygen, TeX
+      ##    - name: Install dependencies Valgrind, GMP, Doxygen, TeX, gcc-multilib
-      run: |
+      ##      run: |
-          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex
+      ##          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex gcc-multilib
-          echo "Valgrind installed"
+
      - name: Set up environment for ccache
        run: echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
      - name: Configure CMake
        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=${{ matrix.sqisign_test_reps }}
+        run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=${{ matrix.sqisign_test_reps }} -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
      - name: Build
        # Build your program with the given configuration
-      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+        run: |
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
      - name: Build documentation
        # Create html and latex documentation, TODO: do we need different docs for ref and opt?
        if: ${{ ((matrix.gf_radix != 32) && (matrix.enable_sign == 'ON') && (matrix.gmp_library == 'SYSTEM')) }}
        run: doxygen Doxyfile && cd latex && xelatex refman
      - name: Upload latex documentation
-      uses: actions/upload-artifact@v3
+        if: ${{ ((matrix.gf_radix != 32) && (matrix.enable_sign == 'ON') && (matrix.gmp_library == 'SYSTEM')) }}
        uses: actions/upload-artifact@v4
        with:
-        name: docs
+          name: docs-${{ matrix.platform }}-${{ matrix.sqisign_build_type }}
          path: latex/refman.pdf
      - name: Test
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
-      run: ctest -j4 -C ${{env.BUILD_TYPE}}
+        run: ctest -j16 -C ${{env.BUILD_TYPE}} -E "KAT$"
      - name: Examples
        working-directory: ${{github.workspace}}/build/apps
@@ -61,54 +101,80 @@ jobs:
          ./example_nistapi_lvl1
          ./example_nistapi_lvl3
          ./example_nistapi_lvl5
        if: ${{ matrix.enable_sign == 'ON' }}
-    - name: CT-Tests
+      - name: Release build & test
      # TODO: re-enable for those tests that should be ct
      if: false
        run: |
          rm -rf build
-          cmake -Bbuild -DENABLE_CT_TESTING=ON -DCMAKE_BUILD_TYPE=Debug -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=${{ matrix.sqisign_test_reps }}
+          cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          # valgrind --track-origins=yes build/
+          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
          # valgrind --track-origins=yes build/
          # valgrind --track-origins=yes build/
          # valgrind --track-origins=yes build/
      - name: Memcheck
        run: |
          rm -rf build
-          cmake -Bbuild -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=${{ matrix.sqisign_test_reps }}
+          cmake -Bbuild -DSQISIGN_TEST_REPS=1 -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          ctest -T memcheck --test-dir build
+          valgrind --error-exitcode=1 --max-stackframe=4116160 ./build/test/sqisign_test_scheme_lvl3
-      if: false
+          valgrind --error-exitcode=1 --max-stackframe=4116160 ./build/test/sqisign_test_scheme_lvl1
          valgrind --error-exitcode=1 --max-stackframe=4116160 ./build/test/sqisign_test_scheme_lvl5
        if: ${{ matrix.toolchain == '' && matrix.platform == 'arm64' && matrix.gf_radix == 'AUTO' }} 
      - name: Build shared libraries
        run: |
          rm -rf build
          cmake -Bbuild -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} -DCMAKE_C_COMPILER=clang ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
          find . -name '*.so' | grep so
          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
      - name: Address Sanitizer ASAN
        run: |
          rm -rf build
-          cmake -Bbuild -DCMAKE_BUILD_TYPE=ASAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1
+          cmake -Bbuild -DCMAKE_BUILD_TYPE=ASAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          ctest -j4 -v --test-dir build
+          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
      # MSAN needs instrumented gmp
      - name: Memory Sanitizer MSAN
        run: |
          rm -rf build
-          cmake -Bbuild -DCMAKE_BUILD_TYPE=MSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1
+          cmake -Bbuild -DCMAKE_BUILD_TYPE=MSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          ctest -j4 -v --test-dir build
+          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
-
+        if: ${{ matrix.gmp_library == 'MINI' }}
      if: false
      - name: Leak Sanitizer LSAN
        run: |
          rm -rf build
-          cmake -Bbuild -DCMAKE_BUILD_TYPE=LSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1
+          cmake -Bbuild -DCMAKE_BUILD_TYPE=LSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          ctest -j4 -v --test-dir build
+          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
      - name: Undefined Behavior Sanitizer UBSAN
        run: |
          rm -rf build
-          cmake -Bbuild -DCMAKE_BUILD_TYPE=UBSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1
+          cmake -Bbuild -DCMAKE_BUILD_TYPE=UBSAN -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DCMAKE_C_COMPILER=clang -DSQISIGN_TEST_REPS=1 -DGF_RADIX=${{ matrix.gf_radix }} -DENABLE_SIGN=${{ matrix.enable_sign }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -G Ninja
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build build
-          ctest -j4 -v --test-dir build
+          CTEST_OUTPUT_ON_FAILURE=1 ctest -j16 -V --test-dir build -E "KAT$"
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -0,0 +1,54 @@
 name: Benchmarks (Daily Workflow)
 on:
  schedule:
    - cron: '0 0 * * *'  # This cron expression means "run at midnight UTC every day"
 env:
  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
  BUILD_TYPE: Release
 jobs:
  benchmarks:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        platform: [x64, arm64]
        toolchain: [""]
        sqisign_build_type: [ref]
        include:
          - platform: x64
            toolchain: ""
            sqisign_build_type: broadwell
            sqisign_test_reps: 10
          - platform: x64
            toolchain: .cmake/32bit.cmake
            sqisign_build_type: ref
            sqisign_test_reps: 10
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
    - uses: actions/checkout@v4
    - name: Configure CMake
      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }}
    - name: Build
    # Build your program with the given configuration
      run: |
        if [ -n "${{ matrix.toolchain }}" ]; then
            cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
        fi
        cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    - name: Run benchmarks in apps folder
      run: |
        build/apps/benchmark_lvl1 50
        build/apps/benchmark_lvl3 20
        build/apps/benchmark_lvl5 10
    - name: Run Benchmarks (make bm)
      run: cd build && make bm
--- a/.github/workflows/daily_trigger.yml
+++ b/.github/workflows/daily_trigger.yml
@@ -0,0 +1,60 @@
 name: Benchmarks (Daily Workflow, manual trigger)
 on:
  workflow_dispatch:
    inputs:
      commit_sha:
        description: 'The commit SHA to run the workflow on'
        required: true
        type: string
 env:
  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
  BUILD_TYPE: Release
 jobs:
  benchmarks:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        platform: [x64, arm64]
        toolchain: [""]
        sqisign_build_type: [ref]
        include:
          - platform: x64
            toolchain: ""
            sqisign_build_type: broadwell
            sqisign_test_reps: 10
          - platform: x64
            toolchain: .cmake/32bit.cmake
            sqisign_build_type: ref
            sqisign_test_reps: 10
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
    - uses: actions/checkout@v4
      with:
        ref: ${{ github.event.inputs.commit_sha }}
    - name: Configure CMake
      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }}
    - name: Build
    # Build your program with the given configuration
      run: |
        if [ -n "${{ matrix.toolchain }}" ]; then
            cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
        fi
        cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    - name: Run benchmarks in apps folder
      run: |
        build/apps/benchmark_lvl1 50
        build/apps/benchmark_lvl3 20
        build/apps/benchmark_lvl5 10
    - name: Run Benchmarks (make bm)
      run: cd build && make bm
--- a/.github/workflows/kat.yml
+++ b/.github/workflows/kat.yml
@@ -0,0 +1,189 @@
 name: Known Answer Tests (KAT)
 on:
  workflow_dispatch:
    inputs:
      commit_sha:
        description: 'The commit SHA to run the workflow on'
        required: true
        type: string
 jobs:
  x86-KAT:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        compiler: [gcc]
        build_type: [Release]
        platform: [x64]
        #fast_math: [-ffast-math, -fno-fast-math]
        #fp_contract: [-ffp-contract=on, -ffp-contract=fast, -ffp-contract=off]
        fast_math: [""]
        fp_contract: [""]
        toolchain: ["", ".cmake/32bit.cmake"]
        sqisign_build_type: [ref]
        gf_radix: [AUTO, 32]
        gmp_library: [SYSTEM, MINI]
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.commit_sha }}
      ##    - name: Install dependencies Valgrind, GMP, Doxygen, TeX, gcc-multilib
      ##      run: |
      ##          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex gcc-multilib
      - name: Configure CMake
        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
        run: cmake -B ${{github.workspace}}/build -DCMAKE_C_COMPILER=${{matrix.compiler}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DGF_RADIX=${{ matrix.gf_radix }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -DCMAKE_C_FLAGS="${{ matrix.fast_math }} ${{ matrix.fp_contract }}"
      - name: Build
        # Build your program with the given configuration
        run: |
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build ${{github.workspace}}/build --parallel 8 --config ${{matrix.build_type}}
      - name: Test
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
        run: ctest -R KAT$ -j3 -C ${{matrix.build_type}}
  broadwell-KAT:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        compiler: [clang, gcc]
        build_type: [Release]
        platform: [x64]
        #fast_math: [ON, OFF]
        #fp_contract: [ON, FAST, OFF]
        fast_math: [""]
        fp_contract: [""]
        toolchain: [""]
        sqisign_build_type: [broadwell]
        gf_radix: [AUTO]
        gmp_library: [SYSTEM, MINI]
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
      - uses: actions/checkout@v4
      ##    - name: Install dependencies Valgrind, GMP, Doxygen, TeX, gcc-multilib
      ##      run: |
      ##          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex gcc-multilib
      - name: Configure CMake
        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
        run: cmake -B ${{github.workspace}}/build -DCMAKE_C_COMPILER=${{matrix.compiler}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DGF_RADIX=${{ matrix.gf_radix }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -DCMAKE_C_FLAGS="${{ matrix.fast_math }} ${{ matrix.fp_contract }}"
      - name: Build
        # Build your program with the given configuration
        run: |
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build ${{github.workspace}}/build --parallel 8 --config ${{matrix.build_type}}
      - name: Test
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
        run: ctest -R KAT$ -j3 -C ${{matrix.build_type}}
  arm64-KAT:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        compiler: [clang, gcc]
        build_type: [Release]
        platform: [arm64]
        #fast_math: [ON, OFF]
        #fp_contract: [ON, FAST, OFF]
        fast_math: [""]
        fp_contract: [""]
        toolchain: [""]
        sqisign_build_type: [ref]
        gf_radix: [AUTO]
        gmp_library: [SYSTEM, MINI]
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
      - uses: actions/checkout@v4
      ##    - name: Install dependencies Valgrind, GMP, Doxygen, TeX, gcc-multilib
      ##      run: |
      ##          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex gcc-multilib
      - name: Configure CMake
        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
        run: cmake -B ${{github.workspace}}/build -DCMAKE_C_COMPILER=${{matrix.compiler}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DGF_RADIX=${{ matrix.gf_radix }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -DCMAKE_C_FLAGS="${{ matrix.fast_math }} ${{ matrix.fp_contract }}"
      - name: Build
        # Build your program with the given configuration
        run: |
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build ${{github.workspace}}/build --parallel 8 --config ${{matrix.build_type}}
      - name: Test
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
        run: ctest -R KAT$ -j3 -C ${{matrix.build_type}}
  DebugKAT:
    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
    strategy:
      matrix:
        compiler: [clang, gcc]
        build_type: [Debug]
        platform: [x64]
        #fast_math: [OFF]
        #fp_contract: [FAST]
        fast_math: [""]
        fp_contract: [""]
        toolchain: [""]
        sqisign_build_type: [ref, broadwell]
        gf_radix: [AUTO]
        gmp_library: [SYSTEM]
    runs-on: [self-hosted, "${{ matrix.platform }}"]
    steps:
      - uses: actions/checkout@v4
      ##    - name: Install dependencies Valgrind, GMP, Doxygen, TeX, gcc-multilib
      ##      run: |
      ##          sudo apt update && sudo apt --fix-missing install valgrind libgmp-dev doxygen texlive-xetex gcc-multilib
      - name: Configure CMake
        # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
        # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
        run: cmake -B ${{github.workspace}}/build -DCMAKE_C_COMPILER=${{matrix.compiler}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DSQISIGN_BUILD_TYPE=${{ matrix.sqisign_build_type }} -DGF_RADIX=${{ matrix.gf_radix }} -DGMP_LIBRARY=${{ matrix.gmp_library }} ${{ matrix.toolchain && format('-DCMAKE_TOOLCHAIN_FILE={0}', matrix.toolchain) || '' }} -DCMAKE_C_FLAGS="${{ matrix.fast_math }} ${{ matrix.fp_contract }}"
      - name: Build
        # Build your program with the given configuration
        run: |
          if [ -n "${{ matrix.toolchain }}" ]; then
              cp ~/gmp-6.3.0.tar.xz ${{github.workspace}}/build/libgmp/src
          fi
          cmake --build ${{github.workspace}}/build --parallel 8 --config ${{matrix.build_type}}
      - name: Test
        working-directory: ${{github.workspace}}/build
        # Execute tests defined by the CMake configuration.
        # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
        run: ctest -R KAT$ -j3 -C ${{matrix.build_type}}
--- a/.github/workflows/s390-daily.yml
+++ b/.github/workflows/s390-daily.yml
@@ -0,0 +1,33 @@
 name: Big-endian s390x test (Daily Workflow)
 on:
  schedule:
    - cron: '0 0 * * *'  # This cron expression means "run at midnight UTC every day"
 jobs:
  s390-be:
    strategy:
      matrix:
        BUILD_TYPE: [Debug, Release]
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v3
    - name: Setup multiarch/qemu-user-static
      run: |
        docker run --rm --privileged multiarch/qemu-user-static:register --reset
    - name: Run build and tests in s390x container
      run: |
        docker run --rm -v ${{ github.workspace }}:/workspace multiarch/ubuntu-core:s390x-focal bash -c "
          set -x &&
          cd /workspace &&
          ls -l . &&
          uname -a &&
          lscpu | grep Endian &&
          apt -y update &&
          apt -y install cmake gcc libgmp-dev &&
          cmake -B build -DCMAKE_BUILD_TYPE=${{ matrix.BUILD_TYPE }} -DSQISIGN_BUILD_TYPE=ref &&
          cmake --build build --config ${{ matrix.BUILD_TYPE }} &&
          cd /workspace/build &&
          ctest -j4 -C ${{ matrix.BUILD_TYPE }}
        "
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
-build/
+build*/
 html/
 latex/
 .vscode
 *.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
 repos:
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v18.1.6
  hooks:
  - id: clang-format
    types_or: [c]
    exclude: ^src/precomp/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,48 +1,59 @@
 # SPDX-License-Identifier: Apache-2.0
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.13)
-project(SQIsign VERSION 1.0 LANGUAGES C ASM)
+project(SQIsign VERSION 2.0 LANGUAGES C ASM)
 set(SQISIGN_SO_VERSION "0")
-set(CMAKE_C_STANDARD 99)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 include(CTest)
-option(ENABLE_STRICT "Build with strict compile options." OFF)
+option(ENABLE_STRICT "Build with strict compile options." ON)
 option(ENABLE_TESTS  "Enable compilation of tests." ON)
 option(ENABLE_CT_TESTING  "Enable compilation for constant time testing." OFF)
-option(ENABLE_GMP_BUILD "Download and build external version of GMP" OFF)
+option(ENABLE_SIGN "Build with sign functionality" ON)
-option(ENABLE_DOC_TARGET "Enable building API documentation using doxygen" OFF)
+set(GMP_LIBRARY "SYSTEM" CACHE STRING "Which version of GMP to use: SYSTEM, BUILD or MINI")
 set(GF_RADIX "AUTO" CACHE STRING "Set the radix for the gf module (currently supported values: 32 or 64), or AUTO.")
 if (NOT DEFINED SQISIGN_BUILD_TYPE)
  SET(SQISIGN_BUILD_TYPE "ref")
 endif()
-if(SQISIGN_BUILD_TYPE STREQUAL "broadwell")
+if (${SQISIGN_BUILD_TYPE} MATCHES "ref")
-	SET(SVARIANT_S "lvl1")
+  add_compile_definitions(SQISIGN_BUILD_TYPE_REF SQISIGN_GF_IMPL_REF)
-else()
+elseif (${SQISIGN_BUILD_TYPE} MATCHES "opt")
-	SET(SVARIANT_S "lvl1;lvl3;lvl5")
+  add_compile_definitions(SQISIGN_BUILD_TYPE_OPT SQISIGN_GF_IMPL_REF)
 elseif (${SQISIGN_BUILD_TYPE} MATCHES "broadwell")
  add_compile_definitions(SQISIGN_BUILD_TYPE_BROADWELL SQISIGN_GF_IMPL_BROADWELL)
 elseif (${SQISIGN_BUILD_TYPE} MATCHES "arm64crypto")
  add_compile_definitions(SQISIGN_BUILD_TYPE_ARM64CRYPTO SQISIGN_GF_IMPL_REF)
 endif()
 SET(SVARIANT_S "lvl1;lvl3;lvl5")
 include(.cmake/flags.cmake)
 include(.cmake/sanitizers.cmake)
 include(.cmake/target.cmake)
-if(ENABLE_DOC_TARGET)
+if(ENABLE_SIGN)
-	include(.cmake/target_docs.cmake)
+	include(.cmake/gmpconfig.cmake)
 	add_compile_definitions(ENABLE_SIGN)
 endif()
 include(.cmake/gmpconfig.cmake)
 set(BM_BINS "" CACHE INTERNAL "List of benchmark binaries")
 set(SELECT_IMPL_TYPE ${PROJECT_SOURCE_DIR}/.cmake/impl_type.cmake)
 set(SELECT_SQISIGN_VARIANT ${PROJECT_SOURCE_DIR}/.cmake/sqisign_variant.cmake)
 set(INC_PUBLIC ${PROJECT_SOURCE_DIR}/include)
 add_subdirectory(src)
 add_subdirectory(apps)
 add_subdirectory(test)
-if(ENABLE_TESTS)
+include(.cmake/bm.cmake)
-	enable_testing()
+
-	add_subdirectory(test)
+#if(ENABLE_TESTS)
-endif()
+#    enable_testing()
 #    add_subdirectory(test)
 #endif()
--- a/COPYING.LGPL
+++ b/COPYING.LGPL
@@ -0,0 +1,165 @@
                   GNU LESSER GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007
 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
  This version of the GNU Lesser General Public License incorporates
 the terms and conditions of version 3 of the GNU General Public
 License, supplemented by the additional permissions listed below.
  0. Additional Definitions.
  As used herein, "this License" refers to version 3 of the GNU Lesser
 General Public License, and the "GNU GPL" refers to version 3 of the GNU
 General Public License.
  "The Library" refers to a covered work governed by this License,
 other than an Application or a Combined Work as defined below.
  An "Application" is any work that makes use of an interface provided
 by the Library, but which is not otherwise based on the Library.
 Defining a subclass of a class defined by the Library is deemed a mode
 of using an interface provided by the Library.
  A "Combined Work" is a work produced by combining or linking an
 Application with the Library.  The particular version of the Library
 with which the Combined Work was made is also called the "Linked
 Version".
  The "Minimal Corresponding Source" for a Combined Work means the
 Corresponding Source for the Combined Work, excluding any source code
 for portions of the Combined Work that, considered in isolation, are
 based on the Application, and not on the Linked Version.
  The "Corresponding Application Code" for a Combined Work means the
 object code and/or source code for the Application, including any data
 and utility programs needed for reproducing the Combined Work from the
 Application, but excluding the System Libraries of the Combined Work.
  1. Exception to Section 3 of the GNU GPL.
  You may convey a covered work under sections 3 and 4 of this License
 without being bound by section 3 of the GNU GPL.
  2. Conveying Modified Versions.
  If you modify a copy of the Library, and, in your modifications, a
 facility refers to a function or data to be supplied by an Application
 that uses the facility (other than as an argument passed when the
 facility is invoked), then you may convey a copy of the modified
 version:
   a) under this License, provided that you make a good faith effort to
   ensure that, in the event an Application does not supply the
   function or data, the facility still operates, and performs
   whatever part of its purpose remains meaningful, or
   b) under the GNU GPL, with none of the additional permissions of
   this License applicable to that copy.
  3. Object Code Incorporating Material from Library Header Files.
  The object code form of an Application may incorporate material from
 a header file that is part of the Library.  You may convey such object
 code under terms of your choice, provided that, if the incorporated
 material is not limited to numerical parameters, data structure
 layouts and accessors, or small macros, inline functions and templates
 (ten or fewer lines in length), you do both of the following:
   a) Give prominent notice with each copy of the object code that the
   Library is used in it and that the Library and its use are
   covered by this License.
   b) Accompany the object code with a copy of the GNU GPL and this license
   document.
  4. Combined Works.
  You may convey a Combined Work under terms of your choice that,
 taken together, effectively do not restrict modification of the
 portions of the Library contained in the Combined Work and reverse
 engineering for debugging such modifications, if you also do each of
 the following:
   a) Give prominent notice with each copy of the Combined Work that
   the Library is used in it and that the Library and its use are
   covered by this License.
   b) Accompany the Combined Work with a copy of the GNU GPL and this license
   document.
   c) For a Combined Work that displays copyright notices during
   execution, include the copyright notice for the Library among
   these notices, as well as a reference directing the user to the
   copies of the GNU GPL and this license document.
   d) Do one of the following:
       0) Convey the Minimal Corresponding Source under the terms of this
       License, and the Corresponding Application Code in a form
       suitable for, and under terms that permit, the user to
       recombine or relink the Application with a modified version of
       the Linked Version to produce a modified Combined Work, in the
       manner specified by section 6 of the GNU GPL for conveying
       Corresponding Source.
       1) Use a suitable shared library mechanism for linking with the
       Library.  A suitable mechanism is one that (a) uses at run time
       a copy of the Library already present on the user's computer
       system, and (b) will operate properly with a modified version
       of the Library that is interface-compatible with the Linked
       Version.
   e) Provide Installation Information, but only if you would otherwise
   be required to provide such information under section 6 of the
   GNU GPL, and only to the extent that such information is
   necessary to install and execute a modified version of the
   Combined Work produced by recombining or relinking the
   Application with a modified version of the Linked Version. (If
   you use option 4d0, the Installation Information must accompany
   the Minimal Corresponding Source and Corresponding Application
   Code. If you use option 4d1, you must provide the Installation
   Information in the manner specified by section 6 of the GNU GPL
   for conveying Corresponding Source.)
  5. Combined Libraries.
  You may place library facilities that are a work based on the
 Library side by side in a single library together with other library
 facilities that are not Applications and are not covered by this
 License, and convey such a combined library under terms of your
 choice, if you do both of the following:
   a) Accompany the combined library with a copy of the same work based
   on the Library, uncombined with any other library facilities,
   conveyed under the terms of this License.
   b) Give prominent notice with the combined library that part of it
   is a work based on the Library, and explaining where to find the
   accompanying uncombined form of the same work.
  6. Revised Versions of the GNU Lesser General Public License.
  The Free Software Foundation may publish revised and/or new versions
 of the GNU Lesser General Public License from time to time. Such new
 versions will be similar in spirit to the present version, but may
 differ in detail to address new problems or concerns.
  Each version is given a distinguishing version number. If the
 Library as you received it specifies that a certain numbered version
 of the GNU Lesser General Public License "or any later version"
 applies to it, you have the option of following the terms and
 conditions either of that published version or of any later version
 published by the Free Software Foundation. If the Library as you
 received it does not specify a version number of the GNU Lesser
 General Public License, you may choose any version of the GNU Lesser
 General Public License ever published by the Free Software Foundation.
  If the Library as you received it specifies that a proxy can decide
 whether future versions of the GNU Lesser General Public License shall
 apply, that proxy's public statement of acceptance of any version is
 permanent authorization for you to choose that version for the
 Library.
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -4,107 +4,197 @@ Please read carefully before contributing to this repo.
 ## Code structure
-The code is split into the modules below:
+The source code is in the [`src/`](src) directory and it is split into the
-
+following modules:
 - `common`: common code for AES, SHAKE, (P)RNG, memory handling. Every
-  module that needs a hash function, seed expansion (e.g., KLPT),
+  module that needs a hash function, seed expansion,
  deterministic alea for tests, should call to this module.
- `uintbig`: multi-precision big integers.
+- `mp`: code for saturated-representation multiprecision arithmetic.
 - `gf`: GF(p^2) and GF(p) arithmetic.
 - `ec`: elliptic curves, isogenies and pairings. Everything that is
  purely finite-fieldy.
- `quaternion`: quaternion orders and ideals. This is, essentially,
+- `precomp`: constants and precomputed values.
-  replacing PARI/GP.
+- `quaternion`: quaternion orders and ideals.
- `klpt`: implementation of KLPT.
+- `hd`: code to compute (2,2)-isogenies in the theta model.
 - `id2iso`: code for Iso <-> Ideal.
- `util`: auxilary code shared among libraries.
+- `verification`: code for the verification protocol.
 - `signature`: code for the key generation and signature protocols.
-The sources for the modules are in [`src/`](src).  Each module is
+### Contents of a module
 structured as follows:
 Each module is comprised of *implementation types* and common code. An
 implementation type refers to the *portable optimized*, *portable reference* or
 any architecture-specific implementation of the module; a module must contain at
 least one implementation type. Each implementation type must be in its own
 directory within the directory of the module. The optimized and reference
 implementation types must be placed in the `opt` and `ref` directories,
 respectively; there is no rule for naming other architecture-specific
 implementations. Common code refers to optional generic code that is shared
 among all implementation types. Common code is placed in special directories
 within the directory of a module: header files in the `include` directory and
 source files in the `<module_name>x` directory, where `<module_name>` is the
 name of the module. An example of a module is given below:
 ```
-SQIsign
+src
-└── src
+└── <module_name>
-    └── <module_name>
+    ├── include
-        ├── <arch>
+    ├── <module_name>x
        │   ├── generic
        │   └── lvl1
    ├── opt
-        │   ├── generic
+    ├── ref
    └── <arch>
 ```
 where:
 - `<module_name>` is the name of the module.
 - `opt` and `ref` are the portable optimized and reference
  implementation types, respectively.
 - `<arch>` is an optional architecture-specific implementation type of the
  module (e.g., `broadwell` for code using assembly instructions
  specific to the Broadwell platform).
 - `include` contains header files common to all implementation types.
 - `<module_name>x` contains source files common to all implementation types
  (i.e., `opt`, `ref` and `<arch>`).
 Header files in the `include` directory above can be included by other modules
 and must contain extensive doxygen-formatted documentation describing the
 functions declared there; see [Documentation](#Documentation). Any
 implementation-type directory above is allowed to be a symlink; e.g., if a
 module has no separate optimized and reference implementation, then
 `opt` can be a symlink to `ref`.
 Similar to a module, each implementation type is comprised of implementation 
 *variants* and common code. A variant refers to either a *generic*
 implementation, an implementation whose parameters are defined by one of the
 NIST levels (i.e., 1, 3 or 5) or a variation of the latter. An implementation
 type must contain at least one variant. Each variant must be in its own
 directory within that of the implementation type. The generic variant must be
 placed in the `generic` directory and variants corresponding to NIST levels 1,
 3 and 5 are placed in the directories `lvl1`, `lvl3` and `lvl5`, respectively;
 there is no rule for naming the directory of a NIST variation, but
 implementors are encouraged to choose informative namings. Common code refers to
 optional variant-independent code that is shared among all variants of the same
 implementation type. Common code is placed in special directories within that of
 the implementation type: header files in the `include` directory and source
 files in the `lvlx` directory. Expanding on the example above, we show the
 details of its implementation types:
 ```
 src
 └── <module_name>
    ├── include
    ├── <module_name>x
    ├── opt
    │   ├── include
    │   ├── lvlx
    │   ├── lvl1
    │   ├── lvl1_var1
    │   ├── lvl3
    │   └── lvl5
-        └── ref
+    ├── ref
-            └── generic
+    │   ├── generic
    │   └── lvl1
    └── <arch>
        ├── include
        ├── lvlx
        ├── lvl3
        └── lvl5
 ```
 where:
 - `lvl1`, `lvl3`, `lvl5` are implementations of NIST levels 1, 3 and 5,
  respectively, for the corresponding implementation type.
 - `lvl1_var1` is a variation of `lvl1` for the `opt` implementation type (e.g.,
  using a different prime characteristic).
 - `opt/include` contains header files common to all variants in the `opt`
  implementation type (i.e., `lvl1`, `lvl1_var1`, `lvl3` and `lvl5`).
  Similarly, `<arch>/include` for all variants in the `<arch>` implementation
  type (i.e., `lvl3` and `lvl5`).
 - `opt/lvlx` contains source files common to all variants in the `opt`
  implementation type. Similarly, `<arch>/lvlx` for all variants in the `<arch>`
  implementation type.
 - `generic` contains a parameter-independent implementation of the `ref`
  implementation type.
- `<module_name>` is the name of the module.
+As the name suggests, the `generic` variant is a generic implementation which
- `<arch>` are optional architecture-specific implementations of the
+does not depend on the parameters defined by the NIST levels or any variation
-  module (e.g., `broadwell` for code using assembly instructions
+of these. If this directory is present, all other parameter-dependent
-  specific to the Broadwell platform).
+implementations are ignored and the `generic` implementation is built instead.
- `opt` and `ref` are the portable *optimized* and *reference*
+As with modules, header files in the `include` directory of an implementation
-  implementations.
+type (e.g., `opt/include` and `<arch>/include` above) can be included by other
- `lvl1`, `lvl3`, `lvl5` are parameter-dependent implementations of
+modules and must contain extensive doxygen-formatted documentation describing
-  the module, corresponding to NIST levels 1, 3 and 5, respectively.
+the functions declared there.
 - `lvl1_var1` is a variant of `lvl1`, e.g., using a different prime
  characteristic. The naming is free, and implementors are encouraged
  to choose more explicit naming, e.g., `lvl1_varp6983` for the
  variant using the `p6983` prime defined in the SQIsign AC20 variant.
 - `generic` is a parameter-independent implementation of the module.
  If no folder is named like the currently selected variant (see
  [Build](README.md#Build)), then this is compiled instead.
-Each of the folders above is allowed to be a symlink. E.g., if a
+Each implementation variant must be organized as follows:
-module has no separate optimized and reference implementation, then
+- Header files that can be included by other modules are placed in the `include`
-`opt` can be a symlink to `ref`. Other example: a module's code only
+  directory. These files must contain extensive doxygen-formatted documentation
-depends on the field size, but not the specific prime, then
+  describing the functions declared there.
-`lvl1_varp6983` could be a symlink to `lvl1`.
+- Source files of the implementation and their private internal header files are
-
+  placed directly in the implementation variant directory.
-### Contents of a module
+- Source files of unit tests and their private internal header files are placed
-
+  in the `test` directory. Refer to [Tests](#Tests) for instructions on how to
-The leaf folders described above should arrange code as described
+  write these.
 below.  We use the `generic` implementation of the `uintbig` module as
 an example.
 Common code (in `lvlx`) for all variants in an implementation type follows the
 same organization as above, with the exception that `lvlx` never contains an
 `include` directory. This role is taken by the `include` directory in the
 implementation type. Below is an example with the detailed organization of the
 common code and the `lvl1` variant for the `ref` implementation type of a
 module:
 ```
-generic
+<module_name>
-├── bench
+├──ref
-│   ├── CmakeLists.txt
+│  ├── include
-│   ├── bench1.c
+│  │   └── header_ref.h
-│   └── bench2.c
+│  ├──lvlx
 │  │  ├── test
 │  │  │   ├── test_internal_header_ref.h
 │  │  │   │   ...
 │  │  │   ├── test1_ref.c
 │  │  │   └── test2_ref.c
 │  │  ├── internal_header_ref.h
 │  │  ├── source1_ref.c
 │  │  └── source2_ref.c
 │  ├──lvl1
 │  │  ├── include
 │  │  │   └── header_ref_lvl1.h
 │  │  ├── test
 │  │  │   ├── test_internal_header_ref_lvl1.h
 │  │  │   │   ...
 │  │  │   ├── test1_ref_lvl1.c
 │  │  │   └── test2_ref_lvl1.c
 │  │  ├── internal_header_ref_lvl1.h
 │  │  ├── source1_ref_lvl1.c
 │  │  └── source2_ref_lvl1.c
 │  ├──lvl3
 │  └──lvl5
 ```
 Finally, common code for a module must be organized as follows:
 - Header files that can be included by other modules are placed in the `include`
  directory. As mentionde before, these files must contain extensive
  doxygen-formatted documentation describing the functions declared there.
 - Source files and their private internal header files are placed in the 
  `<module_name>x` directory.
 - Source files of unit tests and their private internal header files are placed
  in the `<module_name>x/test` directory. Again, refer to [Tests](#Tests) for
  instructions on how to write these.
 The example below shows the detailed organization of the common code of a
 module:
 ```
 <module_name>
 ├── include
-│   └── uintbig.h
+│   └── header.h
-├── test 
+├── <module_name>x
-│   ├── CmakeLists.txt
+│   ├── test
-│   ├── test1.c
+│   │   ├── test_internal_header.h
-│   └── test2.c
+│   │   │   ...
-├── CmakeLists.txt
+│   │   ├── test1.c
-├── internal_header.h
+│   │   └── test2.c
-├── soruce1.c
+│   ├── internal_header.h
-└── soruce2.c
+│   ├── source1.c
 │   └── source2.c
 ├── opt
 └── ref
 ```
 where:
 - `include/` shall contain a **unique header file** named
  `<module_name>.h`, where `<module_name>` is the name of the module.
  This header contains the public API of the module, and is the only
  header that can be included by other modules (e.g., via `#include
  <uintbig.h>`). These files must contain extensive doxygen-formatted
  documentation describing the module, see
  [Documentation](#Documentation).
 - `bench` and `test` contain one executable per file, containing,
  well, benchmarks and unit tests. Refer to [Benchmarks](#Benchmarks)
  and [Tests](#Tests) for instructions on how to write these.
 - Internal headers for the private use of the module, such as
  `internal_header.h` go to the root. Include these using `#include
  "internal_header.h"`.
 - The implementation of the module also goes into the root.
 ## Tests
 It is important to have extensive test coverage of the whole software.
@@ -113,27 +203,28 @@ to ensure consistency across the modules.
 ### Unit tests
-These go into `src/<module_name>/<ref|opt|...>/<generic|lvl1|...>/test/`.
+These go in the `src/<module_name>/<module_name>x/test` and 
-Refer to ... for an example of how to write tests.
+`src/<module_name>/<ref|opt|...>/<generic|lvlx|lvl1|...>/test/` directories.
 Refer to [`src/gf/gfx/test/test_fp.c`](src/gf/gfx/test/test_fp.c) for an example
 of how to write tests.
 ### Integration tests
-These go into `test/`.  Refer to
+These go in the `test/` directory. Refer to
 [`test/test_sqisign.c`](test/test_sqisign.c) for an example.
 ### Known Answer Tests (KAT)
 KATs help validate consistency across implementations. By ensuring
 that, e.g., the optimized and reference implementation produce the
-same signatures.
+same signatures. KATs are generated by executing `PQCgenKAT_sign_<level>` in
-
+the `apps` directory. KAT tests go in the `test/` directory.
 See [Known Answer Tests in README.md](README.md#Known Answer Tests (KAT)).
 ## Benchmarks
-Benchmarks for a module go into
+Benchmarks for a module go in the same directories as for tests.
-`src/<module_name>/<ref|opt|...>/<generic|lvl1|...>/bench/`.  Global
+Global benchmarks go in the `apps` directory; e.g.,
-benchmarks go...
+[`apps/benchmark.c`](apps/benchmark.c).
 ## Documentation
@@ -145,9 +236,9 @@ All code should be extensively documented.  The public module headers
 CI automatically builds a PDF of the doc every time code is pushed.
 To download the PDF, go to
-[Actions](https://github.com/SQIsign/sqisign-nist/actions), click on
+[Actions](https://github.com/SQIsign/sqisign-nist2/actions), click on
 the workflow run you're interested in, then go to Artifacts -> docs
-(see figure).
+(see figure). PDFs are retained for 2 days.
 ![](https://user-images.githubusercontent.com/149199/231756751-0f2780f8-33fe-4db9-8800-b5f145423b65.png)
@@ -155,11 +246,11 @@ the workflow run you're interested in, then go to Artifacts -> docs
 Always work on topic branches, never push work in progress on the
 `main` branch.  Once a task / issue / work unit is completed, create a
-pull-request and ask your team leader for a review.
+pull-request and ask for at least one review.
 ## Coding style
- **C version**: All code must compile cleanly as *C99*, without
+- **C version**: All code must compile cleanly as *C11*, without
  emitting any warnings, using recent versions of GCC and clang.
 - **Names**: Externally visible functions and types should be prefixed
@@ -187,7 +278,23 @@ pull-request and ask your team leader for a review.
  Global *state* (modifiable global variables), on the other hand, is
  strictly forbidden.
- **Whitespace**: Try not to mix tabs and spaces. Line endings
+- **Formatting**: This project uses
-  should be UNIX-style (i.e., `\n` rather than `\r\n`). Whitespace
+  [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html) to
-  characters at the end of a line, or by themselves on an otherwise
+  format the code.  From the root of the project run the following
-  empty line, are to be avoided.
+  command:
  ```
  find ./src -path ./src/precomp -prune -type f -o -iname '*.h' -o -iname '*.c' | xargs clang-format -i 
  ```
  to automatically format all appropriate files with `clang-format`.
  If you want, you can install a [pre-commit
  hook](https://pre-commit.com/) to ensure that your work is correctly
  formatted before pushing
  ```
  pre-commit install
  ```
  Will use the `.pre-commit-config.yaml` file.
--- a/KAT/PQCsignKAT_1138_lvl3.rsp
+++ b/KAT/PQCsignKAT_1138_lvl3.rsp
--- a/KAT/PQCsignKAT_1509_lvl5.rsp
+++ b/KAT/PQCsignKAT_1509_lvl5.rsp
--- a/KAT/PQCsignKAT_353_SQIsign_lvl1.req
+++ b/KAT/PQCsignKAT_353_SQIsign_lvl1.req
--- a/KAT/PQCsignKAT_353_SQIsign_lvl1.rsp
+++ b/KAT/PQCsignKAT_353_SQIsign_lvl1.rsp
--- a/KAT/PQCsignKAT_529_SQIsign_lvl3.req
+++ b/KAT/PQCsignKAT_529_SQIsign_lvl3.req
--- a/KAT/PQCsignKAT_529_SQIsign_lvl3.rsp
+++ b/KAT/PQCsignKAT_529_SQIsign_lvl3.rsp
--- a/KAT/PQCsignKAT_701_SQIsign_lvl5.req
+++ b/KAT/PQCsignKAT_701_SQIsign_lvl5.req
--- a/KAT/PQCsignKAT_701_SQIsign_lvl5.rsp
+++ b/KAT/PQCsignKAT_701_SQIsign_lvl5.rsp
--- a/KAT/PQCsignKAT_782_lvl1.rsp
+++ b/KAT/PQCsignKAT_782_lvl1.rsp
--- a/10
+++ b/10
@@ -1,4 +1,4 @@
-Copyright 2023 the SQIsign team. All rights reserved.
+Copyright 2023-2025 the SQIsign team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,3 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 The DPE Library is (C) 2004-2024 Patrick Pelissier, Paul Zimmermann,
 LORIA/INRIA, and licensed under the GNU Lesser General Public License,
 version 3. You may obtain a copy of the License at
    https://www.gnu.org/licenses/lgpl-3.0.en.html
 or in the file COPYING.LGPL.
--- a/README.md
+++ b/README.md
@@ -1,57 +1,70 @@
 # SQIsign
-This library is a C implementation of SQIsign, short for Short Quaternion and Isogeny Signature (from isogeny graphs of supersingular elliptic curves).
+This library is a C implementation of SQIsign.
 ## Requirements
- CMake (version 3.5 or later)
+- CMake (version 3.13 or later)
- C99-compatible compiler
+- C11-compatible compiler
- Valgrind (for dynamic testing)
+- GMP (version 6.0.0 or later)
- Clang static analyzer (version 10 or later, for static analysis)
+
- GMP (version 6.1.2 or later)
+### Pre-computation
 The constant values in the `src/precomp` directory were generated using the
 pre-computation scripts in the `scripts/precomp` directory. It is not necessary
 to execute these scripts to compile the project. The scripts have the following
 requirements:
 - [two-isogenies](https://github.com/ThetaIsogenies/two-isogenies)
  (`Theta-SageMath` version).
 - [deuring-2D](https://github.com/Jonathke/deuring-2D)
 ## Build
- `mkdir -p build`
+For a generic build
- `cd build`
+```
- `cmake -DSQISIGN_BUILD_TYPE=<ref/broadwell> ..`
+$ mkdir -p build
- `make`
+$ cd build
 $ cmake -DSQISIGN_BUILD_TYPE=ref ..
 $ make
 $ make test
 ```
 An optimized executable with debug code and assertions disabled can be built
 replacing the `cmake` command above by
 ```
 cmake -DSQISIGN_BUILD_TYPE=<ref/broadwell> -DCMAKE_BUILD_TYPE=Release ..
 ```
 ## Build options
 CMake build options can be specified with `-D<BUILD_OPTION>=<VALUE>`.
-### ENABLE_TESTS
+### SQISIGN_BUILD_TYPE
-Builds a test harness for the library, the default value is `ON`.
+Specifies the build type for which SQIsign is built. The currently supported values are:
 - `ref`: builds the plain reference implementation.
 - `opt`: builds the optimized implementation which is the same as the reference
  implementation.
 - `broadwell`: builds an additional optimized implementation targeting the Intel
  Broadwell architecture (and later). The optimizations are applied to the
  finite field arithmetic.
-### ENABLE_CT_TESTING
+### GMP_LIBRARY
-Builds the library with instrumentation for constant-time behavior testing, the default value is `OFF`. Valgrind development files are used for this build option.
+If set to `SYSTEM` (by default), the gmp library on the system is dynamically linked.
-### ENABLE_GMP_BUILD
+If set to `BUILD`, a custom gmp library is linked, which is built as part of the overall build process.
-
+In this case, the following further options are available:
 If set to `OFF` (by default), the gmp library on the system is dynamically linked.
 If set to `ON`, a custom gmp library is linked, which is built as part of the overall build process. 
 In the latter case, the following further options are available:
 - `ENABLE_GMP_STATIC`: Does static linking against gmp. The default is `OFF`.
 - `GMP_BUILD_CONFIG_ARGS`: Provides additional config arguments for the gmp build (for example `--disable-assembly`). By default, no config arguments are provided.
-### ENABLE_DOC_TARGET
+If set to `MINI`, the mini-gmp library is used, whose sources are included in the repository, in the folder `src/mini-gmp`. In this case, no copies of the full gmp library (system or custom-built) are required.
 If set to `ON`, a doc target is available that builds API documentation. Note that a doxygen installation is required if set to `ON`.
-The default is `OFF`.
+### ENABLE_SIGN
-### SQISIGN_BUILD_TYPE
+If set to `ON` (default), SQIsign is built with signature and verification functionality.
-
+If set to `OFF`, SQIsign is built with verification functionality only.
-Specifies the build type for which SQIsign is built. The currently supported flags are:
+In the latter case, GMP is no longer a dependency.
 - `ref`, which builds the plain C reference implementation.
 - `broadwell`, which builds an additional implementation with GF assembly optimized code for the Intel Broadwell architecture.
 ### SQISIGN_TEST_REPS
 Specifies and overrides the number of (self-)test repetitions to be run.
 ### CMAKE_BUILD_TYPE
@@ -66,183 +79,138 @@ Can be used to specify special build types. The options are:
 The default build type uses the flags `-O3 -Wstrict-prototypes -Wno-error=strict-prototypes -fvisibility=hidden -Wno-error=implicit-function-declaration -Wno-error=attributes`. (Notice that assertions remain enabled in this configuration, which harms performance.)
 ## Build artifacts
 The following libraries are built:
 - `libsqisign_common_sys.a`: library with common crypto - AES, Keccak and system random number generator.
 - `libsqisign_common_test.a`: library with common crypto for deterministic tests - AES, Keccak and CTR-DRBG PRNG.
 - `libsqisign_<level>.a`: library for `SQIsign_<level>`.
 - `libsqisign_<level>_test`: library for `SQIsign_<level>`, only for test, using the deterministic CTR-DRBG as backend.
 - `libsqisign_<level>_nistapi.a`: library for `SQIsign_<level>` against the NIST API.
 - `libsqisign_<level>_nistapi_test.a`: library for `SQIsign_<level>` against the NIST API. Only for test, using the deterministic CTR-DRBG as backend.
 - `libsqisign_gf_<level>.a`: gf sub-library, generic or for `<level>`
 - `libsqisign_ec_<level>.a`: ec sub-library, generic or for `<level>`
 - `libsqisign_klpt_<level>.a`: klpt sub-library, generic or for `<level>`
 - `libsqisign_intbig_generic.a`: intbig sub-library, generic
 - `libsqisign_quaternion_generic.a`: quaternion sub-library, generic
 - `libsqisign_id2iso_<level>.a`: id2iso sub-library, generic or for `<level>`
 The following test apps are built:
 - `sqisign_bench_<level>`: Benchmarking suites.
 - `sqisign_test_kat_<level>`: KAT test suites.
 - `sqisign_test_scheme_<level>`: Self-test suites.
 - `sqisign_test_prof_<level>`: Profiling suites.
 More apps are built in folder `build/apps`:
 - `PQCgenKAT_sign_<param>`: App for generating NIST KAT.
 - `example_nistapi_<param>`: Example app using the NIST API.
 ## Test
-In the build directory, run: `make test` or `ctest`.
+In the build directory, run `make test` or `ctest`.
 The test harness consists of the following units:
- KAT test: tests against the KAT files in the `KAT` folder - `SQIsign_<level>_KAT`
+- KAT test: `SQIsign_<level>_KAT`- tests against the KAT files in the `KAT`
- Self-tests: runs random self-tests (key-generation, signing and verifying) - `SQIsign_<level>_SELFTEST`
+  directory.
 - Self-tests: `SQIsign_<level>_SELFTEST` - runs random self-tests
  (key generation, signature and verification).
 - Sub-library specific unit-tests.
-Note that, ctest has a default timeout of 1500s, which is applied to all tests except the KAT tests. To override the default timeout, run `ctest --timeout <seconds>`.
+Note that, `ctest` has a default timeout of 1500s, which is applied to all tests
 except the KAT tests. To override the default timeout, run
 `ctest --timeout <seconds>`.
 ## Known Answer Tests (KAT)
-KAT are available in folder `KAT`. They can be generated by running the apps built in the `apps` folder:
+KAT are available in the `KAT` directory. They can be generated by running the
-
+apps built in the `apps` directory:
- `apps/PQCgenKAT_sign_<level>`
+```
 apps/PQCgenKAT_sign_<level>
 ```
 A successful execution will generate the `.req` and `.rsp` files.
-A full KAT test is done as part of the test harness (see previous section).
+A full KAT test is done as part of the test harness (see the [Test](#test)
 section).
 ## Benchmarks
-A benchmarking suite is built and runs with the following command:
+A benchmarking suite is built and can be executed with the following command:
 ```
 apps/benchmark_<level> [--iterations=<iterations>]
 ```
 where `<level>` specifies the SQIsign parameter set and `<iterations>` is the
 number of iterations used for benchmarking; if the `--iterations` option is
 omitted, a default of 10 iterations is used.
- `test/sqisign_bench_<param> <runs>`, where params specifies the SQIsign parameter set and runs the number of benchmark runs.
+The benchmarks profile the key generation, signature and verification functions. The results are reported in CPU cycles if available on the host platform, and timing in nanoseconds otherwise.
 The benchmarks profile the `KeyGen`, `Sign` and `Verify` functions. The results are reported in CPU cycles if available on the host platform, and timing in nanoseconds otherwise.
 ## Examples
-Example code that demonstrates how to use SQIsign are available in the `apps` folder:
+Example code that demonstrates how to use SQIsign with the NIST API is available
-
+in `apps/example_nistapi.c`.
 - `apps/example_nistapi.c`: Example with the NIST API.
 ## Project Structure
-The SQIsign library consists of a number of sub-libraries used to implement the final SQIsign library.
+The source code consists of a number of sub-libraries used to implement the
 final SQIsign library:
 - `common`: common code for hash function, seed expansion, PRNG, memory handling.
 - `mp`: code for saturated-representation multiprecision arithmetic.
 - `gf`: GF(p^2) and GF(p) arithmetic.
 - `ec`: elliptic curves, isogenies and pairings. Everything that is purely
   finite-fieldy.
 - `precomp`: constants and precomputed values.
 - `quaternion`: quaternion orders and ideals.
 - `hd`: code to compute (2,2)-isogenies in the theta model.
 - `id2iso`: code for Ideal <-> Iso.
 - `verification`: code for the verification protocol.
 - `signature`: code for the key generation and signature protocols.
 The dependencies are depicted below.
 ```
-    ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
+ ┌─┬──────────┬─┐        ┌─┬──────────┬─┐      ┌─┬──────────┬─┐
-    │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
+ │ ├──────────┤ │        │ ├──────────┤ │      │ ├──────────┤ │
-    │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
+ │ │  Keygen  │ │        │ │   Sign   │ │      │ │  Verify  │ │
-    │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
+ │ ├──────────┤ │        │ ├──────────┤ │      │ ├──────────┤ │
-    └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
+ └─┴────┬─────┴─┘        └─┴────┬─────┴─┘      └─┴────┬─────┴─┘
        │                       │                     │
        └──────────────────┐    │                     │
                           │    │                     │
-          ├────────────────────┼─────────────────┐     │
+┌─────────────────┐    ┌───▼────▼────────┐            │
 │                 │    │                 │            │
 │   Quaternions   ◄────┤  Ideal <-> Iso  ├────────┐   │
 │                 │    │                 │        │   │
 └────────┬────────┘    └────────┬────────┘        │   │
         │                      │                 │   │
         │                      │     ┌───────────────┘
         │                      │     │           │
-      ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
+┌────────▼────────┐    ┌────────▼─────▼──┐    ┌───▼────────────┐
-      │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
+│                 │    │                 │    │                │
-      └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
+│ Multiprecision  │    │       2D        ├────► Precomputation │
-          │       │            │            └───▲──────┬───────────┘
+│ integers (GMP)  │    │    Isogenies    │    │                │
-          │       │            │                │      │
+│                 │    │                 │    │                │
-      ┌───┴──┐    │            │                │      │
+└─────────────────┘    └────────┬────────┘    └───▲────────────┘
-      │ KLPT ◄────┘            │     ┌──────────┘      │
+                                │                 │
-      └───┬──┘                 │     │                 │
+                                │                 │
-          │                    │     │                 │
+                                │                 │
-┌─────────▼─────────┐          │     │                 │
+                       ┌────────▼────────┐        │
-│ Quaternion orders │          │     │            ┌────▼───┐
+                       │                 │        │
-│     and ideals    │          │     │            │ GF(p²) │
+                       │ Elliptic curves ├────────┘
-└─────────┬─────────┘          │     │            └────┬───┘
+                       │   & isogenies   │
-          │           ┌─┬──────▼─────┴──┬─┐            │
+                       │                 │
-    ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
+                       └──┬───────────┬──┘
-    │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
+                          │           │
-    └───────────┘     │ ├───────────────┤ │      └───────────┘
+                          │           │
-                      └─┴───────────────┴─┘                       
+                          │           │
              ┌───────────▼───┐   ┌───▼───────────┐
              │     GF(p)     │   │     Fixed     │
              │       &       │   │   precision   │
              │    GF(p^2)    │   │   integers    │
              └───────────────┘   └───────────────┘
 ```
-There are the following sub-libraries:
+## Cortex-M4 implementation
- `common`: common code for AES, SHAKE, (P)RNG, memory handling
+Verification routines are supported in 32-bit embedded architectures running on bare metal environments such as the ARM Cortex-M4, but they are not directly supported by the build system of the present repository. The [pqm4 project](https://github.com/mupq/pqm4) is supported for evaluating SQIsign verification in the ARM Cortex-M4.
 - `ec`: elliptic curves, isogenies and pairings
 - `gf`: GF(p^2) and GF(p) arithmetic, including FP BigInt
 - `id2iso`: code for Iso <-> Ideal
 - `klpt`: implementation of KLPT
 - `quaternion`: quaternion orders and ideals
 - `intbig`: multi-precision big integers
 - `precomp`: precomputed constants
 - `protocols`: protocol implementation
 pqm4 assumes that the full NIST API (keypair generation, signing and verification) is available. Since only verification is supported, the remaining routines are mocked and must meet certain constraints, such as sharing the public key, signing a prespecified message and being of a specific size used by the testing and benchmarking binaries of pqm4. Therefore, additional KATs must be generated specifically for pqm4, which is done by a dedicated KAT generator for pqm4, found in `apps/PQCgenKAT_sign_pqm4.c`.
-### Folder structure
+A copy of the most recent version of pqm4 as of the round 2 submission deadline, including an implementation of SQIsign verification generated directly from this repository using the procedure explained next, is made available [here](https://github.com/SQISign/the-sqisign-pqm4), in the `sqisign` branch.
-Folder levels after `src`:
+If changes are made to the library, the `scripts/gen_pqm4_sources.sh` shell script can be run, from the root folder of the repository, to generate a pqm4-compatible folder structure in `src/pqm4/sqisign_lvl{1,3,5}`, which can then be copied to the `crypto_sign` folder of pqm4. Note that the pqm4 KAT generator is automatically run by this script.
 ```
 SQIsign
 └── src
    ├── lib_1
    │   ├── broadwell
    │   │   ├── generic
    │   │   └── lvl1
    │   ├── opt
    │   │   ├── generic
    │   │   └── lvl1
    │   └── ref
    │       └── generic
    ├── lib_2
    │   ├── broadwell
    │   │   └── generic
    │   ├── opt
    │   │   └── generic
    │   └── ref
    │       └── generic
    └── lib_n
        ├── broadwell
        │   └── generic
        ├── opt
        │   └── generic
        └── ref
            └── generic
 ```
-Level 1: Library (e.g. quaternion). A `CMakeLists.txt` file with entry `include(${SELECT_IMPL_TYPE})` takes care of including the implementation Level 2.
+## Acknowledgements
-Level 2: Implementation type: reference C (ref), optimized C (opt), ASM-optimized (e.g. broadwell, neon, m4). A `CMakeLists.txt` file entry with `include(${SELECT_SQISIGN_VARIANT})` takes care of including the SQIsign variant.
+The reference implementation for finite field arithemtic (i.e., `src/gf/ref`)
-
+was generated using [modarith](https://github.com/mcarrickscott/modarith) by
-Level 3: SQIsign variant -> generic code or code for a specific parameter set (e.g. lvl1). 
+Michael Scott.
 Other folders:
 - `apps`: Applications: KAT generation application, examples
 - `include`: SQIsign public header files
 - `KAT`: Known Answer Test files
 - `test`: SQIsign test code
 ### Sub-library headers
 Sub-libraries can define their own headers, which may be different between the implementation types. These header files are used sub-library-internally and by other dependent sub-libraries. The convention is to put the headers in an `include` folder of the sub-library src directory. For example, `src/intbig/ref/generic/include/intbig.h`.
 ### Sub-library unit tests
 Sub-libraries can implement their own, self-contained unit tests. The convention is to put the unit tests in a `test` folder of the sub-library `src` directory. For example, `src/intbig/ref/generic/test/test_intbig.c`.
 ### Shared implementation types
 It is possible to share implementations between implementation types. For example, the broadwell optimized implementation might use the same code as the reference implementation except in the GF module.
 ## License
 SQIsign is licensed under Apache-2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
-Third party code is used in some test and common code files:
+Third party code is used in some files:
 - `src/common/aes_c.c`; MIT: "Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>"
- `src/common/fips202.c`: Public Domain
+- `src/common/fips202.c`: CC0: Copyright (c) 2023, the PQClean team
 - `src/common/randombytes_system.c`: MIT: Copyright (c) 2017 Daan Sprenkels <hello@dsprenkels.com>
- `apps/PQCgenKAT_sign.c`, `common/randombytes_ctrdrbg.c`, `test/test_kat.c`: by NIST (Public Domain)
+- `src/common/broadwell/{aes_ni.c, vaes256_key_expansion.S}`: Apache-2.0: Copyright 2019 Amazon.com, Inc.
 - `src/common/broadwell/ctr_drbg.c`: ISC: Copyright (c) 2017, Google Inc.
 - `src/mini-gmp/mini-gmp.c` and `src/mini-gmp/mini-gmp.h`: LGPLv3: Copyright 1991-1997, 1999-2022 Free Software Foundation, Inc.
 - `src/quaternion/ref/generic/dpe.h`: LGPLv3: Copyright (C) 2004-2024 Patrick Pelissier, Paul Zimmermann, LORIA/INRIA
 - `apps/PQCgenKAT_sign.c`, `apps/PQCgenKAT_sign_pqm4.c`, `src/common/ref/randombytes_ctrdrbg.c`, `test/test_kat.c`: by NIST (Public Domain)
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -1,15 +1,58 @@
-# NIST KAT generation apps
+if (ENABLE_SIGN)
-foreach(SVARIANT ${SVARIANT_S})
+
    # NIST KAT generation apps
    foreach(SVARIANT ${SVARIANT_S})
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        add_executable(PQCgenKAT_sign_${SVARIANT_LOWER} PQCgenKAT_sign.c)
        target_link_libraries(PQCgenKAT_sign_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_test_nistapi)
        target_include_directories(PQCgenKAT_sign_${SVARIANT_LOWER} PRIVATE ../include)
-endforeach()
+        target_compile_definitions(PQCgenKAT_sign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
    endforeach()
-# Examples with NIST API
+    # pqm4 KAT generation apps
-foreach(SVARIANT ${SVARIANT_S})
+    foreach(SVARIANT ${SVARIANT_S})
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        add_executable(PQCgenKAT_sign_pqm4_${SVARIANT_LOWER} PQCgenKAT_sign_pqm4.c)
        target_link_libraries(PQCgenKAT_sign_pqm4_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_test_nistapi)
        target_include_directories(PQCgenKAT_sign_pqm4_${SVARIANT_LOWER} PRIVATE ../include)
        target_compile_definitions(PQCgenKAT_sign_pqm4_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
    endforeach()
    # Examples with NIST API
    foreach(SVARIANT ${SVARIANT_S})
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        add_executable(example_nistapi_${SVARIANT_LOWER} example_nistapi.c)
        target_link_libraries(example_nistapi_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_nistapi)
        target_include_directories(example_nistapi_${SVARIANT_LOWER} PRIVATE ../include ../src/${SVARIANT_LOWER})
        target_compile_definitions(example_nistapi_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
        add_test(sqisign_test_nistapi_${SVARIANT_LOWER} example_nistapi_${SVARIANT_LOWER})
    endforeach()
    # Benchmarking tool
    foreach(SVARIANT ${SVARIANT_S})
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        add_executable(benchmark_${SVARIANT_LOWER} benchmark.c)
        target_link_libraries(benchmark_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_nistapi)
        target_include_directories(benchmark_${SVARIANT_LOWER} PRIVATE ../include ../src/common/generic/include)
        target_compile_definitions(benchmark_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
    endforeach()
    # Fuzzing tool -- signature generation
    foreach(SVARIANT ${SVARIANT_S})
        string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
        add_executable(fuzz_sign_${SVARIANT_LOWER} fuzz_sign.c)
        target_link_libraries(fuzz_sign_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_nistapi)
        target_include_directories(fuzz_sign_${SVARIANT_LOWER} PRIVATE ../include)
        target_compile_definitions(fuzz_sign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
    endforeach()
 endif()
 # Fuzzing tool -- signature verification
 foreach(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    add_executable(fuzz_verify_${SVARIANT_LOWER} fuzz_verify.c)
    target_link_libraries(fuzz_verify_${SVARIANT_LOWER} PRIVATE sqisign_${SVARIANT_LOWER}_nistapi)
    target_include_directories(fuzz_verify_${SVARIANT_LOWER} PRIVATE ../include ../src/precomp/ref/${SVARIANT_LOWER}/include)
    target_compile_definitions(fuzz_verify_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT_LOWER})
 endforeach()
--- a/apps/PQCgenKAT_sign_pqm4.c
+++ b/apps/PQCgenKAT_sign_pqm4.c
@@ -0,0 +1,297 @@
 // pqm4 KAT generator
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 /*
 NIST-developed software is provided by NIST as a public service. You may use,
 copy, and distribute copies of the software in any medium, provided that you
 keep intact this entire notice. You may improve, modify, and create derivative
 works of the software or any portion of the software, and you may copy and
 distribute such modifications or works. Modified works should carry a notice
 stating that you changed the software and should note the date and nature of any
 such change. Please explicitly acknowledge the National Institute of Standards
 and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
 ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
 WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
 PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
 NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
 ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
 ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
 INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
 USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and
 distributing the software and you assume all risks associated with its use,
 including but not limited to the risks and costs of program errors, compliance
 with applicable laws, damage to or loss of data, programs or equipment, and the
 unavailability or interruption of operation. This software is not intended to be
 used in any situation where a failure could cause risk of injury or damage to
 property. The software developed by NIST employees is not subject to copyright
 protection within the United States.
 */
 #include "api.h"
 #include "rng.h"
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x) STRINGIFY2(x)
 #define MAX_MARKER_LEN 50
 #define KAT_SUCCESS 0
 #define KAT_FILE_OPEN_ERROR -1
 #define KAT_DATA_ERROR -3
 #define KAT_CRYPTO_FAILURE -4
 #define NUM_KATS 2
 #define MAX_MSG_LEN 59
 void output_header(FILE *fp) {
  const char header[] =
    "// SPDX-License-Identifier: Apache-2.0\n"
    "\n"
    "#ifndef api_h\n"
    "#define api_h\n"
    "\n"
    "#include <stddef.h>\n"
    "#include <sqisign_namespace.h>\n"
    "\n"
    "#define CRYPTO_SECRETKEYBYTES " STRINGIFY(CRYPTO_SECRETKEYBYTES) "\n"
    "#define CRYPTO_PUBLICKEYBYTES " STRINGIFY(CRYPTO_PUBLICKEYBYTES) "\n"
    "#define CRYPTO_BYTES " STRINGIFY(CRYPTO_BYTES) "\n"
    "\n"
    "#define CRYPTO_ALGNAME \"SQIsign_" STRINGIFY(SQISIGN_VARIANT) "\"\n"
    "\n"
    "SQISIGN_API\n"
    "int\n"
    "crypto_sign_keypair(unsigned char *pk, unsigned char *sk);\n"
    "\n"
    "SQISIGN_API\n"
    "int\n"
    "crypto_sign(unsigned char *sm, size_t *smlen,\n"
    "            const unsigned char *m, size_t mlen,\n"
    "            const unsigned char *sk);\n"
    "\n"
    "SQISIGN_API\n"
    "int\n"
    "crypto_sign_open(unsigned char *m, size_t *mlen,\n"
    "                 const unsigned char *sm, size_t smlen,\n"
    "                 const unsigned char *pk);\n"
    "\n"
    "#endif /* api_h */\n";
  fputs(header, fp);
 }
 void output_rng(FILE *fp) {
  const char rng[] =
    "// SPDX-License-Identifier: Apache-2.0\n"
    "\n"
    "#ifndef rng_h\n"
    "#define rng_h\n"
    "\n"
    "#include \"randombytes.h\"\n"
    "\n"
    "#endif /* rng_h */\n";
  fputs(rng, fp);
 }
 void output_preamble(FILE *fp) {
  const char preamble[] =
    "// SPDX-License-Identifier: Apache-2.0\n"
    "\n"
    "#include <api.h>\n"
    "#include <sig.h>\n"
    "#include <string.h>\n"
    "\n"
    "typedef struct {\n"
    "  size_t mlen;\n"
    "  char msg[" STRINGIFY(MAX_MSG_LEN) "];\n"
    "  size_t smlen;\n"
    "  char sm[" STRINGIFY(MAX_MSG_LEN) " + CRYPTO_BYTES];\n"
    "} SQISign_KAT_t;\n"
    "\n";
  fputs(preamble, fp);
 }
 void output_pk(FILE *fp, const unsigned char *pk) {
  fprintf(fp, "const char kat_" STRINGIFY(SQISIGN_VARIANT) "_pk[CRYPTO_PUBLICKEYBYTES] = {\n  ");
  for (int i = 0; i < CRYPTO_PUBLICKEYBYTES; i++) {
    fprintf(fp, "0x%02X, ", pk[i]);
  }
  fprintf(fp, "\n};\n\n");
 }
 void output_message_signature(FILE *fp, const unsigned char *m, unsigned long long mlen, const unsigned char *sm, unsigned long long smlen) {
  fprintf(fp, "  {\n"
              "    .mlen = %llu,\n"
              "    .msg = { ", mlen);
  for (unsigned long long i = 0; i < mlen; i++) {
    fprintf(fp, "0x%02X, ", m[i]);
  }
  fprintf(fp, "},\n"
              "    .smlen = %llu + CRYPTO_BYTES,\n"
              "    .sm = { ", mlen);
  for (unsigned long long i = 0; i < smlen; i++) {
    fprintf(fp, "0x%02X, ", sm[i]);
  }
  fprintf(fp, "},\n"
              "  },\n");
 }
 void output_implementation(FILE *fp) {
  const char api[] =
    "int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {\n"
    "  memcpy(pk, kat_" STRINGIFY(SQISIGN_VARIANT) "_pk, CRYPTO_PUBLICKEYBYTES);\n"
    "  // We don't need the secret key\n"
    "  memset(sk, 0, CRYPTO_SECRETKEYBYTES);\n"
    "}\n"
    "\n"
    "int crypto_sign(unsigned char *sm, size_t *smlen, const unsigned char *m,\n"
    "                size_t mlen, const unsigned char *sk) {\n"
    "  for (size_t i = 0; i < sizeof(kat_" STRINGIFY(SQISIGN_VARIANT) ") / sizeof(kat_" STRINGIFY(SQISIGN_VARIANT) "[0]); i++) {\n"
    "    if (mlen == kat_" STRINGIFY(SQISIGN_VARIANT) "[i].mlen) {\n"
    "      memcpy(sm, kat_" STRINGIFY(SQISIGN_VARIANT) "[i].sm, kat_" STRINGIFY(SQISIGN_VARIANT) "[i].smlen);\n"
    "      *smlen = kat_" STRINGIFY(SQISIGN_VARIANT) "[i].smlen;\n"
    "      return 0;\n"
    "    }\n"
    "  }\n"
    "\n"
    "  return 1;\n"
    "}\n"
    "\n"
    "int crypto_sign_open(unsigned char *m, size_t *mlen, const unsigned char *sm,\n"
    "                     size_t smlen, const unsigned char *pk) {\n"
    "  unsigned long long mlen_ull = *mlen;\n"
    "  int ret = sqisign_open(m, &mlen_ull, sm, smlen, pk);\n"
    "  if (mlen) {\n"
    "    *mlen = mlen_ull;\n"
    "  }\n"
    "  return ret;\n"
    "}\n";
  fputs(api, fp);
 }
 int main(void) {
  // pqm4 KATs use only all-zeros messages, one of length 32 and another of length 59;
  // arrays whose dimension are the length of the message are declared the size of the
  // largest of the two, but for the former, only 32 out of 59 bytes are actually used
  unsigned char seed[NUM_KATS][48];
  unsigned char entropy_input[48];
  const unsigned char m[NUM_KATS][MAX_MSG_LEN] = { { 0 }, { 0 } };
  unsigned char sm[NUM_KATS][MAX_MSG_LEN + CRYPTO_BYTES] = { { 0 }, { 0 } };
  unsigned char m1[MAX_MSG_LEN];
  const unsigned long long mlen[2] = { 32, 59 };
  unsigned long long smlen[2], mlen1;
  unsigned char pk[CRYPTO_PUBLICKEYBYTES], sk[CRYPTO_SECRETKEYBYTES];
  int ret_val;
  for (int i = 0; i < 48; i++)
    entropy_input[i] = i;
  randombytes_init(entropy_input, NULL, 256);
  // Generate the keypair, shared between both KATs, as required for pqm4
  if ((ret_val = crypto_sign_keypair(pk, sk)) != 0) {
    printf("crypto_sign_keypair returned <%d>\n", ret_val);
    return KAT_CRYPTO_FAILURE;
  }
  // Choose two seeds (for the 32-byte and 59-byte KATs)
  for (int i = 0; i < NUM_KATS; i++)
    randombytes(seed[i], 48);
  // Fill m1 with random bytes. Note that the memcmp check below for a valid signature
  // compares m1 with m[i], but since the KATs use all-zero messages, the comparison
  // may suceed even if m was untouched from a previous iteration. This ensures that
  // memcmp will fail in that case.
  randombytes(m1, MAX_MSG_LEN);
  for (int i = 0; i < NUM_KATS; i++) {
    randombytes_init(seed[i], NULL, 256);
    if ((ret_val = crypto_sign(sm[i], &smlen[i], m[i], mlen[i], sk)) != 0) {
      printf("crypto_sign returned <%d>\n", ret_val);
      return KAT_CRYPTO_FAILURE;
    }
    if ((ret_val = crypto_sign_open(m1, &mlen1, sm[i], smlen[i], pk)) != 0) {
      printf("crypto_sign_open returned <%d>\n", ret_val);
      return KAT_CRYPTO_FAILURE;
    }
    if (mlen[i] != mlen1) {
      printf(
          "crypto_sign_open returned bad 'mlen': Got <%llu>, expected <%llu>\n",
          mlen1, mlen[i]);
      return KAT_CRYPTO_FAILURE;
    }
    if (memcmp(m, m1, mlen[i])) {
      printf("crypto_sign_open returned bad 'm' value\n");
      return KAT_CRYPTO_FAILURE;
    }
    // Fill m1 with random bytes for the next iteration
    randombytes(m1, MAX_MSG_LEN);
  }  
  // Output rng.h
  FILE *fp = fopen("src/pqm4/sqisign_" STRINGIFY(SQISIGN_VARIANT) "/ref/rng.h", "w");
  if (!fp) {
    printf("Couldn't open rng.h file for writing. Are you in the correct folder?\n");
    return KAT_FILE_OPEN_ERROR;
  }
  output_rng(fp);
  fclose(fp);
  // Output the header file
  fp = fopen("src/pqm4/sqisign_" STRINGIFY(SQISIGN_VARIANT) "/ref/api.h", "w");
  if (!fp) {
    printf("Couldn't open api.h file for writing. Are you in the correct folder?\n");
    return KAT_FILE_OPEN_ERROR;
  }
  output_header(fp);
  fclose(fp);
  // Output the implementation
  fp = fopen("src/pqm4/sqisign_" STRINGIFY(SQISIGN_VARIANT) "/ref/pqm4_api.c", "w");
  if (!fp) {
    printf("Couldn't open pqm4_api.c file for writing. Are you in the correct folder?\n");
    return KAT_FILE_OPEN_ERROR;
  }
  output_preamble(fp);
  output_pk(fp, pk);
  fprintf(fp, "const SQISign_KAT_t kat_" STRINGIFY(SQISIGN_VARIANT) "[%d] = {\n", NUM_KATS);
  for (int i = 0; i < NUM_KATS; i++) {
    output_message_signature(fp, m[i], mlen[i], sm[i], smlen[i]);
  }
  fprintf(fp, "};\n\n");
  output_implementation(fp);
  fclose(fp);
  return KAT_SUCCESS;
 }
--- a/apps/benchmark.c
+++ b/apps/benchmark.c
@@ -0,0 +1,124 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <inttypes.h>
 #include <api.h>
 #include <rng.h>
 #include <bench.h>
 #include <bench_test_arguments.h>
 #if defined(TARGET_BIG_ENDIAN)
 #include <tutil.h>
 #endif
 void
 bench(size_t runs)
 {
    const size_t m_len = 32;
    const size_t sm_len = CRYPTO_BYTES + m_len;
    unsigned char *pkbuf = calloc(runs, CRYPTO_PUBLICKEYBYTES);
    unsigned char *skbuf = calloc(runs, CRYPTO_SECRETKEYBYTES);
    unsigned char *smbuf = calloc(runs, sm_len);
    unsigned char *mbuf = calloc(runs, m_len);
    unsigned char *pk[runs], *sk[runs], *sm[runs], *m[runs];
    for (size_t i = 0; i < runs; ++i) {
        pk[i] = pkbuf + i * CRYPTO_PUBLICKEYBYTES;
        sk[i] = skbuf + i * CRYPTO_SECRETKEYBYTES;
        sm[i] = smbuf + i * sm_len;
        m[i] = mbuf + i * m_len;
        if (randombytes(m[i], m_len))
            abort();
    }
    unsigned long long len;
    printf("%s (%zu iterations)\n", CRYPTO_ALGNAME, runs);
    BENCH_CODE_1(runs);
    crypto_sign_keypair(pk[i], sk[i]);
    BENCH_CODE_2("keypair");
    BENCH_CODE_1(runs);
    len = sm_len;
    crypto_sign(sm[i], &len, m[i], m_len, sk[i]);
    if (len != sm_len)
        abort();
    BENCH_CODE_2("sign");
    int ret;
    BENCH_CODE_1(runs);
    len = m_len;
    ret = crypto_sign_open(m[i], &len, sm[i], sm_len, pk[i]);
    if (ret)
        abort();
    BENCH_CODE_2("verify");
    free(pkbuf);
    free(skbuf);
    free(smbuf);
    free(mbuf);
 }
 int
 main(int argc, char *argv[])
 {
    uint32_t seed[12] = { 0 };
    int iterations = SQISIGN_TEST_REPS;
    int help = 0;
    int seed_set = 0;
 #ifndef NDEBUG
    fprintf(stderr,
            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
            "This will severely impact performance measurements.\x1b[0m\n");
 #endif
    for (int i = 1; i < argc; i++) {
        if (!help && strcmp(argv[i], "--help") == 0) {
            help = 1;
            continue;
        }
        if (!seed_set && !parse_seed(argv[i], seed)) {
            seed_set = 1;
            continue;
        }
        if (sscanf(argv[i], "--iterations=%d", &iterations) == 1) {
            continue;
        }
    }
    if (help || iterations <= 0) {
        printf("Usage: %s [--iterations=<iterations>] [--seed=<seed>]\n", argv[0]);
        printf("Where <iterations> is the number of iterations used for benchmarking; if not "
               "present, uses the default: %d)\n",
               iterations);
        printf("Where <seed> is the random seed to be used; if not present, a random seed is "
               "generated\n");
        return 1;
    }
    if (!seed_set) {
        randombytes_select((unsigned char *)seed, sizeof(seed));
    }
    print_seed(seed);
 #if defined(TARGET_BIG_ENDIAN)
    for (int i = 0; i < 12; i++) {
        seed[i] = BSWAP32(seed[i]);
    }
 #endif
    randombytes_init((unsigned char *)seed, NULL, 256);
    cpucycles_init();
    bench(iterations);
    return 0;
 }
--- a/apps/example_nistapi.c
+++ b/apps/example_nistapi.c
@@ -4,11 +4,31 @@
 * An example to demonstrate how to use SQIsign with the NIST API.
 */
-#include <api.h>
+#include <inttypes.h>
 #include <mem.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <time.h>
 #include <api.h>
 #include <rng.h>
 #include <bench_test_arguments.h>
 #if defined(TARGET_BIG_ENDIAN)
 #include <tutil.h>
 #endif
 static uint32_t rand_u32()
 {
    unsigned char buf[4];
    if (randombytes(buf, sizeof(buf)))
        abort();
    return ((uint32_t) buf[3] << 24)
         | ((uint32_t) buf[2] << 16)
         | ((uint32_t) buf[1] <<  8)
         | ((uint32_t) buf[0] <<  0);
 }
 /**
 * Example for SQIsign variant:
@@ -18,18 +38,19 @@
 *
 * @return int return code
 */
-static int example_sqisign(void) {
+static int
 example_sqisign(void)
 {
-    unsigned long long msglen = 32;
+    unsigned long long msglen = rand_u32() % 100;
    unsigned long long smlen = CRYPTO_BYTES + msglen;
    unsigned char *pk  = calloc(CRYPTO_PUBLICKEYBYTES, 1);
    unsigned char *sk = calloc(CRYPTO_SECRETKEYBYTES, 1);
    unsigned char *pk = calloc(CRYPTO_PUBLICKEYBYTES, 1);
-    unsigned char *sig = calloc(smlen, 1);
+    unsigned char *sm = calloc(smlen, 1);
-    unsigned char msg[32] = { 0xe };
+    unsigned char msg[msglen], msg2[msglen];
    unsigned char msg2[32] = { 0 };
    printf("Example with %s\n", CRYPTO_ALGNAME);
@@ -37,53 +58,111 @@ static int example_sqisign(void) {
    int res = crypto_sign_keypair(pk, sk);
    if (res) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        printf("OK\n");
    }
    // choose a random message
    for (size_t i = 0; i < msglen; ++i)
        msg[i] = rand_u32();
    printf("crypto_sign -> ");
-    res = crypto_sign(sig, &smlen, msg, msglen, sk);
+    res = crypto_sign(sm, &smlen, msg, msglen, sk);
    if (res) {
        printf("FAIL\n");
        res = -1;
        goto err;
    } else {
        printf("OK\n");
    }
    printf("crypto_sign_open (with correct signature) -> ");
-    res = crypto_sign_open(msg2, &msglen, sig, smlen, pk);
+    res = crypto_sign_open(msg2, &msglen, sm, smlen, pk);
-    if (res || memcmp(msg, msg2, msglen)) {
+    if (res || msglen != sizeof(msg) || memcmp(msg, msg2, msglen)) {
-        printf("FAIL\n");
+        printf("FAIL\n"); // signature was not accepted!?
        res = -1;
        goto err;
    } else {
        res = 0;
        printf("OK\n");
    }
    // fill with random bytes
    for (size_t i = 0; i < msglen; ++i)
        msg2[i] = rand_u32();
    // let's try a single bit flip
    size_t pos = rand_u32() % smlen;
    sm[pos / 8] ^= 1 << pos % 8;
    res = crypto_sign_open(msg2, &msglen, sm, smlen, pk);
    printf("crypto_sign_open (with altered signature) -> ");
-    sig[0] = ~sig[0];
+    if (!res) {
-    memset(msg2, 0, msglen);
+        printf("FAIL\n"); // signature was accepted anyway!?
    res = crypto_sign_open(msg2, &msglen, sig, smlen, pk);
    if (!res || !memcmp(msg, msg2, msglen)) {
        printf("FAIL\n");
        res = -1;
        goto err;
-    } else {
+    }
-        res = 0;
+    else {
        printf("OK\n");
        res = 0;
        if (msglen)
            printf("WARNING: verification failed but the message length was returned nonzero; misuse-prone API\n");
        unsigned char any = 0;
        for (size_t i = 0; i < msglen; ++i)
            any |= msg2[i];
        if (any)
            printf("WARNING: verification failed but the message buffer was not zeroed out; misuse-prone API\n");
    }
 err:
    free(pk);
    sqisign_secure_free(sk, CRYPTO_SECRETKEYBYTES);
-    free(sig);
+    free(pk);
    free(sm);
    return res;
 }
-int main(void) {
+int
 main(int argc, char *argv[])
 {
    uint32_t seed[12] = { 0 };
    int help = 0;
    int seed_set = 0;
    for (int i = 1; i < argc; i++) {
        if (!help && strcmp(argv[i], "--help") == 0) {
            help = 1;
            continue;
        }
        if (!seed_set && !parse_seed(argv[i], seed)) {
            seed_set = 1;
            continue;
        }
    }
    if (help) {
        printf("Usage: %s [--seed=<seed>]\n", argv[0]);
        printf("Where <seed> is the random seed to be used; if not present, a random seed is "
               "generated\n");
        return 1;
    }
    if (!seed_set) {
        randombytes_select((unsigned char *)seed, sizeof(seed));
    }
    print_seed(seed);
 #if defined(TARGET_BIG_ENDIAN)
    for (int i = 0; i < 12; i++) {
        seed[i] = BSWAP32(seed[i]);
    }
 #endif
    randombytes_init((unsigned char *)seed, NULL, 256);
    return example_sqisign();
 }
--- a/apps/fuzz_sign.c
+++ b/apps/fuzz_sign.c
@@ -0,0 +1,151 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <mem.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <api.h>
 #include <rng.h>
 /**
 * Example for SQIsign variant:
 * - crypto_sign_keypair
 * - crypto_sign
 * - crypto_sign_open
 *
 * @return int return code
 */
 static int example_sqisign(int iter) {
  int ret = 0;
  unsigned long long msglen = 64;
  unsigned long long smlen = CRYPTO_BYTES + msglen;
  unsigned char *sk = calloc(CRYPTO_SECRETKEYBYTES, 1);
  unsigned char *pk = calloc(CRYPTO_PUBLICKEYBYTES, 1);
  unsigned char *sm = calloc(smlen, 1);
  unsigned char msg[msglen];
  FILE *f = NULL;
  int res = crypto_sign_keypair(pk, sk);
  if (res) {
    fprintf(stderr, "crypto_sign_keypair -> FAIL\n");
    ret = 1;
    goto end;
  }
  // choose a random message
  randombytes(msg, msglen);
  res = crypto_sign(sm, &smlen, msg, msglen, sk);
  if (res) {
    fprintf(stderr, "crypto_sign -> FAIL\n");
    ret = 1;
    goto end;
  }
  // This string is larger than necessary, but gcc is not smart enough
  // to detect that iter < 1000000 in the snprintf call below
  char filename[sizeof("testcases/SQIsign_lvl1/signature4294967296.bin") + 1];
  if (iter > 999999) {
    fprintf(stderr, "Too many iterations: %d\n", iter);
    ret = 1;
    goto end;
  }
  snprintf(filename, sizeof(filename), "testcases/%s/signature%06d.bin",
           CRYPTO_ALGNAME, iter);
  f = fopen(filename, "wb");
  if (!f) {
    fprintf(stderr,
            "Can't open file: %s (have you created the testcases/%s folder?)\n",
            filename, CRYPTO_ALGNAME);
    ret = 1;
    goto end;
  }
  if (fwrite(pk, CRYPTO_PUBLICKEYBYTES, 1, f) != 1) {
    fprintf(stderr, "Error writing public key to file\n");
    ret = 1;
    goto end;
  }
  if (fwrite(sm, smlen, 1, f) != 1) {
    fprintf(stderr, "Error writing signature to file\n");
    ret = 1;
    goto end;
  }
 end:
  if (f)
    fclose(f);
  free(sk);
  free(pk);
  free(sm);
  return ret;
 }
 // Brief fuzzing tutorial (assumes level 1, but works for other levels)
 // Assumes an Intel Linux system
 //
 // 0. Some OS configurations required for AFL to work:
 //    echo core | sudo tee /proc/sys/kernel/core_pattern
 //    echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
 //
 // 1. Install AFL++ (https://aflplus.plus) -- on Ubuntu, install packages
 //    afl++, clang, lld, tmux
 //
 // 2. Configure CMake using AFL's compilers:
 //    AFL_HARDEN=1 cmake -DCMAKE_C_COMPILER=afl-clang-lto <other params>
 //
 // 3. Build
 //
 // 4. cd to the build/apps folder
 //
 // 5. Create required folders:
 //    mkdir -p testcases/SQIsign_lvl{1,3,5}
 //
 // 6. Run ./fuzz_sign_lvl1 to create some initial testcases
 //
 // 7. Run:
 //    tmux new-session -s afl1 afl-fuzz -i testcases/SQIsign_lvl1/ -o syncdir/ -D -M fuzz1 -- ./fuzz_verify_lvl1
 //
 // 8. Optionally run using other cores in the machine (e.g. for 24 cores),
 //    for i in $(seq 2 24)
 //    do
 //      tmux new-session -s afl$i -d afl-fuzz -d -i testcases/SQIsign_lvl1/ -o syncdir/ -S fuzz$i -- ./fuzz_verify_lvl1
 //    done
 //
 // 9. Attach to a specific instance by running:
 //    tmux attach -t afl$i
 //
 // 10. To get summary statistics for all runs, run:
 //     afl-whatsup syncdir/
 //
 // 11. "Interesting" signatures, in a binary format understood by
 //     fuzz_verify_lvl1, will be found in syncdir/fuzz$i/crashes; to
 //     reproduce the crash, pipe one of these files to fuzz_verify_lvl1
 int
 main(int argc, char *argv[]) {
  int testcases = 10;
  unsigned char seed[48];
  randombytes_select(seed, sizeof(seed));
  randombytes_init(seed, NULL, 256);
  if (argc == 2) {
    sscanf(argv[1], "--testcases=%d", &testcases);
  }
  for (int i = 0; i < testcases; ++i) {
    example_sqisign(i);
  }
  return 0;
 }
--- a/apps/fuzz_verify.c
+++ b/apps/fuzz_verify.c
@@ -0,0 +1,117 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <mem.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <api.h>
 #include <rng.h>
 #include "encoded_sizes.h"
 typedef struct {
  unsigned char pk[CRYPTO_PUBLICKEYBYTES];
  unsigned char sm[CRYPTO_BYTES + 64];
 } signature_t;
 static void crash() {
  int *p = 0;
  *p = 0;
 }
 static int load_signature(signature_t *sig, int iter) {
  char filename[sizeof("testcases/SQIsign_lvl1/signature000000.bin")];
  snprintf(filename, sizeof(filename), "testcases/%s/signature%06d.bin", CRYPTO_ALGNAME, iter);
  FILE *f = fopen(filename, "rb");
  if (!f) {
    fprintf(stderr, "Can't open file: %s\n", filename);
    return 1;
  }
  if (fread(sig->pk, CRYPTO_PUBLICKEYBYTES, 1, f) != 1) {
    fprintf(stderr, "Can't read public key from file: %s\n", filename);
    fclose(f);
    return 1;
  }
  if (fread(sig->sm, CRYPTO_BYTES + 64, 1, f) != 1) {
    fprintf(stderr, "Can't read signature from file: %s\n", filename);
    fclose(f);
    return 1;
  }
  fclose(f);
  return 0;
 }
 static void verify_signature(signature_t corpus[], int testcases) {
  unsigned long long msglen = 64;
  unsigned long long smlen = CRYPTO_BYTES + msglen;
  unsigned char *pk = calloc(CRYPTO_PUBLICKEYBYTES, 1);
  unsigned char *sm = calloc(smlen, 1);
  unsigned char msg[msglen];
  if (fread(pk, CRYPTO_PUBLICKEYBYTES, 1, stdin) != 1) {
    fprintf(stderr, "Error reading public key from stdin\n");
    free(pk);
    free(sm);
    return;
  }
  if (fread(sm, smlen, 1, stdin) != 1) {
    fprintf(stderr, "Error reading signature from stdin\n");
    free(pk);
    free(sm);
    return;
  }
  int res = crypto_sign_open(msg, &msglen, sm, smlen, pk);
  if (res || msglen != sizeof(msg) || memcmp(msg, sm + SIGNATURE_BYTES, msglen)) {
    // Signature was not accepted -- check if it was in the corpus and, in that case, crash
    for (int i = 0; i < testcases; ++i)
      if (!memcmp(pk, corpus[i].pk, CRYPTO_PUBLICKEYBYTES) || !memcmp(sm, corpus[i].sm, smlen))
        crash();
  } else {
    // Signature was accepted -- check if it was not in the corpus and, in that case, crash
    int in_corpus = 0;
    for (int i = 0; i < testcases; ++i)
      if (!memcmp(pk, corpus[i].pk, CRYPTO_PUBLICKEYBYTES) || !memcmp(sm, corpus[i].sm, smlen)) {
        in_corpus = 1;
        break;
      }
    if (!in_corpus)
      crash();
  }
  free(pk);
  free(sm);
 }
 int
 main(int argc, char *argv[]) {
  int testcases = 10;
  if (argc == 2) {
    sscanf(argv[1], "--testcases=%d", &testcases);
  }
  signature_t corpus[testcases];
  for (int i = 0; i < testcases; ++i)
    if (!load_signature(&corpus[i], i))
      return 1;
 #ifdef __AFL_LOOP
  while (__AFL_LOOP(1000))
    verify_signature(corpus, testcases);
 #else
  verify_signature(corpus, testcases);
 #endif
  return 0;
 }
--- a/include/mem.h
+++ b/include/mem.h
@@ -3,6 +3,7 @@
 #ifndef MEM_H
 #define MEM_H
 #include <stddef.h>
 #include <sqisign_namespace.h>
 /**
 * Clears and frees allocated memory.
--- a/include/rng.h
+++ b/include/rng.h
@@ -3,6 +3,8 @@
 #ifndef rng_h
 #define rng_h
 #include <sqisign_namespace.h>
 /**
 * Randombytes initialization.
 * Initialization may be needed for some random number generators (e.g. CTR-DRBG).
@@ -11,10 +13,22 @@
 * @param[in] personalization_string Personalization string
 * @param[in] security_strength Security string
 */
 SQISIGN_API
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength);
 /**
 * Random byte generation using /dev/urandom.
 * The caller is responsible to allocate sufficient memory to hold x.
 *
 * @param[out] x Memory to hold the random bytes.
 * @param[in] xlen Number of random bytes to be generated
 * @return int 0 on success, -1 otherwise
 */
 SQISIGN_API
 int randombytes_select(unsigned char *x, unsigned long long xlen);
 /**
 * Random byte generation.
 * The caller is responsible to allocate sufficient memory to hold x.
@@ -23,6 +37,7 @@ void randombytes_init(unsigned char *entropy_input,
 * @param[in] xlen Number of random bytes to be generated
 * @return int 0 on success, -1 otherwise
 */
 SQISIGN_API
 int randombytes(unsigned char *x, unsigned long long xlen);
 #endif /* rng_h */
--- a/include/sig.h
+++ b/include/sig.h
@@ -4,7 +4,9 @@
 #define SQISIGN_H
 #include <stdint.h>
 #include <sqisign_namespace.h>
 #if defined(ENABLE_SIGN)
 /**
 * SQIsign keypair generation.
 *
@@ -15,6 +17,7 @@
 * @param[out] sk SQIsign secret key
 * @return int status code
 */
 SQISIGN_API 
 int sqisign_keypair(unsigned char *pk, unsigned char *sk);
 /**
@@ -31,16 +34,20 @@ int sqisign_keypair(unsigned char *pk, unsigned char *sk);
 * @param[in] sk Compacted secret key
 * @return int status code
 */
 SQISIGN_API 
 int sqisign_sign(unsigned char *sm,
-              unsigned long long *smlen, const unsigned char *m,
+                 unsigned long long *smlen,
-              unsigned long long mlen, const unsigned char *sk);
+                 const unsigned char *m,
                 unsigned long long mlen,
                 const unsigned char *sk);
 #endif
 /**
 * SQIsign open signature.
 *
- * The implementation performs SQIsign.verify(). If the signature verification succeeded, the original message is stored in m.
+ * The implementation performs SQIsign.verify(). If the signature verification succeeded, the
- * Keys provided is a compact public key.
+ * original message is stored in m. Keys provided is a compact public key. The caller is responsible
- * The caller is responsible to allocate sufficient memory to hold m.
+ * to allocate sufficient memory to hold m.
 *
 * @param[out] m Message stored if verification succeeds
 * @param[out] mlen Pointer to the length of m
@@ -49,10 +56,12 @@ int sqisign_sign(unsigned char *sm,
 * @param[in] pk Compacted public key
 * @return int status code
 */
 SQISIGN_API
 int sqisign_open(unsigned char *m,
-              unsigned long long *mlen, const unsigned char *sm,
+                 unsigned long long *mlen,
-              unsigned long long smlen, const unsigned char *pk);
+                 const unsigned char *sm,
-
+                 unsigned long long smlen,
                 const unsigned char *pk);
 /**
 * SQIsign verify signature.
@@ -66,8 +75,11 @@ int sqisign_open(unsigned char *m,
 * @param[in] pk Compacted public key
 * @return int 0 if verification succeeded, 1 otherwise.
 */
 SQISIGN_API 
 int sqisign_verify(const unsigned char *m,
-                unsigned long long mlen, const unsigned char *sig,
+                   unsigned long long mlen,
-                unsigned long long siglen, const unsigned char *pk);
+                   const unsigned char *sig,
                   unsigned long long siglen,
                   const unsigned char *pk);
 #endif
--- a/include/sqisign_namespace.h
+++ b/include/sqisign_namespace.h
--- a/scripts/Namespace.scala
+++ b/scripts/Namespace.scala
@@ -0,0 +1,148 @@
 import io.StdIn.readLine
 // 1. #define DISABLE_NAMESPACING in sqisign_namespace.h
 // 2. build (cmake and make)
 // 3. find . -name '*.a' -exec nm {} \; | grep '.c.o:\|T ' | scala ../scripts/Namespace.scala > sqisign_namespace.h
 // 4. cp sqisign_namespace.h $SQISIGN_DIR/include
 object Namespace extends App {
  val PREAMBLE = """
 #ifndef SQISIGN_NAMESPACE_H
 #define SQISIGN_NAMESPACE_H
 //#define DISABLE_NAMESPACING
 #if defined(_WIN32)
 #define SQISIGN_API __declspec(dllexport)
 #else
 #define SQISIGN_API __attribute__((visibility("default")))
 #endif
 #define PARAM_JOIN3_(a, b, c) sqisign_##a##_##b##_##c
 #define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
 #define PARAM_NAME3(end, s) PARAM_JOIN3(SQISIGN_VARIANT, end, s)
 #define PARAM_JOIN2_(a, b) sqisign_##a##_##b
 #define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
 #define PARAM_NAME2(end, s) PARAM_JOIN2(end, s)
 #ifndef DISABLE_NAMESPACING
 #define SQISIGN_NAMESPACE_GENERIC(s) PARAM_NAME2(gen, s)
 #else
 #define SQISIGN_NAMESPACE_GENERIC(s) s
 #endif
 #if defined(SQISIGN_VARIANT) && !defined(DISABLE_NAMESPACING)
 #if defined(SQISIGN_BUILD_TYPE_REF)
 #define SQISIGN_NAMESPACE(s) PARAM_NAME3(ref, s)
 #elif defined(SQISIGN_BUILD_TYPE_OPT)
 #define SQISIGN_NAMESPACE(s) PARAM_NAME3(opt, s)
 #elif defined(SQISIGN_BUILD_TYPE_BROADWELL)
 #define SQISIGN_NAMESPACE(s) PARAM_NAME3(broadwell, s)
 #elif defined(SQISIGN_BUILD_TYPE_ARM64CRYPTO)
 #define SQISIGN_NAMESPACE(s) PARAM_NAME3(arm64crypto, s)
 #else
 #error "Build type not known"
 #endif
 #else
 #define SQISIGN_NAMESPACE(s) s
 #endif
 """
  val EPILOGUE = """
 #endif
 """
  val x = Iterator
    .continually(readLine)
    .takeWhile(_ != null).toList
  var scfile = ""
  val allFuns: List[(String, String)] = x.flatMap {
    case i if i.contains(".c.o:") =>
      scfile = i
      None
    case i =>
      i.split(" ").last match {
        case j if j.startsWith("_") =>
          Some((j.substring(1), scfile))
        case j =>
          Some((j, scfile))
      }
  }. // removing duplicates..
  groupBy(_._1).mapValues(k => k.distinct.toList.sortBy(_._2).reduceLeft((i,j) => ((i._1, s"${i._2}, ${j._2}")))).values.toList
  val maxFunLen = allFuns.map(i => i._1.length).max
  val filterFiles = List(
    "fips202.c",
    "tools.c",
    "randombytes_system.c",
    "randombytes_ctrdrbg.c",
    "randombytes_ctrdrbg_aesni.c",
    "foo.c",
    "aes_c.c",
    "aes_ni.c",
    "ctr_drbg.c"    
  )
  val genericFiles = List(
    // quaternion module
    "intbig.c",
    "algebra.c",
    "ideal.c",
    "dim4.c",
    "dim2.c",
    "integers.c",
    "lattice.c",
    "lat_ball.c",
    "finit.c",
    "printer.c",
    "rationals.c",
    "l2.c",
    "lll_verification.c",
    "lll_applications.c",
    "rationals.c",
    "normeq.c",
    "ibz_division.c",
    "hnf_internal.c",
    "hnf.c",
    "random_input_generation.c",
    "mem.c",
    // mp module
    "mp.c"
  ).map(i => s"$i.o:")
  val groupedByFile = 
    allFuns.
    groupBy(_._2).
    map(i => (i._1, i._2.distinct.sorted)).
    filter(i => filterFiles.forall(j => !i._1.contains(j))).toList.sortBy(_._1)
  println(PREAMBLE)
  groupedByFile.foreach(i => {
    println(s"// Namespacing symbols exported from ${i._1.replaceAll("\\.o:", "")}:")
    i._2.foreach(j => 
      println(s"#undef ${j._1}")
    )
    println
    i._2.foreach(j => {
      val padded = j._1.padTo(maxFunLen, " ").mkString
      if (genericFiles.contains(j._2)) {
        println(s"#define $padded SQISIGN_NAMESPACE_GENERIC(${j._1})")
      } else {
        println(s"#define $padded SQISIGN_NAMESPACE(${j._1})")
      }
    }
    )
    println
  })
  println(EPILOGUE)
 }
--- a/scripts/cformat.py
+++ b/scripts/cformat.py
@@ -1,92 +0,0 @@
 #!/usr/bin/env python3
 import sys, itertools
 from math import floor, log
 import sage.all
 class Ibz:
    def __init__(self, v):
        self.v = int(v)
    def _literal(self, sz):
        val = int(self.v)
        sgn = val < 0
        num_limbs = (abs(val).bit_length() + sz-1) // sz if val else 0
        limbs = [(abs(val) >> sz*i) & (2**sz-1) for i in range(num_limbs or 1)]
        data = {
                '._mp_alloc': 0,
                '._mp_size': (-1)**sgn * num_limbs,
                '._mp_d': '(mp_limb_t[]) {' + ','.join(map(hex,limbs)) + '}',
            }
        return '{{' + ', '.join(f'{k} = {v}' for k,v in data.items()) + '}}'
 class Object:
    def __init__(self, ty, name, obj):
        if '[' in ty:
            idx = ty.index('[')
            depth = ty.count('[]')
            def rec(os, d):
                assert d >= 0
                if not d:
                    return ()
                assert isinstance(os,list) or isinstance(os,tuple)
                r, = {rec(o, d-1) for o in os}
                return (len(os),) + r
            dims = rec(obj, depth)
            self.ty = ty[:idx], ''.join(f'[{d}]' for d in dims)
        else:
            self.ty = ty, ''
        self.name = name
        self.obj = obj
    def _declaration(self):
        return f'extern const {self.ty[0]} {self.name}{self.ty[1]};'
    def _literal(self, mp_limb_t_bits):
        def rec(obj):
            if isinstance(obj, int):
                return hex(obj)
            if isinstance(obj, sage.all.Integer):
                return hex(obj)
            if isinstance(obj, Ibz):
                return obj._literal(mp_limb_t_bits)
            if isinstance(obj, list) or isinstance(obj, tuple):
                return '{' + ', '.join(map(rec, obj)) + '}'
            raise NotImplementedError(f'unknown type {type(obj)} in Formatter')
        return rec(self.obj)
    def _definition(self, mp_limb_t_bits):
        return f'const {self.ty[0]} {self.name}{self.ty[1]} = ' + self._literal(mp_limb_t_bits) + ';'
 class ObjectFormatter:
    def __init__(self, objs):
        self.objs = objs
    def header(self, file=None):
        for obj in self.objs:
            assert isinstance(obj, Object)
            print(obj._declaration(), file=file)
    def implementation(self, file=None):
        print('#if 0', file=file)
        for sz in (16, 32, 64):
            print(f'#elif 8*DIGIT_LEN == {sz}', file=file)
            for obj in self.objs:
                assert isinstance(obj, Object)
                print(obj._definition(sz), file=file)
        print('#endif', file=file)
 def field(v, F=None):
    if F:
        v = F(v)
    p = F.characteristic()
    l = 1 + floor(log(p,2**64))
    vs = [[(c >> 64*i) & (2**64-1) for i in range(l)] for c in v]
    return vs
 def xonly(T, *args):
    if not T: raise NotImplementedError('is point at infinity')
    x, _ = T.xy()
    return field(x, *args)
--- a/scripts/check_namespace.sh
+++ b/scripts/check_namespace.sh
@@ -0,0 +1,36 @@
 #!/bin/bash
 set -e
 if [ ! -f "include/sqisign_namespace.h" ]; then
  echo "Please run script from the sqisign root directory"
  exit 1
 fi
 if [[ "$OSTYPE" == "darwin"* ]]; then
    sed -i '' 's|//#define DISABLE_NAMESPACING|#define DISABLE_NAMESPACING|' ./include/sqisign_namespace.h
 else
    sed -i 's|//#define DISABLE_NAMESPACING|#define DISABLE_NAMESPACING|' ./include/sqisign_namespace.h
 fi
 mkdir -p build_broadwell && cd build_broadwell && cmake -DSQISIGN_BUILD_TYPE=broadwell .. && make -j8 && cd ..
 mkdir -p build && cd build && cmake .. && make -j8
 find . ../build_broadwell -name '*.a' -exec nm {} \; | grep '.c.o:\|T ' | scala -nc ../scripts/Namespace.scala > sqisign_namespace.h
 if [[ "$OSTYPE" == "darwin"* ]]; then
    sed -i '' 's|#define DISABLE_NAMESPACING|//#define DISABLE_NAMESPACING|' ../include/sqisign_namespace.h
 else
    sed -i 's|#define DISABLE_NAMESPACING|//#define DISABLE_NAMESPACING|' ../include/sqisign_namespace.h
 fi
 diff sqisign_namespace.h ../include/sqisign_namespace.h
 # Check the exit code of diff
 if [ $? -eq 0 ]; then
  echo "No change in namespace."
  exit 0
 else
  echo "Namespace changed, please update."
  exit 1
 fi
--- a/scripts/gen_kat_files.sh
+++ b/scripts/gen_kat_files.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 echo 'Running Script for Level 1...' 
 ./build/apps/PQCgenKAT_sign_lvl1
 mv PQCsignKAT_353_SQIsign_lvl1.req ./KAT/PQCsignKAT_353_SQIsign_lvl1.req
 mv PQCsignKAT_353_SQIsign_lvl1.rsp ./KAT/PQCsignKAT_353_SQIsign_lvl1.rsp
 echo 'Running Script for Level 3...' 
 ./build/apps/PQCgenKAT_sign_lvl3
 mv PQCsignKAT_529_SQIsign_lvl3.req ./KAT/PQCsignKAT_529_SQIsign_lvl3.req
 mv PQCsignKAT_529_SQIsign_lvl3.rsp ./KAT/PQCsignKAT_529_SQIsign_lvl3.rsp
 echo 'Running Script for Level 5...' 
 ./build/apps/PQCgenKAT_sign_lvl5
 mv PQCsignKAT_701_SQIsign_lvl5.req ./KAT/PQCsignKAT_701_SQIsign_lvl5.req
 mv PQCsignKAT_701_SQIsign_lvl5.rsp ./KAT/PQCsignKAT_701_SQIsign_lvl5.rsp
--- a/scripts/gen_pqm4_sources.sh
+++ b/scripts/gen_pqm4_sources.sh
@@ -0,0 +1,60 @@
 #!/bin/bash
 # This script should be run in the root folder of the repository, and creates pqm4 files in "src/pqm4/sqisign{1,3,5}"
 if [ -d "src/pqm4" ]; then
    echo Destination folder src/pqm4 already exists. Delete it before running this script. Aborting.
    exit 1
 fi
 for LEVEL in 1 3 5
 do
    LVL=lvl${LEVEL}
    DST_PATH=src/pqm4/sqisign_${LVL}/ref
    PQCGENKAT_SIGN_PQM4_BINARY=build/apps/PQCgenKAT_sign_pqm4_${LVL}
    if [ ! -f ${PQCGENKAT_SIGN_PQM4_BINARY} ]; then
        echo ${PQCGENKAT_SIGN_PQM4_BINARY} not found. Build it before running this script, or change the build folder in the script. Aborting.
        exit 1
    fi
    mkdir -p ${DST_PATH}
    # Run API generation script
    ${PQCGENKAT_SIGN_PQM4_BINARY}
    CPPFLAGS="-DRADIX_32 -DSQISIGN_BUILD_TYPE_REF -DSQISIGN_GF_IMPL_REF -DSQISIGN_VARIANT=${LVL} -DTARGET_ARM -DTARGET_OS_OTHER -DNDEBUG -DDISABLE_NAMESPACING -DBIG_PUBLIC_KEY_TESTS"
    PQM4_NAME="crypto_sign_sqisign_${LVL}_ref"
    echo "elf/${PQM4_NAME}_%.elf: CPPFLAGS+=${CPPFLAGS}" > ${DST_PATH}/config.mk
    echo "obj/lib${PQM4_NAME}.a: CPPFLAGS+=${CPPFLAGS}" >> ${DST_PATH}/config.mk
    cp include/{sig,sqisign_namespace}.h ${DST_PATH}/
    cp src/sqisign.c ${DST_PATH}/
    cp src/common/generic/include/{tools,tutil}.h ${DST_PATH}/
    cp src/ec/ref/lvlx/{basis,ec_jac,ec,isog_chains,xeval,xisog}.c ${DST_PATH}/
    cp src/ec/ref/include/{ec,isog}.h ${DST_PATH}/
    cp src/gf/ref/lvlx/{fp,fp2}.c ${DST_PATH}/
    cp src/gf/ref/include/{fp,fp2}.h ${DST_PATH}/
    cp src/hd/ref/lvlx/{hd.c,theta_isogenies.c,theta_isogenies.h,theta_structure.c,theta_structure.h} ${DST_PATH}/
    cp src/hd/ref/include/hd.h ${DST_PATH}/
    cp src/mp/ref/generic/mp.c ${DST_PATH}/
    cp src/mp/ref/generic/include/mp.h ${DST_PATH}/
    cp src/precomp/ref/${LVL}/include/{e0_basis,ec_params,encoded_sizes,fp_constants,hd_splitting_transforms}.h ${DST_PATH}/
    cp src/precomp/ref/${LVL}/{e0_basis,ec_params,hd_splitting_transforms}.c ${DST_PATH}/
    cp src/verification/ref/lvlx/{common,encode_verification,verify}.c ${DST_PATH}/
    cp src/verification/ref/include/verification.h ${DST_PATH}/
 done
 cp src/gf/ref/lvl1/fp_p5248_32.c src/pqm4/sqisign_lvl1/ref/
 cp src/gf/ref/lvl3/fp_p65376_32.c src/pqm4/sqisign_lvl3/ref/
 cp src/gf/ref/lvl5/fp_p27500_32.c src/pqm4/sqisign_lvl5/ref/
--- a/scripts/parameters.py
+++ b/scripts/parameters.py
@@ -1,31 +0,0 @@
 #!/usr/bin/env python3
 from sage.all import *
 proof.all(False)  # faster
 import re
 for l in open('sqisign_parameters.txt'):
    for k in ('lvl', 'p', 'B'):
        m = re.search(rf'^\s*{k}\s*=\s*([x0-9a-f]+)', l)
        if m:
            v = ZZ(m.groups()[0], 0)
            globals()[k] = v
 L = {l for l,_ in (p**2 - 1).factor(limit=B+5) if l <= B}
 assert 2 in L
 L.remove(2)
 f = (p+1).valuation(2)
 if (p-1).valuation(2) > f:
    raise NotImplementedError('2-power torsion is on twist')
 Lpls = {l for l in L if (p+1).valuation(l) >= (p-1).valuation(l)}
 Lmin = L - Lpls
 Lpls, Lmin = map(sorted, (Lpls, Lmin))
 Epls = [(p+1).valuation(l) for l in Lpls]
 Emin = [(p-1).valuation(l) for l in Lmin]
 Tpls = prod(l**e for l,e in zip(Lpls,Epls))
 Tmin = prod(l**e for l,e in zip(Lmin,Emin))
 Dcom = (Tpls*Tmin).prime_to_m_part(2*3)
 Dchall = prod(l**(p+1).valuation(l) for l in (2,3))
 __all__ = ['lvl', 'p', 'B', 'f', 'Tpls', 'Tmin', 'Dcom', 'Dchall']
--- a/scripts/precomp/cformat.py
+++ b/scripts/precomp/cformat.py
@@ -0,0 +1,128 @@
 #!/usr/bin/env python3
 import sys, itertools
 from math import ceil, floor, log
 import sage.all
 class Ibz:
    def __init__(self, v):
        self.v = int(v)
    def _literal(self, sz):
        val = int(self.v)
        sgn = val < 0
        num_limbs = (abs(val).bit_length() + sz-1) // sz if val else 0
        limbs = [(abs(val) >> sz*i) & (2**sz-1) for i in range(num_limbs or 1)]
        data = {
                '._mp_alloc': 0,
                '._mp_size': (-1)**sgn * num_limbs,
                '._mp_d': '(mp_limb_t[]) {' + ','.join(map(hex,limbs)) + '}',
            }
        return '{{' + ', '.join(f'{k} = {v}' for k,v in data.items()) + '}}'
 class FpEl:
    ref_p5248_radix_map  = { 16: 13, 32: 29, 64: 51 }
    ref_p65376_radix_map = { 16: 13, 32: 28, 64: 55 }
    ref_p27500_radix_map = { 16: 13, 32: 29, 64: 57 }
    def __init__(self, n, p, montgomery=True):
        self.n = n
        self.p = p
        self.montgomery = montgomery
    def __get_radix(self, word_size, arith=None):
        if arith == "ref" or arith is None:
            # lvl1
            if self.p == 0x4ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff:
                return self.ref_p5248_radix_map[word_size]
            # lvl3
            elif self.p == 0x40ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff:
                return self.ref_p65376_radix_map[word_size]
            # lvl5
            elif self.p == 0x1afffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff:
                return self.ref_p27500_radix_map[word_size]
            raise ValueError(f'Invalid prime \"{self.p}\"')
        elif arith == "broadwell":
            return word_size
        raise ValueError(f'Invalid arithmetic implementation type \"{arith}\"')
    def _literal(self, sz, arith=None):
        radix = self.__get_radix(sz, arith=arith)
        l = 1 + floor(log(self.p, 2**radix))
        # If we're using Montgomery representation, we need to multiply
        # by the Montgomery factor R = 2^nw (n = limb number, w = radix)
        if self.montgomery:
            R = 2**(radix * ceil(log(self.p, 2**radix)))
        else:
            R = 1
        el = (self.n * R) % self.p
        vs = [(int(el) >> radix*i) % 2**radix for i in range(l)]
        return '{' + ', '.join(map(hex, vs)) + '}'
 class Object:
    def __init__(self, ty, name, obj):
        if '[' in ty:
            idx = ty.index('[')
            depth = ty.count('[]')
            def rec(os, d):
                assert d >= 0
                if not d:
                    return ()
                assert isinstance(os,list) or isinstance(os,tuple)
                r, = {rec(o, d-1) for o in os}
                return (len(os),) + r
            dims = rec(obj, depth)
            self.ty = ty[:idx], ''.join(f'[{d}]' for d in dims)
        else:
            self.ty = ty, ''
        self.name = name
        self.obj = obj
    def _declaration(self):
        return f'extern const {self.ty[0]} {self.name}{self.ty[1]};'
    def _literal(self):
        def rec(obj):
            if isinstance(obj, int):
                if obj < 256: return str(obj)
                else: return hex(obj)
            if isinstance(obj, sage.all.Integer):
                if obj < 256: return str(obj)
                else: return hex(obj)
            if isinstance(obj, Ibz):
                literal = "\n#if 0"
                for sz in (16, 32, 64):
                    literal += f"\n#elif GMP_LIMB_BITS == {sz}"
                    literal += f"\n{obj._literal(sz)}"
                return literal + "\n#endif\n"
            if isinstance(obj, FpEl):
                literal = "\n#if 0"
                for sz in (16, 32, 64):
                    literal += f"\n#elif RADIX == {sz}"
                    if sz == 64:
                        literal += "\n#if defined(SQISIGN_GF_IMPL_BROADWELL)"
                        literal += f"\n{obj._literal(sz, 'broadwell')}"
                        literal += "\n#else"
                        literal += f"\n{obj._literal(sz, 'ref')}"
                        literal += "\n#endif"
                    else:
                        literal += f"\n{obj._literal(sz, 'ref')}"
                return literal + "\n#endif\n"
            if isinstance(obj, list) or isinstance(obj, tuple):
                return '{' + ', '.join(map(rec, obj)) + '}'
            if isinstance(obj, str):
                return obj
            raise NotImplementedError(f'unknown type {type(obj)} in Formatter')
        return rec(self.obj)
    def _definition(self):
        return f'const {self.ty[0]} {self.name}{self.ty[1]} = ' + self._literal() + ';'
 class ObjectFormatter:
    def __init__(self, objs):
        self.objs = objs
    def header(self, file=None):
        for obj in self.objs:
            assert isinstance(obj, Object)
            print(obj._declaration(), file=file)
    def implementation(self, file=None):
        for obj in self.objs:
            assert isinstance(obj, Object)
            print(obj._definition(), file=file)
--- a/scripts/precomp/ec_params.sage
+++ b/scripts/precomp/ec_params.sage
@@ -0,0 +1,41 @@
 #!/usr/bin/env python3
 from sage.all import *
 from parameters import p, f
 if __name__ == '__main__':
    cof = (p+1)//(2**f)
    from cformat import Object, ObjectFormatter
    obj_cof = ObjectFormatter(
        [
            Object('digit_t[]', 'p_cofactor_for_2f', [cof]),
        ]
    )
    with open("include/ec_params.h", "w") as hfile:
        with open("ec_params.c", "w") as cfile:
            hfile.write('#ifndef EC_PARAMS_H\n')
            hfile.write('#define EC_PARAMS_H\n')
            hfile.write('\n')
            hfile.write('#include <fp.h>\n')
            cfile.write('#include <ec_params.h>\n')
            hfile.write('\n')
            hfile.write(f'#define TORSION_EVEN_POWER {f}\n')
            hfile.write('\n')
            hfile.write('// p+1 divided by the power of 2\n')
            cfile.write('// p+1 divided by the power of 2\n')
            obj_cof.header(file=hfile)
            obj_cof.implementation(file=cfile)
            hfile.write(f'#define P_COFACTOR_FOR_2F_BITLENGTH {((p+1)//(2**f)).bit_length()}\n')
            hfile.write('\n')
            cfile.write('\n')
            hfile.write('#endif\n')
--- a/scripts/precomp/maxorders.py
+++ b/scripts/precomp/maxorders.py
@@ -0,0 +1,88 @@
 from sage.all import *
 from sage.misc.banner import require_version
 if not require_version(10, 5, print_message=True):
    exit('')
 from parameters import p, num_orders as num
 ################################################################
 # Underlying theory:
 # - Ibukiyama, On maximal orders of division quaternion algebras with certain optimal embeddings
 # - https://ia.cr/2023/106 Lemma 10
 from sage.algebras.quatalg.quaternion_algebra import basis_for_quaternion_lattice
 bfql = lambda els: basis_for_quaternion_lattice(els, reverse=True)
 Quat1, (i,j,k) = QuaternionAlgebra(-1, -p).objgens()
 assert Quat1.discriminant() == p         # ramifies correctly
 O0mat = matrix([list(g) for g in [Quat1(1), i, (i+j)/2, (1+k)/2]])
 O0 = Quat1.quaternion_order(list(O0mat))
 orders = [ (1, identity_matrix(QQ,4), O0mat, i, O0mat, vector((1,0,0,0))) ]
 q = ZZ(1)
 while len(orders) < num:
    q = next_prime(q)
    if q % 4 != 1:  # restricting to q ≡ 1 (mod 4)
        continue
    Quatq, (ii,jj,kk) = QuaternionAlgebra(-q, -p).objgens()
    if Quatq.discriminant() != p:       # ramifies incorrectly
        continue
    x, y = QuadraticForm(QQ, 2, [1,0,p]).solve(q)
    gamma = x + j*y
    assert gamma.reduced_norm() == q
    ims1 = [Quat1(1), i*gamma, j, k*gamma]
    assert ims1[1]**2 == -q
    assert ims1[2]**2 == -p
    assert ims1[1]*ims1[2] == ims1[3]
    assert ims1[2]*ims1[1] == -ims1[3]
    # (1,ii,jj,kk)->ims1 is an isomorphism Quatq->Quat1
    iso1q = ~matrix(map(list, ims1))
    r = min(map(ZZ, Mod(-p, 4*q).sqrt(all=True)))
    basq = [
            Quatq(1),
            ii,
            (1 + jj) / 2,
            (r + jj) * ii / 2 / q,
        ]
    Oq = Quatq.quaternion_order(basq)
    assert Oq.discriminant() == p   # is maximal
    mat1 = matrix(map(list, basq)) * ~iso1q
    O1 = Quat1.quaternion_order(list(mat1))
    assert O1.discriminant() == p   # is maximal
    assert j in O1                  # p-extremal
    # look for an odd connecting ideal
    I = O0 * O1
    I *= I.norm().denominator()
    assert I.is_integral()
    for v in IntegralLattice(I.gram_matrix()).enumerate_short_vectors():
        elt = sum(c*g for c,g in zip(v,I.basis()))
        if ZZ(elt.reduced_norm() / I.norm()) % 2:
            break
    I = I * (elt.conjugate() / I.norm())
    assert I.is_integral()
    assert I.norm() % 2
    assert I.left_order() == O0
    O1_ = I.right_order()
    assert O1_.unit_ideal() == elt * O1 * ~elt
    idl1 = matrix(map(list, I.basis()))
    # q
    # isomorphism from (-1,-p) algebra to (-q,-p) algebra
    # basis of maximal order O₁ in (-1,-p) algebra
    # element sqrt(-q) in O₁ in (-1,-p) algebra
    # basis of connecting ideal I from O₀ in (-1,-p) algebra
    # element γ such that I has right order γ O₁ γ^-1
    orders.append((q, iso1q, mat1, ims1[1], idl1, vector(elt)))
--- a/scripts/precomp/parameters.py
+++ b/scripts/precomp/parameters.py
@@ -0,0 +1,16 @@
 #!/usr/bin/env python3
 from sage.all import *
 proof.all(False)  # faster
 import re
 for l in open('sqisign_parameters.txt'):
    for k in ('lvl', 'p', 'num_orders'):
        m = re.search(rf'^\s*{k}\s*=\s*([x0-9a-f]+)', l)
        if m:
            v = ZZ(m.groups()[0], 0)
            globals()[k] = v
 f = (p+1).valuation(2)
 __all__ = ['lvl', 'p', 'f', 'num_orders']
--- a/scripts/precomp/precompute_E0_basis.sage
+++ b/scripts/precomp/precompute_E0_basis.sage
@@ -0,0 +1,38 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 ################################################################
 from parameters import p, f
 if p % 4 != 3:
    raise NotImplementedError('requires p ≡ 3 (mod 4)')
 assert (1 << f).divides(p + 1)
 Fp2.<i> = GF((p,2), modulus=[1,0,1])
 E0 = EllipticCurve(Fp2, [1, 0])
 from torsion_basis import even_torsion_basis_E0
 P, Q = even_torsion_basis_E0(E0, f)
 ################################################################
 from cformat import FpEl, Object, ObjectFormatter
 def Fp2_to_list(el):
    return [FpEl(int(c), p, True) for c in Fp2(el)]
 objs = ObjectFormatter([
        Object('fp2_t', 'BASIS_E0_PX', Fp2_to_list(P.x())),
        Object('fp2_t', 'BASIS_E0_QX', Fp2_to_list(Q.x())),
    ])
 ################################################################
 with open('include/e0_basis.h','w') as hfile:
    with open('e0_basis.c','w') as cfile:
        print(f'#include <fp2.h>', file=hfile)
        print(f'#include <e0_basis.h>', file=cfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precomp/precompute_endomorphism_action.sage
+++ b/scripts/precomp/precompute_endomorphism_action.sage
@@ -0,0 +1,303 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(10, 0, print_message=True):
    exit('')
 ################################################################
 from parameters import p, f
 from torsion_basis import even_torsion_basis_E0
 ################################################################
 from sage.groups.generic import order_from_multiple
 pari.allocatemem(1 << 34)  # 16G
 if p % 4 != 3:
    raise NotImplementedError('requires p ≡ 3 (mod 4)')
 assert (1 << f).divides(p + 1)
 Fp2.<i> = GF((p,2), modulus=[1,0,1])
 sqrtm1 = min(Fp2(-1).sqrt(all=True))
 def compute(q, mat, idl, iso1q):
    print(f'\x1b[33m{q = }\x1b[0m')
    E0 = EllipticCurve(Fp2, [1,0])
    E0.set_order((p+1)^2)
    if q == 1:
        E1 = E0
        P1, Q1 = even_torsion_basis_E0(E1, f)
        print(f'E0 = {E1}')
        print(f'P0 = {P1}')
        print(f'Q0 = {Q1}')
    else:
        Quat.<i,j,k> = QuaternionAlgebra(-1, -p)
        I = Quat.ideal(map(Quat, idl))
 #        print(f'{I = }')
        O0 = Quat.quaternion_order(list(map(Quat, orders[0][2])))
 #        print(f'{O0 = }')
        O1 = I.right_order()
 #        print(f'{O1 = }')
        assert I.left_order() == O0
        assert O0.is_maximal() and O1.is_maximal()
        assert I.norm() % 2
        from deuring2d import Deuring2D
        ctx = Deuring2D(p)
        assert ctx.O0.order == O0
        assert ctx.E0 == E0
        ctx.sqrtm1 = sqrtm1
        P0, Q0 = data[0][1]
        for deg in range(1,10):
            print(f'trying {deg = }...')
            ctx.e = E0.cardinality(extension_degree=2^deg).sqrt().valuation(2) - 1
            first = True
            for suitable in ctx.SuitableIdeals(I, attempts=10**6, bound=10**3):
                if first:
                    Fbig.<U> = Fp2.extension(2^deg)
                    ctx.E0 = E0.change_ring(Fbig)
                    ctx.P = P0.change_ring(Fbig)
                    ctx.Q = Q0.change_ring(Fbig)
                    assert ctx.e == ctx.E0.order().sqrt().valuation(2) - 1
                    for _ in range(ctx.e - f):
                        ctx.P = ctx.P.division_points(2)[0]
                        ctx.Q = ctx.Q.division_points(2)[0]
                    ctx.P.set_order(multiple=2^ctx.e)
                    ctx.Q.set_order(multiple=2^ctx.e)
                first = False
                try:
                    E1, P1, Q1 = ctx.IdealToIsogeny(I, suitable=suitable)
                    break
                except Deuring2D.Failure:
                    continue
            else:
                continue
            break
        else:
            raise NotImplementedError('Deuring2D failed')
        E1 = E1.change_ring(Fp2)
        j = GF(p)(E1.j_invariant())
        X = polygen(GF(p))
        for A,_ in sorted((256*(X^2-3)^3 - (X^2-4)*j).roots()):
            E1_ = EllipticCurve(Fp2, [0,A,0,1,0])
            try:
                iso = min(E1.isomorphisms(E1_))
                break
            except ValueError:
                pass
        E1 = iso.codomain()
        P1 = iso._eval(P1)
        Q1 = iso._eval(Q1)
        print(f'{E1 = }')
        P1 *= ctx.P.order() // P0.order()
        Q1 *= ctx.Q.order() // Q0.order()
        P1 = P1.change_ring(Fp2)
        Q1 = Q1.change_ring(Fp2)
        print(f'{P1 = }')
        print(f'{Q1 = }')
        P1.set_order(P0.order())
        Q1.set_order(Q0.order())
        assert P0.order() == Q0.order() == P1.order() == Q1.order() == 2^f
        assert P1.weil_pairing(Q1,2^f) == P0.weil_pairing(Q0,2^f)^I.norm()
    if q == 1:
        endo_i, = (a for a in E1.automorphisms() if a.scaling_factor() == sqrtm1)
    else:
        iso = E1.isomorphism(min(Fp2(-q).sqrt(all=True)), is_codomain=True)
        try:
            endo_i = iso * E1.isogeny(None, codomain=iso.domain(), degree=q)
        except ValueError:
            assert False
 #    assert endo_i^2 == -q
    endo_1 = E1.scalar_multiplication(1)
    endo_j = E1.frobenius_isogeny()
    endo_k = endo_i * endo_j
    if __debug__:
        R = E1.random_point()
        assert (endo_i^2)(R) == -q*R
        assert (endo_j^2)(R) == -p*R
        assert (endo_j*endo_i)(R) == -(endo_i*endo_j)(R)
    denom = mat.denominator()
    coprime = denom.prime_to_m_part(lcm(P1.order(), Q1.order()))
    P1d, Q1d = (inverse_mod(coprime, T.order()) * T for T in (P1, Q1))
    denom //= coprime
    extdeg = next(d for d in range(1,denom+1) if ((denom<<f)^2).divides(E1.order(extension_degree=d)))
    if extdeg == 1:
        Fbig = Fp2
    else:
        Fbig.<U> = Fp2.extension(extdeg)
    P1d, Q1d = (T.change_ring(Fbig) for T in (P1d, Q1d))
    P1d.set_order(multiple=denom<<f)
    for l,m in denom.factor():
        for i in range(m):
            assert l.divides(P1d.order())
            P1d = P1d.division_points(l)[0]
            P1d.set_order(multiple=denom<<f)
            for Q1d_ in Q1d.division_points(l):
                o = order_from_multiple(P1d.weil_pairing(Q1d_, P1d.order()), denom<<f, operation='*')
                if o == P1d.order():
                    Q1d = Q1d_
                    break
            else:
                assert False
    assert hasattr(P1d, '_order')
    Q1d.set_order(multiple=denom<<f)
    denom *= coprime
    PQ1d = P1d, Q1d
 #    mat1 = matrix(Zmod(1<<f), [endo_1._eval(T).log(PQ1d) for T in PQ1d])
 #    assert mat1 == 1            # identity; omit
    mati = matrix(Zmod(1<<f), [endo_i._eval(T).log(PQ1d) for T in PQ1d])
    matj = matrix(Zmod(1<<f), [endo_j._eval(T).log(PQ1d) for T in PQ1d])
 #    matk = matrix(Zmod(1<<f), [endo_k._eval(T).log(PQ1d) for T in PQ1d])
 #    assert matk == matj * mati  # redundant; omit
    matk = matj * mati
    gens = []
    for row in denom * mat:
        endo = sum(ZZ(c)*e for c,e in zip(row, (endo_1,endo_i,endo_j,endo_k)))
        gens.append(endo)
    gen1, gen2, gen3, gen4 = gens
    assert mat[0] == vector((1,0,0,0))
 #    mat1 = matrix(ZZ, [gen1._eval(T).log(PQ1d) for T in PQ1d]) / denom
 #    assert mat1 == 1            # identity; omit
    mat2 = matrix(ZZ, [gen2._eval(T).log(PQ1d) for T in PQ1d]) / denom
    mat3 = matrix(ZZ, [gen3._eval(T).log(PQ1d) for T in PQ1d]) / denom
    mat4 = matrix(ZZ, [gen4._eval(T).log(PQ1d) for T in PQ1d]) / denom
    mat2, mat3, mat4 = (M.change_ring(Zmod(1<<f)) for M in (mat2,mat3,mat4))
    A = E1.a2()
    assert E1.a_invariants() == (0,A,0,1,0)
    return (A, (A+2)/4), (P1, Q1), (mati,matj,matk), (mat2,mat3,mat4)
 ################################################################
 from maxorders import orders
 print('qs:', [q for q,_,_,_,_,_ in orders])
 todo = [(q, mat*iso1q, idl, iso1q) for q,iso1q,mat,_,idl,_ in orders]
 data = [None] * len(todo)
 assert todo[0][0] == 1
 data[0] = compute(*todo[0])  # compute this first; we need it for the others
 print(f'[\x1b[32m+\x1b[0m] finished precomputation for \x1b[36mq = {todo[0][0]}\x1b[0m.')
 ####XXX
 ##for idx,inp in enumerate(todo[1:],1):
 ##    data[idx] = compute(*inp)
 ##    print(f'[\x1b[32m+\x1b[0m] finished precomputation for \x1b[36mq = {inp[0]}\x1b[0m.')
 ##todo = []
 ####XXX
 for (inp,_),res in parallel(8)(compute)(todo[1:]):
    q,_,_,_ = inp
    idx, = (i for i,(qq,_,_,_) in enumerate(todo) if qq == q)
    assert data[idx] is None
    data[idx] = res
    print(f'[\x1b[32m+\x1b[0m] finished precomputation for \x1b[36m{q = }\x1b[0m.')
 ################################################################
 from cformat import FpEl, Ibz, Object, ObjectFormatter
 def Fp2_to_list(el):
    return [FpEl(int(c), p, True) for c in Fp2(el)]
 def basis2field(P, Q):
    vs = [
            [Fp2_to_list(T[0]), Fp2_to_list(T[2])]
            for T in (P,Q,P-Q)
        ]
    return vs
 ################################################################
 objs = ObjectFormatter([
        Object('curve_with_endomorphism_ring_t[]', 'CURVES_WITH_ENDOMORPHISMS',
            [
                [
                    [Fp2_to_list(A), Fp2_to_list(1),                    # ec_curve_t A, C
                     [Fp2_to_list(A24), Fp2_to_list(1)], "true"],       # ec_curve_t A24, is_A24_computed_and_normalized
                    basis2field(*basis),                                # ec_basis_t
                    [[Ibz(v) for v in vs] for vs in mati.transpose()],  # ibz_mat_2x2_t
                    [[Ibz(v) for v in vs] for vs in matj.transpose()],  # ibz_mat_2x2_t
                    [[Ibz(v) for v in vs] for vs in matk.transpose()],  # ibz_mat_2x2_t
                    [[Ibz(v) for v in vs] for vs in mat2.transpose()],  # ibz_mat_2x2_t
                    [[Ibz(v) for v in vs] for vs in mat3.transpose()],  # ibz_mat_2x2_t
                    [[Ibz(v) for v in vs] for vs in mat4.transpose()],  # ibz_mat_2x2_t
                ]
                for (A,A24),basis,(mati,matj,matk),(mat2,mat3,mat4)
                in data
            ])
    ])
 with open('include/endomorphism_action.h','w') as hfile:
    with open('endomorphism_action.c','w') as cfile:
        print(f'#ifndef ENDOMORPHISM_ACTION_H', file=hfile)
        print(f'#define ENDOMORPHISM_ACTION_H', file=hfile)
        print(f'#include <sqisign_namespace.h>', file=hfile)
        print(f'#include <ec.h>', file=hfile)
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <endomorphism_action.h>', file=cfile)
        print('''
 /** Type for precomputed endomorphism rings applied to precomputed torsion bases.
 *
 * Precomputed by the precompute scripts.
 *
 * @typedef curve_with_endomorphism_ring_t
 *
 * @struct curve_with_endomorphism_ring
 **/
 typedef struct curve_with_endomorphism_ring {
    ec_curve_t curve;
    ec_basis_t basis_even;
    ibz_mat_2x2_t action_i, action_j, action_k;
    ibz_mat_2x2_t action_gen2, action_gen3, action_gen4;
 } curve_with_endomorphism_ring_t;
              '''.strip(), file=hfile)
        print(f'#define CURVE_E0 (CURVES_WITH_ENDOMORPHISMS->curve)', file=hfile)
        print(f'#define BASIS_EVEN (CURVES_WITH_ENDOMORPHISMS->basis_even)', file=hfile)
        print(f'#define ACTION_I (CURVES_WITH_ENDOMORPHISMS->action_i)', file=hfile)
        print(f'#define ACTION_J (CURVES_WITH_ENDOMORPHISMS->action_j)', file=hfile)
        print(f'#define ACTION_K (CURVES_WITH_ENDOMORPHISMS->action_k)', file=hfile)
        print(f'#define ACTION_GEN2 (CURVES_WITH_ENDOMORPHISMS->action_gen2)', file=hfile)
        print(f'#define ACTION_GEN3 (CURVES_WITH_ENDOMORPHISMS->action_gen3)', file=hfile)
        print(f'#define ACTION_GEN4 (CURVES_WITH_ENDOMORPHISMS->action_gen4)', file=hfile)
        print(f'#define NUM_ALTERNATE_STARTING_CURVES {len(data)-1}', file=hfile)
        print(f'#define ALTERNATE_STARTING_CURVES (CURVES_WITH_ENDOMORPHISMS+1)', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
        print(f'#endif', file=hfile)
--- a/scripts/precomp/precompute_hd_splitting.sage
+++ b/scripts/precomp/precompute_hd_splitting.sage
@@ -0,0 +1,158 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 ################################################################
 from parameters import p
 # Field
 Fp2.<i> = GF((p,2), modulus=[1,0,1])
 Fp2_constants = [
    [Fp2(0), Fp2(1), Fp2(i), Fp2(-1), Fp2(-i)],
    ["FP2_ZERO", "FP2_ONE", "FP2_I", "FP2_MINUS_ONE", "FP2_MINUS_I"]
 ]
 ################################################################
 from cformat import FpEl
 def Fp2_to_list(el):
    return [FpEl(int(c), p, True) for c in Fp2(el)]
 def Fp2_to_name(el):
    return Fp2_constants[1][Fp2_constants[0].index(el)]
 ################################################################
 # Splitting Data
 chi_eval = [
    [1,1,1,1],
    [1,-1,1,-1],
    [1,1,-1,-1],
    [1,-1,-1,1]
 ]
 even_indices = [
    [0, 0],
    [0, 1],
    [0, 2],
    [0, 3],
    [1, 0],
    [1, 2],
    [2, 0],
    [2, 1],
    [3, 0],
    [3, 3],
 ]
 splitting_map = {
    (0, 2): [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]],
    (3, 3): [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
    (0, 3): [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, -1]],
    (2, 1): [[1, 1, 1, 1], [1, -1, 1, -1], [1, -1, -1, 1], [1, 1, -1, -1]],
    (0, 1): [[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0]],
    (1, 2): [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]],
    (2, 0): [[1, 1, 1, 1], [1, -1, 1, -1], [1, -1, -1, 1], [-1, -1, 1, 1]],
    (3, 0): [[1, 1, 1, 1], [1, -1, 1, -1], [1, 1, -1, -1], [-1, 1, 1, -1]],
    (1, 0): [[1, 1, 1, 1], [1, -1, -1, 1], [1, 1, -1, -1], [-1, 1, -1, 1]],
    (0, 0): [[1, i, 1, i], [1, -i, -1, i], [1, i, -1, -i], [-1, i, -1, i]],
 }
 # 24 2x2 maps used for normalization. Applying a uniform matrix to a level 2
 # theta null point ensure that we get a random theta null point in the
 # equivalence class of Γ/Γ(2,4)
 full_normalization_maps_2d = [
    matrix(2, 2, [1, 0, 0, 1]),
    matrix(2, 2, [0, 1, 1, 0]),
    matrix(2, 2, [1, 0, 0, -1]),
    matrix(2, 2, [0, 1, -1, 0]),
    matrix(2, 2, [1, 1, 1, -1]),
    matrix(2, 2, [1, -1, 1, 1]),
    matrix(2, 2, [1, 1, -1, 1]),
    matrix(2, 2, [-1, 1, 1, 1]),
    matrix(2, 2, [1, 0, 0, i]),
    matrix(2, 2, [0, i, 1, 0]),
    matrix(2, 2, [1, 0, 0, -i]),
    matrix(2, 2, [0, i, -1, 0]),
    matrix(2, 2, [i, 1, 1, i]),
    matrix(2, 2, [1, i, i, 1]),
    matrix(2, 2, [i, 1, -1, -i]),
    matrix(2, 2, [1, i, -i, -1]),
    matrix(2, 2, [1, i, 1, -i]),
    matrix(2, 2, [1, -i, 1, i]),
    matrix(2, 2, [1, i, -1, i]),
    matrix(2, 2, [1, -i, -1, -i]),
    matrix(2, 2, [1, 1, i, -i]),
    matrix(2, 2, [i, -i, 1, 1]),
    matrix(2, 2, [1, 1, -i, i]),
    matrix(2, 2, [i, -i, -1, -1]),
 ]
 # 6 2x2 maps used for normalisation. A subset of the preceding 24 matrices,
 # that are sufficient to ensure uniform normalisation (under Γ/Γ^0(4))
 # when using the Montgomery model
 normalization_maps_2d = [
    matrix(2, 2, [1, 0, 0, 1]),
    matrix(2, 2, [0, 1, 1, 0]),
    matrix(2, 2, [1, 1, 1, -1]),
    matrix(2, 2, [-1, 1, 1, 1]),
    matrix(2, 2, [i, 1, 1, i]),
    matrix(2, 2, [1, i, i, 1]),
 ]
 # Format from dictionary to list of lists
 splitting_matrices = []
 for ind in even_indices:
    # Create a list of all ten matrices (represented as 4x4 matrices)
    splitting_matrices.append([[Fp2_to_name(Fp2(x)) for x in row] for row in splitting_map[tuple(ind)]])
 # 6 4x4 maps constructed from the above
 normalization_maps_4d = []
 for m in normalization_maps_2d:
    M = m.tensor_product(m, subdivide=False).list()
    matrix_elements_list = list(map(Fp2, M))
    # Reshape into matrix
    matrix_elements = [ matrix_elements_list[i:i+4] for i in range(0, len(matrix_elements_list), 4) ]
    normalization_maps_4d.append([[Fp2_to_name(x) for x in row] for row in matrix_elements])
 ################################################################
 from cformat import Object, ObjectFormatter
 objs = ObjectFormatter(
    [
        Object('int[][]', 'EVEN_INDEX', even_indices),
        Object('int[][]', 'CHI_EVAL', chi_eval),
        Object('fp2_t[]', 'FP2_CONSTANTS', list(map(Fp2_to_list, Fp2_constants[0]))),
        Object('precomp_basis_change_matrix_t[]', 'SPLITTING_TRANSFORMS', [[x] for x in splitting_matrices]),
        Object('precomp_basis_change_matrix_t[]', 'NORMALIZATION_TRANSFORMS', [[x] for x in normalization_maps_4d]),
    ]
 )
 with open("include/hd_splitting_transforms.h", "w") as hfile:
    with open("hd_splitting_transforms.c", "w") as cfile:
        print("#ifndef HD_SPLITTING_H", file=hfile)
        print("#define HD_SPLITTING_H", file=hfile)
        print(f"\n#include <hd.h>", file=hfile)
        print(f"#include <stdint.h>\n", file=hfile)
        print("typedef struct precomp_basis_change_matrix {", file=hfile)
        print("    uint8_t m[4][4];", file=hfile)
        print("} precomp_basis_change_matrix_t;\n", file=hfile)
        print(f"#include <hd_splitting_transforms.h>\n", file=cfile)
        for i in range(len(Fp2_constants[1])):
            print(f"#define {Fp2_constants[1][i]} {i}", file=cfile)
        print("", file=cfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
        print("\n#endif\n", file=hfile)
--- a/scripts/precomp/precompute_quaternion_constants.sage
+++ b/scripts/precomp/precompute_quaternion_constants.sage
@@ -0,0 +1,44 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 ################################################################
 from parameters import p
 negl = 2**-64
 ################################################################
 logp = ceil(log(p, 2))
 loglogp = ceil(log(logp,2))
 tors2val = (p+1).valuation(2)
 defs = dict()
 # RepresentInteger data
 small = ceil(log(negl, 2) / -1)
 assert 2**-small <= negl
 add_shift = ceil(log(log(negl, 1-1/(64*logp)), 2))
 assert (1 - 1/(64*logp)) ** (2**(add_shift)) <= negl
 defs['QUAT_primality_num_iter'] = ceil(-log(negl, 4))
 defs['QUAT_repres_bound_input'] = add_shift
 # Equivalent ideal data
 defs['QUAT_equiv_bound_coeff'] = 2**(1 + add_shift//4)
 # Find_uv constants
 m = 2 + floor((logp - tors2val) / 4)
 defs['FINDUV_box_size'] = m
 defs['FINDUV_cube_size'] = (2 * m + 1)**4 - 1
 ################################################################
 with open('include/quaternion_constants.h','w') as hfile:
    print(f'#include <quaternion.h>', file=hfile)
    for k,v in defs.items():
        v = ZZ(v)
        print(f'#define {k} {v}', file=hfile)
--- a/scripts/precomp/precompute_quaternion_data.sage
+++ b/scripts/precomp/precompute_quaternion_data.sage
@@ -0,0 +1,91 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from maxorders import p, orders
 from cformat import Ibz, Object, ObjectFormatter
 # Prime of same size than p for random ideal of fixed norm
 bitlength_p = int(p).bit_length()
 prime_cofactor = next_prime((2^(bitlength_p)))
 algobj = [Ibz(p)]
 objs = \
    [
        [
            # basis (columns)
            [
                Ibz(mat.denominator()),
                [[Ibz(v) for v in vs]
                    for vs in mat.transpose()*mat.denominator()],
            ],
            # sqrt(-q)
            [
                Ibz(mat.denominator()),
                [Ibz(c) for c in ii*mat.denominator()],
            ],
            # sqrt(-p)
            [
                Ibz(1),
                [Ibz(c) for c in (0,0,1,0)]
            ],
            q
        ]
        for q,_,mat,ii,_,_ in orders
    ]
 idlobjs = \
    [
        [
            # basis (columns)
            [
                Ibz(idl.denominator()),
                [[Ibz(v) for v in vs]
                    for vs in idl.transpose()*idl.denominator()],
            ],
            # norm
            Ibz(abs(idl.row_space(ZZ).intersection((ZZ^4).submodule([[1,0,0,0]])).basis()[0][0])),
            # left order
            '&MAXORD_O0',
        ]
        for _,_,mat,_,idl,_ in orders
    ]
 gammaobjs = \
    [
        [
            Ibz(gamma.denominator()),
            list(map(Ibz, gamma * gamma.denominator())),
        ]
        for _,_,_,_,_,gamma in orders
    ]
 objs = ObjectFormatter([
        Object('ibz_t', 'QUAT_prime_cofactor', Ibz(prime_cofactor)),
        Object('quat_alg_t', 'QUATALG_PINFTY', algobj),
        Object('quat_p_extremal_maximal_order_t[]', 'EXTREMAL_ORDERS', objs),
        Object('quat_left_ideal_t[]', 'CONNECTING_IDEALS', idlobjs),  # ideal corresponding to an isogeny from E0 which acts as identity w.r.t. the basis_even
        Object('quat_alg_elem_t[]', 'CONJUGATING_ELEMENTS', gammaobjs), # elements γ such that each I has right order γ O₁ γ^-1
    ])
 with open('include/quaternion_data.h','w') as hfile:
    with open('quaternion_data.c','w') as cfile:
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <quaternion_data.h>', file=cfile)
        #FIXME this should eventually go away?
        print(f'#define MAXORD_O0 (EXTREMAL_ORDERS->order)', file=hfile)
        print(f'#define STANDARD_EXTREMAL_ORDER (EXTREMAL_ORDERS[0])', file=hfile)
        print(f'#define NUM_ALTERNATE_EXTREMAL_ORDERS {len(orders)-1}', file=hfile)
        print(f'#define ALTERNATE_EXTREMAL_ORDERS (EXTREMAL_ORDERS+1)', file=hfile)
        print(f'#define ALTERNATE_CONNECTING_IDEALS (CONNECTING_IDEALS+1)', file=hfile)
        print(f'#define ALTERNATE_CONJUGATING_ELEMENTS (CONJUGATING_ELEMENTS+1)', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precomp/precompute_sizes.sage
+++ b/scripts/precomp/precompute_sizes.sage
@@ -0,0 +1,93 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import lvl, f, p
 ################################################################
 logp = ceil(log(p, 2))
 tors2val = (p+1).valuation(2)
 tors2part = (p+1).p_primary_part(2)
 tors3part = (p+1).p_primary_part(3)
 defs = dict()
 TORSION_2POWER_BYTES = (tors2part.bit_length() + 7) // 8
 SECURITY_BITS = round(p.bit_length() / 128) * 64
 RESPONSE_LENGTH = ceil(p.bit_length()/2)
 RESPONSE_BYTES = (RESPONSE_LENGTH + 9) // 8
 fpsz = (logp + 63)//64*8
 fp2sz = 2 * fpsz
 defs['SECURITY_BITS'] = SECURITY_BITS
 defs['SQIsign_response_length'] = ceil(logp/2)
 defs['HASH_ITERATIONS'] = 2**(32 * ceil( logp/64 ) - (tors2val - ceil(logp/2)))
 defs['FP_ENCODED_BYTES'] = fpsz
 defs['FP2_ENCODED_BYTES'] = fp2sz
 defs['EC_CURVE_ENCODED_BYTES'] = fp2sz  # just the A
 defs['EC_POINT_ENCODED_BYTES'] = fp2sz  # just the x
 defs['EC_BASIS_ENCODED_BYTES'] = 3 * defs['EC_POINT_ENCODED_BYTES']
 defs['PUBLICKEY_BYTES'] = defs['EC_CURVE_ENCODED_BYTES'] + 1  # extra byte for hint
 defs['SECRETKEY_BYTES'] = defs['PUBLICKEY_BYTES'] + 5*defs['FP_ENCODED_BYTES'] + 4*TORSION_2POWER_BYTES
 defs['SIGNATURE_BYTES'] = defs['EC_CURVE_ENCODED_BYTES'] + 2 + 4*RESPONSE_BYTES + (SECURITY_BITS//8) + 1 + 1
 size_privkey = defs['SECRETKEY_BYTES']
 size_pubkey = defs['PUBLICKEY_BYTES']
 size_signature = defs['SIGNATURE_BYTES']
 algname = f'SQIsign_lvl{lvl}'
 ################################################################
 with open('include/encoded_sizes.h','w') as hfile:
    for k,v in defs.items():
        v = ZZ(v)
        print(f'#define {k} {v}', file=hfile)
 ################################################################
 api = f'''
 // SPDX-License-Identifier: Apache-2.0
 #ifndef api_h
 #define api_h
 #include <sqisign_namespace.h>
 #define CRYPTO_SECRETKEYBYTES {size_privkey}
 #define CRYPTO_PUBLICKEYBYTES {size_pubkey}
 #define CRYPTO_BYTES {size_signature}
 #define CRYPTO_ALGNAME "{algname}"
 #if defined(ENABLE_SIGN)
 SQISIGN_API
 int
 crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
 SQISIGN_API
 int
 crypto_sign(unsigned char *sm, unsigned long long *smlen,
            const unsigned char *m, unsigned long long mlen,
            const unsigned char *sk);
 #endif
 SQISIGN_API
 int
 crypto_sign_open(unsigned char *m, unsigned long long *mlen,
                 const unsigned char *sm, unsigned long long smlen,
                 const unsigned char *pk);
 #endif /* api_h */
 '''.strip()
 with open(f'../../../nistapi/lvl{lvl}/api.h', 'w') as f:
    print(api, file=f)
--- a/scripts/precomp/precompute_torsion_constants.sage
+++ b/scripts/precomp/precompute_torsion_constants.sage
@@ -0,0 +1,40 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 ################################################################
 from parameters import p
 ################################################################
 tors2part = (p+1).p_primary_part(2)
 lambda_security = round(p.bit_length() / 128) * 64
 N_sec = next_prime(1 << 4*lambda_security)
 N_com = N_sec
 defs = {
        'TORSION_2POWER_BYTES': (tors2part.bit_length() + 7) // 8,
    }
 from cformat import Ibz, Object, ObjectFormatter
 objs = ObjectFormatter([
        Object('ibz_t', 'TWO_TO_SECURITY_BITS', Ibz(1 << lambda_security)),  # lambda_security = SECURITY_BITS (128, 192, 256)
        Object('ibz_t', 'TORSION_PLUS_2POWER', Ibz(tors2part)),
        Object('ibz_t', 'SEC_DEGREE', Ibz(N_sec)),
        Object('ibz_t', 'COM_DEGREE', Ibz(N_com)),
    ])
 with open('include/torsion_constants.h','w') as hfile:
    with open('torsion_constants.c','w') as cfile:
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <torsion_constants.h>', file=cfile)
        for k,v in defs.items():
            print(f'#define {k} {v}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precomp/torsion_basis.py
+++ b/scripts/precomp/torsion_basis.py
@@ -0,0 +1,75 @@
 from sage.all import ZZ, GF, EllipticCurve, parallel
 def even_torsion_basis_E0(E0, f):
    """
    For the case when A = 0 we can't use the entangled basis algorithm
    so we do something "stupid" to simply get something canonical
    """
    assert E0.a_invariants() == (0, 0, 0, 1, 0)
    Fp2 = E0.base_ring()
    p = Fp2.characteristic()
    def points_order_two_f():
        """
        Compute a point P of order 2^f with x(P) = 1 + i*x_im
        """
        x_im = 0
        while True:
            x_im += 1
            x = Fp2([1, x_im])
            if not E0.is_x_coord(x):
                continue
            # compares a+bi <= c+di iff (a,b) <= (c,d) as tuples, where integers
            # modulo p are compared via their minimal non-negative representatives
            P = min(E0.lift_x(x, all=True), key = lambda pt: list(pt.y()))
            P.set_order(multiple=p+1)
            if P.order() % (1 << f) == 0:
                P *= P.order() // (1 << f)
                P.set_order(1 << f)
                yield P
    pts = points_order_two_f()
    P = next(pts)
    for Q in pts:
        # Q is picked to be in E[2^f] AND we must ensure that
        # <P, Q> form a basis, which is the same as e(P, Q) having
        # full order 1 << f.
        e = P.weil_pairing(Q, 1 << f)
        if e ** (1 << f - 1) == -1:
            break
    # Finally we want to make sure Q is above (0, 0)
    P2 = (1 << f - 1) * P
    Q2 = (1 << f - 1) * Q
    if Q2 == E0(0, 0):
        pass
    elif P2 == E0(0, 0):
        P, Q = Q, P
    else:
        Q += P
    assert P.weil_pairing(Q, 1 << f) ** (1 << f - 1) == -1
    assert (1 << f - 1) * Q == E0(0, 0)
    return P, Q
 if __name__ == "__main__":
    # p, f = 5 * 2**248 - 1, 248
    # p, f = 65 * 2**376 - 1, 376
    p, f = 27 * 2**500 - 1, 500
    print(f"p = {ZZ(p+1).factor()} - 1")
    Fp2 = GF(p**2, modulus=[1, 0, 1], names="i")
    E = EllipticCurve(Fp2, [1, 0])
    E.set_order((p + 1) ** 2)
    P, Q = even_torsion_basis_E0(E, f)
    print(f"{P = }")
    print(f"{Q = }")
    assert P.order() == 1 << f
    assert Q.order() == 1 << f
    e = P.weil_pairing(Q, 1 << f)
    assert e ** (1 << f - 1) == -1
    print("all good")
--- a/scripts/precompute_endomorphism_action.sage
+++ b/scripts/precompute_endomorphism_action.sage
@@ -1,203 +0,0 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(10, 0, print_message=True):
    exit('')
 ################################################################
 from parameters import p, B, f, Tpls, Tmin, Dcom, Dchall
 T = Tpls * Tmin
 ################################################################
 if p % 4 != 3:
    raise NotImplementedError('requires p ≡ 3 (mod 4)')
 Fp2.<i> = GF((p,2), modulus=[1,0,1])
 Fp4 = Fp2.extension(2,'u')
 E = EllipticCurve(Fp4, [1,0])
 assert E.j_invariant() == 1728
 assert E.is_supersingular()
 assert E.change_ring(Fp2).frobenius() == -p
 assert E.order() == (p^2-1)^2
 endo_1 = E.scalar_multiplication(1)
 endo_i = E.automorphisms()[-1]
 endo_j = E.frobenius_isogeny()
 endo_k = endo_i * endo_j
 if 0:  # skipped for speed, for now
    assert endo_i^2 == E.scalar_multiplication(-1)
    assert endo_j^2 == E.scalar_multiplication(-p)
    assert endo_j * endo_i == - endo_i * endo_j
 else:
    R = E.random_point()
    assert (endo_i^2)(R) == -1*R
    assert (endo_j^2)(R) == -p*R
    assert (endo_j*endo_i)(R) == -(endo_i*endo_j)(R)
 def half_endo(summands):
    def _eval(P):
        E = P.curve()
        assert P in E
        F = E.base_field()
        if (halves := P.division_points(2)):
            Q = halves[0]
        else:
            Q = E.change_ring(F.extension(2,'v'))(P)
        R = sum(endo._eval(Q) for endo in summands)
        return E(R)
    return _eval
 gen1 = endo_1._eval
 gen2 = endo_i._eval
 gen3 = half_endo([endo_i, endo_j])
 gen4 = half_endo([endo_1, endo_k])
 ################################################################
 from sage.groups.generic import order_from_multiple
 x = Fp4.gen()
 while True:
    x += 1
    try:
        P = E.lift_x(x)
    except ValueError:
        continue
    o = order_from_multiple(P, p^2-1)
    if (T<<f).divides(o):
        P *= o // (T<<f)
        P.set_order(T<<f)
        break
 x = Fp4.gen()
 while True:
    x += 1
    try:
        Q = E.lift_x(x)
    except ValueError:
        continue
    o = order_from_multiple(Q, p^2-1)
    if not (T<<f).divides(o):
        continue
    Q *= o // (T<<f)
    Q.set_order(T<<f)
    if order_from_multiple(P.weil_pairing(Q, T<<f), T<<f, operation='*') == T<<f:
        break
 def dlp(P, Q, R):
    n = P.order()
    assert P.order() == Q.order()
    assert R.order().divides(P.order())
    e = Fp2(P.weil_pairing(Q, n))
    a = Fp2(R.weil_pairing(Q, n)).log(e)
    b = Fp2(P.weil_pairing(R, n)).log(e)
    assert a*P + b*Q == R
    return a, b
 def matrix_of_isogeny(phi):
    imP, imQ = map(phi, (P,Q))
    vecP = dlp(P, Q, imP)
    vecQ = dlp(P, Q, imQ)
    mat = matrix(Zmod(T<<f), [vecP, vecQ]).transpose()
    assert imP == ZZ(mat[0][0])*P + ZZ(mat[1][0])*Q
    assert imQ == ZZ(mat[0][1])*P + ZZ(mat[1][1])*Q
    return mat
 #mat1 = matrix_of_isogeny(endo_1)
 mati = matrix_of_isogeny(endo_i)
 matj = matrix_of_isogeny(endo_j)
 matk = matrix_of_isogeny(endo_k)
 #assert mat1 == 1    # identity; omit
 #mat1 = matrix_of_isogeny(gen1)
 mat2 = matrix_of_isogeny(gen2)
 mat3 = matrix_of_isogeny(gen3)
 mat4 = matrix_of_isogeny(gen4)
 #assert mat1 == 1    # identity; omit
 ################################################################
 Quat.<i,j,k> = QuaternionAlgebra(-1, -p)
 O0 = Quat.quaternion_order([1, i, (i+j)/2, (1+k)/2])
 assert Dcom % 2 == 1  # odd
 mat = block_matrix(Zmod(Dcom), [[identity_matrix(2), mati, matj, matk]])[:,::2]
 ker = list(map(Quat, mat.right_kernel_matrix()))
 idealP = sum((O0*g for g in ker), O0*Dcom)
 assert idealP.norm() == Dcom
 for b in idealP.basis():
    assert sum(Mod(c,Dcom)*g for c,g in zip(b,(1,mati,matj,matk)))[:,0] == 0  # kills P
 for v in (ZZ^4):
    idealPgen = sum(c*g for c,g in zip(v, idealP.basis()))
    if vector(list(idealPgen)).denominator() == 2:
        idealPgen *= 2
    if gcd(idealPgen.reduced_norm(), Dcom^2) == Dcom:
        break
 assert idealP == O0*Dcom + O0*idealPgen
 mat = mat   # still
 rhs = vector(Zmod(Dcom), [0,1])
 cs = mat.solve_right(rhs)
 distorter = Quat(cs)
 assert sum(Mod(c,Dcom)*g for c,g in zip(distorter,(1,mati,matj,matk))).columns()[0] == vector((0,1))  # maps P->Q
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 def field2limbs(el):
    l = 1 + floor(log(p, 2**64))
    el = Fp2(el)
    vs = [[(int(c) >> 64*i) % 2**64 for i in range(l)] for c in el]
    return vs
 def fmt_basis(name, P, Q):
    vs = [
            [field2limbs(T[0]), field2limbs(T[2])]
            for T in (P,Q,P-Q)
        ]
    return Object('ec_basis_t', name, vs)
 bases = {
        'EVEN': 1<<f,
        'ODD_PLUS': Tpls,
        'ODD_MINUS': Tmin,
        'COMMITMENT_PLUS': gcd(Tpls, Dcom),
        'COMMITMENT_MINUS': gcd(Tmin, Dcom),
        'CHALLENGE': Dchall,
    }
 assert P.order() == Q.order()
 objs = ObjectFormatter([
        fmt_basis(f'BASIS_{k}', ZZ(P.order()/v)*P, ZZ(Q.order()/v)*Q)
        for k,v in bases.items()
    ] + [
        Object('ec_curve_t', 'CURVE_E0', [[[int(0)]], [[int(1)]]]),
        Object('ec_point_t', 'CURVE_E0_A24', [[[int(0)]], [[int(1)]]]),
        Object('ibz_mat_2x2_t', 'ACTION_I', [[Ibz(v) for v in vs] for vs in mati]),
        Object('ibz_mat_2x2_t', 'ACTION_J', [[Ibz(v) for v in vs] for vs in matj]),
        Object('ibz_mat_2x2_t', 'ACTION_K', [[Ibz(v) for v in vs] for vs in matk]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN2', [[Ibz(v) for v in vs] for vs in mat2]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN3', [[Ibz(v) for v in vs] for vs in mat3]),
        Object('ibz_mat_2x2_t', 'ACTION_GEN4', [[Ibz(v) for v in vs] for vs in mat4]),
        Object('quat_alg_elem_t', 'COMMITMENT_IDEAL_UNDISTORTED_GEN', [Ibz(1), [Ibz(ZZ(v)) for v in idealPgen]]),
        Object('quat_alg_elem_t', 'COMMITMENT_IDEAL_DISTORTION_ENDO', [Ibz(1), [Ibz(ZZ(v)) for v in distorter]]),
    ])
 with open('include/endomorphism_action.h','w') as hfile:
    with open('endomorphism_action.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <ec.h>', file=hfile)
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <endomorphism_action.h>', file=cfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_klpt_constants.sage
+++ b/scripts/precompute_klpt_constants.sage
@@ -1,114 +0,0 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import f, p, Tpls, Tmin
 negl = 2**-64   #TODO optimize
 ################################################################
 logp = ceil(log(p, 2))
 logT = ceil(log(Tpls*Tmin, 2))
 tors2val = (p+1).valuation(2)
 defs = dict()
 # lideal_equiv
 defs['KLPT_equiv_bound_coeff'] = ceil((log(negl, 1-2/logp) ** (1/4) - 1) / 2) + 2
 assert (1 - 2/logp) ** ((2 * defs['KLPT_equiv_bound_coeff'] + 1) ** 4) <= negl
 defs['KLPT_equiv_num_iter'] = (2 * defs['KLPT_equiv_bound_coeff'] + 1) ** 4
 defs['KLPT_primality_num_iter'] = ceil(-log(negl, 4))
 # signing KLPT
 defs['KLPT_signing_klpt_length'] = f * ceil (ceil((log(negl, 2) / -2) + 15/4*logp + 25)/f)
 assert 2**(-2 * (defs['KLPT_signing_klpt_length'] - 15/4*logp - 25)) <= negl
 defs['KLPT_signing_num_gamma_trial'] = ceil(log(negl, 2) / -1)
 assert 2 ** ( - defs['KLPT_signing_num_gamma_trial']) <= negl
 defs['KLPT_gamma_exponent_interval_size'] = 0
 defs['KLPT_gamma_exponent_center_shift'] = ceil(log(log(negl, 1-1/logp) + defs['KLPT_signing_num_gamma_trial'], 2) + defs['KLPT_gamma_exponent_interval_size'])
 assert (1 - 1/logp) ** (2**(defs['KLPT_gamma_exponent_center_shift'] - defs['KLPT_gamma_exponent_interval_size']) - defs['KLPT_signing_num_gamma_trial']) <= negl
 defs['KLPT_repres_num_gamma_trial'] = 2**(defs['KLPT_gamma_exponent_center_shift'] + defs['KLPT_gamma_exponent_interval_size'])
 defs['KLPT_signing_number_strong_approx'] = ceil(log(1/64, 1-4/13/logp))
 assert (1 - 4/13/logp) ** defs['KLPT_signing_number_strong_approx'] <= 1/64
 # keygen KLPT
 defs['KLPT_random_prime_attempts'] = 64
 defs['KLPT_secret_key_prime_size'] = ceil(logp / 4)
 defs['KLPT_keygen_length'] =   f* ceil ( ceil(log(negl, 2) / -2 + 5/2*logp -25 ) / f)
 assert 2 ** (-2 * (defs['KLPT_keygen_length'] - 5/2*logp +25)) <= negl
 defs['KLPT_keygen_num_gamma_trial'] = ceil(log(negl, 2) / -1)
 defs['KLPT_eichler_smallnorm_bitsize'] = ceil(1/2*logp - 4/3*( logT - 5/4*logp))
 defs['KLPT_keygen_number_strong_approx'] = ceil(log(1/64, 1-2/5/logp))
 assert (1 - 2/5/logp) ** defs['KLPT_keygen_number_strong_approx'] <= 1/64
 # Eichler
 defs['KLPT_eichler_number_mu_norm'] = ceil((logT - 5/4*logp) / log(3,2))
 defs['KLPT_eichler_strong_approx_log_margin'] = 2
 defs['KLPT_eichler_num_equiv_ideal'] = ceil(logp / 10)
 defs['KLPT_eichler_number_strong_approx'] = ceil(10 * logp)
 # signature response
 defs['SQISIGN_response_attempts'] = 64
 # signature isogeny degrees
 defs['SQISIGN_random_length'] = 0
 defs['SQISIGN_signing_total_length'] = defs['KLPT_signing_klpt_length']
 defs['SQISIGN_signing_length'] = ZZ(defs['SQISIGN_signing_total_length'] / tors2val)
 defs['SQISIGN_keygen_length'] = ZZ(defs['KLPT_keygen_length'] / tors2val)
 # prime data for Cornacchia
 primes_1mod4 = [p for p in primes(100) if p%4==1]
 prod_primes_3mod4 = prod(p for p in primes(100) if p%4==3)
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 objs = ObjectFormatter([
        Object('short[]', 'SMALL_PRIMES_1MOD4', [int(v) for v in primes_1mod4]),
        Object('ibz_t', 'PROD_SMALL_PRIMES_3MOD4', Ibz(prod_primes_3mod4)),
    ])
 ################################################################
 with open('include/klpt_constants.h','w') as hfile:
    with open('klpt_constants.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <klpt_constants.h>', file=cfile)
        for k,v in defs.items():
            v = ZZ(v)
            print(f'#define {k} {v}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_quaternion_data.sage
+++ b/scripts/precompute_quaternion_data.sage
@@ -1,115 +0,0 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import p
 num = 7  #TODO how many extra maximal orders to precompute?
 ################################################################
 # Underlying theory:
 # - Ibukiyama, On maximal orders of division quaternion algebras with certain optimal embeddings
 # - https://ia.cr/2023/106 Lemma 10
 from sage.algebras.quatalg.quaternion_algebra import basis_for_quaternion_lattice
 bfql = lambda els: basis_for_quaternion_lattice(els, reverse=True)
 Quat.<i,j,k> = QuaternionAlgebra(-1, -p)
 assert Quat.discriminant() == p         # ramifies correctly
 orders = []
 q = 1
 while len(orders) < num:
    q = next_prime(q)
    if q == 2:
        continue
    Quat2.<ii,jj,kk> = QuaternionAlgebra(-q, -p)
    if Quat2.discriminant() != p:       # ramifies incorrectly
        continue
    x, y = QuadraticForm(QQ, 2, [1,0,p]).solve(q)
    gamma = x + j*y
    assert gamma.reduced_norm() == q
    ims = [Quat(1), i*gamma, j, k*gamma]
    assert ims[1]^2 == -q
    assert ims[2]^2 == -p
    assert ims[1]*ims[2] == ims[3]
    assert ims[2]*ims[1] == -ims[3]
    # (1,ii,jj,kk)->ims is an isomorphism Quat2->Quat
    r = min(map(ZZ, Mod(-p, 4*q).sqrt(all=True)))
    if q % 4 == 3:
        bas2 = [
                Quat2(1),
                (1 + ii) / 2,
                jj * (1 + ii) / 2,
                (r + jj) * ii / q,
            ]
    else:
        bas2 = [
                Quat2(1),
                ii,
                (1 + jj) / 2,
                (r + jj) * ii / 2 / q,
            ]
    O2 = Quat2.quaternion_order(bas2)
    assert O2.discriminant() == p       # is maximal
    bas = [sum(c*im for c,im in zip(el,ims)) for el in bas2]
    bas = bfql(bas)
    O = Quat.quaternion_order(bas)
    assert O.discriminant() == p        # is maximal
    assert j in O                       # p-extremal
    mat = matrix(map(list, bas))
 #    print(f'{q = }\nsqrt(-q) = {ims[1]}\n    {(chr(10)+"    ").join(map(str,bas))}', file=sys.stderr)
    assert mat[0] == vector((1,0,0,0))
    orders.append((q, ims[1], mat))
 ################################################################
 gram = matrix(ZZ, [
    [((gi+gj).reduced_norm() - gi.reduced_norm() - gj.reduced_norm()) / 2
        for gi in Quat.basis()] for gj in Quat.basis()])
 O0mat = matrix([list(g) for g in [Quat(1), i, (i+j)/2, (1+k)/2]])
 ################################################################
 from cformat import Ibz, Object, ObjectFormatter
 algobj = [Ibz(p), [[Ibz(v) for v in vs] for vs in gram]]
 O0ord = [Ibz(O0mat.denominator()), [[Ibz(v*O0mat.denominator()) for v in vs] for vs in O0mat.transpose()]]
 O0obj = [O0ord, [Ibz(1), [Ibz(c) for c in (0,1,0,0)]], [Ibz(1), [Ibz(c) for c in (0,0,1,0)]], 1]
 objs = [[[Ibz(mat.denominator()), [[Ibz(v*mat.denominator()) for v in vs] for vs in mat.transpose()]], [Ibz(mat.denominator()), [Ibz(c*mat.denominator()) for c in ii]], [Ibz(1), [Ibz(c) for c in (0,0,1,0)]], q] for q,ii,mat in orders]
 objs = ObjectFormatter([
        Object('quat_alg_t', 'QUATALG_PINFTY', algobj),
        Object('quat_order_t', 'MAXORD_O0', O0ord),
        Object('quat_p_extremal_maximal_order_t', 'STANDARD_EXTREMAL_ORDER', O0obj),
        Object('quat_p_extremal_maximal_order_t[]', 'ALTERNATE_EXTREMAL_ORDERS', objs),
    ])
 with open('include/quaternion_data.h','w') as hfile:
    with open('quaternion_data.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <quaternion.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <quaternion_data.h>', file=cfile)
        print(f'#define NUM_ALTERNATE_EXTREMAL_ORDERS {len(orders)}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/scripts/precompute_sizes.sage
+++ b/scripts/precompute_sizes.sage
@@ -1,92 +0,0 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import lvl, f, p
 ################################################################
 logp = ceil(log(p, 2))
 tors2part = (p+1).p_primary_part(2)
 tors3part = (p+1).p_primary_part(3)
 #XXX first load the constants from klpt_constants.h
 import re
 klpt_consts = dict()
 for l in open('include/klpt_constants.h'):
    m = re.search(r'#define *([^ ]+) *([x0-9]+)$', l)
    if m:
        k,v = m.groups()
        klpt_consts[k] = int(v, 0)
 defs = dict()
 fp2sz = (logp + 63)//64*8 * 2
 defs['FP2_ENCODED_BYTES'] = fp2sz
 defs['EC_CURVE_ENCODED_BYTES'] = fp2sz  # just the A
 defs['EC_POINT_ENCODED_BYTES'] = fp2sz  # just the x
 defs['EC_BASIS_ENCODED_BYTES'] = 3 * defs['EC_POINT_ENCODED_BYTES']
 defs['CHAIN_LENGTH'] = klpt_consts['SQISIGN_keygen_length']
 defs['QUAT_ALG_ELEM_ENCODED_BITS'] = ceil(((logp/4) + klpt_consts['KLPT_keygen_length'])/2  +55)  #TODO FIXME figure this out XXX XXX
 defs['QUAT_ALG_ELEM_ENCODED_BYTES'] = (defs['QUAT_ALG_ELEM_ENCODED_BITS'] + 7)//8
 defs['ID2ISO_LONG_TWO_ISOG_ENCODED_BYTES'] = defs['CHAIN_LENGTH'] * (defs['EC_CURVE_ENCODED_BYTES'] + defs['EC_POINT_ENCODED_BYTES'] + 2)
 defs['ZIP_CHAIN_LEN'] = klpt_consts['SQISIGN_signing_length']
 defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_ZIP_CHAIN_BYTES'] = (f + 7) // 8
 defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_BYTES'] = defs['ZIP_CHAIN_LEN'] * defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_ZIP_CHAIN_BYTES'] + 1
 defs['SIGNATURE_LEN'] = defs['ID2ISO_COMPRESSED_LONG_TWO_ISOG_BYTES'] + ((tors2part*tors3part).bit_length()+7)//8 + 1 + (tors2part.bit_length()+7)//8 + (tors3part.bit_length()+7)//8
 defs['PUBLICKEY_BYTES'] = defs['EC_CURVE_ENCODED_BYTES']
 defs['SECRETKEY_BYTES'] = defs['EC_CURVE_ENCODED_BYTES'] + 5*defs['QUAT_ALG_ELEM_ENCODED_BYTES'] + defs['EC_POINT_ENCODED_BYTES'] + defs['EC_BASIS_ENCODED_BYTES'] + defs['EC_BASIS_ENCODED_BYTES']
 size_privkey = defs['SECRETKEY_BYTES']
 size_pubkey = defs['PUBLICKEY_BYTES']
 size_signature = defs['SIGNATURE_LEN']
 algname = f'lvl{lvl}'
 ################################################################
 with open('include/encoded_sizes.h','w') as hfile:
    for k,v in defs.items():
        v = ZZ(v)
        print(f'#define {k} {v}', file=hfile)
 api = f'''
 // SPDX-License-Identifier: Apache-2.0
 #ifndef api_h
 #define api_h
 #define CRYPTO_SECRETKEYBYTES {size_privkey:4}
 #define CRYPTO_PUBLICKEYBYTES {size_pubkey:4}
 #define CRYPTO_BYTES          {size_signature:4}
 #define CRYPTO_ALGNAME "{algname}"
 int
 crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
 int
 crypto_sign(unsigned char *sm, unsigned long long *smlen,
            const unsigned char *m, unsigned long long mlen,
            const unsigned char *sk);
 int
 crypto_sign_open(unsigned char *m, unsigned long long *mlen,
                 const unsigned char *sm, unsigned long long smlen,
                 const unsigned char *pk);
 #endif /* api_h */
 '''.strip()
 with open(f'../../../nistapi/lvl{lvl}/api.h', 'w') as f:
    print(api, file=f)
--- a/scripts/precompute_torsion_constants.sage
+++ b/scripts/precompute_torsion_constants.sage
@@ -1,67 +0,0 @@
 #!/usr/bin/env sage
 proof.all(False)  # faster
 from sage.misc.banner import require_version
 if not require_version(9, 8, print_message=True):
    exit('')
 ################################################################
 from parameters import p, B, f, Tpls, Tmin, Dcom, Dchall
 ################################################################
 Lpls = sorted(set(Tpls.prime_factors()) - {2})
 Epls = [Tpls.valuation(l) for l in Lpls]
 Lmin = sorted(set(Tmin.prime_factors()) - {2})
 Emin = [Tmin.valuation(l) for l in Lmin]
 tors2part = (p+1).p_primary_part(2)
 tors3part = (p+1).p_primary_part(3)
 tors23part = tors2part * tors3part
 defs = {
        'TORSION_2POWER_BYTES': (int(tors2part).bit_length() + 7) // 8,
        'TORSION_3POWER_BYTES': (int(tors3part).bit_length() + 7) // 8,
        'TORSION_23POWER_BYTES': (int(tors23part).bit_length() + 7) // 8,
    }
 from cformat import Ibz, Object, ObjectFormatter
 objs = ObjectFormatter([
        Object('uint64_t', 'TORSION_PLUS_EVEN_POWER', int(f)),
        Object('uint64_t[]', 'TORSION_ODD_PRIMES', Lpls + Lmin),
        Object('uint64_t[]', 'TORSION_ODD_POWERS', Epls + Emin),
        Object('uint64_t[]', 'TORSION_PLUS_ODD_PRIMES', Lpls),      # TODO deduplicate?
        Object('size_t[]', 'TORSION_PLUS_ODD_POWERS', Epls),        # TODO deduplicate?
        Object('uint64_t[]', 'TORSION_MINUS_ODD_PRIMES', Lmin),     # TODO deduplicate?
        Object('size_t[]', 'TORSION_MINUS_ODD_POWERS', Emin),       # TODO deduplicate?
        Object('size_t[]', 'DEGREE_COMMITMENT_POWERS', [Dcom.valuation(l) for l in Lpls+Lmin]), #FIXME should be ec_degree_odd_t
        Object('ibz_t', 'CHARACTERISTIC', Ibz(p)),
        Object('ibz_t', 'TORSION_ODD', Ibz(Tpls * Tmin)),
        Object('ibz_t[]', 'TORSION_ODD_PRIMEPOWERS', [Ibz(l^e) for Tpm in (Tpls,Tmin) for l,e in Tpm.factor()]),
        Object('ibz_t', 'TORSION_ODD_PLUS', Ibz(Tpls)),
        Object('ibz_t', 'TORSION_ODD_MINUS', Ibz(Tmin)),
        Object('ibz_t', 'TORSION_PLUS_2POWER', Ibz(tors2part)),
        Object('ibz_t', 'TORSION_PLUS_3POWER', Ibz(tors3part)),
        Object('ibz_t', 'TORSION_PLUS_23POWER', Ibz(tors23part)),
        Object('ibz_t', 'DEGREE_COMMITMENT', Ibz(Dcom)),
        Object('ibz_t', 'DEGREE_COMMITMENT_PLUS', Ibz(gcd(Dcom, Tpls))),
        Object('ibz_t', 'DEGREE_COMMITMENT_MINUS', Ibz(gcd(Dcom, Tmin))),
        Object('ibz_t', 'DEGREE_CHALLENGE', Ibz(Dchall)),
    ])
 with open('include/torsion_constants.h','w') as hfile:
    with open('torsion_constants.c','w') as cfile:
        print(f'#include <intbig.h>', file=hfile)
        print(f'#include <stddef.h>', file=cfile)
        print(f'#include <stdint.h>', file=cfile)
        print(f'#include <torsion_constants.h>', file=cfile)
        for k,v in defs.items():
            print(f'#define {k} {v}', file=hfile)
        objs.header(file=hfile)
        objs.implementation(file=cfile)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,92 +1,74 @@
 # There are the following dependencies
 #     ┌─┬──────┬─┐           ┌─┬────┬─┐            ┌─┬──────┬─┐
 #     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
 #     │ │Keygen│ │           │ │Sign│ │            │ │Verify│ │
 #     │ ├──────┤ │           │ ├────┤ │            │ ├──────┤ │
 #     └─┴───┬──┴─┘           └─┴─┬──┴─┘            └─┴───┬──┴─┘
 #           │                    │                       │
 #           │                    │                       │
 #           ├────────────────────┼─────────────────┐     │
 #           │                    │                 │     │
 #           │                    │                 │     │
 #       ┌───▼──┐          ┌──────▼────────┐   ┌────▼─────▼───────────┐
 #       │ PRNG ◄────┬─────┤ Iso <-> Ideal ├───►   Elliptic Curves,   │
 #       └───▲──┘    │     └──────┬────────┘   │ Pairings & Isogenies │
 #           │       │            │            └───▲──────┬───────────┘
 #           │       │            │                │      │
 #       ┌───┴──┐    │            │                │      │
 #       │ KLPT ◄────┘            │     ┌──────────┘      │
 #       └───┬──┘                 │     │                 │
 #           │                    │     │                 │
 # ┌─────────▼─────────┐          │     │                 │
 # │ Quaternion orders │          │     │            ┌────▼───┐
 # │     and ideals    │          │     │            │ GF(p²) │
 # └─────────┬─────────┘          │     │            └────┬───┘
 #           │           ┌─┬──────▼─────┴──┬─┐            │
 #     ┌─────▼─────┐     │ ├───────────────┤ │      ┌─────▼─────┐
 #     │ MP BigInt │     │ │Precomputations│ │      │ FP BigInt │
 #     └───────────┘     │ ├───────────────┤ │      └───────────┘
 #                       └─┴───────────────┴─┘                    
 add_subdirectory(common)
 add_subdirectory(intbig)
 add_subdirectory(quaternion)
 add_subdirectory(precomp)
 add_subdirectory(klpt)
 add_subdirectory(gf)
 add_subdirectory(ec)
-add_subdirectory(id2iso)
+if(ENABLE_SIGN)
-add_subdirectory(protocols)
+    add_subdirectory(quaternion)
 endif()
 add_subdirectory(mp)
 add_subdirectory(gf)
 add_subdirectory(precomp)
 add_subdirectory(ec)
 add_subdirectory(hd)
 add_subdirectory(verification)
 if(ENABLE_SIGN)
    add_subdirectory(id2iso)
    add_subdirectory(signature)
 endif()
 FOREACH(SVARIANT ${SVARIANT_S})
    string(TOLOWER ${SVARIANT} SVARIANT_LOWER)
    string(TOUPPER ${SVARIANT} SVARIANT_UPPER)
    set(SOURCE_FILES_VARIANT sqisign.c)
    # Library for SQIsign variant
    add_library(sqisign_${SVARIANT_LOWER} ${SOURCE_FILES_VARIANT})
    target_link_libraries(sqisign_${SVARIANT_LOWER} PUBLIC
-        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_SIGNATURE_${SVARIANT_UPPER}}>
-        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
+        ${LIB_VERIFICATION_${SVARIANT_UPPER}}
-        ${LIB_KLPT_${SVARIANT_UPPER}} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_ID2ISO_${SVARIANT_UPPER}}>
-        ${LIB_QUATERNION} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_QUATERNION}>
-        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
+        ${LIB_MP}
        ${LIB_INTBIG} 
        ${LIB_GF_${SVARIANT_UPPER}}
        ${LIB_EC_${SVARIANT_UPPER}}
-        ${GMP} 
+        ${LIB_HD_${SVARIANT_UPPER}}
        ${LIB_PRECOMP_${SVARIANT_UPPER}}
        $<$<BOOL:${ENABLE_SIGN}>:GMP>
        sqisign_common_sys
    )
-    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_include_directories(sqisign_${SVARIANT_LOWER} PUBLIC $<$<BOOL:${ENABLE_SIGN}>:${INC_SIGNATURE}> ${INC_VERIFICATION} $<$<BOOL:${ENABLE_SIGN}>:${INC_QUATERNION}> ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_MP} ${INC_EC} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_HD} $<$<BOOL:${ENABLE_SIGN}>:${INC_ID2ISO}> ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER} PUBLIC SQISIGN_VARIANT=${SVARIANT})
    # Library for SQIsign variant (test)
    add_library(sqisign_${SVARIANT_LOWER}_test ${SOURCE_FILES_VARIANT})
    target_link_libraries(sqisign_${SVARIANT_LOWER}_test PUBLIC
-        ${LIB_PROTOCOLS_${SVARIANT_UPPER}} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_SIGNATURE_${SVARIANT_UPPER}}>
-        ${LIB_ID2ISO_${SVARIANT_UPPER}} 
+        ${LIB_VERIFICATION_${SVARIANT_UPPER}}
-        ${LIB_KLPT_${SVARIANT_UPPER}} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_ID2ISO_${SVARIANT_UPPER}}>
-        ${LIB_QUATERNION} 
+        $<$<BOOL:${ENABLE_SIGN}>:${LIB_QUATERNION}>
-        ${LIB_PRECOMP_${SVARIANT_UPPER}} 
+        ${LIB_MP}
        ${LIB_INTBIG} 
        ${LIB_GF_${SVARIANT_UPPER}}
        ${LIB_EC_${SVARIANT_UPPER}}
-        ${GMP} 
+        ${LIB_HD_${SVARIANT_UPPER}}
        ${LIB_PRECOMP_${SVARIANT_UPPER}}
        $<$<BOOL:${ENABLE_SIGN}>:GMP>
        sqisign_common_test
    )
-    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC ${INC_PROTOCOLS} ${INC_INTBIG} ${INC_QUATERNION} ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_EC} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_KLPT} ${INC_ID2ISO} ../include PRIVATE common/generic internal)
+    target_include_directories(sqisign_${SVARIANT_LOWER}_test PUBLIC $<$<BOOL:${ENABLE_SIGN}>:${INC_SIGNATURE}> ${INC_VERIFICATION} $<$<BOOL:${ENABLE_SIGN}>:${INC_QUATERNION}> ${INC_PRECOMP_${SVARIANT_UPPER}} ${INC_MP} ${INC_EC} ${INC_GF} ${INC_GF_${SVARIANT_UPPER}} ${INC_COMMON} ${INC_HD} $<$<BOOL:${ENABLE_SIGN}>:${INC_ID2ISO}> ../include PRIVATE common/generic internal)
    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test PUBLIC SQISIGN_VARIANT=${SVARIANT})
    # Library with NIST API
    set(SOURCE_FILE_NISTAPI nistapi/${SVARIANT_LOWER}/api.c)
    add_library(sqisign_${SVARIANT_LOWER}_nistapi ${SOURCE_FILE_NISTAPI})
-    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PRIVATE sqisign_${SVARIANT_LOWER})
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC sqisign_${SVARIANT_LOWER})
    target_include_directories(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC nistapi/${SVARIANT_LOWER} PUBLIC ../include)
    target_compile_definitions(sqisign_${SVARIANT_LOWER}_nistapi PUBLIC SQISIGN_VARIANT=${SVARIANT})
    # Library with NIST API (test)
    add_library(sqisign_${SVARIANT_LOWER}_test_nistapi ${SOURCE_FILE_NISTAPI})
-    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PRIVATE sqisign_${SVARIANT_LOWER}_test)
+    target_link_libraries(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC sqisign_${SVARIANT_LOWER}_test)
    target_include_directories(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC nistapi/${SVARIANT_LOWER})
    target_compile_definitions(sqisign_${SVARIANT_LOWER}_test_nistapi PUBLIC SQISIGN_VARIANT=${SVARIANT})
 ENDFOREACH()
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,3 +1,8 @@
 if (POLICY CMP0076)
    cmake_policy(SET CMP0076 NEW)
 endif()
 get_filename_component(CCSD_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
 string(TOUPPER ${CCSD_NAME} CCSD_NAME_UPPER)
 include(${SELECT_SQISIGN_VARIANT})
 include(${SELECT_IMPL_TYPE})
--- a/src/common/arm64crypto/CMakeLists.txt
+++ b/src/common/arm64crypto/CMakeLists.txt
@@ -0,0 +1,40 @@
 if(CMAKE_C_COMPILER_ID MATCHES "Clang")
    set(SOURCE_FILES_COMMON_ARM64CRYPTO randombytes_ctrdrbg_inline_asm.c)
 else()
    set(SOURCE_FILES_COMMON_ARM64CRYPTO randombytes_ctrdrbg.c)
    set_source_files_properties(randombytes_ctrdrbg.c PROPERTIES COMPILE_FLAGS -fno-strict-aliasing)
 endif()
 foreach(SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
    target_sources(${SQISIGN_COMMON_TARGET} PRIVATE ${SOURCE_FILES_COMMON_ARM64CRYPTO})
    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include)
    target_compile_definitions(${SQISIGN_COMMON_TARGET} PRIVATE RANDOMBYTES_ARM64CRYPTO)
    target_compile_options(${SQISIGN_COMMON_TARGET} PRIVATE -march=armv8-a+crypto)
 endforeach()
 set(SOURCE_FILES_CTRDRBG_TEST_BENCHMARK
    ${SOURCE_FILES_COMMON_ARM64CRYPTO}
    ../ref/aes_c.c
    ../ref/randombytes_ctrdrbg.c
    ../generic/randombytes_system.c
 )
 add_executable(sqisign_test_ctrdrbg_arm64crypto ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/test_ctrdrbg.c)
 target_include_directories(sqisign_test_ctrdrbg_arm64crypto PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
 target_compile_definitions(sqisign_test_ctrdrbg_arm64crypto PRIVATE
    CTRDRBG_TEST_BENCH
    RANDOMBYTES_INIT_PLATFORM=randombytes_init_arm64crypto
    RANDOMBYTES_PLATFORM=randombytes_arm64crypto)
 target_compile_options(sqisign_test_ctrdrbg_arm64crypto PRIVATE -march=armv8-a+crypto)
 add_test(sqisign_test_ctrdrbg_arm64crypto sqisign_test_ctrdrbg_arm64crypto)
 add_executable(sqisign_bench_ctrdrbg_arm64crypto ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/bench_ctrdrbg.c)
 target_include_directories(sqisign_bench_ctrdrbg_arm64crypto PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
 target_compile_definitions(sqisign_bench_ctrdrbg_arm64crypto PRIVATE
    CTRDRBG_TEST_BENCH
    RANDOMBYTES_INIT_PLATFORM=randombytes_init_arm64crypto
    RANDOMBYTES_PLATFORM=randombytes_arm64crypto)
 target_compile_options(sqisign_bench_ctrdrbg_arm64crypto PRIVATE -march=armv8-a+crypto)
 set(BM_BINS ${BM_BINS} sqisign_bench_ctrdrbg_arm64crypto CACHE INTERNAL "List of benchmark executables")
--- a/src/common/arm64crypto/include/randombytes_arm64crypto.h
+++ b/src/common/arm64crypto/include/randombytes_arm64crypto.h
@@ -0,0 +1,27 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef RANDOMBYTES_ARM64CRYPTO_H
 #define RANDOMBYTES_ARM64CRYPTO_H
 #include <stdio.h>
 #define RNG_SUCCESS      0
 #define RNG_BAD_MAXLEN  -1
 #define RNG_BAD_OUTBUF  -2
 #define RNG_BAD_REQ_LEN -3
 typedef struct {
    unsigned char   buffer[16];
    int             buffer_pos;
    unsigned long   length_remaining;
    unsigned char   key[32];
    unsigned char   ctr[16];
 } AES_XOF_struct;
 typedef struct {
    unsigned char   Key[32];
    unsigned char   V[16];
    int             reseed_counter;
 } AES256_CTR_DRBG_struct;
 #endif /* RANDOMBYTES_ARM64CRYPTO_H */
--- a/src/common/arm64crypto/randombytes_ctrdrbg.c
+++ b/src/common/arm64crypto/randombytes_ctrdrbg.c
@@ -0,0 +1,276 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "randombytes_arm64crypto.h"
 #include <arm_neon.h>
 #include <string.h>
 static AES256_CTR_DRBG_struct DRBG_ctx;
 static inline uint32_t AES_sbox_x4(uint32_t in) {
  uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
  sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
  return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
 }
 #define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
 typedef union {
  uint8_t u8[15][16];
  uint32_t u32[15][4];
 } subkeys_t;
 static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) {
  subkeys_t *sk = (subkeys_t *)subkeys;
  uint8_t rcon = 1;
  uint32_t s;
  int i, j;
  memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t));
  for (i = 2; i < 14; i += 2) {
    s = AES_sbox_x4(sk->u32[i - 1][3]);
    sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0];
    for (j = 1; j < 4; j++) {
      sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j];
    }
    s = AES_sbox_x4(sk->u32[i][3]);
    sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0];
    for (j = 1; j < 4; j++) {
      sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j];
    }
    rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b);
  }
  s = AES_sbox_x4(sk->u32[13][3]);
  sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0];
  for (j = 1; j < 4; j++) {
    sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j];
  }
 }
 #define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out)                             \
  do {                                                                         \
    uint8x16_t state[ways];                                                    \
                                                                               \
    for (int j = 0; j < ways; j++) {                                           \
      state[j] = vaeseq_u8(ctr[j], vsubkeys[0]);                               \
      state[j] = vaesmcq_u8(state[j]);                                         \
    }                                                                          \
                                                                               \
    for (int i = 1; i < 13; i++) {                                             \
      for (int j = 0; j < ways; j++) {                                         \
        state[j] = vaeseq_u8(state[j], vsubkeys[i]);                           \
        state[j] = vaesmcq_u8(state[j]);                                       \
      }                                                                        \
    }                                                                          \
                                                                               \
    for (int j = 0; j < ways; j++) {                                           \
      state[j] = vaeseq_u8(state[j], vsubkeys[13]);                            \
      state[j] = veorq_u8(state[j], vsubkeys[14]);                             \
      vst1q_u8(out + j * 16, state[j]);                                        \
    }                                                                          \
  } while (0);
 //    subkeys - subkeys for AES-256
 //    ctr - a 128-bit plaintext value
 //    buffer - a 128-bit ciphertext value
 static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr,
                       unsigned char *buffer) {
  AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
 }
 // vsubkeys - subkeys for AES-256
 // ctr - an array of 3 x 128-bit plaintext value
 // buffer - an array of 3 x 128-bit ciphertext value
 static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3],
                          unsigned char *buffer) {
  AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
 }
 static void bswap128(__uint128_t *x) {
  uint64_t *x64 = (uint64_t *)x;
  uint64_t t = x64[0];
  x64[0] = x64[1];
  x64[1] = t;
  x64[0] = __builtin_bswap64(x64[0]);
  x64[1] = __builtin_bswap64(x64[1]);
 }
 static void add_to_V(unsigned char V[], int incr) {
  __uint128_t *V128 = (__uint128_t *)V;
  bswap128(V128);
  (*V128) += incr;
  bswap128(V128);
 }
 static void AES256_CTR_DRBG_Update(unsigned char *provided_data,
                                   uint8x16_t vsubkeys[15], unsigned char *Key,
                                   unsigned char *V) {
  unsigned char temp[48];
  __uint128_t V128, t;
  uint64x2_t vV[3];
  memcpy(&V128, DRBG_ctx.V, sizeof(V128));
  bswap128(&V128);
  for (int j = 0; j < 3; j++) {
    V128++;
    t = V128;
    bswap128(&t);
    vV[j] = vld1q_u64((uint64_t *)&t);
  }
  AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
  if (provided_data != NULL)
    for (int i = 0; i < 48; i++)
      temp[i] ^= provided_data[i];
  memcpy(Key, temp, 32);
  memcpy(V, temp + 32, 16);
  add_to_V(DRBG_ctx.V, 1);
 }
 void randombytes_init_arm64crypto(unsigned char *entropy_input,
                                  unsigned char *personalization_string,
                                  int security_strength) {
  (void)security_strength;
  unsigned char seed_material[48];
  uint8_t subkeys[15][16];
  uint8x16_t vsubkeys[15];
  memcpy(seed_material, entropy_input, 48);
  if (personalization_string)
    for (int i = 0; i < 48; i++)
      seed_material[i] ^= personalization_string[i];
  memset(DRBG_ctx.Key, 0x00, 32);
  memset(DRBG_ctx.V, 0x00, 16);
  AES256_key_schedule(subkeys, DRBG_ctx.Key);
  for (int i = 0; i < 15; i++) {
    vsubkeys[i] = vld1q_u8(subkeys[i]);
  }
  AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter = 1;
 }
 #define WAYS 4
 int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) {
  uint8_t subkeys[15][16];
  unsigned char block[16];
  __uint128_t V[WAYS], Vle[WAYS];
  uint8x16x4_t vV;
  uint8x16_t vsubkeys[15];
  AES256_key_schedule(subkeys, DRBG_ctx.Key);
  for (int j = 0; j < 15; j++) {
    vsubkeys[j] = vld1q_u8(subkeys[j]);
  }
  memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0]));
  V[0] = Vle[0];
  vV.val[0] = vld1q_u8((uint8_t *)&V[0]);
  bswap128(&Vle[0]);
  for (int j = 1; j < WAYS; j++) {
    Vle[j] = Vle[j - 1] + 1;
    V[j] = Vle[j];
    bswap128(&V[j]);
    vV.val[j] = vld1q_u8((uint8_t *)&V[j]);
  }
  int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0;
  while (xlen >= WAYS * 16) {
    for (int j = 0; j < WAYS; j++) {
      Vle[j] += 4;
    }
    for (int j = 0; j < WAYS; j++) {
      vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[0]);
      vV.val[j] = vaesmcq_u8(vV.val[j]);
    }
    for (int i = 1; i < 13; i++) {
      for (int j = 0; j < WAYS; j++) {
        vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[i]);
        vV.val[j] = vaesmcq_u8(vV.val[j]);
      }
    }
    for (int j = 0; j < WAYS; j++) {
      vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[13]);
      vV.val[j] = veorq_u8(vV.val[j], vsubkeys[14]);
      vst1q_u8(x + j * 16, vV.val[j]);
    }
    for (int j = 0; j < WAYS; j++) {
      V[j] = Vle[j];
      bswap128(&V[j]);
    }
    vV = vld1q_u8_x4((uint8_t *)V);
    x += WAYS * 16;
    xlen -= WAYS * 16;
  }
  if (entered_fast_path && xlen == 0) {
    asm volatile("" : "+r,m"(Vle[3]) : : "memory");
    V[0] = Vle[3] - 4;
    bswap128(&V[0]);
  }
  while (xlen > 0) {
    if (xlen > 16) {
      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
      x += 16;
      xlen -= 16;
      Vle[0]++;
      V[0] = Vle[0];
      bswap128(&V[0]);
    } else {
      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
      memcpy(x, block, xlen);
      xlen = 0;
    }
  }
  memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
  AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter++;
  return RNG_SUCCESS;
 }
 #ifdef RANDOMBYTES_ARM64CRYPTO
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
  int ret = randombytes_arm64crypto(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
  return ret;
 }
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
  randombytes_init_arm64crypto(entropy_input, personalization_string,
                               security_strength);
 }
 #endif
--- a/src/common/arm64crypto/randombytes_ctrdrbg_inline_asm.c
+++ b/src/common/arm64crypto/randombytes_ctrdrbg_inline_asm.c
@@ -0,0 +1,422 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <arm_neon.h>
 #include <string.h>
 #include "randombytes_arm64crypto.h"
 typedef union {
  uint8_t u8[16];
  uint64_t u64[2];
  __uint128_t u128;
 } u128_t;
 static AES256_CTR_DRBG_struct DRBG_ctx;
 static inline uint32_t AES_sbox_x4(uint32_t in) {
  uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
  sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
  return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
 }
 #define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
 typedef union {
  uint32_t u32[15][4];
 } subkeys_t;
 static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) {
  subkeys_t *sk = (subkeys_t *)subkeys;
  uint8_t rcon = 1;
  uint32_t s;
  int i, j;
  memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t));
  for (i = 2; i < 14; i += 2) {
    s = AES_sbox_x4(sk->u32[i - 1][3]);
    sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0];
    for (j = 1; j < 4; j++) {
      sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j];
    }
    s = AES_sbox_x4(sk->u32[i][3]);
    sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0];
    for (j = 1; j < 4; j++) {
      sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j];
    }
    rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b);
  }
  s = AES_sbox_x4(sk->u32[13][3]);
  sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0];
  for (j = 1; j < 4; j++) {
    sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j];
  }
 }
 #define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out)                             \
  do {                                                                         \
    uint8x16_t state[ways];                                                    \
                                                                               \
    for (int j = 0; j < ways; j++) {                                           \
      state[j] = vaeseq_u8(ctr[j], vsubkeys[0]);                               \
      state[j] = vaesmcq_u8(state[j]);                                         \
    }                                                                          \
                                                                               \
    for (int i = 1; i < 13; i++) {                                             \
      for (int j = 0; j < ways; j++) {                                         \
        state[j] = vaeseq_u8(state[j], vsubkeys[i]);                           \
        state[j] = vaesmcq_u8(state[j]);                                       \
      }                                                                        \
    }                                                                          \
                                                                               \
    for (int j = 0; j < ways; j++) {                                           \
      state[j] = vaeseq_u8(state[j], vsubkeys[13]);                            \
      state[j] = veorq_u8(state[j], vsubkeys[14]);                             \
      vst1q_u8(out + j * 16, state[j]);                                        \
    }                                                                          \
  } while (0);
 //    subkeys - subkeys for AES-256
 //    ctr - a 128-bit plaintext value
 //    buffer - a 128-bit ciphertext value
 static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr,
                       unsigned char *buffer) {
  AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
 }
 // vsubkeys - subkeys for AES-256
 // ctr - an array of 3 x 128-bit plaintext value
 // buffer - an array of 3 x 128-bit ciphertext value
 static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3],
                          unsigned char *buffer) {
  AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
 }
 static void bswap128(u128_t *x) {
  uint64_t t = x->u64[0];
  x->u64[0] = x->u64[1];
  x->u64[1] = t;
  x->u64[0] = __builtin_bswap64(x->u64[0]);
  x->u64[1] = __builtin_bswap64(x->u64[1]);
 }
 static void incr_V(u128_t *V) {
  bswap128(V);
  V->u128++;
  bswap128(V);
 }
 static void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
                                   uint8x16_t vsubkeys[15], unsigned char *Key,
                                   unsigned char *V) {
  (void)V;
  unsigned char temp[48];
  u128_t V128, t;
  uint64x2_t vV[3];
  memcpy(&V128, DRBG_ctx.V, sizeof(V128));
  bswap128(&V128);
  for (int j = 0; j < 3; j++) {
    V128.u128++;
    t = V128;
    bswap128(&t);
    vV[j] = vld1q_u64((uint64_t *)&t);
  }
  AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
  if (provided_data != NULL)
    for (int i = 0; i < 48; i++)
      temp[i] ^= provided_data[i];
  memcpy(Key, temp, 32);
  memcpy(V128.u8, temp + 32, 16);
  incr_V(&V128);
  memcpy(DRBG_ctx.V, V128.u8, 16);
 }
 void randombytes_init_arm64crypto(unsigned char *entropy_input,
                                  unsigned char *personalization_string,
                                  int security_strength) {
  (void)security_strength;
  unsigned char seed_material[48];
  uint8_t subkeys[15][16];
  uint8x16_t vsubkeys[15];
  memcpy(seed_material, entropy_input, 48);
  if (personalization_string)
    for (int i = 0; i < 48; i++)
      seed_material[i] ^= personalization_string[i];
  memset(DRBG_ctx.Key, 0x00, 32);
  memset(DRBG_ctx.V, 0x00, 16);
  AES256_key_schedule(subkeys, DRBG_ctx.Key);
  for (int i = 0; i < 15; i++) {
    vsubkeys[i] = vld1q_u8(subkeys[i]);
  }
  AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter = 1;
 }
 #define WAYS 4
 int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) {
  uint8_t subkeys[15][16];
  unsigned char block[16];
  u128_t V[WAYS], Vle[WAYS];
  uint8x16x4_t vV;
  uint8x16_t vsubkeys[15];
  AES256_key_schedule(subkeys, DRBG_ctx.Key);
  for (int j = 0; j < 15; j++) {
    vsubkeys[j] = vld1q_u8(subkeys[j]);
  }
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Woverlength-strings"
  asm("ldp         %[V0l],     %[V0h],  %[DRBG_ctx_V]     \n\t"
      "stp         %[V0l],     %[V0h],    [%[V]     ]     \n\t"
      "rev       %[Vle0h],     %[V0l]                     \n\t"
      "rev       %[Vle0l],     %[V0h]                     \n\t"
      "adds      %[Vle1l],   %[Vle0l],             #1     \n\t"
      "adc       %[Vle1h],   %[Vle0h],            xzr     \n\t"
      "rev         %[V1h],   %[Vle1l]                     \n\t"
      "rev         %[V1l],   %[Vle1h]                     \n\t"
      "stp         %[V1l],     %[V1h],    [%[V], #16]     \n\t"
      "adds      %[Vle2l],   %[Vle0l],             #2     \n\t"
      "adc       %[Vle2h],   %[Vle0h],            xzr     \n\t"
      "rev         %[V2h],   %[Vle2l]                     \n\t"
      "rev         %[V2l],   %[Vle2h]                     \n\t"
      "stp         %[V2l],     %[V2h],    [%[V], #32]     \n\t"
      "adds      %[Vle3l],   %[Vle0l],             #3     \n\t"
      "adc       %[Vle3h],   %[Vle0h],            xzr     \n\t"
      "rev         %[V3h],   %[Vle3l]                     \n\t"
      "rev         %[V3l],   %[Vle3h]                     \n\t"
      "stp         %[V3l],     %[V3h],    [%[V], #48]     \n\t"
      "ld1       { %[vV0].16b, %[vV1].16b, %[vV2].16b, %[vV3].16b }, [%[V]]\n\t"
      "cmp        %[xlen],          #64                   \n\t"
      "b.lo            2f                                 \n\t"
      ".p2align         6                                 \n\t"
      "1:                                                 \n\t"
      "aese    %[vV0].16b,  %[vsk0].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk0].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk0].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk0].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk1].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk1].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk1].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk1].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "adds      %[Vle0l],     %[Vle0l],           #4     \n\t"
      "adc       %[Vle0h],     %[Vle0h],          xzr     \n\t"
      "adds      %[Vle1l],     %[Vle1l],           #4     \n\t"
      "adc       %[Vle1h],     %[Vle1h],          xzr     \n\t"
      "adds      %[Vle2l],     %[Vle2l],           #4     \n\t"
      "adc       %[Vle2h],     %[Vle2h],          xzr     \n\t"
      "adds      %[Vle3l],     %[Vle3l],           #4     \n\t"
      "adc       %[Vle3h],     %[Vle3h],          xzr     \n\t"
      "aese    %[vV0].16b,  %[vsk2].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk2].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk2].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk2].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk3].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk3].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk3].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk3].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "rev         %[V0h],     %[Vle0l]                   \n\t"
      "rev         %[V0l],     %[Vle0h]                   \n\t"
      "rev         %[V1h],     %[Vle1l]                   \n\t"
      "rev         %[V1l],     %[Vle1h]                   \n\t"
      "rev         %[V2h],     %[Vle2l]                   \n\t"
      "rev         %[V2l],     %[Vle2h]                   \n\t"
      "rev         %[V3h],     %[Vle3l]                   \n\t"
      "rev         %[V3l],     %[Vle3h]                   \n\t"
      "aese    %[vV0].16b,  %[vsk4].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk4].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk4].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk4].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk5].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk5].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk5].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk5].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk6].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk6].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk6].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk6].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk7].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk7].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk7].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk7].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk8].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk8].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk8].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk8].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b,  %[vsk9].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b,  %[vsk9].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b,  %[vsk9].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b,  %[vsk9].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "stp         %[V0l],       %[V0h],  [%[V]]          \n\t"
      "stp         %[V1l],       %[V1h],  [%[V], #16]     \n\t"
      "stp         %[V2l],       %[V2h],  [%[V], #32]     \n\t"
      "stp         %[V3l],       %[V3h],  [%[V], #48]     \n\t"
      "aese    %[vV0].16b, %[vsk10].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b, %[vsk10].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b, %[vsk10].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b, %[vsk10].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b, %[vsk11].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b, %[vsk11].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b, %[vsk11].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b, %[vsk11].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b, %[vsk12].16b                   \n\t"
      "aesmc   %[vV0].16b,   %[vV0].16b                   \n\t"
      "aese    %[vV1].16b, %[vsk12].16b                   \n\t"
      "aesmc   %[vV1].16b,   %[vV1].16b                   \n\t"
      "aese    %[vV2].16b, %[vsk12].16b                   \n\t"
      "aesmc   %[vV2].16b,   %[vV2].16b                   \n\t"
      "aese    %[vV3].16b, %[vsk12].16b                   \n\t"
      "aesmc   %[vV3].16b,   %[vV3].16b                   \n\t"
      "aese    %[vV0].16b, %[vsk13].16b                   \n\t"
      "eor     %[vV0].16b,   %[vV0].16b, %[vsk14].16b     \n\t"
      "aese    %[vV1].16b, %[vsk13].16b                   \n\t"
      "eor     %[vV1].16b,   %[vV1].16b, %[vsk14].16b     \n\t"
      "stp        %q[vV0],      %q[vV1],       [%[x]], #32\n\t"
      "aese    %[vV2].16b, %[vsk13].16b                   \n\t"
      "eor     %[vV2].16b,   %[vV2].16b, %[vsk14].16b     \n\t"
      "aese    %[vV3].16b, %[vsk13].16b                   \n\t"
      "eor     %[vV3].16b,   %[vV3].16b, %[vsk14].16b     \n\t"
      "stp        %q[vV2],      %q[vV3],       [%[x]], #32\n\t"
      "sub        %[xlen],      %[xlen],          #64     \n\t"
      "ld1       { %[vV0].16b, %[vV1].16b, %[vV2].16b, %[vV3].16b }, [%[V]]\n\t"
      "cmp        %[xlen],          #64                   \n\t"
      "b.hs            1b                                 \n\t"
      "cbnz       %[xlen],           2f                   \n\t"
      "subs        %[V0h],     %[Vle3l],           #4     \n\t"
      "sbc         %[V0l],     %[Vle3h],          xzr     \n\t"
      "rev         %[V0h],       %[V0h]                   \n\t"
      "rev         %[V0l],       %[V0l]                   \n\t"
      "stp         %[V0l],       %[V0h],       [%[V]]     \n\t"
      "2:                                                 \n\t"
      : [vV0] "=&w"(vV.val[0]), [vV1] "=&w"(vV.val[1]), [vV2] "=&w"(vV.val[2]),
        [vV3] "=&w"(vV.val[3]), [Vle0l] "=&r"(Vle[0].u64[0]),
        [Vle0h] "=&r"(Vle[0].u64[1]), [Vle1l] "=&r"(Vle[1].u64[0]),
        [Vle1h] "=&r"(Vle[1].u64[1]), [Vle2l] "=&r"(Vle[2].u64[0]),
        [Vle2h] "=&r"(Vle[2].u64[1]), [Vle3l] "=&r"(Vle[3].u64[0]),
        [Vle3h] "=&r"(Vle[3].u64[1]), [x] "+r"(x), [xlen] "+r"(xlen),
        [V0l] "=&r"(V[0].u64[0]), [V0h] "=&r"(V[0].u64[1]),
        [V1l] "=&r"(V[1].u64[0]), [V1h] "=&r"(V[1].u64[1]),
        [V2l] "=&r"(V[2].u64[0]), [V2h] "=&r"(V[2].u64[1]),
        [V3l] "=&r"(V[3].u64[0]), [V3h] "=&r"(V[3].u64[1]),
        "=m"(*(unsigned char(*)[64])x), "=m"(*(unsigned char(*)[64])V)
      :
      [vsk0] "w"(vsubkeys[0]), [vsk1] "w"(vsubkeys[1]), [vsk2] "w"(vsubkeys[2]),
      [vsk3] "w"(vsubkeys[3]), [vsk4] "w"(vsubkeys[4]), [vsk5] "w"(vsubkeys[5]),
      [vsk6] "w"(vsubkeys[6]), [vsk7] "w"(vsubkeys[7]), [vsk8] "w"(vsubkeys[8]),
      [vsk9] "w"(vsubkeys[9]), [vsk10] "w"(vsubkeys[10]),
      [vsk11] "w"(vsubkeys[11]), [vsk12] "w"(vsubkeys[12]),
      [vsk13] "w"(vsubkeys[13]), [vsk14] "w"(vsubkeys[14]), [V] "r"(V),
      [DRBG_ctx_V] "m"(DRBG_ctx.V)
      : "cc");
 #pragma GCC diagnostic pop
  while (xlen > 0) {
    if (xlen > 16) {
      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
      x += 16;
      xlen -= 16;
      Vle[0].u128++;
      V[0] = Vle[0];
      bswap128(&V[0]);
    } else {
      AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
      memcpy(x, block, xlen);
      xlen = 0;
    }
  }
  memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
  AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter++;
  return RNG_SUCCESS;
 }
 #ifdef RANDOMBYTES_ARM64CRYPTO
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
  int ret = randombytes_arm64crypto(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
  return ret;
 }
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
  randombytes_init_arm64crypto(entropy_input, personalization_string,
                               security_strength);
 }
 #endif
--- a/src/common/broadwell/CMakeLists.txt
+++ b/src/common/broadwell/CMakeLists.txt
@@ -0,0 +1,43 @@
 set(SOURCE_FILES_COMMON_AESNI
    aes_ni.c
    ctr_drbg.c
    randombytes_ctrdrbg_aesni.c
    vaes256_key_expansion.S
 )
 foreach(SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
    target_sources(${SQISIGN_COMMON_TARGET} PRIVATE ${SOURCE_FILES_COMMON_AESNI})
    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include)
    target_compile_definitions(${SQISIGN_COMMON_TARGET} PRIVATE RANDOMBYTES_AES_NI)
    target_compile_options(${SQISIGN_COMMON_TARGET} PRIVATE -maes -mavx2)
 endforeach()
 set(SOURCE_FILES_CTRDRBG_TEST_BENCHMARK
    ../ref/aes_c.c
    aes_ni.c
    ctr_drbg.c
    randombytes_ctrdrbg_aesni.c
    ../ref/randombytes_ctrdrbg.c
    ../generic/randombytes_system.c
    vaes256_key_expansion.S
 )
 add_executable(sqisign_test_ctrdrbg_intel ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/test_ctrdrbg.c)
 target_include_directories(sqisign_test_ctrdrbg_intel PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
 target_compile_definitions(sqisign_test_ctrdrbg_intel PRIVATE
    CTRDRBG_TEST_BENCH
    RANDOMBYTES_INIT_PLATFORM=randombytes_init_aes_ni
    RANDOMBYTES_PLATFORM=randombytes_aes_ni)
 target_compile_options(sqisign_test_ctrdrbg_intel PRIVATE -maes -mavx2)
 add_test(sqisign_test_ctrdrbg_intel sqisign_test_ctrdrbg_intel)
 add_executable(sqisign_bench_ctrdrbg_intel ${SOURCE_FILES_CTRDRBG_TEST_BENCHMARK} ../generic/test/bench_ctrdrbg.c)
 target_include_directories(sqisign_bench_ctrdrbg_intel PRIVATE ${INC_PUBLIC} ${INC_COMMON} include ../ref/include)
 target_compile_definitions(sqisign_bench_ctrdrbg_intel PRIVATE
    CTRDRBG_TEST_BENCH
    RANDOMBYTES_INIT_PLATFORM=randombytes_init_aes_ni
    RANDOMBYTES_PLATFORM=randombytes_aes_ni)
 target_compile_options(sqisign_bench_ctrdrbg_intel PRIVATE -maes -mavx2)
 set(BM_BINS ${BM_BINS} sqisign_bench_ctrdrbg_intel CACHE INTERNAL "List of benchmark executables")
--- a/src/common/broadwell/aes_ni.c
+++ b/src/common/broadwell/aes_ni.c
@@ -0,0 +1,258 @@
 /***************************************************************************
 * This implementation is a modified version of the code,
 * written by Nir Drucker and Shay Gueron
 * AWS Cryptographic Algorithms Group
 * (ndrucker@amazon.com, gueron@amazon.com)
 *
 * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *  
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *  
 * or in the "license" file accompanying this file. This file is distributed 
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
 * express or implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 * The license is detailed in the file LICENSE.txt, and applies to this file.
 * ***************************************************************************/
 #include "aes_ni.h"
 #include <string.h>
 #include <emmintrin.h>
 #include <immintrin.h>
 #define AESENC(m, key)         _mm_aesenc_si128(m, key)
 #define AESENCLAST(m, key)     _mm_aesenclast_si128(m, key)
 #define XOR(a, b)              _mm_xor_si128(a, b)
 #define ADD32(a, b)            _mm_add_epi32(a, b)
 #define SHUF8(a, mask)         _mm_shuffle_epi8(a, mask)
 #define ZERO256                _mm256_zeroall
 #define BSWAP_MASK 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 #ifdef VAES256
 #define VAESENC(a, key)        _mm256_aesenc_epi128(a, key)
 #define VAESENCLAST(a, key)    _mm256_aesenclast_epi128(a, key)
 #define EXTRACT128(a, imm)     _mm256_extracti128_si256(a, imm)
 #define XOR256(a, b)           _mm256_xor_si256(a,b)
 #define ADD32_256(a, b)        _mm256_add_epi32(a,b)
 #define SHUF8_256(a, mask)     _mm256_shuffle_epi8(a, mask)
 #endif
 #ifdef VAES512
 #define VAESENC(a, key)        _mm512_aesenc_epi128(a, key)
 #define VAESENCLAST(a, key)    _mm512_aesenclast_epi128(a, key)
 #define EXTRACT128(a, imm)     _mm512_extracti64x2_epi64(a, imm)
 #define XOR512(a, b)           _mm512_xor_si512(a,b)
 #define ADD32_512(a, b)        _mm512_add_epi32(a,b)
 #define SHUF8_512(a, mask)     _mm512_shuffle_epi8(a, mask)
 #endif
 _INLINE_ __m128i load_m128i(IN const uint8_t *ctr)
 {
    return _mm_set_epi8(ctr[0],  ctr[1],  ctr[2],  ctr[3],
                        ctr[4],  ctr[5],  ctr[6],  ctr[7],
                        ctr[8],  ctr[9],  ctr[10], ctr[11],
                        ctr[12], ctr[13], ctr[14], ctr[15]);
 }
 _INLINE_ __m128i loadr_m128i(IN const uint8_t *ctr)
 {
    return _mm_setr_epi8(ctr[0],  ctr[1],  ctr[2],  ctr[3],
                         ctr[4],  ctr[5],  ctr[6],  ctr[7],
                         ctr[8],  ctr[9],  ctr[10], ctr[11],
                         ctr[12], ctr[13], ctr[14], ctr[15]);
 }
 void aes256_enc(OUT uint8_t *ct,
                IN const uint8_t *pt,
                IN const aes256_ks_t *ks) {
    uint32_t i = 0;
    __m128i block = loadr_m128i(pt);
    block = XOR(block, ks->keys[0]);
    for (i = 1; i < AES256_ROUNDS; i++) {
        block = AESENC(block, ks->keys[i]);
    }
    block = AESENCLAST(block, ks->keys[AES256_ROUNDS]);
    _mm_storeu_si128((void*)ct, block);
    // Delete secrets from registers if any.
    ZERO256();
 }
 void aes256_ctr_enc(OUT uint8_t *ct,
                    IN const uint8_t *ctr,
                    IN const uint32_t num_blocks,
                    IN const aes256_ks_t *ks)
 {
    __m128i ctr_block = load_m128i(ctr);
    const __m128i bswap_mask = _mm_set_epi32(BSWAP_MASK);
    const __m128i one = _mm_set_epi32(0,0,0,1);
    __m128i block = SHUF8(ctr_block, bswap_mask);
    for (uint32_t bidx = 0; bidx < num_blocks; bidx++) 
    {
        block = XOR(block, ks->keys[0]);
        for (uint32_t i = 1; i < AES256_ROUNDS; i++) {
            block = AESENC(block, ks->keys[i]);
        }
        block = AESENCLAST(block, ks->keys[AES256_ROUNDS]);
        //We use memcpy to avoid align casting.
        _mm_storeu_si128((void*)&ct[16*bidx], block);
        ctr_block = ADD32(ctr_block, one);
        block = SHUF8(ctr_block, bswap_mask);
    }
    // Delete secrets from registers if any.
    ZERO256();
 }
 #ifdef VAES256
 _INLINE_ void load_ks(OUT __m256i ks256[AES256_ROUNDS + 1], 
                      IN const aes256_ks_t *ks)
 {
    for(uint32_t i = 0; i < AES256_ROUNDS + 1; i++)
    {
        ks256[i] = _mm256_broadcastsi128_si256(ks->keys[i]);
    }
 }
 // NIST 800-90A Table 3, Section 10.2.1 (no derivation function) states that 
 // max_number_of_bits_per_request is min((2^ctr_len - 4) x block_len, 2^19) <= 2^19
 // Therefore the maximal number of blocks (16 bytes) is 2^19/128 = 2^19/2^7 = 2^12 < 2^32
 // Here num_blocks is assumed to be less then 2^32. 
 // It is the caller responsiblity to ensure it.
 void aes256_ctr_enc256(OUT uint8_t *ct,
                       IN const uint8_t *ctr,
                       IN const uint32_t num_blocks,
                       IN const aes256_ks_t *ks)
 {
    const uint64_t num_par_blocks = num_blocks/2;
    const uint64_t blocks_rem = num_blocks - (2*(num_par_blocks));
    __m256i ks256[AES256_ROUNDS + 1];
    load_ks(ks256, ks);
    __m128i single_block = load_m128i(ctr);
    __m256i ctr_blocks = _mm256_broadcastsi128_si256(single_block);
    // Preparing the masks
    const __m256i bswap_mask = _mm256_set_epi32(BSWAP_MASK, BSWAP_MASK);
    const __m256i two = _mm256_set_epi32(0,0,0,2,0,0,0,2);
    const __m256i init = _mm256_set_epi32(0,0,0,1,0,0,0,0);
    // Initialize two parallel counters
    ctr_blocks = ADD32_256(ctr_blocks, init);
    __m256i p = SHUF8_256(ctr_blocks, bswap_mask);
    for (uint32_t block_idx = 0; block_idx < num_par_blocks; block_idx++) 
    {
        p = XOR256(p, ks256[0]);
        for (uint32_t i = 1; i < AES256_ROUNDS; i++) 
        {
            p = VAESENC(p, ks256[i]);
        }
        p = VAESENCLAST(p, ks256[AES256_ROUNDS]);
        // We use memcpy to avoid align casting.
        _mm256_storeu_si256((__m256i *)&ct[PAR_AES_BLOCK_SIZE * block_idx], p);
        // Increase the two counters in parallel
        ctr_blocks = ADD32_256(ctr_blocks, two);
        p = SHUF8_256(ctr_blocks, bswap_mask);
    }
    if(0 != blocks_rem)
    {
        single_block = EXTRACT128(p, 0);
        aes256_ctr_enc(&ct[PAR_AES_BLOCK_SIZE * num_par_blocks], 
                       (const uint8_t*)&single_block, blocks_rem, ks);
    }
    // Delete secrets from registers if any.
    ZERO256();
 }
 #endif //VAES256
 #ifdef VAES512
 _INLINE_ void load_ks(OUT __m512i ks512[AES256_ROUNDS + 1], 
                      IN const aes256_ks_t *ks)
 {
    for(uint32_t i = 0; i < AES256_ROUNDS + 1; i++)
    {
        ks512[i] = _mm512_broadcast_i32x4(ks->keys[i]);
    }
 }
 // NIST 800-90A Table 3, Section 10.2.1 (no derivation function) states that 
 // max_number_of_bits_per_request is min((2^ctr_len - 4) x block_len, 2^19) <= 2^19
 // Therefore the maximal number of blocks (16 bytes) is 2^19/128 = 2^19/2^7 = 2^12 < 2^32
 // Here num_blocks is assumed to be less then 2^32. 
 // It is the caller responsiblity to ensure it.
 void aes256_ctr_enc512(OUT uint8_t *ct,
                       IN const uint8_t *ctr,
                       IN const uint32_t num_blocks,
                       IN const aes256_ks_t *ks)
 {
    const uint64_t num_par_blocks = num_blocks/4;
    const uint64_t blocks_rem = num_blocks - (4*(num_par_blocks));
    __m512i ks512[AES256_ROUNDS + 1];
    load_ks(ks512, ks);
    __m128i single_block = load_m128i(ctr);
    __m512i ctr_blocks = _mm512_broadcast_i32x4(single_block);
    // Preparing the masks
    const __m512i bswap_mask = _mm512_set_epi32(BSWAP_MASK, BSWAP_MASK,
                                                BSWAP_MASK, BSWAP_MASK);
    const __m512i four = _mm512_set_epi32(0,0,0,4,0,0,0,4,0,0,0,4,0,0,0,4);
    const __m512i init = _mm512_set_epi32(0,0,0,3,0,0,0,2,0,0,0,1,0,0,0,0);
    // Initialize four parallel counters
    ctr_blocks = ADD32_512(ctr_blocks, init);
    __m512i p = SHUF8_512(ctr_blocks, bswap_mask);
    for (uint32_t block_idx = 0; block_idx < num_par_blocks; block_idx++) 
    {
        p = XOR512(p, ks512[0]);
        for (uint32_t i = 1; i < AES256_ROUNDS; i++) 
        {
            p = VAESENC(p, ks512[i]);
        }
        p = VAESENCLAST(p, ks512[AES256_ROUNDS]);
        // We use memcpy to avoid align casting.
        _mm512_storeu_si512(&ct[PAR_AES_BLOCK_SIZE * block_idx], p);
        // Increase the four counters in parallel
        ctr_blocks = ADD32_512(ctr_blocks, four);
        p = SHUF8_512(ctr_blocks, bswap_mask);
    }
    if(0 != blocks_rem)
    {
        single_block = EXTRACT128(p, 0);
        aes256_ctr_enc(&ct[PAR_AES_BLOCK_SIZE * num_par_blocks], 
                       (const uint8_t*)&single_block, blocks_rem, ks);
    }
    // Delete secrets from registers if any.
    ZERO256();
 }
 #endif //VAES512
--- a/src/common/broadwell/ctr_drbg.c
+++ b/src/common/broadwell/ctr_drbg.c
@@ -0,0 +1,201 @@
 /* Copyright (c) 2017, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 /***************************************************************************
 * Small modification by Nir Drucker and Shay Gueron
 * AWS Cryptographic Algorithms Group
 * (ndrucker@amazon.com, gueron@amazon.com)
 * include:
 * 1) Use memcpy/memset instead of OPENSSL_memcpy/memset
 * 2) Include aes.h as the underlying aes code
 * 3) Modifying the drbg structure
 * ***************************************************************************/
 #include "ctr_drbg.h"
 #include <string.h>
 // Section references in this file refer to SP 800-90Ar1:
 // http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf
 int CTR_DRBG_init(CTR_DRBG_STATE *drbg,
                  const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
                  const uint8_t *personalization, size_t personalization_len) {
  // Section 10.2.1.3.1
  if (personalization_len > CTR_DRBG_ENTROPY_LEN) {
    return 0;
  }
  uint8_t seed_material[CTR_DRBG_ENTROPY_LEN];
  memcpy(seed_material, entropy, CTR_DRBG_ENTROPY_LEN);
  for (size_t i = 0; i < personalization_len; i++) {
    seed_material[i] ^= personalization[i];
  }
  // Section 10.2.1.2
  // kInitMask is the result of encrypting blocks with big-endian value 1, 2
  // and 3 with the all-zero AES-256 key.
  static const uint8_t kInitMask[CTR_DRBG_ENTROPY_LEN] = {
      0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, 0xa9, 0x63, 0xb4, 0xf1,
      0xc4, 0xcb, 0x73, 0x8b, 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
      0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18, 0x72, 0x60, 0x03, 0xca,
      0x37, 0xa6, 0x2a, 0x74, 0xd1, 0xa2, 0xf5, 0x8e, 0x75, 0x06, 0x35, 0x8e,
  };
  for (size_t i = 0; i < sizeof(kInitMask); i++) {
    seed_material[i] ^= kInitMask[i];
  }
  aes256_key_t key;
  memcpy(key.raw, seed_material, 32);
  memcpy(drbg->counter.bytes, seed_material + 32, 16);
  aes256_key_expansion(&drbg->ks, &key);
  drbg->reseed_counter = 1;
  return 1;
 }
 // ctr_inc adds |n| to the last four bytes of |drbg->counter|, treated as a
 // big-endian number.
 static void ctr32_add(CTR_DRBG_STATE *drbg, uint32_t n) {
  drbg->counter.words[3] =
      CRYPTO_bswap4(CRYPTO_bswap4(drbg->counter.words[3]) + n);
 }
 static int ctr_drbg_update(CTR_DRBG_STATE *drbg, const uint8_t *data,
                           size_t data_len) {
  // Per section 10.2.1.2, |data_len| must be |CTR_DRBG_ENTROPY_LEN|. Here, we
  // allow shorter inputs and right-pad them with zeros. This is equivalent to
  // the specified algorithm but saves a copy in |CTR_DRBG_generate|.
  if (data_len > CTR_DRBG_ENTROPY_LEN) {
    return 0;
  }
  uint8_t temp[CTR_DRBG_ENTROPY_LEN];
  for (size_t i = 0; i < CTR_DRBG_ENTROPY_LEN; i += AES_BLOCK_SIZE) {
    ctr32_add(drbg, 1);
    aes256_enc(temp + i, drbg->counter.bytes, &drbg->ks);
  }
  for (size_t i = 0; i < data_len; i++) {
    temp[i] ^= data[i];
  }
  aes256_key_t key;
  memcpy(key.raw, temp, 32);
  memcpy(drbg->counter.bytes, temp + 32, 16);
  aes256_key_expansion(&drbg->ks, &key);
  return 1;
 }
 int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg,
                    const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
                    const uint8_t *additional_data,
                    size_t additional_data_len) {
  // Section 10.2.1.4
  uint8_t entropy_copy[CTR_DRBG_ENTROPY_LEN];
  if (additional_data_len > 0) {
    if (additional_data_len > CTR_DRBG_ENTROPY_LEN) {
      return 0;
    }
    memcpy(entropy_copy, entropy, CTR_DRBG_ENTROPY_LEN);
    for (size_t i = 0; i < additional_data_len; i++) {
      entropy_copy[i] ^= additional_data[i];
    }
    entropy = entropy_copy;
  }
  if (!ctr_drbg_update(drbg, entropy, CTR_DRBG_ENTROPY_LEN)) {
    return 0;
  }
  drbg->reseed_counter = 1;
  return 1;
 }
 int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len,
                      const uint8_t *additional_data,
                      size_t additional_data_len) {
  if (additional_data_len != 0 &&
      !ctr_drbg_update(drbg, additional_data, additional_data_len)) {
    return 0;
  }
  // kChunkSize is used to interact better with the cache. Since the AES-CTR
  // code assumes that it's encrypting rather than just writing keystream, the
  // buffer has to be zeroed first. Without chunking, large reads would zero
  // the whole buffer, flushing the L1 cache, and then do another pass (missing
  // the cache every time) to “encrypt” it. The code can avoid this by
  // chunking.
  static const size_t kChunkSize = 8 * 1024;
  while (out_len >= AES_BLOCK_SIZE) {
    size_t todo = kChunkSize;
    if (todo > out_len) {
      todo = out_len;
    }
    todo &= ~(AES_BLOCK_SIZE - 1);
    const size_t num_blocks = todo / AES_BLOCK_SIZE;
    if (1) {
      memset(out, 0, todo);
      ctr32_add(drbg, 1);
 #ifdef VAES512
      aes256_ctr_enc512(out, drbg->counter.bytes, num_blocks, &drbg->ks);
 #elif defined(VAES256)
      aes256_ctr_enc256(out, drbg->counter.bytes, num_blocks, &drbg->ks);
 #else
      aes256_ctr_enc(out, drbg->counter.bytes, num_blocks, &drbg->ks);
 #endif
      ctr32_add(drbg, num_blocks - 1);
    } else {
      for (size_t i = 0; i < todo; i += AES_BLOCK_SIZE) {
        ctr32_add(drbg, 1);
        aes256_enc(&out[i], drbg->counter.bytes, &drbg->ks);
      }
    }
    out += todo;
    out_len -= todo;
  }
  if (out_len > 0) {
    uint8_t block[AES_BLOCK_SIZE];
    ctr32_add(drbg, 1);
    aes256_enc(block, drbg->counter.bytes, &drbg->ks);
    memcpy(out, block, out_len);
  }
  // Right-padding |additional_data| in step 2.2 is handled implicitly by
  // |ctr_drbg_update|, to save a copy.
  if (!ctr_drbg_update(drbg, additional_data, additional_data_len)) {
    return 0;
  }
  drbg->reseed_counter++;
  return 1;
 }
 void CTR_DRBG_clear(CTR_DRBG_STATE *drbg) {
  secure_clean((uint8_t *)drbg, sizeof(CTR_DRBG_STATE));
 }
--- a/src/common/broadwell/include/aes_ni.h
+++ b/src/common/broadwell/include/aes_ni.h
@@ -0,0 +1,85 @@
 /***************************************************************************
 * Written by Nir Drucker and Shay Gueron
 * AWS Cryptographic Algorithms Group
 * (ndrucker@amazon.com, gueron@amazon.com)
 *
 * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *  
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *  
 * or in the "license" file accompanying this file. This file is distributed 
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
 * express or implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 * The license is detailed in the file LICENSE.txt, and applies to this file.
 * ***************************************************************************/
 #pragma once
 #include <stdint.h>
 #include <wmmintrin.h>
 #include "defs.h"
 #define MAX_AES_INVOKATION (MASK(32))
 #define AES256_KEY_SIZE (32ULL)
 #define AES256_KEY_BITS (AES256_KEY_SIZE * 8)
 #define AES_BLOCK_SIZE (16ULL)
 #define AES256_ROUNDS (14ULL)
 #ifdef VAES256
 #define PAR_AES_BLOCK_SIZE (AES_BLOCK_SIZE*2)
 #elif defined(VAES512)
 #define PAR_AES_BLOCK_SIZE (AES_BLOCK_SIZE*4)
 #endif
 typedef ALIGN(16) struct aes256_key_s {
    uint8_t raw[AES256_KEY_SIZE];
 } aes256_key_t;
 typedef ALIGN(16) struct aes256_ks_s {
    __m128i keys[AES256_ROUNDS + 1];
 } aes256_ks_t;
 // The ks parameter must be 16 bytes aligned!
 EXTERNC void aes256_key_expansion(OUT aes256_ks_t *ks,
                                  IN const aes256_key_t *key);
 // Encrypt one 128-bit block ct = E(pt,ks)
 void aes256_enc(OUT uint8_t *ct,
                IN const uint8_t *pt,
                IN const aes256_ks_t *ks);
 // Encrypt num_blocks 128-bit blocks 
 // ct[15:0] = E(pt[15:0],ks)
 // ct[31:16] = E(pt[15:0] + 1,ks)
 // ...
 // ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
 void aes256_ctr_enc(OUT uint8_t *ct,
                    IN const uint8_t *pt,
                    IN const uint32_t num_blocks,
                    IN const aes256_ks_t *ks);
 // Encrypt num_blocks 128-bit blocks using VAES (AVX-2)
 // ct[15:0] = E(pt[15:0],ks)
 // ct[31:16] = E(pt[15:0] + 1,ks)
 // ...
 // ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
 void aes256_ctr_enc256(OUT uint8_t *ct,
                       IN const uint8_t *ctr,
                       IN const uint32_t num_blocks,
                       IN const aes256_ks_t *ks);
 // Encrypt num_blocks 128-bit blocks using VAES (AVX512)
 // ct[15:0] = E(pt[15:0],ks)
 // ct[31:16] = E(pt[15:0] + 1,ks)
 // ...
 // ct[16*num_blocks - 1:16*(num_blocks-1)] = E(pt[15:0] + num_blocks,ks)
 void aes256_ctr_enc512(OUT uint8_t *ct,
                       IN const uint8_t *ctr,
                       IN const uint32_t num_blocks,
                       IN const aes256_ks_t *ks);
--- a/src/common/broadwell/include/ctr_drbg.h
+++ b/src/common/broadwell/include/ctr_drbg.h
@@ -0,0 +1,78 @@
 /* Copyright (c) 2017, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 /***************************************************************************
 * Small modification by Nir Drucker and Shay Gueron
 * AWS Cryptographic Algorithms Group
 * (ndrucker@amazon.com, gueron@amazon.com)
 * include:
 * 1) Use memcpy/memset instead of OPENSSL_memcpy/memset
 * 2) Include aes.h as the underlying aes code
 * 3) Modifying the drbg structure
 * ***************************************************************************/
 #pragma once
 #if defined(__cplusplus)
 extern "C" {
 #endif
 #include "aes_ni.h"
 // CTR_DRBG_STATE contains the state of a CTR_DRBG based on AES-256. See SP
 // 800-90Ar1.
 typedef struct {
  aes256_ks_t ks;
  union {
    uint8_t bytes[16];
    uint32_t words[4];
  } counter;
  uint64_t reseed_counter;
 } CTR_DRBG_STATE;
 // See SP 800-90Ar1, table 3.
 #define CTR_DRBG_ENTROPY_LEN 48
 // CTR_DRBG_init initialises |*drbg| given |CTR_DRBG_ENTROPY_LEN| bytes of
 // entropy in |entropy| and, optionally, a personalization string up to
 // |CTR_DRBG_ENTROPY_LEN| bytes in length. It returns one on success and zero
 // on error.
 int CTR_DRBG_init(CTR_DRBG_STATE *drbg,
                  const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
                  const uint8_t *personalization,
                  size_t personalization_len);
 // CTR_DRBG_reseed reseeds |drbg| given |CTR_DRBG_ENTROPY_LEN| bytes of entropy
 // in |entropy| and, optionally, up to |CTR_DRBG_ENTROPY_LEN| bytes of
 // additional data. It returns one on success or zero on error.
 int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg,
                    const uint8_t entropy[CTR_DRBG_ENTROPY_LEN],
                    const uint8_t *additional_data,
                    size_t additional_data_len);
 // CTR_DRBG_generate processes to up |CTR_DRBG_ENTROPY_LEN| bytes of additional
 // data (if any) and then writes |out_len| random bytes to |out|. It returns one on success or
 // zero on error.
 int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out,
                      size_t out_len,
                      const uint8_t *additional_data,
                      size_t additional_data_len);
 // CTR_DRBG_clear zeroises the state of |drbg|.
 void CTR_DRBG_clear(CTR_DRBG_STATE *drbg);
 #if defined(__cplusplus)
 }  // extern C
 #endif
--- a/src/common/broadwell/include/defs.h
+++ b/src/common/broadwell/include/defs.h
@@ -0,0 +1,63 @@
 /***************************************************************************
 * Written by Nir Drucker and Shay Gueron
 * AWS Cryptographic Algorithms Group
 * (ndrucker@amazon.com, gueron@amazon.com)
 *
 * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *  
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *  
 * or in the "license" file accompanying this file. This file is distributed 
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
 * express or implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 * The license is detailed in the file LICENSE.txt, and applies to this file.
 * ***************************************************************************/
 #pragma once
 #include <string.h>
 #ifdef __cplusplus
  #define EXTERNC extern "C"
 #else
  #define EXTERNC
 #endif
 // For code clarity.
 #define IN
 #define OUT
 #define ALIGN(n) __attribute__((aligned(n)))
 #define _INLINE_ static inline
 typedef enum
 {
  SUCCESS=0,
  ERROR=1
 } status_t;
 #define SUCCESS 0
 #define ERROR 1
 #define GUARD(func) {if(SUCCESS != func) {return ERROR;}}
 #if defined(__GNUC__) && __GNUC__ >= 2
 static inline uint32_t CRYPTO_bswap4(uint32_t x) {
  return __builtin_bswap32(x);
 }
 #endif
 _INLINE_ void secure_clean(OUT uint8_t *p, IN const uint32_t len)
 {
 #ifdef _WIN32
    SecureZeroMemory(p, len);
 #else
    typedef void *(*memset_t)(void *, int, size_t);
    static volatile memset_t memset_func = memset;
    memset_func(p, 0, len);
 #endif
 }
--- a/src/common/broadwell/randombytes_ctrdrbg_aesni.c
+++ b/src/common/broadwell/randombytes_ctrdrbg_aesni.c
@@ -0,0 +1,87 @@
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 //
 /*
 NIST-developed software is provided by NIST as a public service. You may use,
 copy, and distribute copies of the software in any medium, provided that you
 keep intact this entire notice. You may improve, modify, and create derivative
 works of the software or any portion of the software, and you may copy and
 distribute such modifications or works. Modified works should carry a notice
 stating that you changed the software and should note the date and nature of any
 such change. Please explicitly acknowledge the National Institute of Standards
 and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
 ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
 WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
 PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
 NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
 ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
 ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
 INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
 USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and
 distributing the software and you assume all risks associated with its use,
 including but not limited to the risks and costs of program errors, compliance
 with applicable laws, damage to or loss of data, programs or equipment, and the
 unavailability or interruption of operation. This software is not intended to be
 used in any situation where a failure could cause risk of injury or damage to
 property. The software developed by NIST employees is not subject to copyright
 protection within the United States.
 */
 #include <string.h>
 #include <rng.h>
 #include "ctr_drbg.h"
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
 #define RNG_SUCCESS 0
 #define RNG_BAD_MAXLEN -1
 #define RNG_BAD_OUTBUF -2
 #define RNG_BAD_REQ_LEN -3
 CTR_DRBG_STATE drbg;
 #ifndef CTRDRBG_TEST_BENCH
 static
 #endif
 void
 randombytes_init_aes_ni(unsigned char *entropy_input,
                        unsigned char *personalization_string,
                        int security_strength) {
  (void)security_strength; // fixed to 256
  CTR_DRBG_init(&drbg, entropy_input, personalization_string,
                (personalization_string == NULL) ? 0 : CTR_DRBG_ENTROPY_LEN);
 }
 #ifndef CTRDRBG_TEST_BENCH
 static
 #endif
 int
 randombytes_aes_ni(unsigned char *x, size_t xlen) {
  CTR_DRBG_generate(&drbg, x, xlen, NULL, 0);
  return RNG_SUCCESS;
 }
 #ifdef RANDOMBYTES_AES_NI
 SQISIGN_API
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
  int ret = randombytes_aes_ni(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
  return ret;
 }
 SQISIGN_API
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
  randombytes_init_aes_ni(entropy_input, personalization_string,
                          security_strength);
 }
 #endif
--- a/src/common/broadwell/vaes256_key_expansion.S
+++ b/src/common/broadwell/vaes256_key_expansion.S
@@ -0,0 +1,122 @@
 #***************************************************************************
 # This implementation is a modified version of the code,
 # written by Nir Drucker and Shay Gueron
 # AWS Cryptographic Algorithms Group
 # (ndrucker@amazon.com, gueron@amazon.com)
 #
 # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #  
 # Licensed under the Apache License, Version 2.0 (the "License").
 # You may not use this file except in compliance with the License.
 # A copy of the License is located at
 #  
 #     http://www.apache.org/licenses/LICENSE-2.0
 #  
 # or in the "license" file accompanying this file. This file is distributed 
 # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
 # express or implied. See the License for the specific language governing 
 # permissions and limitations under the License.
 # The license is detailed in the file LICENSE.txt, and applies to this file.
 #***************************************************************************
 .intel_syntax noprefix
 .data
 .p2align 4, 0x90
 MASK1:
 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
 CON1:
 .long 1,1,1,1
 .set k256_size, 32
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",@progbits
 #endif
 .text
 ################################################################################
 # void aes256_key_expansion(OUT aes256_ks_t* ks, IN const uint8_t* key);
 # The output parameter must be 16 bytes aligned!
 #
 #Linux ABI
 #define out rdi
 #define in  rsi
 #define CON      xmm0
 #define MASK_REG xmm1
 #define IN0      xmm2
 #define IN1      xmm3
 #define TMP1     xmm4
 #define TMP2     xmm5
 #define ZERO     xmm15
 .macro ROUND1 in0 in1
    add         out,   k256_size
    vpshufb     TMP2,  \in1, MASK_REG
    aesenclast  TMP2,  CON
    vpslld      CON,   CON,  1
    vpslldq     TMP1,  \in0, 4
    vpxor       \in0,  \in0, TMP1
    vpslldq     TMP1,  TMP1, 4
    vpxor       \in0,  \in0, TMP1
    vpslldq     TMP1,  TMP1, 4
    vpxor       \in0,  \in0, TMP1
    vpxor       \in0,  \in0, TMP2
    vmovdqa     [out], \in0
 .endm
 .macro ROUND2
   vpshufd     TMP2,     IN0,  0xff
   aesenclast  TMP2,     ZERO
   vpslldq     TMP1,     IN1,  4
   vpxor       IN1,      IN1,  TMP1
   vpslldq     TMP1,     TMP1, 4
   vpxor       IN1,      IN1,  TMP1
   vpslldq     TMP1,     TMP1, 4
   vpxor       IN1,      IN1, TMP1
   vpxor       IN1,      IN1, TMP2
   vmovdqa     [out+16], IN1
 .endm
 #ifdef __APPLE__
 #define AES256_KEY_EXPANSION _aes256_key_expansion
 #else
 #define AES256_KEY_EXPANSION aes256_key_expansion
 #endif
 #ifndef __APPLE__
 .type   AES256_KEY_EXPANSION,@function
 .hidden AES256_KEY_EXPANSION
 #endif
 .globl  AES256_KEY_EXPANSION
 AES256_KEY_EXPANSION:
   vmovdqu IN0,      [in]
   vmovdqu IN1,      [in+16]
   vmovdqa [out],    IN0
   vmovdqa [out+16], IN1
   vmovdqa CON,      [rip+CON1]
   vmovdqa MASK_REG, [rip+MASK1]
   vpxor   ZERO, ZERO, ZERO
   mov     ax, 6
 .loop256:
   ROUND1  IN0, IN1
   dec     ax
   ROUND2
   jne     .loop256
   ROUND1  IN0, IN1
   ret
 #ifndef __APPLE__
 .size AES256_KEY_EXPANSION, .-AES256_KEY_EXPANSION
 #endif
--- a/src/common/generic/CMakeLists.txt
+++ b/src/common/generic/CMakeLists.txt
@@ -1,26 +1,15 @@
-set(SOURCE_FILES_COMMON_SYS 
+set(SOURCE_FILES_COMMON_GENERIC
    randombytes_system.c 
    aes_c.c 
    fips202.c 
    mem.c
    tools.c
 )
-add_library(sqisign_common_sys ${SOURCE_FILES_COMMON_SYS})
+foreach (SQISIGN_COMMON_TARGET sqisign_common_test sqisign_common_sys)
-target_include_directories(sqisign_common_sys PRIVATE include ../../include)
+    add_library(${SQISIGN_COMMON_TARGET} STATIC ${SOURCE_FILES_COMMON_GENERIC})
-target_compile_options(sqisign_common_sys PUBLIC ${C_OPT_FLAGS})
+    target_include_directories(${SQISIGN_COMMON_TARGET} PRIVATE include ${INC_PUBLIC})
-
+    target_compile_options(${SQISIGN_COMMON_TARGET} PUBLIC ${C_OPT_FLAGS})
-set(SOURCE_FILES_COMMON_TEST 
+    if (ENABLE_CT_TESTING)
-    randombytes_ctrdrbg.c 
+        target_compile_definitions(${SQISIGN_COMMON_TARGET} PUBLIC ENABLE_CT_TESTING)
-    aes_c.c 
+    endif()
-    fips202.c 
+endforeach()
    mem.c
 )
 add_library(sqisign_common_test ${SOURCE_FILES_COMMON_TEST})
 target_include_directories(sqisign_common_test PRIVATE include ../include)
 target_compile_options(sqisign_common_test PUBLIC ${C_OPT_FLAGS})
 if (ENABLE_CT_TESTING)
    target_compile_definitions(sqisign_common_sys PUBLIC ENABLE_CT_TESTING)
    target_compile_definitions(sqisign_common_test PUBLIC ENABLE_CT_TESTING)
 endif()
--- a/src/common/generic/fips202.c
+++ b/src/common/generic/fips202.c
@@ -13,167 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
-#include <stddef.h>
+#include "fips202.h"
 #include <stdint.h>
 #define SHAKE128_RATE 168
 #define SHAKE256_RATE 136
 #define SHA3_256_RATE 136
 #define SHA3_384_RATE 104
 #define SHA3_512_RATE 72
 #define PQC_SHAKEINCCTX_BYTES (sizeof(uint64_t)*26)
 #define PQC_SHAKECTX_BYTES (sizeof(uint64_t)*25)
 // Context for incremental API
 typedef struct {
    uint64_t *ctx;
 } shake128incctx;
 // Context for non-incremental API
 typedef struct {
    uint64_t *ctx;
 } shake128ctx;
 // Context for incremental API
 typedef struct {
    uint64_t *ctx;
 } shake256incctx;
 // Context for non-incremental API
 typedef struct {
    uint64_t *ctx;
 } shake256ctx;
 // Context for incremental API
 typedef struct {
    uint64_t *ctx;
 } sha3_256incctx;
 // Context for incremental API
 typedef struct {
    uint64_t *ctx;
 } sha3_384incctx;
 // Context for incremental API
 typedef struct {
    uint64_t *ctx;
 } sha3_512incctx;
 /* Initialize the state and absorb the provided input.
 *
 * This function does not support being called multiple times
 * with the same state.
 */
 void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
 /* Free the state */
 void shake128_ctx_release(shake128ctx *state);
 /* Copy the state. */
 void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
 /* Initialize incremental hashing API */
 void shake128_inc_init(shake128incctx *state);
 /* Absorb more information into the XOF.
 *
 * Can be called multiple times.
 */
 void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
 /* Finalize the XOF for squeezing */
 void shake128_inc_finalize(shake128incctx *state);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
 /* Copy the context of the SHAKE128 XOF */
 void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src);
 /* Free the context of the SHAKE128 XOF */
 void shake128_inc_ctx_release(shake128incctx *state);
 /* Initialize the state and absorb the provided input.
 *
 * This function does not support being called multiple times
 * with the same state.
 */
 void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
 /* Free the context held by this XOF */
 void shake256_ctx_release(shake256ctx *state);
 /* Copy the context held by this XOF */
 void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src);
 /* Initialize incremental hashing API */
 void shake256_inc_init(shake256incctx *state);
 void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
 /* Prepares for squeeze phase */
 void shake256_inc_finalize(shake256incctx *state);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
 /* Copy the state */
 void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src);
 /* Free the state */
 void shake256_inc_ctx_release(shake256incctx *state);
 /* One-stop SHAKE128 call */
 void shake128(uint8_t *output, size_t outlen,
              const uint8_t *input, size_t inlen);
 /* One-stop SHAKE256 call */
 void shake256(uint8_t *output, size_t outlen,
              const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_256_inc_init(sha3_256incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
 /* Copy the context */
 void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_256_inc_ctx_release(sha3_256incctx *state);
 void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_384_inc_init(sha3_384incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state);
 /* Copy the context */
 void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_384_inc_ctx_release(sha3_384incctx *state);
 /* One-stop SHA3-384 shop */
 void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_512_inc_init(sha3_512incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
 /* Copy the context */
 void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_512_inc_ctx_release(sha3_512incctx *state);
 /* One-stop SHA3-512 shop */
 void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
 #define NROUNDS 24
 #define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))
@@ -686,10 +526,6 @@ static void keccak_inc_squeeze(uint8_t *h, size_t outlen,
 }
 void shake128_inc_init(shake128incctx *state) {
    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_inc_init(state->ctx);
 }
@@ -706,22 +542,14 @@ void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state)
 }
 void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src) {
    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }
 void shake128_inc_ctx_release(shake128incctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 void shake256_inc_init(shake256incctx *state) {
    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_inc_init(state->ctx);
 }
@@ -738,15 +566,11 @@ void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state)
 }
 void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src) {
    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }
 void shake256_inc_ctx_release(shake256incctx *state) {
-    free(state->ctx);
+    (void)state;
 }
@@ -762,10 +586,6 @@ void shake256_inc_ctx_release(shake256incctx *state) {
 *              - size_t inlen: length of input in bytes
 **************************************************/
 void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) {
    state->ctx = malloc(PQC_SHAKECTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_absorb(state->ctx, SHAKE128_RATE, input, inlen, 0x1F);
 }
@@ -786,16 +606,12 @@ void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state)
 }
 void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src) {
    dest->ctx = malloc(PQC_SHAKECTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKECTX_BYTES);
 }
 /** Release the allocated state. Call only once. */
 void shake128_ctx_release(shake128ctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 /*************************************************
@@ -810,10 +626,6 @@ void shake128_ctx_release(shake128ctx *state) {
 *              - size_t inlen: length of input in bytes
 **************************************************/
 void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) {
    state->ctx = malloc(PQC_SHAKECTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F);
 }
@@ -834,16 +646,12 @@ void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state)
 }
 void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src) {
    dest->ctx = malloc(PQC_SHAKECTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKECTX_BYTES);
 }
 /** Release the allocated state. Call only once. */
 void shake256_ctx_release(shake256ctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 /*************************************************
@@ -909,23 +717,15 @@ void shake256(uint8_t *output, size_t outlen,
 }
 void sha3_256_inc_init(sha3_256incctx *state) {
    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_inc_init(state->ctx);
 }
 void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src) {
    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }
 void sha3_256_inc_ctx_release(sha3_256incctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen) {
@@ -970,18 +770,10 @@ void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) {
 }
 void sha3_384_inc_init(sha3_384incctx *state) {
    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_inc_init(state->ctx);
 }
 void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src) {
    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }
@@ -990,7 +782,7 @@ void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inl
 }
 void sha3_384_inc_ctx_release(sha3_384incctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state) {
@@ -1031,18 +823,10 @@ void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen) {
 }
 void sha3_512_inc_init(sha3_512incctx *state) {
    state->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (state->ctx == NULL) {
        exit(111);
    }
    keccak_inc_init(state->ctx);
 }
 void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src) {
    dest->ctx = malloc(PQC_SHAKEINCCTX_BYTES);
    if (dest->ctx == NULL) {
        exit(111);
    }
    memcpy(dest->ctx, src->ctx, PQC_SHAKEINCCTX_BYTES);
 }
@@ -1051,7 +835,7 @@ void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inl
 }
 void sha3_512_inc_ctx_release(sha3_512incctx *state) {
-    free(state->ctx);
+    (void)state;
 }
 void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state) {
@@ -1090,13 +874,3 @@ void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) {
        output[i] = t[i];
    }
 }
 int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen) {
    shake128(output, outputByteLen, input, inputByteLen);
    return 0;
 }
 int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen) {
    shake256(output, outputByteLen, input, inputByteLen);
    return 0;
 }
--- a/src/common/generic/include/aes.h
+++ b/src/common/generic/include/aes.h
@@ -1,23 +0,0 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef AES_H
 #define AES_H
 #include <stddef.h>
 #include <stdint.h>
 void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
 #define AES_ECB_encrypt AES_256_ECB
 #ifdef ENABLE_AESNI
 int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
                   const unsigned char *input, size_t inputByteLen);
 int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
                      const unsigned char *input, size_t inputByteLen);
 #define AES_128_CTR AES_128_CTR_NI
 #else
 int AES_128_CTR(unsigned char *output, size_t outputByteLen,
                const unsigned char *input, size_t inputByteLen);
 #endif
 #endif
--- a/src/common/generic/include/bench.h
+++ b/src/common/generic/include/bench.h
@@ -1,63 +1,126 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef BENCH_H__
 #define BENCH_H__
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <inttypes.h>
 #if defined(TARGET_OS_UNIX) && (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_OTHER))
 #include <time.h>
 #if defined(__APPLE__)
 #include "bench_macos.h"
 #endif
-#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X) || defined(TARGET_OTHER))
+
-#define print_bench_unit printf("nsec\n");
+#if defined(TARGET_ARM) || defined(TARGET_S390X) || defined(NO_CYCLE_COUNTER)
 #define BENCH_UNIT0 "nanoseconds"
 #define BENCH_UNIT3 "microseconds"
 #define BENCH_UNIT6 "milliseconds"
 #define BENCH_UNIT9 "seconds"
 #else
-#define print_bench_unit printf("cycles\n");
+#define BENCH_UNIT0 "cycles"
 #define BENCH_UNIT3 "kilocycles"
 #define BENCH_UNIT6 "megacycles"
 #define BENCH_UNIT9 "gigacycles"
 #endif
-#if (defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_S390X))
+static inline void
-#define BENCH_UNITS "nsec"
+cpucycles_init(void) {
-#else
+#if defined(__APPLE__) && defined(TARGET_ARM64)
-#define BENCH_UNITS "cycles"
+    macos_init_rdtsc();
 #endif
 }
-static inline int64_t cpucycles(void) {
+static inline uint64_t
-#if (defined(TARGET_AMD64) || defined(TARGET_X86))
+cpucycles(void)
-    unsigned int hi, lo;
+{
 #if defined(TARGET_AMD64) || defined(TARGET_X86)
    uint32_t hi, lo;
-    asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
+    asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
-    return ((int64_t) lo) | (((int64_t) hi) << 32);
+    return ((uint64_t)lo) | ((uint64_t)hi << 32);
-#elif (defined(TARGET_S390X))
+#elif defined(TARGET_S390X)
    uint64_t tod;
-    asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
+    asm volatile("stckf %0\n" : "=Q"(tod) : : "cc");
    return (tod * 1000 / 4096);
 #elif defined(TARGET_ARM64) && !defined(NO_CYCLE_COUNTER)
 #if defined(__APPLE__)
    return macos_rdtsc();
 #else
    uint64_t cycles;
    asm volatile("mrs %0, PMCCNTR_EL0" : "=r"(cycles));
    return cycles;
 #endif // __APPLE__
 #else
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
-    return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
+    return (uint64_t)time.tv_sec * 1000000000 + time.tv_nsec;
 #endif
 }
-static inline int cmpfunc (const void *a, const void *b) {
+static inline int
-    return ( *(uint64_t *)a - * (uint64_t *)b );
+CMPFUNC(const void *a, const void *b)
 {
    uint64_t aa = *(uint64_t *)a, bb = *(uint64_t *)b;
    if (aa > bb)
        return +1;
    if (aa < bb)
        return -1;
    return 0;
 }
-#define BENCH_CODE_1(r) \
+static inline uint32_t
 ISQRT(uint64_t x)
 {
    uint32_t r = 0;
    for (ssize_t i = 31; i >= 0; --i) {
        uint32_t s = r + (1 << i);
        if ((uint64_t)s * s <= x)
            r = s;
    }
    return r;
 }
 static inline double
 _TRUNC(uint64_t x)
 {
    return x / 1000 / 1000.;
 }
 #define _FMT ".3lf"
 #define _UNIT BENCH_UNIT6
 #define BENCH_CODE_1(RUNS)                                                                         \
    {                                                                                              \
        const size_t count = (RUNS);                                                               \
        if (!count)                                                                                \
            abort();                                                                               \
        uint64_t cycles, cycles1, cycles2;                                                         \
        uint64_t cycles_list[count];                                                               \
        cycles = 0;                                                                                \
-    for (i = 0; i < (r); ++i) { \
+        for (size_t i = 0; i < count; ++i) {                                                       \
            cycles1 = cpucycles();
-#define BENCH_CODE_2(name, csv) \
+#define BENCH_CODE_2(name)                                                                         \
    cycles2 = cpucycles();                                                                         \
-        if(i < LIST_SIZE) \
+    cycles_list[i] = cycles2 - cycles1;                                                            \
-          cycles_list[i] = (cycles2 - cycles1);\
+    cycles += cycles2 - cycles1;                                                                   \
        cycles = cycles + (cycles2 - cycles1); \
    }                                                                                              \
-    qsort(cycles_list, (runs < LIST_SIZE)? runs : LIST_SIZE, sizeof(uint64_t), cmpfunc);\
+    qsort(cycles_list, count, sizeof(uint64_t), CMPFUNC);                                          \
-    if (csv) \
+    uint64_t variance = 0;                                                                         \
-      printf("%2" PRId64 ",", cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2]); \
+    for (size_t i = 0; i < count; ++i) {                                                           \
-    else { \
+        int64_t off = cycles_list[i] - cycles / count;                                             \
-      printf("  %-20s-> median: %2" PRId64 ", average: %2" PRId64 " ", name, \
+        variance += off * off;                                                                     \
-      cycles_list[(runs < LIST_SIZE)? runs/2 : LIST_SIZE/2], (cycles / runs)); \
+    }                                                                                              \
-      printf("%s\n", BENCH_UNITS); \
+    variance /= count;                                                                             \
    printf("  %-10s", name);                                                                       \
    printf(" | average %9" _FMT " | stddev %9" _FMT,                                               \
           _TRUNC(cycles / count),                                                                 \
           _TRUNC(ISQRT(variance)));                                                               \
    printf(" | median %9" _FMT " | min %9" _FMT " | max %9" _FMT,                                  \
           _TRUNC(cycles_list[count / 2]),                                                         \
           _TRUNC(cycles_list[0]),                                                                 \
           _TRUNC(cycles_list[count - 1]));                                                        \
    printf("  (%s)\n", _UNIT);                                                                     \
    }
 #endif
--- a/src/common/generic/include/bench_macos.h
+++ b/src/common/generic/include/bench_macos.h
@@ -0,0 +1,143 @@
 // WARNING: must be run as root on an M1 device
 // WARNING: fragile, uses private apple APIs
 // currently no command line interface, see variables at top of main
 /*
 no warranty; use at your own risk - i believe this code needs
 some minor changes to work on some later hardware and/or software revisions,
 which is unsurprising given the use of undocumented, private APIs.
 ------------------------------------------------------------------------------
 This code is available under 2 licenses -- choose whichever you prefer.
 ------------------------------------------------------------------------------
 ALTERNATIVE A - MIT License
 Copyright (c) 2020 Dougall Johnson
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ------------------------------------------------------------------------------
 ALTERNATIVE B - Public Domain (www.unlicense.org)
 This is free and unencumbered software released into the public domain.
 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
 software, either in source code form or as a compiled binary, for any purpose,
 commercial or non-commercial, and by any means.
 In jurisdictions that recognize copyright laws, the author or authors of this
 software dedicate any and all copyright interest in the software to the public
 domain. We make this dedication for the benefit of the public at large and to
 the detriment of our heirs and successors. We intend this dedication to be an
 overt act of relinquishment in perpetuity of all present and future rights to
 this software under copyright law.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
 */
 /*
  Based on https://github.com/travisdowns/robsize
  Henry Wong <henry@stuffedcow.net>
  http://blog.stuffedcow.net/2013/05/measuring-rob-capacity/
  2014-10-14
 */
 #include <dlfcn.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #define KPERF_LIST                                                                                 \
    /*  ret, name, params */                                                                       \
    F(int, kpc_force_all_ctrs_set, int)                                                            \
    F(int, kpc_set_counting, uint32_t)                                                             \
    F(int, kpc_set_thread_counting, uint32_t)                                                      \
    F(int, kpc_set_config, uint32_t, void *)                                                       \
    F(int, kpc_get_thread_counters, int, unsigned int, void *)
 #define F(ret, name, ...)                                                                          \
    typedef ret name##proc(__VA_ARGS__);                                                           \
    static name##proc *name;
 KPERF_LIST
 #undef F
 #define CFGWORD_EL0A64EN_MASK (0x20000)
 #define CPMU_CORE_CYCLE 0x02
 #define KPC_CLASS_FIXED (0)
 #define KPC_CLASS_CONFIGURABLE (1)
 #define COUNTERS_COUNT 10
 #define KPC_MASK ((1u << KPC_CLASS_CONFIGURABLE) | (1u << KPC_CLASS_FIXED))
 static uint64_t g_config[COUNTERS_COUNT];
 static uint64_t g_counters[COUNTERS_COUNT];
 static void
 macos_configure_rdtsc()
 {
    if (kpc_force_all_ctrs_set(1)) {
        printf("kpc_force_all_ctrs_set failed\n");
        return;
    }
    if (kpc_set_config(KPC_MASK, g_config)) {
        printf("kpc_set_config failed\n");
        return;
    }
    if (kpc_set_counting(KPC_MASK)) {
        printf("kpc_set_counting failed\n");
        return;
    }
    if (kpc_set_thread_counting(KPC_MASK)) {
        printf("kpc_set_thread_counting failed\n");
        return;
    }
 }
 static void
 macos_init_rdtsc()
 {
    void *kperf =
        dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY);
    if (!kperf) {
        printf("kperf = %p\n", kperf);
        return;
    }
 #define F(ret, name, ...)                                                                          \
    name = (name##proc *)(intptr_t)(dlsym(kperf, #name));                                          \
    if (!name) {                                                                                   \
        printf("%s = %p\n", #name, (void *)(intptr_t)name);                                        \
        return;                                                                                    \
    }
    KPERF_LIST
 #undef F
    g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK;
    macos_configure_rdtsc();
 }
 static uint64_t
 macos_rdtsc(void)
 {
    if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) {
        printf("kpc_get_thread_counters failed\n");
        return 1;
    }
    return g_counters[2];
 }
--- a/src/common/generic/include/bench_test_arguments.h
+++ b/src/common/generic/include/bench_test_arguments.h
@@ -0,0 +1,32 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef BENCH_TEST_ARGUMENTS_H__
 #define BENCH_TEST_ARGUMENTS_H__
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdint.h>
 static int parse_seed(const char *arg, uint32_t *seed)
 {
    if (sscanf(arg, "--seed=%u", &seed[0]) == 1)
        return 0;
    if (sscanf(arg, "--seed={ "
        "0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", "
        "0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32 " }",
        &seed[0], &seed[1], &seed[2], &seed[3], &seed[4], &seed[5],
        &seed[6], &seed[7], &seed[8], &seed[9], &seed[10], &seed[11]) == 12)
        return 0;
    return 1;
 }
 static void print_seed(const uint32_t *seed)
 {
    printf("Random seed: \"--seed={ ");
    for (int i = 0; i < 12; i++) {
        printf("0x%08x%s", seed[i], (i < 11) ? ", " : " }\"\n");
    }
 }
 #endif
--- a/src/common/generic/include/fips202.h
+++ b/src/common/generic/include/fips202.h
@@ -4,8 +4,168 @@
 #define FIPS202_H
 #include <stddef.h>
 #include <stdint.h>
-int SHAKE128(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
+#define SHAKE128_RATE 168
-int SHAKE256(unsigned char *output, size_t outputByteLen, const unsigned char *input, size_t inputByteLen);
+#define SHAKE256_RATE 136
 #define SHA3_256_RATE 136
 #define SHA3_384_RATE 104
 #define SHA3_512_RATE 72
 #define PQC_SHAKEINCCTX_U64WORDS 26
 #define PQC_SHAKECTX_U64WORDS 25
 #define PQC_SHAKEINCCTX_BYTES (sizeof(uint64_t) * 26)
 #define PQC_SHAKECTX_BYTES (sizeof(uint64_t) * 25)
 // Context for incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
 } shake128incctx;
 // Context for non-incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKECTX_U64WORDS];
 } shake128ctx;
 // Context for incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
 } shake256incctx;
 // Context for non-incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKECTX_U64WORDS];
 } shake256ctx;
 // Context for incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
 } sha3_256incctx;
 // Context for incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
 } sha3_384incctx;
 // Context for incremental API
 typedef struct {
    uint64_t ctx[PQC_SHAKEINCCTX_U64WORDS];
 } sha3_512incctx;
 /* Initialize the state and absorb the provided input.
 *
 * This function does not support being called multiple times
 * with the same state.
 */
 void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
 /* Free the state */
 void shake128_ctx_release(shake128ctx *state);
 /* Copy the state. */
 void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
 /* Initialize incremental hashing API */
 void shake128_inc_init(shake128incctx *state);
 /* Absorb more information into the XOF.
 *
 * Can be called multiple times.
 */
 void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
 /* Finalize the XOF for squeezing */
 void shake128_inc_finalize(shake128incctx *state);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
 /* Copy the context of the SHAKE128 XOF */
 void shake128_inc_ctx_clone(shake128incctx *dest, const shake128incctx *src);
 /* Free the context of the SHAKE128 XOF */
 void shake128_inc_ctx_release(shake128incctx *state);
 /* Initialize the state and absorb the provided input.
 *
 * This function does not support being called multiple times
 * with the same state.
 */
 void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
 /* Free the context held by this XOF */
 void shake256_ctx_release(shake256ctx *state);
 /* Copy the context held by this XOF */
 void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src);
 /* Initialize incremental hashing API */
 void shake256_inc_init(shake256incctx *state);
 void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
 /* Prepares for squeeze phase */
 void shake256_inc_finalize(shake256incctx *state);
 /* Squeeze output out of the sponge.
 *
 * Supports being called multiple times
 */
 void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
 /* Copy the state */
 void shake256_inc_ctx_clone(shake256incctx *dest, const shake256incctx *src);
 /* Free the state */
 void shake256_inc_ctx_release(shake256incctx *state);
 /* One-stop SHAKE128 call */
 void shake128(uint8_t *output, size_t outlen,
              const uint8_t *input, size_t inlen);
 /* One-stop SHAKE256 call */
 void shake256(uint8_t *output, size_t outlen,
              const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_256_inc_init(sha3_256incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
 /* Copy the context */
 void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_256_inc_ctx_release(sha3_256incctx *state);
 void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_384_inc_init(sha3_384incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state);
 /* Copy the context */
 void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_384_inc_ctx_release(sha3_384incctx *state);
 /* One-stop SHA3-384 shop */
 void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen);
 /* Initialize the incremental hashing state */
 void sha3_512_inc_init(sha3_512incctx *state);
 /* Absorb blocks into SHA3 */
 void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
 /* Obtain the output of the function and free `state` */
 void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
 /* Copy the context */
 void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src);
 /* Release the state, don't use if `_finalize` has been used */
 void sha3_512_inc_ctx_release(sha3_512incctx *state);
 /* One-stop SHA3-512 shop */
 void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
 #endif
--- a/src/common/generic/include/tools.h
+++ b/src/common/generic/include/tools.h
@@ -0,0 +1,49 @@
 #ifndef TOOLS_H
 #define TOOLS_H
 #include <time.h>
 // Debug printing:
 // https://stackoverflow.com/questions/1644868/define-macro-for-debug-printing-in-c
 #ifndef NDEBUG
 #define DEBUG_PRINT 1
 #else
 #define DEBUG_PRINT 0
 #endif
 #ifndef __FILE_NAME__
 #define __FILE_NAME__ "NA"
 #endif
 #ifndef __LINE__
 #define __LINE__ 0
 #endif
 #ifndef __func__
 #define __func__ "NA"
 #endif
 #define debug_print(fmt)                                                                           \
    do {                                                                                           \
        if (DEBUG_PRINT)                                                                           \
            printf("warning: %s, file %s, line %d, function %s().\n",                              \
                   fmt,                                                                            \
                   __FILE_NAME__,                                                                  \
                   __LINE__,                                                                       \
                   __func__);                                                                      \
    } while (0)
 clock_t tic(void);
 float tac(void);                             /* time in ms since last tic */
 float TAC(const char *str);                  /* same, but prints it with label 'str' */
 float toc(const clock_t t);                  /* time in ms since t */
 float TOC(const clock_t t, const char *str); /* same, but prints it with label 'str' */
 float TOC_clock(const clock_t t, const char *str);
 clock_t dclock(const clock_t t); // return the clock cycle diff between now and t
 float clock_to_time(const clock_t t,
                    const char *str); // convert the number of clock cycles t to time
 float clock_print(const clock_t t, const char *str);
 #endif
--- a/src/common/generic/include/tutil.h
+++ b/src/common/generic/include/tutil.h
@@ -5,24 +5,27 @@
 #include <stdint.h>
 #if defined(__GNUC__) || defined(__clang__)
 #define BSWAP16(i) __builtin_bswap16((i))
 #define BSWAP32(i) __builtin_bswap32((i))
 #define BSWAP64(i) __builtin_bswap64((i))
 #define UNUSED __attribute__((unused))
 #else
-#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP16(i) ((((i) >> 8) & 0xff) | (((i) & 0xff00) << 8))
 #define BSWAP32(i)                                                                                 \
    ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
 #define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)
 #define UNUSED
 #endif
 #if defined(RADIX_64)
 #define digit_t uint64_t
 #define sdigit_t int64_t
 #define DIGIT_LEN 8
 #define RADIX 64
 #define LOG2RADIX 6
 #define BSWAP_DIGIT(i) BSWAP64(i)
 #elif defined(RADIX_32)
 #define digit_t uint32_t
 #define sdigit_t int32_t
 #define DIGIT_LEN 4
 #define RADIX 32
 #define LOG2RADIX 5
 #define BSWAP_DIGIT(i) BSWAP32(i)
--- a/src/common/generic/mem.c
+++ b/src/common/generic/mem.c
@@ -1,9 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <mem.h>
 #include <string.h>
 #include <stdlib.h>
-void sqisign_secure_free(void *mem, size_t size) {
+void
 sqisign_secure_free(void *mem, size_t size)
 {
    if (mem) {
        typedef void *(*memset_t)(void *, int, size_t);
        static volatile memset_t memset_func = memset;
@@ -11,7 +14,9 @@ void sqisign_secure_free(void *mem, size_t size) {
        free(mem);
    }
 }
-void sqisign_secure_clear(void *mem, size_t size) {
+void
 sqisign_secure_clear(void *mem, size_t size)
 {
    typedef void *(*memset_t)(void *, int, size_t);
    static volatile memset_t memset_func = memset;
    memset_func(mem, 0, size);
--- a/src/common/generic/randombytes_ctrdrbg.c
+++ b/src/common/generic/randombytes_ctrdrbg.c
@@ -1,140 +0,0 @@
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 //
 /*
 NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
 */
 #include <string.h>
 #include <aes.h>
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
 #define RNG_SUCCESS      0
 #define RNG_BAD_MAXLEN  -1
 #define RNG_BAD_OUTBUF  -2
 #define RNG_BAD_REQ_LEN -3
 static __inline void AES256_ECB(unsigned char *key, unsigned char *ctr, unsigned char *buffer) {
    AES_ECB_encrypt(ctr, key, buffer);
 }
 typedef struct {
    unsigned char   buffer[16];
    int             buffer_pos;
    unsigned long   length_remaining;
    unsigned char   key[32];
    unsigned char   ctr[16];
 } AES_XOF_struct;
 typedef struct {
    unsigned char   Key[32];
    unsigned char   V[16];
    int             reseed_counter;
 } AES256_CTR_DRBG_struct;
 void
 AES256_CTR_DRBG_Update(unsigned char *provided_data,
                       unsigned char *Key,
                       unsigned char *V);
 AES256_CTR_DRBG_struct  DRBG_ctx;
 static void
 randombytes_init_nist(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
    unsigned char   seed_material[48];
    (void)security_strength;  // Unused parameter
    memcpy(seed_material, entropy_input, 48);
    if (personalization_string)
        for (int i = 0; i < 48; i++) {
            seed_material[i] ^= personalization_string[i];
        }
    memset(DRBG_ctx.Key, 0x00, 32);
    memset(DRBG_ctx.V, 0x00, 16);
    AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter = 1;
 }
 static int
 randombytes_nist(unsigned char *x, size_t xlen) {
    unsigned char   block[16];
    size_t          i = 0;
    while ( xlen > 0 ) {
        //increment V
        for (int j = 15; j >= 0; j--) {
            if ( DRBG_ctx.V[j] == 0xff ) {
                DRBG_ctx.V[j] = 0x00;
            } else {
                DRBG_ctx.V[j]++;
                break;
            }
        }
        AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
        if ( xlen > 15 ) {
            memcpy(x + i, block, 16);
            i += 16;
            xlen -= 16;
        } else {
            memcpy(x + i, block, xlen);
            i += xlen;
            xlen = 0;
        }
    }
    AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter++;
    return 0;
 }
 void
 AES256_CTR_DRBG_Update(unsigned char *provided_data,
                       unsigned char *Key,
                       unsigned char *V) {
    unsigned char   temp[48];
    for (int i = 0; i < 3; i++) {
        //increment V
        for (int j = 15; j >= 0; j--) {
            if ( V[j] == 0xff ) {
                V[j] = 0x00;
            } else {
                V[j]++;
                break;
            }
        }
        AES256_ECB(Key, V, temp + 16 * i);
    }
    if ( provided_data != NULL )
        for (int i = 0; i < 48; i++) {
            temp[i] ^= provided_data[i];
        }
    memcpy(Key, temp, 32);
    memcpy(V, temp + 32, 16);
 }
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
    int ret = randombytes_nist(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
    return ret;
 }
 void
 randombytes_init(unsigned char *entropy_input,
                 unsigned char *personalization_string,
                 int security_strength) {
    return randombytes_init_nist(entropy_input, personalization_string, security_strength);
 }
--- a/src/common/generic/randombytes_system.c
+++ b/src/common/generic/randombytes_system.c
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 #include <rng.h>
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
@@ -28,13 +30,13 @@ THE SOFTWARE.
 // *before* randombytes.h is included. Otherwise SYS_getrandom will not be
 // declared.
 #if defined(__linux__) || defined(__GNU__)
-# define _GNU_SOURCE
+#define _GNU_SOURCE
 #endif /* defined(__linux__) || defined(__GNU__) */
 #if defined(_WIN32)
 /* Windows */
-# include <windows.h>
+#include <windows.h>
-# include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
+#include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
 #endif                /* defined(_WIN32) */
 /* wasi */
@@ -44,7 +46,7 @@ THE SOFTWARE.
 /* kFreeBSD */
 #if defined(__FreeBSD_kernel__) && defined(__GLIBC__)
-# define GNU_KFREEBSD
+#define GNU_KFREEBSD
 #endif
 #if defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
@@ -53,96 +55,104 @@ THE SOFTWARE.
 // to the linux headers. We only need RNDGETENTCNT, so we instead inline it.
 // RNDGETENTCNT is originally defined in `include/uapi/linux/random.h` in the
 // linux repo.
-# define RNDGETENTCNT 0x80045200
+#define RNDGETENTCNT 0x80045200
-# include <assert.h>
+#include <assert.h>
-# include <errno.h>
+#include <errno.h>
-# include <fcntl.h>
+#include <fcntl.h>
-# include <poll.h>
+#include <poll.h>
-# include <stdint.h>
+#include <stdint.h>
-# include <stdio.h>
+#include <stdio.h>
-# include <sys/ioctl.h>
+#include <sys/ioctl.h>
-# if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
+#if (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) &&                              \
-#  define USE_GLIBC
+    ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24))
-#  include <sys/random.h>
+#define USE_GLIBC
-# endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC_MINOR__ > 24)) */
+#include <sys/random.h>
-# include <sys/stat.h>
+#endif /* (defined(__linux__) || defined(__GNU__)) && defined(__GLIBC__) && ((__GLIBC__ > 2) ||    \
-# include <sys/syscall.h>
+          (__GLIBC_MINOR__ > 24)) */
-# include <sys/types.h>
+#include <sys/stat.h>
-# include <unistd.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
 // We need SSIZE_MAX as the maximum read len from /dev/urandom
-# if !defined(SSIZE_MAX)
+#if !defined(SSIZE_MAX)
-#  define SSIZE_MAX (SIZE_MAX / 2 - 1)
+#define SSIZE_MAX (SIZE_MAX / 2 - 1)
-# endif /* defined(SSIZE_MAX) */
+#endif /* defined(SSIZE_MAX) */
 #endif /* defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD) */
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 /* Dragonfly, FreeBSD, NetBSD, OpenBSD (has arc4random) */
-# include <sys/param.h>
+#include <sys/param.h>
-# if defined(BSD)
+#if defined(BSD)
-#  include <stdlib.h>
+#include <stdlib.h>
-# endif
+#endif
 /* GNU/Hurd defines BSD in sys/param.h which causes problems later */
-# if defined(__GNU__)
+#if defined(__GNU__)
-#  undef BSD
+#undef BSD
-# endif
+#endif
 #endif
 #if defined(__EMSCRIPTEN__)
-# include <assert.h>
+#include <assert.h>
-# include <emscripten.h>
+#include <emscripten.h>
-# include <errno.h>
+#include <errno.h>
-# include <stdbool.h>
+#include <stdbool.h>
 #endif /* defined(__EMSCRIPTEN__) */
 #if defined(_WIN32)
-static int randombytes_win32_randombytes(void* buf, size_t n)
+static int
 randombytes_win32_randombytes(void *buf, size_t n)
 {
    HCRYPTPROV ctx;
    BOOL tmp;
    DWORD to_read = 0;
    const size_t MAX_DWORD = 0xFFFFFFFF;
-	tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL,
+    tmp = CryptAcquireContext(&ctx, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT);
-	                          CRYPT_VERIFYCONTEXT);
+    if (tmp == FALSE)
-	if (tmp == FALSE) return -1;
+        return -1;
    while (n > 0) {
        to_read = (DWORD)(n < MAX_DWORD ? n : MAX_DWORD);
-		tmp = CryptGenRandom(ctx, to_read, (BYTE*) buf);
+        tmp = CryptGenRandom(ctx, to_read, (BYTE *)buf);
-		if (tmp == FALSE) return -1;
+        if (tmp == FALSE)
-		buf = ((char*)buf) + to_read;
+            return -1;
        buf = ((char *)buf) + to_read;
        n -= to_read;
    }
    tmp = CryptReleaseContext(ctx, 0);
-	if (tmp == FALSE) return -1;
+    if (tmp == FALSE)
        return -1;
    return 0;
 }
 #endif /* defined(_WIN32) */
 #if defined(__wasi__)
-static int randombytes_wasi_randombytes(void *buf, size_t n) {
+static int
 randombytes_wasi_randombytes(void *buf, size_t n)
 {
    arc4random_buf(buf, n);
    return 0;
 }
 #endif /* defined(__wasi__) */
 #if (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom))
-# if defined(USE_GLIBC)
+#if defined(USE_GLIBC)
 // getrandom is declared in glibc.
-# elif defined(SYS_getrandom)
+#elif defined(SYS_getrandom)
-static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) {
+static ssize_t
 getrandom(void *buf, size_t buflen, unsigned int flags)
 {
    return syscall(SYS_getrandom, buf, buflen, flags);
 }
-# endif
+#endif
-static int randombytes_linux_randombytes_getrandom(void *buf, size_t n)
+static int
 randombytes_linux_randombytes_getrandom(void *buf, size_t n)
 {
    /* I have thought about using a separate PRF, seeded by getrandom, but
     * it turns out that the performance of getrandom is good enough
@@ -156,24 +166,28 @@ static int randombytes_linux_randombytes_getrandom(void *buf, size_t n)
        do {
            ret = getrandom((char *)buf + offset, chunk, 0);
        } while (ret == -1 && errno == EINTR);
-		if (ret < 0) return ret;
+        if (ret < 0)
            return ret;
        offset += ret;
        n -= ret;
    }
    assert(n == 0);
    return 0;
 }
-#endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) || defined(SYS_getrandom)) */
+#endif /* (defined(__linux__) || defined(__GNU__)) && (defined(USE_GLIBC) ||                       \
          defined(SYS_getrandom)) */
 #if (defined(__linux__) || defined(GNU_KFREEBSD)) && !defined(SYS_getrandom)
-# if defined(__linux__)
+#if defined(__linux__)
-static int randombytes_linux_read_entropy_ioctl(int device, int *entropy)
+static int
 randombytes_linux_read_entropy_ioctl(int device, int *entropy)
 {
    return ioctl(device, RNDGETENTCNT, entropy);
 }
-static int randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
+static int
 randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
 {
    int retcode;
    do {
@@ -186,12 +200,17 @@ static int randombytes_linux_read_entropy_proc(FILE *stream, int *entropy)
    return 0;
 }
-static int randombytes_linux_wait_for_entropy(int device)
+static int
 randombytes_linux_wait_for_entropy(int device)
 {
    /* We will block on /dev/random, because any increase in the OS' entropy
     * level will unblock the request. I use poll here (as does libsodium),
     * because we don't *actually* want to read from the device. */
-	enum { IOCTL, PROC } strategy = IOCTL;
+    enum
    {
        IOCTL,
        PROC
    } strategy = IOCTL;
    const int bits = 128;
    struct pollfd pfd;
    int fd;
@@ -276,10 +295,10 @@ static int randombytes_linux_wait_for_entropy(int device)
    }
    return retcode;
 }
-# endif /* defined(__linux__) */
+#endif /* defined(__linux__) */
-
+static int
-static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
+randombytes_linux_randombytes_urandom(void *buf, size_t n)
 {
    int fd;
    size_t offset = 0, count;
@@ -287,10 +306,12 @@ static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
    do {
        fd = open("/dev/urandom", O_RDONLY);
    } while (fd == -1 && errno == EINTR);
-	if (fd == -1) return -1;
+    if (fd == -1)
-# if defined(__linux__)
+        return -1;
-	if (randombytes_linux_wait_for_entropy(fd) == -1) return -1;
+#if defined(__linux__)
-# endif
+    if (randombytes_linux_wait_for_entropy(fd) == -1)
        return -1;
 #endif
    while (n > 0) {
        count = n <= SSIZE_MAX ? n : SSIZE_MAX;
@@ -298,7 +319,8 @@ static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
        if (tmp == -1 && (errno == EAGAIN || errno == EINTR)) {
            continue;
        }
-		if (tmp == -1) return -1; /* Unrecoverable IO error */
+        if (tmp == -1)
            return -1; /* Unrecoverable IO error */
        offset += tmp;
        n -= tmp;
    }
@@ -308,19 +330,21 @@ static int randombytes_linux_randombytes_urandom(void *buf, size_t n)
 }
 #endif /* defined(__linux__) && !defined(SYS_getrandom) */
 #if defined(BSD)
-static int randombytes_bsd_randombytes(void *buf, size_t n)
+static int
 randombytes_bsd_randombytes(void *buf, size_t n)
 {
    arc4random_buf(buf, n);
    return 0;
 }
 #endif /* defined(BSD) */
 #if defined(__EMSCRIPTEN__)
-static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
+static int
-	const int ret = EM_ASM_INT({
+randombytes_js_randombytes_nodejs(void *buf, size_t n)
 {
    const int ret = EM_ASM_INT(
        {
            var crypto;
            try {
                crypto = require('crypto');
@@ -333,7 +357,9 @@ static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
            } catch (error) {
                return -1;
            }
-	}, buf, n);
+        },
        buf,
        n);
    switch (ret) {
        case 0:
            return 0;
@@ -348,22 +374,23 @@ static int randombytes_js_randombytes_nodejs(void *buf, size_t n) {
 }
 #endif /* defined(__EMSCRIPTEN__) */
-
+SQISIGN_API
-static int randombytes_select(void *buf, size_t n)
+int
 randombytes_select(unsigned char *buf, unsigned long long n)
 {
 #if defined(__EMSCRIPTEN__)
    return randombytes_js_randombytes_nodejs(buf, n);
 #elif defined(__linux__) || defined(__GNU__) || defined(GNU_KFREEBSD)
-# if defined(USE_GLIBC)
+#if defined(USE_GLIBC)
    /* Use getrandom system call */
    return randombytes_linux_randombytes_getrandom(buf, n);
-# elif defined(SYS_getrandom)
+#elif defined(SYS_getrandom)
    /* Use getrandom system call */
    return randombytes_linux_randombytes_getrandom(buf, n);
-# else
+#else
    /* When we have enough entropy, we can read from /dev/urandom */
    return randombytes_linux_randombytes_urandom(buf, n);
-# endif
+#endif
 #elif defined(BSD)
    /* Use arc4random system call */
    return randombytes_bsd_randombytes(buf, n);
@@ -374,23 +401,31 @@ static int randombytes_select(void *buf, size_t n)
    /* Use WASI */
    return randombytes_wasi_randombytes(buf, n);
 #else
-# error "randombytes(...) is not supported on this platform"
+#error "randombytes(...) is not supported on this platform"
 #endif
 }
-int randombytes(unsigned char *x, unsigned long long xlen) {
+#ifdef RANDOMBYTES_SYSTEM
 SQISIGN_API
 int
 randombytes(unsigned char *x, unsigned long long xlen)
 {
-    int ret = randombytes_select(x, (size_t) xlen);
+    int ret = randombytes_select(x, (size_t)xlen);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(x, xlen);
 #endif
    return ret;
 }
-void randombytes_init(unsigned char *entropy_input,
+SQISIGN_API
 void
 randombytes_init(unsigned char *entropy_input,
                 unsigned char *personalization_string,
-                      int security_strength) {
+                 int security_strength)
-    (void) entropy_input;
+{
-    (void) personalization_string;
+    (void)entropy_input;
-    (void) security_strength;
+    (void)personalization_string;
    (void)security_strength;
 }
 #endif
--- a/src/common/generic/test/bench_ctrdrbg.c
+++ b/src/common/generic/test/bench_ctrdrbg.c
@@ -0,0 +1,57 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <string.h>
 #include "bench.h"
 #define RANDOMBYTES_MAX_LENGTH 131072
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x) STRINGIFY2(x)
 void
 randombytes_init_nist(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength);
 int
 randombytes_nist(unsigned char *x, size_t xlen);
 void
 RANDOMBYTES_INIT_PLATFORM(unsigned char *entropy_input,
                          unsigned char *personalization_string,
                          int security_strength);
 int
 RANDOMBYTES_PLATFORM(unsigned char *x, size_t xlen);
 int
 randombytes_select(void *buf, size_t n);
 // run all tests in module
 int main(int argc, char *argv[]) {
 #ifndef NDEBUG
    fprintf(stderr,
            "\x1b[31mIt looks like SQIsign was compiled with assertions enabled.\n"
            "This will severely impact performance measurements.\x1b[0m\n");
 #endif
  printf("Running AES-CTR-DRBG benchmarks\n");
  unsigned char x[RANDOMBYTES_MAX_LENGTH];
  cpucycles_init();
  BENCH_CODE_1(1000 * SQISIGN_TEST_REPS);
  RANDOMBYTES_PLATFORM(x, RANDOMBYTES_MAX_LENGTH);
  BENCH_CODE_2(STRINGIFY(RANDOMBYTES_PLATFORM));
  BENCH_CODE_1(SQISIGN_TEST_REPS);
  randombytes_nist(x, RANDOMBYTES_MAX_LENGTH);
  BENCH_CODE_2("randombytes_nist");
  BENCH_CODE_1(1000 * SQISIGN_TEST_REPS);
  randombytes_select(x, RANDOMBYTES_MAX_LENGTH);
  BENCH_CODE_2("randombytes_system");
  return 0;
 }
--- a/src/common/generic/test/test_ctrdrbg.c
+++ b/src/common/generic/test/test_ctrdrbg.c
@@ -0,0 +1,68 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <string.h>
 #define RANDOMBYTES_MAX_LENGTH 131072
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x) STRINGIFY2(x)
 void
 randombytes_init_nist(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength);
 int
 randombytes_nist(unsigned char *x, size_t xlen);
 void
 RANDOMBYTES_INIT_PLATFORM(unsigned char *entropy_input,
                          unsigned char *personalization_string,
                          int security_strength);
 int
 RANDOMBYTES_PLATFORM(unsigned char *x, size_t xlen);
 int
 randombytes_select(void *buf, size_t n);
 // run all tests in module
 int main(int argc, char *argv[]) {
  int res = 1;
  printf("Running AES-CTR-DRBG unit tests\n");
  unsigned char seed[48];
  unsigned char x_nist[RANDOMBYTES_MAX_LENGTH], x_platform[RANDOMBYTES_MAX_LENGTH];
  for (int i = 0; i < 8; i++) {
    for (unsigned j = 0; j < sizeof(seed); j++) {
      seed[j] = 1 << i;
    }
    RANDOMBYTES_INIT_PLATFORM(seed, NULL, 256);
    randombytes_init_nist(seed, NULL, 256);
    for (int j = RANDOMBYTES_MAX_LENGTH; j <= RANDOMBYTES_MAX_LENGTH; j *= 2) {
      RANDOMBYTES_PLATFORM(x_platform, j);
      randombytes_nist(x_nist, j);
      if (memcmp(x_platform, x_nist, j) != 0) {
        for (int k = 0; k < j; k++) {
          if (x_platform[k] != x_nist[k]) {
            printf("Test failed for seed = %d, length = %d bytes: mismatch at index %d: %d != %d\n", i, j, k, x_platform[k], x_nist[k]);
            break;
          }
        }
        res = 0;
      }
    }
  }
  if (!res) {
    printf("\nSome tests failed!\n");
  } else {
    printf("\nAll tests passed!\n");
  }
  return (!res);
 }
--- a/src/common/generic/tools.c
+++ b/src/common/generic/tools.c
@@ -0,0 +1,75 @@
 #include <stdio.h>
 #include <time.h>
 static clock_t global_timer;
 clock_t
 tic(void)
 {
    global_timer = clock();
    return global_timer;
 }
 float
 tac(void)
 {
    float ms = (1000. * (float)(clock() - global_timer) / CLOCKS_PER_SEC);
    return ms;
 }
 float
 TAC(const char *str)
 {
    float ms = (1000. * (float)(clock() - global_timer) / CLOCKS_PER_SEC);
 #ifndef NDEBUG
    printf("%s [%d ms]\n", str, (int)ms);
 #endif
    return ms;
 }
 float
 toc(const clock_t t)
 {
    float ms = (1000. * (float)(clock() - t) / CLOCKS_PER_SEC);
    return ms;
 }
 float
 TOC(const clock_t t, const char *str)
 {
    float ms = (1000. * (float)(clock() - t) / CLOCKS_PER_SEC);
    printf("%s [%d ms]\n", str, (int)ms);
    return ms;
    // printf("%s [%ld]\n",str,clock()-t);
    // return (float) (clock()-t);
 }
 float
 TOC_clock(const clock_t t, const char *str)
 {
    printf("%s [%ld]\n", str, clock() - t);
    return (float)(clock() - t);
 }
 clock_t
 dclock(const clock_t t)
 {
    return (clock() - t);
 }
 float
 clock_to_time(const clock_t t, const char *str)
 {
    float ms = (1000. * (float)(t) / CLOCKS_PER_SEC);
    printf("%s [%d ms]\n", str, (int)ms);
    return ms;
    // printf("%s [%ld]\n",str,t);
    // return (float) (t);
 }
 float
 clock_print(const clock_t t, const char *str)
 {
    printf("%s [%ld]\n", str, t);
    return (float)(t);
 }
--- a/src/common/ref/CMakeLists.txt
+++ b/src/common/ref/CMakeLists.txt
@@ -0,0 +1,10 @@
 set(SOURCE_FILES_COMMON_TEST_REF
    randombytes_ctrdrbg.c 
    aes_c.c 
 )
 target_sources(sqisign_common_test PRIVATE ${SOURCE_FILES_COMMON_TEST_REF})
 target_include_directories(sqisign_common_test PRIVATE include)
 target_compile_definitions(sqisign_common_test PRIVATE RANDOMBYTES_C)
 target_compile_definitions(sqisign_common_sys PRIVATE RANDOMBYTES_SYSTEM)
--- a/src/common/generic/aes_c.c
+++ b/src/common/generic/aes_c.c
@@ -39,23 +39,24 @@
 #define AESCTR_NONCEBYTES 12
 #define AES_BLOCKBYTES 16
 // We've put these states on the heap to make sure ctx_release is used.
 #define PQC_AES128_STATESIZE 88
-typedef struct {
+typedef struct
-    uint64_t *sk_exp;
+{
    uint64_t sk_exp[PQC_AES128_STATESIZE];
 } aes128ctx;
 #define PQC_AES192_STATESIZE 104
-typedef struct {
+typedef struct
-    uint64_t  *sk_exp;
+{
    uint64_t sk_exp[PQC_AES192_STATESIZE];
 } aes192ctx;
 #define PQC_AES256_STATESIZE 120
-typedef struct {
+typedef struct
-    uint64_t *sk_exp;
+{
    uint64_t sk_exp[PQC_AES256_STATESIZE];
 } aes256ctx;
 /** Initializes the context **/
 void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);
@@ -68,7 +69,6 @@ void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons
 /** Frees the context **/
 void aes128_ctx_release(aes128ctx *r);
 /** Initializes the context **/
 void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key);
@@ -80,7 +80,6 @@ void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons
 void aes192_ctx_release(aes192ctx *r);
 /** Initializes the context **/
 void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);
@@ -93,46 +92,50 @@ void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, cons
 /** Frees the context **/
 void aes256_ctx_release(aes256ctx *r);
-static inline uint32_t br_dec32le(const unsigned char *src) {
+static inline uint32_t
-    return (uint32_t)src[0]
+br_dec32le(const unsigned char *src)
-           | ((uint32_t)src[1] << 8)
+{
-           | ((uint32_t)src[2] << 16)
+    return (uint32_t)src[0] | ((uint32_t)src[1] << 8) | ((uint32_t)src[2] << 16) |
-           | ((uint32_t)src[3] << 24);
+           ((uint32_t)src[3] << 24);
 }
-
+static void
-static void br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src) {
+br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src)
 {
    while (num-- > 0) {
-        *v ++ = br_dec32le(src);
+        *v++ = br_dec32le(src);
        src += 4;
    }
 }
-
+static inline uint32_t
-static inline uint32_t br_swap32(uint32_t x) {
+br_swap32(uint32_t x)
-    x = ((x & (uint32_t)0x00FF00FF) << 8)
+{
-        | ((x >> 8) & (uint32_t)0x00FF00FF);
+    x = ((x & (uint32_t)0x00FF00FF) << 8) | ((x >> 8) & (uint32_t)0x00FF00FF);
    return (x << 16) | (x >> 16);
 }
-
+static inline void
-static inline void br_enc32le(unsigned char *dst, uint32_t x) {
+br_enc32le(unsigned char *dst, uint32_t x)
 {
    dst[0] = (unsigned char)x;
    dst[1] = (unsigned char)(x >> 8);
    dst[2] = (unsigned char)(x >> 16);
    dst[3] = (unsigned char)(x >> 24);
 }
-
+static void
-static void br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num) {
+br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num)
 {
    while (num-- > 0) {
-        br_enc32le(dst, *v ++);
+        br_enc32le(dst, *v++);
        dst += 4;
    }
 }
-
+static void
-static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
+br_aes_ct64_bitslice_Sbox(uint64_t *q)
 {
    /*
     * This S-box implementation is a straightforward translation of
     * the circuit described by Boyar and Peralta in "A new
@@ -306,8 +309,11 @@ static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
    q[0] = s7;
 }
-static void br_aes_ct64_ortho(uint64_t *q) {
+static void
-#define SWAPN(cl, ch, s, x, y)   do { \
+br_aes_ct64_ortho(uint64_t *q)
 {
 #define SWAPN(cl, ch, s, x, y)                                                                     \
    do {                                                                                           \
        uint64_t a, b;                                                                             \
        a = (x);                                                                                   \
        b = (y);                                                                                   \
@@ -335,8 +341,9 @@ static void br_aes_ct64_ortho(uint64_t *q) {
    SWAP8(q[3], q[7]);
 }
-
+static void
-static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
+br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w)
 {
    uint64_t x0, x1, x2, x3;
    x0 = w[0];
@@ -363,8 +370,9 @@ static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t
    *q1 = x1 | (x3 << 8);
 }
-
+static void
-static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
+br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1)
 {
    uint64_t x0, x1, x2, x3;
    x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
@@ -385,11 +393,11 @@ static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
    w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
 }
-static const unsigned char Rcon[] = {
+static const unsigned char Rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 };
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
 };
-static uint32_t sub_word(uint32_t x) {
+static uint32_t
 sub_word(uint32_t x)
 {
    uint64_t q[8];
    memset(q, 0, sizeof q);
@@ -400,7 +408,9 @@ static uint32_t sub_word(uint32_t x) {
    return (uint32_t)q[0];
 }
-static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len) {
+static void
 br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key, unsigned int key_len)
 {
    unsigned int i, j, k, nk, nkf;
    uint32_t tmp;
    uint32_t skey[60];
@@ -410,7 +420,7 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
    nkf = ((nrounds + 1) << 2);
    br_range_dec32le(skey, (key_len >> 2), key);
    tmp = skey[(key_len >> 2) - 1];
-    for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+    for (i = nk, j = 0, k = 0; i < nkf; i++) {
        if (j == 0) {
            tmp = (tmp << 24) | (tmp >> 8);
            tmp = sub_word(tmp) ^ Rcon[k];
@@ -419,9 +429,9 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
        }
        tmp ^= skey[i - nk];
        skey[i] = tmp;
-        if (++ j == nk) {
+        if (++j == nk) {
            j = 0;
-            k ++;
+            k++;
        }
    }
@@ -437,23 +447,21 @@ static void br_aes_ct64_keysched(uint64_t *comp_skey, const unsigned char *key,
        q[7] = q[4];
        br_aes_ct64_ortho(q);
        comp_skey[j + 0] =
-            (q[0] & (uint64_t)0x1111111111111111)
+            (q[0] & (uint64_t)0x1111111111111111) | (q[1] & (uint64_t)0x2222222222222222) |
-            | (q[1] & (uint64_t)0x2222222222222222)
+            (q[2] & (uint64_t)0x4444444444444444) | (q[3] & (uint64_t)0x8888888888888888);
            | (q[2] & (uint64_t)0x4444444444444444)
            | (q[3] & (uint64_t)0x8888888888888888);
        comp_skey[j + 1] =
-            (q[4] & (uint64_t)0x1111111111111111)
+            (q[4] & (uint64_t)0x1111111111111111) | (q[5] & (uint64_t)0x2222222222222222) |
-            | (q[5] & (uint64_t)0x2222222222222222)
+            (q[6] & (uint64_t)0x4444444444444444) | (q[7] & (uint64_t)0x8888888888888888);
            | (q[6] & (uint64_t)0x4444444444444444)
            | (q[7] & (uint64_t)0x8888888888888888);
    }
 }
-static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds) {
+static void
 br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, unsigned int nrounds)
 {
    unsigned u, v, n;
    n = (nrounds + 1) << 1;
-    for (u = 0, v = 0; u < n; u ++, v += 4) {
+    for (u = 0, v = 0; u < n; u++, v += 4) {
        uint64_t x0, x1, x2, x3;
        x0 = x1 = x2 = x3 = comp_skey[u];
@@ -471,8 +479,9 @@ static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey, u
    }
 }
-
+static inline void
-static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
+add_round_key(uint64_t *q, const uint64_t *sk)
 {
    q[0] ^= sk[0];
    q[1] ^= sk[1];
    q[2] ^= sk[2];
@@ -483,28 +492,32 @@ static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
    q[7] ^= sk[7];
 }
-static inline void shift_rows(uint64_t *q) {
+static inline void
 shift_rows(uint64_t *q)
 {
    int i;
-    for (i = 0; i < 8; i ++) {
+    for (i = 0; i < 8; i++) {
        uint64_t x;
        x = q[i];
-        q[i] = (x & (uint64_t)0x000000000000FFFF)
+        q[i] =
-               | ((x & (uint64_t)0x00000000FFF00000) >> 4)
+            (x & (uint64_t)0x000000000000FFFF) | ((x & (uint64_t)0x00000000FFF00000) >> 4) |
-               | ((x & (uint64_t)0x00000000000F0000) << 12)
+            ((x & (uint64_t)0x00000000000F0000) << 12) | ((x & (uint64_t)0x0000FF0000000000) >> 8) |
-               | ((x & (uint64_t)0x0000FF0000000000) >> 8)
+            ((x & (uint64_t)0x000000FF00000000) << 8) | ((x & (uint64_t)0xF000000000000000) >> 12) |
-               | ((x & (uint64_t)0x000000FF00000000) << 8)
+            ((x & (uint64_t)0x0FFF000000000000) << 4);
               | ((x & (uint64_t)0xF000000000000000) >> 12)
               | ((x & (uint64_t)0x0FFF000000000000) << 4);
    }
 }
-static inline uint64_t rotr32(uint64_t x) {
+static inline uint64_t
 rotr32(uint64_t x)
 {
    return (x << 32) | (x >> 32);
 }
-static inline void mix_columns(uint64_t *q) {
+static inline void
 mix_columns(uint64_t *q)
 {
    uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
    uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
@@ -535,14 +548,19 @@ static inline void mix_columns(uint64_t *q) {
    q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
 }
-
+static void
-static void inc4_be(uint32_t *x) {
+inc4_be(uint32_t *x)
 {
    uint32_t t = br_swap32(*x) + 4;
    *x = br_swap32(t);
 }
-
+static void
-static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+aes_ecb4x(unsigned char out[64],
          const uint32_t ivw[16],
          const uint64_t *sk_exp,
          unsigned int nrounds)
 {
    uint32_t w[16];
    uint64_t q[8];
    unsigned int i;
@@ -553,7 +571,6 @@ static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint6
    }
    br_aes_ct64_ortho(q);
    add_round_key(q, sk_exp);
    for (i = 1; i < nrounds; i++) {
        br_aes_ct64_bitslice_Sbox(q);
@@ -566,14 +583,15 @@ static void aes_ecb4x(unsigned char out[64], const uint32_t ivw[16], const uint6
    add_round_key(q, sk_exp + 8 * nrounds);
    br_aes_ct64_ortho(q);
-    for (i = 0; i < 4; i ++) {
+    for (i = 0; i < 4; i++) {
        br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
    }
    br_range_enc32le(out, w, 16);
 }
-
+static void
-static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds) {
+aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *sk_exp, unsigned int nrounds)
 {
    aes_ecb4x(out, ivw, sk_exp, nrounds);
    /* Increase counter for next 4 blocks */
@@ -583,8 +601,13 @@ static void aes_ctr4x(unsigned char out[64], uint32_t ivw[16], const uint64_t *s
    inc4_be(ivw + 15);
 }
-
+static void
-static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, unsigned int nrounds) {
+aes_ecb(unsigned char *out,
        const unsigned char *in,
        size_t nblocks,
        const uint64_t *rkeys,
        unsigned int nrounds)
 {
    uint32_t blocks[16];
    unsigned char t[64];
@@ -603,8 +626,13 @@ static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks,
    }
 }
-
+static void
-static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, unsigned int nrounds) {
+aes_ctr(unsigned char *out,
        size_t outlen,
        const unsigned char *iv,
        const uint64_t *rkeys,
        unsigned int nrounds)
 {
    uint32_t ivw[16];
    size_t i;
    uint32_t cc = 0;
@@ -613,8 +641,8 @@ static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv,
    memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t));
    memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t));
    memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
-    ivw[ 3] = br_swap32(cc);
+    ivw[3] = br_swap32(cc);
-    ivw[ 7] = br_swap32(cc + 1);
+    ivw[7] = br_swap32(cc + 1);
    ivw[11] = br_swap32(cc + 2);
    ivw[15] = br_swap32(cc + 3);
@@ -632,97 +660,110 @@ static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv,
    }
 }
-void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key) {
+void
 aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key)
 {
    uint64_t skey[22];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES128_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 16);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 10);
 }
-void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key) {
+void
 aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key)
 {
    aes128_ecb_keyexp(r, key);
 }
-
+void
-void aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key) {
+aes192_ecb_keyexp(aes192ctx *r, const unsigned char *key)
 {
    uint64_t skey[26];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES192_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 24);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 12);
 }
-
+void
-void aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key) {
+aes192_ctr_keyexp(aes192ctx *r, const unsigned char *key)
 {
    aes192_ecb_keyexp(r, key);
 }
-
+void
-void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key) {
+aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key)
 {
    uint64_t skey[30];
    r->sk_exp = malloc(sizeof(uint64_t) * PQC_AES256_STATESIZE);
    if (r->sk_exp == NULL) {
        exit(111);
    }
    br_aes_ct64_keysched(skey, key, 32);
    br_aes_ct64_skey_expand(r->sk_exp, skey, 14);
 }
-
+void
-void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key) {
+aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key)
 {
    aes256_ecb_keyexp(r, key);
 }
-
+void
-void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx) {
+aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx)
 {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 10);
 }
-void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx) {
+void
 aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx)
 {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 10);
 }
-void aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx) {
+void
 aes192_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx *ctx)
 {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 12);
 }
-void aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx) {
+void
 aes192_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx *ctx)
 {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 12);
 }
-void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx) {
+void
 aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx)
 {
    aes_ecb(out, in, nblocks, ctx->sk_exp, 14);
 }
-void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx) {
+void
 aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx)
 {
    aes_ctr(out, outlen, iv, ctx->sk_exp, 14);
 }
-void aes128_ctx_release(aes128ctx *r) {
+void
-    free(r->sk_exp);
+aes128_ctx_release(aes128ctx *r)
 {
 }
-void aes192_ctx_release(aes192ctx *r) {
+void
-    free(r->sk_exp);
+aes192_ctx_release(aes192ctx *r)
 {
 }
-void aes256_ctx_release(aes256ctx *r) {
+void
-    free(r->sk_exp);
+aes256_ctx_release(aes256ctx *r)
 {
 }
-int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+int
-                const unsigned char *input, size_t inputByteLen) {
+AES_128_CTR(unsigned char *output,
            size_t outputByteLen,
            const unsigned char *input,
            size_t inputByteLen)
 {
    aes128ctx ctx;
-    unsigned char iv[16] = { 0 };
+    const unsigned char iv[16] = { 0 };
    aes128_ctr_keyexp(&ctx, input);
    aes128_ctr(output, outputByteLen, iv, &ctx);
@@ -731,7 +772,9 @@ int AES_128_CTR(unsigned char *output, size_t outputByteLen,
    return (int)outputByteLen;
 }
-void AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output) {
+void
 AES_256_ECB(const uint8_t *input, const unsigned char *key, unsigned char *output)
 {
    aes256ctx ctx;
    aes256_ecb_keyexp(&ctx, key);
--- a/src/common/ref/include/aes.h
+++ b/src/common/ref/include/aes.h
@@ -0,0 +1,29 @@
 // SPDX-License-Identifier: Apache-2.0
 #ifndef AES_H
 #define AES_H
 #include <stddef.h>
 #include <stdint.h>
 void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
 #define AES_ECB_encrypt AES_256_ECB
 #ifdef ENABLE_AESNI
 int AES_128_CTR_NI(unsigned char *output,
                   size_t outputByteLen,
                   const unsigned char *input,
                   size_t inputByteLen);
 int AES_128_CTR_4R_NI(unsigned char *output,
                      size_t outputByteLen,
                      const unsigned char *input,
                      size_t inputByteLen);
 #define AES_128_CTR AES_128_CTR_NI
 #else
 int AES_128_CTR(unsigned char *output,
                size_t outputByteLen,
                const unsigned char *input,
                size_t inputByteLen);
 #endif
 #endif
--- a/src/common/ref/randombytes_ctrdrbg.c
+++ b/src/common/ref/randombytes_ctrdrbg.c
@@ -0,0 +1,161 @@
 // SPDX-License-Identifier: Apache-2.0 and Unknown
 //
 /*
 NIST-developed software is provided by NIST as a public service. You may use,
 copy, and distribute copies of the software in any medium, provided that you
 keep intact this entire notice. You may improve, modify, and create derivative
 works of the software or any portion of the software, and you may copy and
 distribute such modifications or works. Modified works should carry a notice
 stating that you changed the software and should note the date and nature of any
 such change. Please explicitly acknowledge the National Institute of Standards
 and Technology as the source of the software.
 NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF
 ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING,
 WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
 PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS
 NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR
 ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE
 ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF,
 INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR
 USEFULNESS OF THE SOFTWARE.
 You are solely responsible for determining the appropriateness of using and
 distributing the software and you assume all risks associated with its use,
 including but not limited to the risks and costs of program errors, compliance
 with applicable laws, damage to or loss of data, programs or equipment, and the
 unavailability or interruption of operation. This software is not intended to be
 used in any situation where a failure could cause risk of injury or damage to
 property. The software developed by NIST employees is not subject to copyright
 protection within the United States.
 */
 #include <rng.h>
 #include <string.h>
 #include <aes.h>
 #ifdef ENABLE_CT_TESTING
 #include <valgrind/memcheck.h>
 #endif
 #define RNG_SUCCESS 0
 #define RNG_BAD_MAXLEN -1
 #define RNG_BAD_OUTBUF -2
 #define RNG_BAD_REQ_LEN -3
 static inline void AES256_ECB(const unsigned char *key,
                              const unsigned char *ctr, unsigned char *buffer) {
  AES_ECB_encrypt(ctr, key, buffer);
 }
 typedef struct {
  unsigned char Key[32];
  unsigned char V[16];
  int reseed_counter;
 } AES256_CTR_DRBG_struct;
 void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
                            unsigned char *Key, unsigned char *V);
 AES256_CTR_DRBG_struct DRBG_ctx;
 #ifndef CTRDRBG_TEST_BENCH
 static
 #endif
    void
    randombytes_init_nist(unsigned char *entropy_input,
                          unsigned char *personalization_string,
                          int security_strength) {
  unsigned char seed_material[48];
  (void)security_strength; // Unused parameter
  memcpy(seed_material, entropy_input, 48);
  if (personalization_string)
    for (int i = 0; i < 48; i++) {
      seed_material[i] ^= personalization_string[i];
    }
  memset(DRBG_ctx.Key, 0x00, 32);
  memset(DRBG_ctx.V, 0x00, 16);
  AES256_CTR_DRBG_Update(seed_material, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter = 1;
 }
 #ifndef CTRDRBG_TEST_BENCH
 static
 #endif
    int
    randombytes_nist(unsigned char *x, size_t xlen) {
  unsigned char block[16];
  size_t i = 0;
  while (xlen > 0) {
    // increment V
    for (int j = 15; j >= 0; j--) {
      if (DRBG_ctx.V[j] == 0xff) {
        DRBG_ctx.V[j] = 0x00;
      } else {
        DRBG_ctx.V[j]++;
        break;
      }
    }
    AES256_ECB(DRBG_ctx.Key, DRBG_ctx.V, block);
    if (xlen > 15) {
      memcpy(x + i, block, 16);
      i += 16;
      xlen -= 16;
    } else {
      memcpy(x + i, block, xlen);
      i += xlen;
      xlen = 0;
    }
  }
  AES256_CTR_DRBG_Update(NULL, DRBG_ctx.Key, DRBG_ctx.V);
  DRBG_ctx.reseed_counter++;
  return 0;
 }
 void AES256_CTR_DRBG_Update(const unsigned char *provided_data,
                            unsigned char *Key, unsigned char *V) {
  unsigned char temp[48];
  for (int i = 0; i < 3; i++) {
    // increment V
    for (int j = 15; j >= 0; j--) {
      if (V[j] == 0xff) {
        V[j] = 0x00;
      } else {
        V[j]++;
        break;
      }
    }
    AES256_ECB(Key, V, temp + 16 * i);
  }
  if (provided_data != NULL)
    for (int i = 0; i < 48; i++) {
      temp[i] ^= provided_data[i];
    }
  memcpy(Key, temp, 32);
  memcpy(V, temp + 32, 16);
 }
 #ifdef RANDOMBYTES_C
 SQISIGN_API
 int randombytes(unsigned char *random_array, unsigned long long nbytes) {
  int ret = randombytes_nist(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
  VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
  return ret;
 }
 SQISIGN_API
 void randombytes_init(unsigned char *entropy_input,
                      unsigned char *personalization_string,
                      int security_strength) {
  randombytes_init_nist(entropy_input, personalization_string,
                        security_strength);
 }
 #endif
--- a/src/ec/ref/CMakeLists.txt
+++ b/src/ec/ref/CMakeLists.txt
@@ -1,3 +1,3 @@
-set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)
+set(LVLX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lvlx)
 include(${SELECT_SQISIGN_VARIANT})
--- a/src/ec/ref/ecx/basis.c
+++ b/src/ec/ref/ecx/basis.c
@@ -1,508 +0,0 @@
 #include "isog.h"
 static void xTPL(ec_point_t* Q, const ec_point_t* P, const ec_point_t* A3)
 {
    /* ----------------------------------------------------------------------------- *
     * Differential point tripling given the montgomery coefficient A3 = (A+2C:A-2C)
     * ----------------------------------------------------------------------------- */
    fp2_t t0, t1, t2, t3, t4;
    fp2_sub(&t0, &P->x, &P->z);
    fp2_sqr(&t2, &t0);
    fp2_add(&t1, &P->x, &P->z);
    fp2_sqr(&t3, &t1);
    fp2_add(&t4, &t1, &t0);
    fp2_sub(&t0, &t1, &t0);
    fp2_sqr(&t1, &t4);
    fp2_sub(&t1, &t1, &t3);
    fp2_sub(&t1, &t1, &t2);
    fp2_mul(&Q->x, &t3, &A3->x);
    fp2_mul(&t3, &Q->x, &t3);
    fp2_mul(&Q->z, &t2, &A3->z);
    fp2_mul(&t2, &t2, &Q->z);
    fp2_sub(&t3, &t2, &t3);
    fp2_sub(&t2, &Q->x, &Q->z);
    fp2_mul(&t1, &t2, &t1);
    fp2_add(&t2, &t3, &t1);
    fp2_sqr(&t2, &t2);
    fp2_mul(&Q->x, &t2, &t4);
    fp2_sub(&t1, &t3, &t1);
    fp2_sqr(&t1, &t1);
    fp2_mul(&Q->z, &t1, &t0);
 }
 int ec_is_on_curve(const ec_curve_t* curve, const ec_point_t* P){
    fp2_t t0, t1, t2;
    // Check if xz*(C^2x^2+zACx+z^2C^2) is a square
    fp2_mul(&t0, &curve->C, &P->x); 
    fp2_mul(&t1, &t0, &P->z);       
    fp2_mul(&t1, &t1, &curve->A);   
    fp2_mul(&t2, &curve->C, &P->z); 
    fp2_sqr(&t0, &t0);              
    fp2_sqr(&t2, &t2);              
    fp2_add(&t0, &t0, &t1);
    fp2_add(&t0, &t0, &t2);
    fp2_mul(&t0, &t0, &P->x);
    fp2_mul(&t0, &t0, &P->z);
    return fp2_is_square(&t0);
 }
 static void difference_point(ec_point_t* PQ, const ec_point_t* P, const ec_point_t* Q, const ec_curve_t* curve){
    // Given P,Q in affine x-only, computes a deterministic choice for (P-Q)
    // The points must be normalized to z=1 and the curve to C=1
    fp2_t t0, t1, t2, t3;
    fp2_sub(&PQ->z, &P->x, &Q->x);  // P - Q
    fp2_mul(&t2, &P->x, &Q->x);     // P*Q
    fp_mont_setone(t1.re);
    fp_set(t1.im, 0);
    fp2_sub(&t3, &t2, &t1);         // P*Q-1
    fp2_mul(&t0, &PQ->z, &t3);      // (P-Q)*(P*Q-1)
    fp2_sqr(&PQ->z, &PQ->z);        // (P-Q)^2
    fp2_sqr(&t0, &t0);              // (P-Q)^2*(P*Q-1)^2
    fp2_add(&t1, &t2, &t1);         // P*Q+1
    fp2_add(&t3, &P->x, &Q->x);     // P+Q
    fp2_mul(&t1, &t1, &t3);         // (P+Q)*(P*Q+1)
    fp2_mul(&t2, &t2, &curve->A);   // A*P*Q
    fp2_add(&t2, &t2, &t2);         // 2*A*P*Q
    fp2_add(&t1, &t1, &t2);         // (P+Q)*(P*Q+1) + 2*A*P*Q
    fp2_sqr(&t2, &t1);              // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2
    fp2_sub(&t0, &t2, &t0);         // ((P+Q)*(P*Q+1) + 2*A*P*Q)^2 - (P-Q)^2*(P*Q-1)^2
    fp2_sqrt(&t0);
    fp2_add(&PQ->x, &t0, &t1);
 }
 void ec_curve_to_basis_2(ec_basis_t *PQ2, const ec_curve_t *curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q2, P2, A24;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&P, &P, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&P2, &P);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&P2, &P2, &A24);
        if(ec_is_zero(&P2))
            continue;
        else
            break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_2f, P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&Q2, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q2, &Q2, &A24);
        if(ec_is_zero(&Q2))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P2, &Q2))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ2->PmQ, &P, &Q, &E);
    copy_point(&PQ2->P, &P);
    copy_point(&PQ2->Q, &Q);
 }
 void ec_complete_basis_2(ec_basis_t* PQ2, const ec_curve_t* curve, const ec_point_t* P){
    fp2_t x, t0, t1, t2;
    ec_point_t Q, Q2, P2, A24;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Point of order 2 generated by P
    copy_point(&P2, P);
    for(int i = 0; i < POWER_OF_2 - 1; i++)
        xDBLv2(&P2, &P2, &A24);
    // Find Q
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear odd factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_2f, (int)P_COFACTOR_FOR_2F_BITLENGTH, &A24);
        // Check if point has order 2^f
        copy_point(&Q2, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q2, &Q2, &A24);
        if(ec_is_zero(&Q2))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P2, &Q2))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    ec_point_t PP;
    fp2_mul(&t0, &P->z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&PP.x, &P->x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&PP.x, &PP.x, &Q.z);
    fp2_mul(&PP.x, &PP.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P->z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(PP.z.re);
    fp_set(PP.z.im, 0);
    fp2_copy(&Q.z, &PP.z);
    fp2_copy(&E.C, &PP.z);
    // Compute P-Q
    difference_point(&PQ2->PmQ, &PP, &Q, &E);
    copy_point(&PQ2->P, &PP);
    copy_point(&PQ2->Q, &Q);
 }
 void ec_curve_to_basis_3(ec_basis_t* PQ3, const ec_curve_t* curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q3, P3, A24, A3;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Curve coefficient in the form A3 = (A+2C:A-2C)
    fp2_sub(&A3.z, &A24.x, &A24.z);
    fp2_copy(&A3.x, &A24.x);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear non-3 factors from the order
        xMULv2(&P, &P, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
        // Check if point has order 3^g
        copy_point(&P3, &P);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&P3, &P3, &A3);
        if(ec_is_zero(&P3))
            continue;
        else
            break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear non-3 factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_3g, (int)P_COFACTOR_FOR_3G_BITLENGTH, &A24);
        // Check if point has order 3^g
        copy_point(&Q3, &Q);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&Q3, &Q3, &A3);
        if(ec_is_zero(&Q3))
            continue;
        // Check if point is orthogonal to P
        if(is_point_equal(&P3, &Q3))
            continue;
        xDBLv2(&P3, &P3, &A24);
        if(is_point_equal(&P3, &Q3))
            continue;
        else
            break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ3->PmQ, &P, &Q, &E);
    copy_point(&PQ3->P, &P);
    copy_point(&PQ3->Q, &Q);
 }
 void ec_curve_to_basis_6(ec_basis_t* PQ6, const ec_curve_t* curve){
    fp2_t x, t0, t1, t2;
    ec_point_t P, Q, Q6, P6, R, T, A24, A3;
    // Curve coefficient in the form A24 = (A+2C:4C)
    fp2_add(&A24.z, &curve->C, &curve->C);
    fp2_add(&A24.x, &curve->A, &A24.z);
    fp2_add(&A24.z, &A24.z, &A24.z);
    // Curve coefficient in the form A3 = (A+2C:A-2C)
    fp2_sub(&A3.z, &A24.x, &A24.z);
    fp2_copy(&A3.x, &A24.x);
    fp_mont_setone(x.re);
    fp_set(x.im, 0);
    // Find P
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&P.x, &x);
            fp_mont_setone(P.z.re);
            fp_set(P.z.im, 0);
        }
        else
            continue;
        // Clear non-2 factors and non-3 factors from the order
        xMULv2(&P, &P, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
        // Check if point has order 2^f*3^g
        copy_point(&P6, &P);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&P6, &P6, &A24);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&P6, &P6, &A3);
        if(ec_is_zero(&P6))
            continue;
        xDBLv2(&T, &P6, &A24);
        if (ec_is_zero(&T))
            continue;
        xTPL(&T, &P6, &A3);
        if (ec_is_zero(&T))
            continue;
        break;
    }
    // Find Q
    while(1){
        fp_add(x.im, x.re, x.im);
        // Check if point is rational
        fp2_sqr(&t0, &curve->C);
        fp2_mul(&t1, &t0, &x);
        fp2_mul(&t2, &curve->A, &curve->C);
        fp2_add(&t1, &t1, &t2);
        fp2_mul(&t1, &t1, &x);
        fp2_add(&t1, &t1, &t0);
        fp2_mul(&t1, &t1, &x);
        if(fp2_is_square(&t1)){
            fp2_copy(&Q.x, &x);
            fp_mont_setone(Q.z.re);
            fp_set(Q.z.im, 0);
        }
        else
            continue;
        // Clear non-6 factors from the order
        xMULv2(&Q, &Q, p_cofactor_for_6fg, (int)P_COFACTOR_FOR_6FG_BITLENGTH, &A24);
        // Check first if point has order 2^f*3^g
        copy_point(&Q6, &Q);
        for(int i = 0; i < POWER_OF_2 - 1; i++)
            xDBLv2(&Q6, &Q6, &A24);
        for(int i = 0; i < POWER_OF_3 - 1; i++)
            xTPL(&Q6, &Q6, &A3);
        if(ec_is_zero(&Q6))
            continue;
        xDBLv2(&T, &Q6, &A24);
        if (ec_is_zero(&T))
            continue;
        xTPL(&T, &Q6, &A3);
        if (ec_is_zero(&T))
            continue;
        // Check if point P is independent from point Q
        xTPL(&R, &P6, &A3);
        xTPL(&T, &Q6, &A3);
        if(is_point_equal(&R, &T))
            continue;
        xDBLv2(&R, &P6, &A24);
        xDBLv2(&T, &Q6, &A24);
        if(is_point_equal(&R, &T))
            continue;
        break;
    }
    // Normalize points
    ec_curve_t E;
    fp2_mul(&t0, &P.z, &Q.z);
    fp2_mul(&t1, &t0, &curve->C);
    fp2_inv(&t1);
    fp2_mul(&P.x, &P.x, &t1);
    fp2_mul(&Q.x, &Q.x, &t1);
    fp2_mul(&E.A, &curve->A, &t1);
    fp2_mul(&P.x, &P.x, &Q.z);
    fp2_mul(&P.x, &P.x, &curve->C);
    fp2_mul(&Q.x, &Q.x, &P.z);
    fp2_mul(&Q.x, &Q.x, &curve->C);
    fp2_mul(&E.A, &E.A, &t0);
    fp_mont_setone(P.z.re);
    fp_set(P.z.im, 0);
    fp2_copy(&Q.z, &P.z);
    fp2_copy(&E.C, &P.z);
    // Compute P-Q
    difference_point(&PQ6->PmQ, &P, &Q, &E);
    copy_point(&PQ6->P, &P);
    copy_point(&PQ6->Q, &Q);
 }
--- a/Show More
+++ b/Show More
`@@ -1,3 +1,3 @@`
	`set(ECX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ecx)`	`set(LVLX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lvlx)`

	`include(${SELECT_SQISIGN_VARIANT})`	`include(${SELECT_SQISIGN_VARIANT})`