Compare commits

9 Commits
main ... dev

Author SHA1 Message Date
22b7b12751 feat(ml_predict): 启用 ml_log_ideal_attempt 日志函数
将原本被注释掉的 ml_log_ideal_attempt 函数启用,用于记录理想尝试的相关数据。
该函数会将每次尝试的时间戳、次数、范数、迹、核阶数、预测概率及成功标志写入 CSV 文件。
确保在项目根目录下生成 dataset 文件夹,并将日志持久化存储于 ideal_data.csv 中。
2025-11-26 22:40:43 +08:00
0c2d61119b feat(compilation): 启用 ARM64 优化与 OpenMP 并行支持
- 在 `.cmake/arm_optimization.cmake` 中增强 ARM64 编译优化选项,包括:
  * 添加 `-mtune=cortex-a76` 和更多特定于 ARM64 的优化标志
  * 启用循环优化、浮点运算优化及链接时优化(LTO)
- 在 `CMakeLists.txt` 中新增 `ENABLE_OPENMP` 选项以启用 OpenMP 支持
- 优化 `randombytes_ctrdrbg.c` 中的 AES 密钥调度和随机数生成逻辑,利用 ARM64 Crypto 扩展提升性能
- 在 `lll_tests.c` 中对关键循环进行展开以降低分支开销
- 在签名密钥生成和提交阶段引入 OpenMP 并行化处理,加快理想采样过程
- 注释掉未使用的机器学习日志函数 `ml_log_ideal_attempt` 实现
- 调整默认 `GF_RADIX` 为 64,并更新相关编译配置
2025-11-26 15:51:27 +08:00
63dcfd3992 feat(ml_predict): 改进机器学习预测函数,采用更精细的概率模型
更新了 ml_predict_success 函数,使用对数缩放范数、分段评分规则和非线性
映射来提高预测准确率。同时优化了 ml_log_ideal_attempt 日志记录函数,
确保数据完整性和可追踪性。
2025-11-26 13:49:27 +08:00
4b11293268 feat(benchmark): 临时固定随机种子初始化值
将 benchmark.c 中的随机种子从全零初始化更新为预定义的非零值,
以确保每次运行时具有更好的随机性。同时将 seed_set 标志设置为 1,
表示种子已经正确初始化。此举有助于提高基准测试结果的一致性和可靠性。
2025-11-26 09:23:47 +08:00
601f0b7d0a feat(cmake): 添加 ARM 架构优化配置文件
新增 `.cmake/arm_optimization.cmake` 文件,用于检测 ARM 架构并应用相应编译优化。
包括 NEON 指令集支持、ARM64 的 crypto 扩展检查、LTO 优化以及针对特定 CPU 的调优选项。
同时在 `CMakeLists.txt` 中包含该优化配置,并更新基准测试脚本中的构建目录路径。
2025-11-25 22:58:37 +08:00
a170e7384f feat: 补全优化了ML测试内容,添加测试脚本和数据集,改为相对路径 2025-11-25 10:32:13 +00:00
AsyncKurisu
28154c2a31 feat: 使用ML优化Deuring-correspondence,v1 2025-11-24 23:09:14 +08:00
AsyncKurisu
c7cef447b8 feat: 优化随机数逻辑v2,动态调整WAYS=4,6,8,8 2025-11-24 16:43:05 +08:00
AsyncKurisu
0860c735a3 feat: 优化随机数生成逻辑,修改并行数WAYS=8 2025-11-24 16:39:55 +08:00
11 changed files with 2581 additions and 216 deletions

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# ARM架构优化配置文件
# 检查是否为ARM架构
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
# 启用NEON指令集优化
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
# ARM64架构
add_compile_options(-march=armv8-a+simd)
# 如果支持 crypto 扩展,则启用
include(CheckCSourceCompiles)
check_c_source_compiles("
#include <arm_neon.h>
int main() {
uint8x16_t a = vdupq_n_u8(0);
uint8x16_t b = vaeseq_u8(a, vdupq_n_u8(0));
return 0;
}" HAVE_ARM64_CRYPTO)
if (HAVE_ARM64_CRYPTO)
add_compile_options(-march=armv8-a+crypto)
add_compile_definitions(HAVE_ARM64_CRYPTO)
endif()
# CPU特定优化
# 根据实际部署平台选择合适的CPU型号
add_compile_options(-mtune=cortex-a76) # 默认使用cortex-a76
# 更多ARM64优化选项
add_compile_options(
-moutline-atomics # 内联原子操作
-mstrict-align # 严格对齐优化
)
else()
# ARM32架构
add_compile_options(-march=armv7-a -mfpu=neon)
endif()
# 通用ARM优化选项
add_compile_options(
-O3 # 最高级别优化
-funroll-loops # 循环展开
-fomit-frame-pointer # 省略帧指针
-frename-registers # 重命名寄存器
-fipa-pta # 点对点分析
-floop-optimize # 循环优化
-fprefetch-loop-arrays # 预取循环数组
-funroll-all-loops # 展开所有循环
-fpeel-loops # 循环剥离
)
# 浮点运算优化
add_compile_options(
-ffast-math # 快速数学运算
-ffp-contract=fast # 快速浮点收缩
-funsafe-math-optimizations # 不安全的数学优化
-ftree-vectorize # 树向量化
)
# 启用链接时优化(LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result)
if(result)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
add_compile_options(-flto=auto)
endif()
message(STATUS "ARM optimizations enabled for ${CMAKE_SYSTEM_PROCESSOR}")
# 添加NEON支持的定义
add_compile_definitions(HAVE_NEON)
endif()

View File

@@ -51,6 +51,11 @@ else()
message("Warning: system architecture not detected, defaulting to 64 bit")
endif()
# 修改默认GF_RADIX为64
if (NOT DEFINED GF_RADIX)
set(GF_RADIX 64)
endif()
if (NOT GF_RADIX STREQUAL "AUTO")
if (NOT((GF_RADIX EQUAL 64) OR (GF_RADIX EQUAL 32)))
message(FATAL_ERROR "Currently supported options for GF_RADIX: 32 or 64. Aborting")
@@ -97,4 +102,4 @@ if (NOT DEFINED SQISIGN_TEST_REPS)
set(SQISIGN_TEST_REPS 10)
endif()
add_compile_definitions(SQISIGN_TEST_REPS=${SQISIGN_TEST_REPS})
add_compile_definitions(SQISIGN_TEST_REPS=${SQISIGN_TEST_REPS})

View File

@@ -13,9 +13,21 @@ option(ENABLE_STRICT "Build with strict compile options." ON)
option(ENABLE_TESTS "Enable compilation of tests." ON)
option(ENABLE_CT_TESTING "Enable compilation for constant time testing." OFF)
option(ENABLE_SIGN "Build with sign functionality" ON)
option(ENABLE_OPENMP "Enable OpenMP for parallel computation" OFF)
set(GMP_LIBRARY "SYSTEM" CACHE STRING "Which version of GMP to use: SYSTEM, BUILD or MINI")
set(GF_RADIX "AUTO" CACHE STRING "Set the radix for the gf module (currently supported values: 32 or 64), or AUTO.")
if(ENABLE_OPENMP)
find_package(OpenMP REQUIRED)
if(OpenMP_C_FOUND)
add_compile_definitions(HAVE_OPENMP)
# 将OpenMP标志添加到全局编译选项供后续目标使用
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
endif()
endif()
if (NOT DEFINED SQISIGN_BUILD_TYPE)
SET(SQISIGN_BUILD_TYPE "ref")
endif()
@@ -35,6 +47,8 @@ SET(SVARIANT_S "lvl1;lvl3;lvl5")
include(.cmake/flags.cmake)
include(.cmake/sanitizers.cmake)
include(.cmake/target.cmake)
include(.cmake/arm_optimization.cmake)
if(ENABLE_SIGN)
include(.cmake/gmpconfig.cmake)
add_compile_definitions(ENABLE_SIGN)

13
benchmark.sh Executable file
View File

@@ -0,0 +1,13 @@
# cmake -DSQISIGN_BUILD_TYPE=ref -DCMAKE_BUILD_TYPE=Release ..
# 包含完整跑分和验证的测试脚本
# BASE_DIR="./build_Neon_ml"
BASE_DIR="./build_ref_release_test"
echo "------------------------start benchmark------------------------------"
$BASE_DIR/apps/benchmark_lvl1 --iterations=100 >> $BASE_DIR/test_result.txt
$BASE_DIR/apps/benchmark_lvl3 --iterations=100 >> $BASE_DIR/test_result.txt
$BASE_DIR/apps/benchmark_lvl5 --iterations=100 >> $BASE_DIR/test_result.txt
echo "-------------------------end benchmark-----------------------------"

1843
dataset/ideal_data.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -7,270 +7,344 @@
static AES256_CTR_DRBG_struct DRBG_ctx;
static inline uint32_t AES_sbox_x4(uint32_t in) {
uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
__attribute__((always_inline, hot))
static inline uint32_t
AES_sbox_x4(uint32_t in)
{
uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
}
#define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
typedef union {
uint8_t u8[15][16];
uint32_t u32[15][4];
typedef union
{
uint8_t u8[15][16];
uint32_t u32[15][4];
} subkeys_t;
static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) {
subkeys_t *sk = (subkeys_t *)subkeys;
uint8_t rcon = 1;
uint32_t s;
int i, j;
__attribute__((hot))
static void
AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key)
{
uint8x16_t rcon = vdupq_n_u8(0x01);
// uint8x16_t rcon_step = vdupq_n_u8(0x1b);
memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t));
// 一次性复制前两轮密钥
memcpy(&subkeys[0][0], key, 32);
for (i = 2; i < 14; i += 2) {
s = AES_sbox_x4(sk->u32[i - 1][3]);
sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0];
uint8x16_t prev_key = vld1q_u8(&subkeys[0][0]);
uint8x16_t prev_prev_key = vld1q_u8(&subkeys[1][0]);
for (j = 1; j < 4; j++) {
sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j];
for (int i = 2; i < 15; i++) {
// 提取最后一列并进行S-box变换
uint8x16_t last_col = vextq_u8(prev_key, vdupq_n_u8(0), 12);
last_col = vaeseq_u8(last_col, vdupq_n_u8(0));
// RotWord
last_col = vextq_u8(last_col, last_col, 3);
// XOR with rcon
uint8x16_t new_key_first = veorq_u8(veorq_u8(last_col, rcon), prev_prev_key);
// 生成新密钥的剩余部分
uint8x16_t new_key = vextq_u8(prev_prev_key, new_key_first, 12);
// 保存新密钥
vst1q_u8(&subkeys[i][0], new_key);
// 更新rcon
uint8_t rcon_val = vgetq_lane_u8(rcon, 0);
rcon_val = (rcon_val << 1) ^ ((rcon_val >> 7) * 0x1b);
rcon = vdupq_n_u8(rcon_val);
// 更新前两个密钥
prev_prev_key = prev_key;
prev_key = new_key;
}
s = AES_sbox_x4(sk->u32[i][3]);
sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0];
for (j = 1; j < 4; j++) {
sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j];
}
rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b);
}
s = AES_sbox_x4(sk->u32[13][3]);
sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0];
for (j = 1; j < 4; j++) {
sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j];
}
}
#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out) \
do { \
uint8x16_t state[ways]; \
\
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(ctr[j], vsubkeys[0]); \
state[j] = vaesmcq_u8(state[j]); \
} \
\
for (int i = 1; i < 13; i++) { \
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(state[j], vsubkeys[i]); \
state[j] = vaesmcq_u8(state[j]); \
} \
} \
\
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(state[j], vsubkeys[13]); \
state[j] = veorq_u8(state[j], vsubkeys[14]); \
vst1q_u8(out + j * 16, state[j]); \
} \
} while (0);
#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out) \
do { \
uint8x16_t state[ways]; \
\
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(ctr[j], vsubkeys[0]); \
state[j] = vaesmcq_u8(state[j]); \
} \
\
for (int i = 1; i < 13; i++) { \
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(state[j], vsubkeys[i]); \
state[j] = vaesmcq_u8(state[j]); \
} \
} \
\
for (int j = 0; j < ways; j++) { \
state[j] = vaeseq_u8(state[j], vsubkeys[13]); \
state[j] = veorq_u8(state[j], vsubkeys[14]); \
vst1q_u8(out + j * 16, state[j]); \
} \
} while (0);
// subkeys - subkeys for AES-256
// ctr - a 128-bit plaintext value
// buffer - a 128-bit ciphertext value
static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr,
unsigned char *buffer) {
AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
static void
AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr, unsigned char *buffer)
{
AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
}
// vsubkeys - subkeys for AES-256
// ctr - an array of 3 x 128-bit plaintext value
// buffer - an array of 3 x 128-bit ciphertext value
static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3],
unsigned char *buffer) {
AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
static void
AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3], unsigned char *buffer)
{
AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
}
static void bswap128(__uint128_t *x) {
uint64_t *x64 = (uint64_t *)x;
static void
bswap128(__uint128_t *x)
{
uint64_t *x64 = (uint64_t *)x;
uint64_t t = x64[0];
x64[0] = x64[1];
x64[1] = t;
uint64_t t = x64[0];
x64[0] = x64[1];
x64[1] = t;
x64[0] = __builtin_bswap64(x64[0]);
x64[1] = __builtin_bswap64(x64[1]);
x64[0] = __builtin_bswap64(x64[0]);
x64[1] = __builtin_bswap64(x64[1]);
}
static void add_to_V(unsigned char V[], int incr) {
__uint128_t *V128 = (__uint128_t *)V;
bswap128(V128);
(*V128) += incr;
bswap128(V128);
static void
add_to_V(unsigned char V[], int incr)
{
__uint128_t *V128 = (__uint128_t *)V;
bswap128(V128);
(*V128) += incr;
bswap128(V128);
}
static void AES256_CTR_DRBG_Update(unsigned char *provided_data,
uint8x16_t vsubkeys[15], unsigned char *Key,
unsigned char *V) {
unsigned char temp[48];
__uint128_t V128, t;
uint64x2_t vV[3];
static void
AES256_CTR_DRBG_Update(unsigned char *provided_data, uint8x16_t vsubkeys[15], unsigned char *Key, unsigned char *V)
{
unsigned char temp[48];
__uint128_t V128, t;
uint64x2_t vV[3];
memcpy(&V128, DRBG_ctx.V, sizeof(V128));
memcpy(&V128, DRBG_ctx.V, sizeof(V128));
bswap128(&V128);
bswap128(&V128);
for (int j = 0; j < 3; j++) {
V128++;
t = V128;
bswap128(&t);
vV[j] = vld1q_u64((uint64_t *)&t);
}
for (int j = 0; j < 3; j++) {
V128++;
t = V128;
bswap128(&t);
vV[j] = vld1q_u64((uint64_t *)&t);
}
AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
if (provided_data != NULL)
for (int i = 0; i < 48; i++)
temp[i] ^= provided_data[i];
memcpy(Key, temp, 32);
memcpy(V, temp + 32, 16);
// if (provided_data != NULL)
// for (int i = 0; i < 48; i++)
// temp[i] ^= provided_data[i];
if (provided_data != NULL) {
// 使用 SIMD 进行批量 XOR 操作
uint8x16_t *temp_vec = (uint8x16_t *)temp;
uint8x16_t *prov_vec = (uint8x16_t *)provided_data;
add_to_V(DRBG_ctx.V, 1);
temp_vec[0] = veorq_u8(temp_vec[0], prov_vec[0]);
temp_vec[1] = veorq_u8(temp_vec[1], prov_vec[1]);
temp_vec[2] = veorq_u8(temp_vec[2], prov_vec[2]);
}
memcpy(Key, temp, 32);
memcpy(V, temp + 32, 16);
add_to_V(DRBG_ctx.V, 1);
}
void randombytes_init_arm64crypto(unsigned char *entropy_input,
unsigned char *personalization_string,
int security_strength) {
(void)security_strength;
void
randombytes_init_arm64crypto(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength)
{
(void)security_strength;
unsigned char seed_material[48];
uint8_t subkeys[15][16];
uint8x16_t vsubkeys[15];
unsigned char seed_material[48];
uint8_t subkeys[15][16];
uint8x16_t vsubkeys[15];
memcpy(seed_material, entropy_input, 48);
if (personalization_string)
for (int i = 0; i < 48; i++)
seed_material[i] ^= personalization_string[i];
memset(DRBG_ctx.Key, 0x00, 32);
memset(DRBG_ctx.V, 0x00, 16);
memcpy(seed_material, entropy_input, 48);
// if (personalization_string)
// for (int i = 0; i < 48; i++)
// seed_material[i] ^= personalization_string[i];
if (personalization_string) {
// 使用 SIMD 加速 XOR 操作
uint8x16_t *seed_vec = (uint8x16_t *)seed_material;
uint8x16_t *pers_vec = (uint8x16_t *)personalization_string;
AES256_key_schedule(subkeys, DRBG_ctx.Key);
for (int i = 0; i < 15; i++) {
vsubkeys[i] = vld1q_u8(subkeys[i]);
}
seed_vec[0] = veorq_u8(seed_vec[0], pers_vec[0]);
seed_vec[1] = veorq_u8(seed_vec[1], pers_vec[1]);
seed_vec[2] = veorq_u8(seed_vec[2], pers_vec[2]);
}
AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter = 1;
memset(DRBG_ctx.Key, 0x00, 32);
memset(DRBG_ctx.V, 0x00, 16);
AES256_key_schedule(subkeys, DRBG_ctx.Key);
for (int i = 0; i < 15; i++) {
vsubkeys[i] = vld1q_u8(subkeys[i]);
}
AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter = 1;
}
#define WAYS 4
#define WAYS 8
int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) {
uint8_t subkeys[15][16];
unsigned char block[16];
__uint128_t V[WAYS], Vle[WAYS];
uint8x16x4_t vV;
uint8x16_t vsubkeys[15];
int
randombytes_arm64crypto(unsigned char *x, unsigned long long xlen)
{
uint8_t subkeys[15][16];
unsigned char block[16];
__uint128_t V[WAYS], Vle[WAYS];
uint8x16x4_t vV;
uint8x16_t vsubkeys[15];
AES256_key_schedule(subkeys, DRBG_ctx.Key);
AES256_key_schedule(subkeys, DRBG_ctx.Key);
for (int j = 0; j < 15; j++) {
vsubkeys[j] = vld1q_u8(subkeys[j]);
}
memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0]));
V[0] = Vle[0];
vV.val[0] = vld1q_u8((uint8_t *)&V[0]);
bswap128(&Vle[0]);
for (int j = 1; j < WAYS; j++) {
Vle[j] = Vle[j - 1] + 1;
V[j] = Vle[j];
bswap128(&V[j]);
vV.val[j] = vld1q_u8((uint8_t *)&V[j]);
}
int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0;
while (xlen >= WAYS * 16) {
for (int j = 0; j < WAYS; j++) {
Vle[j] += 4;
for (int j = 0; j < 15; j++) {
vsubkeys[j] = vld1q_u8(subkeys[j]);
}
for (int j = 0; j < WAYS; j++) {
vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[0]);
vV.val[j] = vaesmcq_u8(vV.val[j]);
memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0]));
V[0] = Vle[0];
vV.val[0] = vld1q_u8((uint8_t *)&V[0]);
bswap128(&Vle[0]);
for (int j = 1; j < WAYS; j++) {
Vle[j] = Vle[j - 1] + 1;
V[j] = Vle[j];
bswap128(&V[j]);
// 分批加载到向量寄存器中
if (j % 4 == 0 || j == WAYS - 1) {
vV = vld1q_u8_x4((uint8_t *)&V[j-3]);
}
}
for (int i = 1; i < 13; i++) {
for (int j = 0; j < WAYS; j++) {
vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[i]);
vV.val[j] = vaesmcq_u8(vV.val[j]);
}
int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0;
while (xlen >= WAYS * 16) {
// 添加预取指令
__builtin_prefetch(&x[64], 1, 3);
for (int j = 0; j < WAYS; j++) {
Vle[j] += 8; // 增加步长以减少循环次数
}
// 展开循环以减少分支预测失败
for (int j = 0; j < WAYS; j += 4) {
uint8x16_t state0 = vaeseq_u8(vV.val[0], vsubkeys[0]);
uint8x16_t state1 = vaeseq_u8(vV.val[1], vsubkeys[0]);
uint8x16_t state2 = vaeseq_u8(vV.val[2], vsubkeys[0]);
uint8x16_t state3 = vaeseq_u8(vV.val[3], vsubkeys[0]);
state0 = vaesmcq_u8(state0);
state1 = vaesmcq_u8(state1);
state2 = vaesmcq_u8(state2);
state3 = vaesmcq_u8(state3);
for (int i = 1; i < 13; i++) {
state0 = vaeseq_u8(state0, vsubkeys[i]);
state1 = vaeseq_u8(state1, vsubkeys[i]);
state2 = vaeseq_u8(state2, vsubkeys[i]);
state3 = vaeseq_u8(state3, vsubkeys[i]);
state0 = vaesmcq_u8(state0);
state1 = vaesmcq_u8(state1);
state2 = vaesmcq_u8(state2);
state3 = vaesmcq_u8(state3);
}
state0 = vaeseq_u8(state0, vsubkeys[13]);
state1 = vaeseq_u8(state1, vsubkeys[13]);
state2 = vaeseq_u8(state2, vsubkeys[13]);
state3 = vaeseq_u8(state3, vsubkeys[13]);
state0 = veorq_u8(state0, vsubkeys[14]);
state1 = veorq_u8(state1, vsubkeys[14]);
state2 = veorq_u8(state2, vsubkeys[14]);
state3 = veorq_u8(state3, vsubkeys[14]);
vst1q_u8(x + 0 * 16, state0);
vst1q_u8(x + 1 * 16, state1);
vst1q_u8(x + 2 * 16, state2);
vst1q_u8(x + 3 * 16, state3);
// 更新向量寄存器
if (j + 4 < WAYS) {
vV.val[0] = vld1q_u8((uint8_t *)&V[j+4]);
vV.val[1] = vld1q_u8((uint8_t *)&V[j+5]);
vV.val[2] = vld1q_u8((uint8_t *)&V[j+6]);
vV.val[3] = vld1q_u8((uint8_t *)&V[j+7]);
}
}
for (int j = 0; j < WAYS; j++) {
V[j] = Vle[j];
bswap128(&V[j]);
}
vV = vld1q_u8_x4((uint8_t *)V);
x += WAYS * 16;
xlen -= WAYS * 16;
}
for (int j = 0; j < WAYS; j++) {
vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[13]);
vV.val[j] = veorq_u8(vV.val[j], vsubkeys[14]);
vst1q_u8(x + j * 16, vV.val[j]);
if (entered_fast_path && xlen == 0) {
asm volatile("" : "+r,m"(Vle[7]) : : "memory");
V[0] = Vle[7] - 8;
bswap128(&V[0]);
}
for (int j = 0; j < WAYS; j++) {
V[j] = Vle[j];
bswap128(&V[j]);
while (xlen > 0) {
if (xlen > 16) {
AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
x += 16;
xlen -= 16;
Vle[0]++;
V[0] = Vle[0];
bswap128(&V[0]);
} else {
AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
memcpy(x, block, xlen);
xlen = 0;
}
}
vV = vld1q_u8_x4((uint8_t *)V);
memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
x += WAYS * 16;
xlen -= WAYS * 16;
}
AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter++;
if (entered_fast_path && xlen == 0) {
asm volatile("" : "+r,m"(Vle[3]) : : "memory");
V[0] = Vle[3] - 4;
bswap128(&V[0]);
}
while (xlen > 0) {
if (xlen > 16) {
AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
x += 16;
xlen -= 16;
Vle[0]++;
V[0] = Vle[0];
bswap128(&V[0]);
} else {
AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
memcpy(x, block, xlen);
xlen = 0;
}
}
memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter++;
return RNG_SUCCESS;
return RNG_SUCCESS;
}
#ifdef RANDOMBYTES_ARM64CRYPTO
int randombytes(unsigned char *random_array, unsigned long long nbytes) {
int ret = randombytes_arm64crypto(random_array, nbytes);
int
randombytes(unsigned char *random_array, unsigned long long nbytes)
{
int ret = randombytes_arm64crypto(random_array, nbytes);
#ifdef ENABLE_CT_TESTING
VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
#endif
return ret;
return ret;
}
void randombytes_init(unsigned char *entropy_input,
unsigned char *personalization_string,
int security_strength) {
randombytes_init_arm64crypto(entropy_input, personalization_string,
security_strength);
void
randombytes_init(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength)
{
randombytes_init_arm64crypto(entropy_input, personalization_string, security_strength);
}
#endif

View File

@@ -655,13 +655,48 @@ quat_test_lll_lideal_lideal_mul_reduced()
ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
ibz_mat_4x4_transpose(&(gram_test), &(gram_test));
ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
for (int i = 0; i < 4; i++) {
ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
}
// ARM优化: 循环展开以减少分支预测失败的可能性
// 原始循环:
// for (int i = 0; i < 4; i++) {
// ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
// quat_qf_eval(&norm, &gram, &vec);
// quat_qf_eval(&test_norm, &gram_test, &vec);
// ibz_mul(&norm, &(prod.norm), &norm);
// res = res || !(ibz_cmp(&norm, &test_norm) == 0);
// }
// 展开后的循环 - 减少循环开销更适合ARM处理器流水线
ibz_vec_4_set(&vec, 1, 0, 0, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 1, 0, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 0, 1, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 0, 0, 1);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
// 使用NEON优化大整数运算如果可用
#ifdef HAVE_NEON
// 在支持NEON的ARM64平台上并行处理多个规范评估
// 这里可以进一步优化,但需要重构底层的大整数运算库
#endif
quat_lattice_hnf(&(prod.lattice));
res = res || !quat_lideal_equals(&i1, &lideal1, &alg);

View File

@@ -0,0 +1,6 @@
#ifndef ML_PREDICT_H
#define ML_PREDICT_H
double ml_predict_success(long norm, int trace, int order);
#endif

View File

@@ -3,6 +3,9 @@
#include <quaternion_data.h>
#include <id2iso.h>
#include <torsion_constants.h>
#ifdef _OPENMP
#include <omp.h>
#endif
void
secret_key_init(secret_key_t *sk)
@@ -27,7 +30,59 @@ protocols_keygen(public_key_t *pk, secret_key_t *sk)
// iterating until a solution has been found
while (!found) {
// 尝试并行生成多个理想,提高找到解决方案的概率
#ifdef _OPENMP
int num_threads = omp_get_max_threads();
if (num_threads > 1) {
int local_found = 0;
#pragma omp parallel shared(found, local_found) num_threads(num_threads)
{
if (!local_found) {
secret_key_t local_sk;
secret_key_init(&local_sk);
int thread_found = quat_sampling_random_ideal_O0_given_norm(
&local_sk.secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
// replacing the secret key ideal by a shorter equivalent one for efficiency
thread_found = thread_found && quat_lideal_prime_norm_reduced_equivalent(
&local_sk.secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
// ideal to isogeny clapotis
ec_basis_t local_B_0_two;
thread_found = thread_found && dim2id2iso_arbitrary_isogeny_evaluation(&local_B_0_two, &local_sk.curve, &local_sk.secret_ideal);
if (thread_found) {
#pragma omp critical
{
if (!local_found) {
local_found = 1;
found = 1;
// Copy local results to global variables
quat_left_ideal_copy(&sk->secret_ideal, &local_sk.secret_ideal);
ec_curve_copy(&sk->curve, &local_sk.curve);
// Copy basis
ec_point_copy(&B_0_two.P, &local_B_0_two.P);
ec_point_copy(&B_0_two.Q, &local_B_0_two.Q);
ec_point_copy(&B_0_two.PmQ, &local_B_0_two.PmQ);
}
}
}
secret_key_finalize(&local_sk);
}
}
} else {
found = quat_sampling_random_ideal_O0_given_norm(
&sk->secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
// replacing the secret key ideal by a shorter equivalent one for efficiency
found = found && quat_lideal_prime_norm_reduced_equivalent(
&sk->secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
// ideal to isogeny clapotis
found = found && dim2id2iso_arbitrary_isogeny_evaluation(&B_0_two, &sk->curve, &sk->secret_ideal);
}
#else
found = quat_sampling_random_ideal_O0_given_norm(
&sk->secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
@@ -36,8 +91,8 @@ protocols_keygen(public_key_t *pk, secret_key_t *sk)
&sk->secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
// ideal to isogeny clapotis
found = found && dim2id2iso_arbitrary_isogeny_evaluation(&B_0_two, &sk->curve, &sk->secret_ideal);
#endif
}
// Assert the isogeny was found and images have the correct order
@@ -61,4 +116,4 @@ protocols_keygen(public_key_t *pk, secret_key_t *sk)
assert(fp2_is_one(&pk->curve.C) == 0xFFFFFFFF);
return found;
}
}

View File

@@ -0,0 +1,124 @@
#include "ml_predict.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include "../src/quaternion/ref/generic/include/intbig.h"
#include "../src/quaternion/ref/generic/include/quaternion.h"
// 下述是macOS为获取时间戳的操作
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
// ==================== 辅助函数 ====================
// 将 ibz_t 转为 long
static long ibz_to_long_safe(const ibz_t *x) {
// 注意:仅用于调试,不要在生产环境中用 long 表示大数
return (long) mpz_get_si(*x);
}
// 计算理想的“迹”值trace
// 我们使用生成元近似trace ≈ 2 * a其中 a 是 quat 元素实部
int quat_ideal_trace(const quat_left_ideal_t *I) {
if (I == NULL) return 0;
// 理想中存储的 lattice.basis[0] 通常包含 generator 信息
// 取其第一个坐标近似为实部 a
ibz_t a;
ibz_init(&a);
ibz_copy(&a, &I->lattice.basis[0][0]);
long val = ibz_to_long_safe(&a);
ibz_finalize(&a);
return (int)(2 * val);
}
// 改进的机器学习预测函数,基于更复杂的启发式模型
double ml_predict_success(long norm_val, int trace_val, int kernel_order) {
// 使用更细致的概率模型
double score = 0.0;
// 对范数进行对数缩放并评估
double log_norm = (norm_val > 0) ? log10((double)norm_val + 1) : 0;
if (log_norm < 12) {
score += 0.5; // 范数较小的理想更容易处理
} else if (log_norm < 15) {
score += 0.3;
} else if (log_norm < 18) {
score += 0.1;
}
// 对迹值进行评估
int abs_trace = abs(trace_val);
if (abs_trace < 100000000) {
score += 0.3; // 迹值较小时更优
} else if (abs_trace < 300000000) {
score += 0.15;
}
// 对核阶数进行评估
if (kernel_order == 2) {
score += 0.2; // 优先选择核阶数为2的理想
}
// 基于组合特征的调整
if (log_norm < 12 && abs_trace < 100000000) {
score += 0.2; // 范数和迹都很小的情况给予额外加分
}
// 限制分数范围
if (score > 1.0) score = 1.0;
// 转换为成功概率(非线性映射)
double probability = score * score; // 平方放大差异
return probability;
}
// 日志函数:记录一次理想尝试
// 记录到 CSV 文件,包含时间戳
void ml_log_ideal_attempt(int attempt,
const quat_left_ideal_t *lideal_com,
int kernel_order,
int success_flag)
{
const char *dir_path = "./dataset";
struct stat st = {0};
if (stat(dir_path, &st) == -1) {
mkdir(dir_path, 0755);
}
// 使用固定文件名,避免频繁创建文件
char csv_path[512];
snprintf(csv_path, sizeof(csv_path), "%s/ideal_data.csv", dir_path);
// 使用追加模式,避免覆盖
FILE *logfile = fopen(csv_path, "a+");
if (!logfile) {
perror("无法打开 ideal_data CSV 文件");
return;
}
// 如果是第一次写入,添加表头
if (ftell(logfile) == 0) {
fprintf(logfile, "Timestamp,Attempt,Norm,Trace,KernelOrder,Prob,Success\n");
}
// 提取特征和记录
long norm_val = ibz_to_long_safe(&lideal_com->norm);
int trace_val = quat_ideal_trace(lideal_com);
double prob = ml_predict_success(norm_val, trace_val, kernel_order);
time_t t = time(NULL);
struct tm tm_info;
localtime_r(&t, &tm_info);
char time_str[32];
strftime(time_str, sizeof(time_str), "%Y-%m-%d-%H-%M-%S", &tm_info);
fprintf(logfile, "%s,%d,%ld,%d,%d,%.3f,%d\n",
time_str, attempt, norm_val, trace_val, kernel_order, prob, success_flag);
fclose(logfile);
}

View File

@@ -1,27 +1,148 @@
#include <signature.h>
#include <ml_predict.h> // 包含机器学习模型的头文件
#include "ml_predict.c" // 包含实现文件(仅用于测试,生产环境应编译链接)
#include <assert.h>
#include <tools.h>
#include <quaternion_data.h>
#include <id2iso.h>
#include <torsion_constants.h>
#include <encoded_sizes.h>
#include <intbig.h>
#ifdef _OPENMP
#include <omp.h>
#endif
// 添加函数声明
static bool execute_commit_serial(ec_curve_t *E_com, ec_basis_t *basis_even_com,
quat_left_ideal_t *lideal_com, int *attempt_counter, int kernel_order);
// compute the commitment with ideal to isogeny clapotis
// and apply it to the basis of E0 (together with the multiplication by some scalar u)
// 原代码如下,为了进行相应的测试,暂时注释掉
// static bool
// commit(ec_curve_t *E_com, ec_basis_t *basis_even_com, quat_left_ideal_t *lideal_com)
// {
// bool found = false;
// found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1, &QUAT_represent_integer_params, NULL);
// // replacing it with a shorter prime norm equivalent ideal
// found = found && quat_lideal_prime_norm_reduced_equivalent(
// lideal_com, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
// // ideal to isogeny clapotis
// found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
// return found;
// }
// 这是我的第一个版本的代码,需要进一步测试和调试,于是为了收集更多正确和错误的数据集,我在下面改了第二版
static bool
commit(ec_curve_t *E_com, ec_basis_t *basis_even_com, quat_left_ideal_t *lideal_com)
{
static int attempt_counter = 0;
bool found = false;
int kernel_order = 2;
// 使用并行计算来加速随机理想采样
#ifdef _OPENMP
int num_threads = omp_get_max_threads();
if (num_threads > 1) {
#pragma omp parallel for reduction(|:found) num_threads(num_threads)
for (int i = 0; i < num_threads && !found; i++) {
// 为每个线程创建独立的临时变量
quat_left_ideal_t local_ideal;
quat_left_ideal_init(&local_ideal);
// 添加线程特定的随机性
random_state_t local_rand;
random_init(&local_rand);
random_add_entropy(&local_rand, (unsigned char*)&i, sizeof(i));
// 尝试采样理想
bool local_found = quat_sampling_random_ideal_O0_given_norm(
&local_ideal, &COM_DEGREE, 1, &QUAT_represent_integer_params, &local_rand);
if (local_found) {
// 用机器学习预测理想是否值得尝试
long norm_val = mpz_get_si(local_ideal.norm);
int trace_val = quat_ideal_trace(&local_ideal);
double prob = ml_predict_success(norm_val, trace_val, kernel_order);
if (prob >= 0.3) { // 概率足够高才继续
// 尝试优化等价理想
local_found = local_found &&
quat_lideal_prime_norm_reduced_equivalent(&local_ideal, &QUATALG_PINFTY,
QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
if (local_found) {
#pragma omp critical
{
if (!found) {
// 复制成功的结果到输出参数
quat_left_ideal_copy(lideal_com, &local_ideal);
found = true;
}
}
}
}
}
quat_left_ideal_finalize(&local_ideal);
random_finalize(&local_rand);
}
// 如果并行搜索成功,则执行同源映射评估
if (found) {
found = dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
}
} else {
// 单线程情况 - 原有逻辑
found = execute_commit_serial(E_com, basis_even_com, lideal_com, &attempt_counter, kernel_order);
}
#else
// 没有OpenMP的情况 - 原有逻辑
found = execute_commit_serial(E_com, basis_even_com, lideal_com, &attempt_counter, kernel_order);
#endif
found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1, &QUAT_represent_integer_params, NULL);
// replacing it with a shorter prime norm equivalent ideal
found = found && quat_lideal_prime_norm_reduced_equivalent(
lideal_com, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
// ideal to isogeny clapotis
found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
return found;
}
// 辅助函数:串行提交逻辑
static bool execute_commit_serial(ec_curve_t *E_com, ec_basis_t *basis_even_com,
quat_left_ideal_t *lideal_com, int *attempt_counter, int kernel_order)
{
bool found = false;
// Step 1. 随机生成理想
found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1,
&QUAT_represent_integer_params, NULL);
// Step 2. 用机器学习预测理想是否值得尝试
long norm_val = mpz_get_si(lideal_com->norm);
int trace_val = quat_ideal_trace(lideal_com);
double prob = ml_predict_success(norm_val, trace_val, kernel_order);
if (prob < 0.3) {
// 概率太低,跳过以节省计算
ml_log_ideal_attempt((*attempt_counter)++, lideal_com, kernel_order, 0);
return false;
}
// Step 3. 尝试优化等价理想
found = found && quat_lideal_prime_norm_reduced_equivalent(lideal_com, &QUATALG_PINFTY,
QUAT_primality_num_iter,
QUAT_equiv_bound_coeff);
// Step 4. 理想到同源映射
found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
// Step 5. 记录结果
ml_log_ideal_attempt((*attempt_counter)++, lideal_com, kernel_order, found ? 1 : 0);
return found;
}
static void
compute_challenge_ideal_signature(quat_left_ideal_t *lideal_chall_two, const signature_t *sig, const secret_key_t *sk)
{