feat(ml_predict): 启用 ml_log_ideal_attempt 日志函数

将原本被注释掉的 ml_log_ideal_attempt 函数启用，用于记录理想尝试的相关数据。该函数会将每次尝试的时间戳、次数、范数、迹、核阶数、预测概率及成功标志写入 CSV 文件。确保在项目根目录下生成 dataset 文件夹，并将日志持久化存储于 ideal_data.csv 中。
feat(compilation): 启用 ARM64 优化与 OpenMP 并行支持
2025-11-26 22:40:43 +08:00 · 2025-11-26 15:51:27 +08:00 · 2025-11-26 13:49:27 +08:00 · 2025-11-26 09:23:47 +08:00 · 2025-11-25 22:58:37 +08:00 · 2025-11-25 10:32:13 +00:00
12 changed files with 2519 additions and 284 deletions
--- a/.cmake/arm_optimization.cmake
+++ b/.cmake/arm_optimization.cmake
@@ -0,0 +1,75 @@
 # SPDX-License-Identifier: Apache-2.0
 # ARM架构优化配置文件
 # 检查是否为ARM架构
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
    # 启用NEON指令集优化
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
        # ARM64架构
        add_compile_options(-march=armv8-a+simd)
        # 如果支持 crypto 扩展，则启用
        include(CheckCSourceCompiles)
        check_c_source_compiles("
            #include <arm_neon.h>
            int main() {
                uint8x16_t a = vdupq_n_u8(0);
                uint8x16_t b = vaeseq_u8(a, vdupq_n_u8(0));
                return 0;
            }" HAVE_ARM64_CRYPTO)
        if (HAVE_ARM64_CRYPTO)
            add_compile_options(-march=armv8-a+crypto)
            add_compile_definitions(HAVE_ARM64_CRYPTO)
        endif()
        # CPU特定优化
        # 根据实际部署平台选择合适的CPU型号
        add_compile_options(-mtune=cortex-a76)  # 默认使用cortex-a76
        # 更多ARM64优化选项
        add_compile_options(
            -moutline-atomics           # 内联原子操作
            -mstrict-align              # 严格对齐优化
        )
    else()
        # ARM32架构
        add_compile_options(-march=armv7-a -mfpu=neon)
    endif()
    # 通用ARM优化选项
    add_compile_options(
        -O3                         # 最高级别优化
        -funroll-loops              # 循环展开
        -fomit-frame-pointer        # 省略帧指针
        -frename-registers          # 重命名寄存器
        -fipa-pta                   # 点对点分析
        -floop-optimize             # 循环优化
        -fprefetch-loop-arrays      # 预取循环数组
        -funroll-all-loops          # 展开所有循环
        -fpeel-loops                # 循环剥离
    )
    # 浮点运算优化
    add_compile_options(
        -ffast-math                 # 快速数学运算
        -ffp-contract=fast          # 快速浮点收缩
        -funsafe-math-optimizations # 不安全的数学优化
        -ftree-vectorize            # 树向量化
    )
    # 启用链接时优化(LTO)
    include(CheckIPOSupported)
    check_ipo_supported(RESULT result)
    if(result)
        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
        add_compile_options(-flto=auto)
    endif()
    message(STATUS "ARM optimizations enabled for ${CMAKE_SYSTEM_PROCESSOR}")
    # 添加NEON支持的定义
    add_compile_definitions(HAVE_NEON)
 endif()
--- a/.cmake/target.cmake
+++ b/.cmake/target.cmake
@@ -51,6 +51,11 @@ else()
    message("Warning: system architecture not detected, defaulting to 64 bit")
 endif()
 # 修改默认GF_RADIX为64
 if (NOT DEFINED GF_RADIX)
    set(GF_RADIX 64)
 endif()
 if (NOT GF_RADIX STREQUAL "AUTO")
    if (NOT((GF_RADIX EQUAL 64) OR (GF_RADIX EQUAL 32)))
        message(FATAL_ERROR "Currently supported options for GF_RADIX: 32 or 64. Aborting")
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,9 +13,21 @@ option(ENABLE_STRICT "Build with strict compile options." ON)
 option(ENABLE_TESTS  "Enable compilation of tests." ON)
 option(ENABLE_CT_TESTING  "Enable compilation for constant time testing." OFF)
 option(ENABLE_SIGN "Build with sign functionality" ON)
 option(ENABLE_OPENMP "Enable OpenMP for parallel computation" OFF)
 set(GMP_LIBRARY "SYSTEM" CACHE STRING "Which version of GMP to use: SYSTEM, BUILD or MINI")
 set(GF_RADIX "AUTO" CACHE STRING "Set the radix for the gf module (currently supported values: 32 or 64), or AUTO.")
 if(ENABLE_OPENMP)
    find_package(OpenMP REQUIRED)
    if(OpenMP_C_FOUND)
        add_compile_definitions(HAVE_OPENMP)
        # 将OpenMP标志添加到全局编译选项，供后续目标使用
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
    endif()
 endif()
 if (NOT DEFINED SQISIGN_BUILD_TYPE)
  SET(SQISIGN_BUILD_TYPE "ref")
 endif()
@@ -35,6 +47,8 @@ SET(SVARIANT_S "lvl1;lvl3;lvl5")
 include(.cmake/flags.cmake)
 include(.cmake/sanitizers.cmake)
 include(.cmake/target.cmake)
 include(.cmake/arm_optimization.cmake)
 if(ENABLE_SIGN)
 	include(.cmake/gmpconfig.cmake)
 	add_compile_definitions(ENABLE_SIGN)
--- a/apps/benchmark.c
+++ b/apps/benchmark.c
@@ -66,10 +66,10 @@ bench(size_t runs)
 int
 main(int argc, char *argv[])
 {
-    uint32_t seed[12] = { 0 };
+    uint32_t seed[12] = { 0x9c9c486d, 0xdee450ff, 0x59e7c0c5, 0xfb4cbe06, 0xfd92f011, 0x3af36b27, 0x86b2837c, 0xa63f4891, 0x25b7bdad, 0xe67e94a3, 0x6c32f51a, 0x9bc1d896 };
    int iterations = SQISIGN_TEST_REPS;
    int help = 0;
-    int seed_set = 0;
+    int seed_set = 1;
 #ifndef NDEBUG
    fprintf(stderr,
--- a/benchmark.sh
+++ b/benchmark.sh
@@ -0,0 +1,13 @@
 # cmake -DSQISIGN_BUILD_TYPE=ref -DCMAKE_BUILD_TYPE=Release ..
 # 包含完整跑分和验证的测试脚本
 # BASE_DIR="./build_Neon_ml"
 BASE_DIR="./build_ref_release_test"
 echo "------------------------start benchmark------------------------------"
 $BASE_DIR/apps/benchmark_lvl1 --iterations=100 >> $BASE_DIR/test_result.txt
 $BASE_DIR/apps/benchmark_lvl3 --iterations=100 >> $BASE_DIR/test_result.txt
 $BASE_DIR/apps/benchmark_lvl5 --iterations=100 >> $BASE_DIR/test_result.txt
 echo "-------------------------end benchmark-----------------------------"
--- a/dataset/ideal_data.csv
+++ b/dataset/ideal_data.csv
--- a/src/common/arm64crypto/randombytes_ctrdrbg.c
+++ b/src/common/arm64crypto/randombytes_ctrdrbg.c
@@ -7,32 +7,30 @@
 static AES256_CTR_DRBG_struct DRBG_ctx;
-// 优化1: 改进S-box实现，减少内存操作
+__attribute__((always_inline, hot))
 static inline uint32_t
 AES_sbox_x4(uint32_t in)
 {
    uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
    sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
    return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
 }
 #define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
 // 优化2: 使用更紧凑的数据结构，提高缓存效率
 typedef union
 {
-    uint8_t u8[240];  // 15*16
+    uint8_t u8[15][16];
-    uint32_t u32[60]; // 15*4
+    uint32_t u32[15][4];
    uint8x16_t v[15];
 } subkeys_t;
-// 优化3: 改进密钥调度，使用Neon指令进行批量处理
+__attribute__((hot))
 static void
 AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key)
 {
    subkeys_t *sk = (subkeys_t *)subkeys;
    uint8x16_t rcon = vdupq_n_u8(0x01);
-    uint8x16_t rcon_step = vdupq_n_u8(0x1b);
+    // uint8x16_t rcon_step = vdupq_n_u8(0x1b);
    // 一次性复制前两轮密钥
    memcpy(&subkeys[0][0], key, 32);
@@ -68,152 +66,109 @@ AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key)
    }
 }
-// 优化4: 改进AES-256 ECB实现，减少循环开销
+#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out)                                                                     \
-static inline void
+    do {                                                                                                               \
-AES256_ECB_XWAYS_OPTIMIZED(int ways, const uint8x16_t vsubkeys[15], uint8x16_t state[], unsigned char *out)
+        uint8x16_t state[ways];                                                                                        \
-{
+                                                                                                                       \
-    // 第一轮：AddRoundKey
+        for (int j = 0; j < ways; j++) {                                                                               \
-    for (int j = 0; j < ways; j++) {
+            state[j] = vaeseq_u8(ctr[j], vsubkeys[0]);                                                                 \
-        state[j] = vaeseq_u8(state[j], vsubkeys[0]);
+            state[j] = vaesmcq_u8(state[j]);                                                                           \
-        state[j] = vaesmcq_u8(state[j]);
+        }                                                                                                              \
-    }
+                                                                                                                       \
        for (int i = 1; i < 13; i++) {                                                                                 \
            for (int j = 0; j < ways; j++) {                                                                           \
                state[j] = vaeseq_u8(state[j], vsubkeys[i]);                                                           \
                state[j] = vaesmcq_u8(state[j]);                                                                       \
            }                                                                                                          \
        }                                                                                                              \
                                                                                                                       \
        for (int j = 0; j < ways; j++) {                                                                               \
            state[j] = vaeseq_u8(state[j], vsubkeys[13]);                                                              \
            state[j] = veorq_u8(state[j], vsubkeys[14]);                                                               \
            vst1q_u8(out + j * 16, state[j]);                                                                          \
        }                                                                                                              \
    } while (0);
-    // 中间轮：SubBytes, ShiftRows, MixColumns, AddRoundKey
+//    subkeys - subkeys for AES-256
-    for (int i = 1; i < 13; i++) {
+//    ctr - a 128-bit plaintext value
-        uint8x16_t subkey = vsubkeys[i];
+//    buffer - a 128-bit ciphertext value
        for (int j = 0; j < ways; j++) {
            state[j] = vaeseq_u8(state[j], subkey);
            state[j] = vaesmcq_u8(state[j]);
        }
    }
    // 最后一轮：SubBytes, ShiftRows, AddRoundKey
    for (int j = 0; j < ways; j++) {
        state[j] = vaeseq_u8(state[j], vsubkeys[13]);
        state[j] = veorq_u8(state[j], vsubkeys[14]);
        vst1q_u8(out + j * 16, state[j]);
    }
 }
 // 优化5: 使用向量化的字节交换函数
 static inline void
 bswap128_vectorized(uint8x16_t *v)
 {
    // 使用vrev64q_u8和vtrn1q_u8等指令优化字节交换
    uint8x16_t reversed = vrev64q_u8(*v);
    uint8x8x2_t halves = vtrn_u8(vget_low_u8(reversed), vget_high_u8(reversed));
    *v = vcombine_u8(halves.val[1], halves.val[0]);
 }
 // 优化6: 改进计数器增量函数
 static inline void
 add_to_V_optimized(unsigned char V[], int incr)
 {
    // 使用向量化操作增加计数器
    uint8x16_t vV = vld1q_u8(V);
    uint64x2_t vV64 = vreinterpretq_u64_u8(vV);
    // 处理64位增量
    uint64x2_t incr64 = vdupq_n_u64((uint64_t)incr);
    vV64 = vaddq_u64(vV64, incr64);
    // 如果低64位溢出，增加高64位
    uint64_t low = vgetq_lane_u64(vV64, 0);
    if (low < (uint64_t)incr) {
        uint64_t high = vgetq_lane_u64(vV64, 1);
        vV64 = vsetq_lane_u64(high + 1, vV64, 1);
    }
    vV = vreinterpretq_u8_u64(vV64);
    bswap128_vectorized(&vV);
    vst1q_u8(V, vV);
 }
 // 动态确定最优WAYS值
 static int
 determine_optimal_ways(unsigned long long data_size)
 {
    // 根据数据大小选择最优的WAYS值
    // 这些阈值可以通过实际测试优化
    // 小数据块: 使用4路并行
    if (data_size < 256) {
        return 4;
    }
    // 中等数据块: 使用6路并行
    else if (data_size < 1024) {
        return 6;
    }
    // 大数据块: 使用8路并行
    else if (data_size < 4096) {
        return 8;
    }
    // 超大数据块: 使用10路并行，但不超过12
    else {
        return 8;
    }
 }
 // 优化7: 改进DRBG更新函数，减少内存操作
 static void
-AES256_CTR_DRBG_Update_Optimized(unsigned char *provided_data,
+AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr, unsigned char *buffer)
-                                 const uint8x16_t vsubkeys[15],
+{
-                                 unsigned char *Key,
+    AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer);
-                                 unsigned char *V)
+}
 // vsubkeys - subkeys for AES-256
 // ctr - an array of 3 x 128-bit plaintext value
 // buffer - an array of 3 x 128-bit ciphertext value
 static void
 AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3], unsigned char *buffer)
 {
    AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer);
 }
 static void
 bswap128(__uint128_t *x)
 {
    uint64_t *x64 = (uint64_t *)x;
    uint64_t t = x64[0];
    x64[0] = x64[1];
    x64[1] = t;
    x64[0] = __builtin_bswap64(x64[0]);
    x64[1] = __builtin_bswap64(x64[1]);
 }
 static void
 add_to_V(unsigned char V[], int incr)
 {
    __uint128_t *V128 = (__uint128_t *)V;
    bswap128(V128);
    (*V128) += incr;
    bswap128(V128);
 }
 static void
 AES256_CTR_DRBG_Update(unsigned char *provided_data, uint8x16_t vsubkeys[15], unsigned char *Key, unsigned char *V)
 {
    unsigned char temp[48];
    __uint128_t V128, t;
    uint64x2_t vV[3];
-    // 使用向量化操作处理计数器
+    memcpy(&V128, DRBG_ctx.V, sizeof(V128));
    uint8x16_t vV = vld1q_u8(V);
    uint8x16_t vV1 = vV;
    uint8x16_t vV2 = vV;
    uint8x16_t vV3 = vV;
-    // 增量计数器值
+    bswap128(&V128);
    uint64x2_t inc = vdupq_n_u64(1);
    uint64x2_t vV64 = vreinterpretq_u64_u8(vV1);
    vV64 = vaddq_u64(vV64, inc);
    vV1 = vreinterpretq_u8_u64(vV64);
-    vV64 = vreinterpretq_u64_u8(vV2);
+    for (int j = 0; j < 3; j++) {
-    vV64 = vaddq_u64(vV64, vdupq_n_u64(2));
+        V128++;
-    vV2 = vreinterpretq_u8_u64(vV64);
+        t = V128;
-
+        bswap128(&t);
-    vV64 = vreinterpretq_u64_u8(vV3);
+        vV[j] = vld1q_u64((uint64_t *)&t);
    vV64 = vaddq_u64(vV64, vdupq_n_u64(3));
    vV3 = vreinterpretq_u8_u64(vV64);
    // 批量AES加密
    uint8x16_t vV_array[3] = { vV1, vV2, vV3 };
    AES256_ECB_XWAYS_OPTIMIZED(3, vsubkeys, vV_array, temp);
    // 如果有提供的数据，进行XOR操作
    if (provided_data != NULL) {
        uint8x16_t vData = vld1q_u8(provided_data);
        uint8x16_t vTemp = vld1q_u8(temp);
        vst1q_u8(temp, veorq_u8(vTemp, vData));
        vData = vld1q_u8(provided_data + 16);
        vTemp = vld1q_u8(temp + 16);
        vst1q_u8(temp + 16, veorq_u8(vTemp, vData));
        vData = vld1q_u8(provided_data + 32);
        vTemp = vld1q_u8(temp + 32);
        vst1q_u8(temp + 32, veorq_u8(vTemp, vData));
    }
-    // 更新密钥和V
+    AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp);
    //   if (provided_data != NULL)
    //     for (int i = 0; i < 48; i++)
    //       temp[i] ^= provided_data[i];
    if (provided_data != NULL) {
        // 使用 SIMD 进行批量 XOR 操作
        uint8x16_t *temp_vec = (uint8x16_t *)temp;
        uint8x16_t *prov_vec = (uint8x16_t *)provided_data;
        temp_vec[0] = veorq_u8(temp_vec[0], prov_vec[0]);
        temp_vec[1] = veorq_u8(temp_vec[1], prov_vec[1]);
        temp_vec[2] = veorq_u8(temp_vec[2], prov_vec[2]);
    }
    memcpy(Key, temp, 32);
    memcpy(V, temp + 32, 16);
-    add_to_V_optimized(DRBG_ctx.V, 1);
+    add_to_V(DRBG_ctx.V, 1);
 }
 // 优化8: 改进初始化函数
 void
-randombytes_init_arm64crypto_optimized(unsigned char *entropy_input,
+randombytes_init_arm64crypto(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength)
                                       unsigned char *personalization_string,
                                       int security_strength)
 {
    (void)security_strength;
@@ -221,177 +176,166 @@ randombytes_init_arm64crypto_optimized(unsigned char *entropy_input,
    uint8_t subkeys[15][16];
    uint8x16_t vsubkeys[15];
-    // 使用向量化操作初始化种子材料
+    memcpy(seed_material, entropy_input, 48);
    // if (personalization_string)
    //     for (int i = 0; i < 48; i++)
    //         seed_material[i] ^= personalization_string[i];
    if (personalization_string) {
-        uint8x16_t vEntropy = vld1q_u8(entropy_input);
+        // 使用 SIMD 加速 XOR 操作
-        uint8x16_t vPersonal = vld1q_u8(personalization_string);
+        uint8x16_t *seed_vec = (uint8x16_t *)seed_material;
-        vst1q_u8(seed_material, veorq_u8(vEntropy, vPersonal));
+        uint8x16_t *pers_vec = (uint8x16_t *)personalization_string;
-        vEntropy = vld1q_u8(entropy_input + 16);
+        seed_vec[0] = veorq_u8(seed_vec[0], pers_vec[0]);
-        vPersonal = vld1q_u8(personalization_string + 16);
+        seed_vec[1] = veorq_u8(seed_vec[1], pers_vec[1]);
-        vst1q_u8(seed_material + 16, veorq_u8(vEntropy, vPersonal));
+        seed_vec[2] = veorq_u8(seed_vec[2], pers_vec[2]);
        vEntropy = vld1q_u8(entropy_input + 32);
        vPersonal = vld1q_u8(personalization_string + 32);
        vst1q_u8(seed_material + 32, veorq_u8(vEntropy, vPersonal));
    } else {
        memcpy(seed_material, entropy_input, 48);
    }
-    // 初始化密钥和V为零
+    memset(DRBG_ctx.Key, 0x00, 32);
-    uint8x16_t vZero = vdupq_n_u8(0);
+    memset(DRBG_ctx.V, 0x00, 16);
    vst1q_u8(DRBG_ctx.Key, vZero);
    vst1q_u8(DRBG_ctx.Key + 16, vZero);
    vst1q_u8(DRBG_ctx.V, vZero);
    // 生成子密钥
    AES256_key_schedule(subkeys, DRBG_ctx.Key);
    for (int i = 0; i < 15; i++) {
        vsubkeys[i] = vld1q_u8(subkeys[i]);
    }
-    // 更新DRBG状态
+    AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
    AES256_CTR_DRBG_Update_Optimized(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter = 1;
 }
-// 优化9: 动态选择WAYS值的主随机数生成函数
+#define WAYS 8
 int
-randombytes_arm64crypto_optimized(unsigned char *x, unsigned long long xlen)
+randombytes_arm64crypto(unsigned char *x, unsigned long long xlen)
 {
    uint8_t subkeys[15][16];
    unsigned char block[16];
    __uint128_t V[WAYS], Vle[WAYS];
    uint8x16x4_t vV;
    uint8x16_t vsubkeys[15];
    // 预先计算子密钥
    AES256_key_schedule(subkeys, DRBG_ctx.Key);
    for (int j = 0; j < 15; j++) {
        vsubkeys[j] = vld1q_u8(subkeys[j]);
    }
-    // 根据数据大小动态确定最优的WAYS值
+    memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0]));
-    int ways = determine_optimal_ways(xlen);
+    V[0] = Vle[0];
-
+    vV.val[0] = vld1q_u8((uint8_t *)&V[0]);
-    // 处理大块数据（使用动态确定的WAYS值）
+    bswap128(&Vle[0]);
-    if (xlen >= ways * 16) {
+    for (int j = 1; j < WAYS; j++) {
-        // 使用动态分配的数组来适应不同的WAYS值
+        Vle[j] = Vle[j - 1] + 1;
-        uint8x16_t vV_array[12]; // 最多支持12路并行
+        V[j] = Vle[j];
-        uint8x16_t vV = vld1q_u8(DRBG_ctx.V);
+        bswap128(&V[j]);
-
+        // 分批加载到向量寄存器中
-        // 初始化计数器值
+        if (j % 4 == 0 || j == WAYS - 1) {
-        vV_array[0] = vV;
+            vV = vld1q_u8_x4((uint8_t *)&V[j-3]);
        for (int j = 1; j < ways; j++) {
            uint64x2_t vV64 = vreinterpretq_u64_u8(vV);
            uint64x2_t inc = vdupq_n_u64(j);
            vV64 = vaddq_u64(vV64, inc);
            vV_array[j] = vreinterpretq_u8_u64(vV64);
        }
        // 处理大块数据
        while (xlen >= ways * 16) {
            // 批量AES加密
            AES256_ECB_XWAYS_OPTIMIZED(ways, vsubkeys, vV_array, x);
            // 更新计数器值
            uint64x2_t vV64 = vreinterpretq_u64_u8(vV_array[ways - 1]);
            uint64x2_t inc = vdupq_n_u64(ways);
            vV64 = vaddq_u64(vV64, inc);
            for (int j = 0; j < ways; j++) {
                uint64x2_t current = vreinterpretq_u64_u8(vV_array[j]);
                current = vaddq_u64(current, inc);
                vV_array[j] = vreinterpretq_u8_u64(current);
            }
            x += ways * 16;
            xlen -= ways * 16;
        }
        // 更新V为最后一个计数器值
        vV = vV_array[ways - 1];
        vst1q_u8(DRBG_ctx.V, vV);
    }
-    // 处理剩余数据（小量数据）
+    int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0;
    while (xlen > 0) {
        uint8x16_t vV = vld1q_u8(DRBG_ctx.V);
    while (xlen >= WAYS * 16) {
        // 添加预取指令
        __builtin_prefetch(&x[64], 1, 3);
        for (int j = 0; j < WAYS; j++) {
            Vle[j] += 8; // 增加步长以减少循环次数
        }
        // 展开循环以减少分支预测失败
        for (int j = 0; j < WAYS; j += 4) {
            uint8x16_t state0 = vaeseq_u8(vV.val[0], vsubkeys[0]);
            uint8x16_t state1 = vaeseq_u8(vV.val[1], vsubkeys[0]);
            uint8x16_t state2 = vaeseq_u8(vV.val[2], vsubkeys[0]);
            uint8x16_t state3 = vaeseq_u8(vV.val[3], vsubkeys[0]);
            state0 = vaesmcq_u8(state0);
            state1 = vaesmcq_u8(state1);
            state2 = vaesmcq_u8(state2);
            state3 = vaesmcq_u8(state3);
            for (int i = 1; i < 13; i++) {
                state0 = vaeseq_u8(state0, vsubkeys[i]);
                state1 = vaeseq_u8(state1, vsubkeys[i]);
                state2 = vaeseq_u8(state2, vsubkeys[i]);
                state3 = vaeseq_u8(state3, vsubkeys[i]);
                state0 = vaesmcq_u8(state0);
                state1 = vaesmcq_u8(state1);
                state2 = vaesmcq_u8(state2);
                state3 = vaesmcq_u8(state3);
            }
            state0 = vaeseq_u8(state0, vsubkeys[13]);
            state1 = vaeseq_u8(state1, vsubkeys[13]);
            state2 = vaeseq_u8(state2, vsubkeys[13]);
            state3 = vaeseq_u8(state3, vsubkeys[13]);
            state0 = veorq_u8(state0, vsubkeys[14]);
            state1 = veorq_u8(state1, vsubkeys[14]);
            state2 = veorq_u8(state2, vsubkeys[14]);
            state3 = veorq_u8(state3, vsubkeys[14]);
            vst1q_u8(x + 0 * 16, state0);
            vst1q_u8(x + 1 * 16, state1);
            vst1q_u8(x + 2 * 16, state2);
            vst1q_u8(x + 3 * 16, state3);
            // 更新向量寄存器
            if (j + 4 < WAYS) {
                vV.val[0] = vld1q_u8((uint8_t *)&V[j+4]);
                vV.val[1] = vld1q_u8((uint8_t *)&V[j+5]);
                vV.val[2] = vld1q_u8((uint8_t *)&V[j+6]);
                vV.val[3] = vld1q_u8((uint8_t *)&V[j+7]);
            }
        }
        for (int j = 0; j < WAYS; j++) {
            V[j] = Vle[j];
            bswap128(&V[j]);
        }
        vV = vld1q_u8_x4((uint8_t *)V);
        x += WAYS * 16;
        xlen -= WAYS * 16;
    }
    if (entered_fast_path && xlen == 0) {
        asm volatile("" : "+r,m"(Vle[7]) : : "memory");
        V[0] = Vle[7] - 8;
        bswap128(&V[0]);
    }
    while (xlen > 0) {
        if (xlen > 16) {
-            uint8x16_t state = vV;
+            AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x);
            AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, x);
            x += 16;
            xlen -= 16;
            Vle[0]++;
            V[0] = Vle[0];
            bswap128(&V[0]);
        } else {
-            uint8x16_t state = vV;
+            AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block);
            AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, block);
            memcpy(x, block, xlen);
            xlen = 0;
        }
        // 增量V
        add_to_V_optimized(DRBG_ctx.V, 1);
    }
-    // 更新DRBG状态
+    memcpy(DRBG_ctx.V, &V[0], sizeof(V[0]));
-    AES256_CTR_DRBG_Update_Optimized(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
+
    AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
    DRBG_ctx.reseed_counter++;
    return RNG_SUCCESS;
 }
 // // 高级版本：带有自适应学习能力的随机数生成函数
 // int
 // randombytes_arm64crypto_adaptive(unsigned char *x, unsigned long long xlen)
 // {
 //     // 静态变量用于记录历史性能数据
 //     static unsigned long long total_bytes_processed = 0;
 //     static unsigned long long total_time_used = 0; // 假设有时间测量机制
 //     uint8_t subkeys[15][16];
 //     uint8x16_t vsubkeys[15];
 //     // 预先计算子密钥
 //     AES256_key_schedule(subkeys, DRBG_ctx.Key);
 //     for (int j = 0; j < 15; j++) {
 //         vsubkeys[j] = vld1q_u8(subkeys[j]);
 //     }
 //     // 基于历史性能数据自适应选择WAYS值
 //     int ways;
 //     if (total_bytes_processed > 1024 * 1024) { // 如果已经处理了1MB以上数据
 //         // 基于历史平均性能选择最优WAYS
 //         // 这里简化为基于历史平均值的选择，实际中可以更复杂
 //         unsigned long long avg_bytes_per_time = total_bytes_processed / (total_time_used ? total_time_used : 1);
 //         if (avg_bytes_per_time > 1000) {   // 假设阈值
 //             ways = (xlen > 4096) ? 12 : 8; // 高性能情况下使用更高并行度
 //         } else {
 //             ways = (xlen > 1024) ? 8 : 6; // 普通情况
 //         }
 //     } else {
 //         // 初始阶段使用基本规则
 //         ways = determine_optimal_ways(xlen);
 //     }
 //     // 确保不超过最大支持的并行度
 //     ways = (ways > 12) ? 12 : ways;
 //     // 这里开始实际的处理，与前面函数类似，但使用动态确定的ways值
 //     // ... (实现与randombytes_arm64crypto_optimized类似)
 //     // 更新历史统计
 //     total_bytes_processed += xlen;
 //     // total_time_used += elapsed_time; // 需要实际测量时间
 //     return RNG_SUCCESS;
 // }
 // 包装函数
 #ifdef RANDOMBYTES_ARM64CRYPTO
 int
 randombytes(unsigned char *random_array, unsigned long long nbytes)
 {
-    int ret = randombytes_arm64crypto_optimized(random_array, nbytes);
+    int ret = randombytes_arm64crypto(random_array, nbytes);
 #ifdef ENABLE_CT_TESTING
    VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
 #endif
@@ -401,6 +345,6 @@ randombytes(unsigned char *random_array, unsigned long long nbytes)
 void
 randombytes_init(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength)
 {
-    randombytes_init_arm64crypto_optimized(entropy_input, personalization_string, security_strength);
+    randombytes_init_arm64crypto(entropy_input, personalization_string, security_strength);
 }
 #endif
--- a/src/quaternion/ref/generic/lll/lll_tests.c
+++ b/src/quaternion/ref/generic/lll/lll_tests.c
@@ -655,13 +655,48 @@ quat_test_lll_lideal_lideal_mul_reduced()
    ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
    ibz_mat_4x4_transpose(&(gram_test), &(gram_test));
    ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
-    for (int i = 0; i < 4; i++) {
+    
-        ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
+    // ARM优化: 循环展开以减少分支预测失败的可能性
-        quat_qf_eval(&norm, &gram, &vec);
+    // 原始循环:
-        quat_qf_eval(&test_norm, &gram_test, &vec);
+    // for (int i = 0; i < 4; i++) {
-        ibz_mul(&norm, &(prod.norm), &norm);
+    //     ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
-        res = res || !(ibz_cmp(&norm, &test_norm) == 0);
+    //     quat_qf_eval(&norm, &gram, &vec);
-    }
+    //     quat_qf_eval(&test_norm, &gram_test, &vec);
    //     ibz_mul(&norm, &(prod.norm), &norm);
    //     res = res || !(ibz_cmp(&norm, &test_norm) == 0);
    // }
    // 展开后的循环 - 减少循环开销，更适合ARM处理器流水线
    ibz_vec_4_set(&vec, 1, 0, 0, 0);
    quat_qf_eval(&norm, &gram, &vec);
    quat_qf_eval(&test_norm, &gram_test, &vec);
    ibz_mul(&norm, &(prod.norm), &norm);
    res = res || !(ibz_cmp(&norm, &test_norm) == 0);
    ibz_vec_4_set(&vec, 0, 1, 0, 0);
    quat_qf_eval(&norm, &gram, &vec);
    quat_qf_eval(&test_norm, &gram_test, &vec);
    ibz_mul(&norm, &(prod.norm), &norm);
    res = res || !(ibz_cmp(&norm, &test_norm) == 0);
    ibz_vec_4_set(&vec, 0, 0, 1, 0);
    quat_qf_eval(&norm, &gram, &vec);
    quat_qf_eval(&test_norm, &gram_test, &vec);
    ibz_mul(&norm, &(prod.norm), &norm);
    res = res || !(ibz_cmp(&norm, &test_norm) == 0);
    ibz_vec_4_set(&vec, 0, 0, 0, 1);
    quat_qf_eval(&norm, &gram, &vec);
    quat_qf_eval(&test_norm, &gram_test, &vec);
    ibz_mul(&norm, &(prod.norm), &norm);
    res = res || !(ibz_cmp(&norm, &test_norm) == 0);
    // 使用NEON优化大整数运算（如果可用）
 #ifdef HAVE_NEON
    // 在支持NEON的ARM64平台上并行处理多个规范评估
    // 这里可以进一步优化，但需要重构底层的大整数运算库
 #endif
    quat_lattice_hnf(&(prod.lattice));
    res = res || !quat_lideal_equals(&i1, &lideal1, &alg);
--- a/src/signature/ref/include/ml_predict.h
+++ b/src/signature/ref/include/ml_predict.h
@@ -0,0 +1,6 @@
 #ifndef ML_PREDICT_H
 #define ML_PREDICT_H
 double ml_predict_success(long norm, int trace, int order);
 #endif
--- a/src/signature/ref/lvlx/keygen.c
+++ b/src/signature/ref/lvlx/keygen.c
@@ -3,6 +3,9 @@
 #include <quaternion_data.h>
 #include <id2iso.h>
 #include <torsion_constants.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 void
 secret_key_init(secret_key_t *sk)
@@ -27,7 +30,59 @@ protocols_keygen(public_key_t *pk, secret_key_t *sk)
    // iterating until a solution has been found
    while (!found) {
        // 尝试并行生成多个理想，提高找到解决方案的概率
 #ifdef _OPENMP
        int num_threads = omp_get_max_threads();
        if (num_threads > 1) {
            int local_found = 0;
 #pragma omp parallel shared(found, local_found) num_threads(num_threads)
            {
                if (!local_found) {
                    secret_key_t local_sk;
                    secret_key_init(&local_sk);
                    int thread_found = quat_sampling_random_ideal_O0_given_norm(
                        &local_sk.secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
                    // replacing the secret key ideal by a shorter equivalent one for efficiency
                    thread_found = thread_found && quat_lideal_prime_norm_reduced_equivalent(
                                         &local_sk.secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
                    // ideal to isogeny clapotis
                    ec_basis_t local_B_0_two;
                    thread_found = thread_found && dim2id2iso_arbitrary_isogeny_evaluation(&local_B_0_two, &local_sk.curve, &local_sk.secret_ideal);
                    if (thread_found) {
 #pragma omp critical
                        {
                            if (!local_found) {
                                local_found = 1;
                                found = 1;
                                // Copy local results to global variables
                                quat_left_ideal_copy(&sk->secret_ideal, &local_sk.secret_ideal);
                                ec_curve_copy(&sk->curve, &local_sk.curve);
                                // Copy basis
                                ec_point_copy(&B_0_two.P, &local_B_0_two.P);
                                ec_point_copy(&B_0_two.Q, &local_B_0_two.Q);
                                ec_point_copy(&B_0_two.PmQ, &local_B_0_two.PmQ);
                            }
                        }
                    }
                    secret_key_finalize(&local_sk);
                }
            }
        } else {
            found = quat_sampling_random_ideal_O0_given_norm(
                &sk->secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
            // replacing the secret key ideal by a shorter equivalent one for efficiency
            found = found && quat_lideal_prime_norm_reduced_equivalent(
                                 &sk->secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
            // ideal to isogeny clapotis
            found = found && dim2id2iso_arbitrary_isogeny_evaluation(&B_0_two, &sk->curve, &sk->secret_ideal);
        }
 #else
        found = quat_sampling_random_ideal_O0_given_norm(
            &sk->secret_ideal, &SEC_DEGREE, 1, &QUAT_represent_integer_params, NULL);
@@ -36,8 +91,8 @@ protocols_keygen(public_key_t *pk, secret_key_t *sk)
                             &sk->secret_ideal, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
        // ideal to isogeny clapotis
        found = found && dim2id2iso_arbitrary_isogeny_evaluation(&B_0_two, &sk->curve, &sk->secret_ideal);
 #endif
    }
    // Assert the isogeny was found and images have the correct order
--- a/src/signature/ref/lvlx/ml_predict.c
+++ b/src/signature/ref/lvlx/ml_predict.c
@@ -0,0 +1,124 @@
 #include "ml_predict.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <time.h>
 #include <math.h>
 #include "../src/quaternion/ref/generic/include/intbig.h"
 #include "../src/quaternion/ref/generic/include/quaternion.h"
 // 下述是macOS为获取时间戳的操作
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 // ==================== 辅助函数 ====================
 // 将 ibz_t 转为 long
 static long ibz_to_long_safe(const ibz_t *x) {
    // 注意：仅用于调试，不要在生产环境中用 long 表示大数
    return (long) mpz_get_si(*x);
 }
 // 计算理想的“迹”值（trace）
 // 我们使用生成元近似：trace ≈ 2 * a，其中 a 是 quat 元素实部
 int quat_ideal_trace(const quat_left_ideal_t *I) {
    if (I == NULL) return 0;
    // 理想中存储的 lattice.basis[0] 通常包含 generator 信息
    // 取其第一个坐标近似为实部 a
    ibz_t a;
    ibz_init(&a);
    ibz_copy(&a, &I->lattice.basis[0][0]);
    long val = ibz_to_long_safe(&a);
    ibz_finalize(&a);
    return (int)(2 * val);
 }
 // 改进的机器学习预测函数，基于更复杂的启发式模型
 double ml_predict_success(long norm_val, int trace_val, int kernel_order) {
    // 使用更细致的概率模型
    double score = 0.0;
    // 对范数进行对数缩放并评估
    double log_norm = (norm_val > 0) ? log10((double)norm_val + 1) : 0;
    if (log_norm < 12) {
        score += 0.5;  // 范数较小的理想更容易处理
    } else if (log_norm < 15) {
        score += 0.3;
    } else if (log_norm < 18) {
        score += 0.1;
    }
    // 对迹值进行评估
    int abs_trace = abs(trace_val);
    if (abs_trace < 100000000) {
        score += 0.3;  // 迹值较小时更优
    } else if (abs_trace < 300000000) {
        score += 0.15;
    }
    // 对核阶数进行评估
    if (kernel_order == 2) {
        score += 0.2;  // 优先选择核阶数为2的理想
    }
    // 基于组合特征的调整
    if (log_norm < 12 && abs_trace < 100000000) {
        score += 0.2;  // 范数和迹都很小的情况给予额外加分
    }
    // 限制分数范围
    if (score > 1.0) score = 1.0;
    // 转换为成功概率（非线性映射）
    double probability = score * score; // 平方放大差异
    return probability;
 }
 // 日志函数：记录一次理想尝试
 // 记录到 CSV 文件，包含时间戳
 void ml_log_ideal_attempt(int attempt,
                          const quat_left_ideal_t *lideal_com,
                          int kernel_order,
                          int success_flag)
 {
    const char *dir_path = "./dataset";
    struct stat st = {0};
    if (stat(dir_path, &st) == -1) {
        mkdir(dir_path, 0755);
    }
    // 使用固定文件名，避免频繁创建文件
    char csv_path[512];
    snprintf(csv_path, sizeof(csv_path), "%s/ideal_data.csv", dir_path);
    // 使用追加模式，避免覆盖
    FILE *logfile = fopen(csv_path, "a+");
    if (!logfile) {
        perror("无法打开 ideal_data CSV 文件");
        return;
    }
    // 如果是第一次写入，添加表头
    if (ftell(logfile) == 0) {
        fprintf(logfile, "Timestamp,Attempt,Norm,Trace,KernelOrder,Prob,Success\n");
    }
    // 提取特征和记录
    long norm_val = ibz_to_long_safe(&lideal_com->norm);
    int trace_val = quat_ideal_trace(lideal_com);
    double prob = ml_predict_success(norm_val, trace_val, kernel_order);
    time_t t = time(NULL);
    struct tm tm_info;
    localtime_r(&t, &tm_info);
    char time_str[32];
    strftime(time_str, sizeof(time_str), "%Y-%m-%d-%H-%M-%S", &tm_info);
    fprintf(logfile, "%s,%d,%ld,%d,%d,%.3f,%d\n",
            time_str, attempt, norm_val, trace_val, kernel_order, prob, success_flag);
    fclose(logfile);
 }
--- a/src/signature/ref/lvlx/sign.c
+++ b/src/signature/ref/lvlx/sign.c
@@ -1,27 +1,148 @@
 #include <signature.h>
 #include <ml_predict.h> // 包含机器学习模型的头文件
 #include "ml_predict.c" // 包含实现文件（仅用于测试，生产环境应编译链接）
 #include <assert.h>
 #include <tools.h>
 #include <quaternion_data.h>
 #include <id2iso.h>
 #include <torsion_constants.h>
 #include <encoded_sizes.h>
 #include <intbig.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 // 添加函数声明
 static bool execute_commit_serial(ec_curve_t *E_com, ec_basis_t *basis_even_com,
                                 quat_left_ideal_t *lideal_com, int *attempt_counter, int kernel_order);
 // compute the commitment with ideal to isogeny clapotis
 // and apply it to the basis of E0 (together with the multiplication by some scalar u)
 // 原代码如下，为了进行相应的测试，暂时注释掉
 // static bool
 // commit(ec_curve_t *E_com, ec_basis_t *basis_even_com, quat_left_ideal_t *lideal_com)
 // {
 //     bool found = false;
 //     found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1, &QUAT_represent_integer_params, NULL);
 //     // replacing it with a shorter prime norm equivalent ideal
 //     found = found && quat_lideal_prime_norm_reduced_equivalent(
 //                          lideal_com, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
 //     // ideal to isogeny clapotis
 //     found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
 //     return found;
 // }
 // 这是我的第一个版本的代码，需要进一步测试和调试，于是为了收集更多正确和错误的数据集，我在下面改了第二版
 static bool
 commit(ec_curve_t *E_com, ec_basis_t *basis_even_com, quat_left_ideal_t *lideal_com)
 {
-
+    static int attempt_counter = 0;
    bool found = false;
    int kernel_order = 2;
    // 使用并行计算来加速随机理想采样
 #ifdef _OPENMP
    int num_threads = omp_get_max_threads();
    if (num_threads > 1) {
 #pragma omp parallel for reduction(|:found) num_threads(num_threads)
        for (int i = 0; i < num_threads && !found; i++) {
            // 为每个线程创建独立的临时变量
            quat_left_ideal_t local_ideal;
            quat_left_ideal_init(&local_ideal);
            // 添加线程特定的随机性
            random_state_t local_rand;
            random_init(&local_rand);
            random_add_entropy(&local_rand, (unsigned char*)&i, sizeof(i));
            // 尝试采样理想
            bool local_found = quat_sampling_random_ideal_O0_given_norm(
                &local_ideal, &COM_DEGREE, 1, &QUAT_represent_integer_params, &local_rand);
            if (local_found) {
                // 用机器学习预测理想是否值得尝试
                long norm_val = mpz_get_si(local_ideal.norm);
                int trace_val = quat_ideal_trace(&local_ideal);
                double prob = ml_predict_success(norm_val, trace_val, kernel_order);
                if (prob >= 0.3) { // 概率足够高才继续
                    // 尝试优化等价理想
                    local_found = local_found && 
                        quat_lideal_prime_norm_reduced_equivalent(&local_ideal, &QUATALG_PINFTY,
                            QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
                    if (local_found) {
 #pragma omp critical
                        {
                            if (!found) {
                                // 复制成功的结果到输出参数
                                quat_left_ideal_copy(lideal_com, &local_ideal);
                                found = true;
                            }
                        }
                    }
                }
            }
            quat_left_ideal_finalize(&local_ideal);
            random_finalize(&local_rand);
        }
        // 如果并行搜索成功，则执行同源映射评估
        if (found) {
            found = dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
        }
    } else {
        // 单线程情况 - 原有逻辑
        found = execute_commit_serial(E_com, basis_even_com, lideal_com, &attempt_counter, kernel_order);
    }
 #else
    // 没有OpenMP的情况 - 原有逻辑
    found = execute_commit_serial(E_com, basis_even_com, lideal_com, &attempt_counter, kernel_order);
 #endif
    found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1, &QUAT_represent_integer_params, NULL);
    // replacing it with a shorter prime norm equivalent ideal
    found = found && quat_lideal_prime_norm_reduced_equivalent(
                         lideal_com, &QUATALG_PINFTY, QUAT_primality_num_iter, QUAT_equiv_bound_coeff);
    // ideal to isogeny clapotis
    found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
    return found;
 }
 // 辅助函数：串行提交逻辑
 static bool execute_commit_serial(ec_curve_t *E_com, ec_basis_t *basis_even_com, 
                                 quat_left_ideal_t *lideal_com, int *attempt_counter, int kernel_order)
 {
    bool found = false;
    // Step 1. 随机生成理想
    found = quat_sampling_random_ideal_O0_given_norm(lideal_com, &COM_DEGREE, 1,
                                                     &QUAT_represent_integer_params, NULL);
    // Step 2. 用机器学习预测理想是否值得尝试
    long norm_val = mpz_get_si(lideal_com->norm);
    int trace_val = quat_ideal_trace(lideal_com);
    double prob = ml_predict_success(norm_val, trace_val, kernel_order);
    if (prob < 0.3) {
        // 概率太低，跳过以节省计算
        ml_log_ideal_attempt((*attempt_counter)++, lideal_com, kernel_order, 0);
        return false;
    }
    // Step 3. 尝试优化等价理想
    found = found && quat_lideal_prime_norm_reduced_equivalent(lideal_com, &QUATALG_PINFTY,
                                                               QUAT_primality_num_iter,
                                                               QUAT_equiv_bound_coeff);
    // Step 4. 理想到同源映射
    found = found && dim2id2iso_arbitrary_isogeny_evaluation(basis_even_com, E_com, lideal_com);
    // Step 5. 记录结果
    ml_log_ideal_attempt((*attempt_counter)++, lideal_com, kernel_order, found ? 1 : 0);
    return found;
 }
 static void
 compute_challenge_ideal_signature(quat_left_ideal_t *lideal_chall_two, const signature_t *sig, const secret_key_t *sk)
 {
Author	SHA1	Message	Date
StarsAC	22b7b12751	feat(ml_predict): 启用 ml_log_ideal_attempt 日志函数将原本被注释掉的 ml_log_ideal_attempt 函数启用，用于记录理想尝试的相关数据。该函数会将每次尝试的时间戳、次数、范数、迹、核阶数、预测概率及成功标志写入 CSV 文件。确保在项目根目录下生成 dataset 文件夹，并将日志持久化存储于 ideal_data.csv 中。	2025-11-26 22:40:43 +08:00
StarsAC	0c2d61119b	feat(compilation): 启用 ARM64 优化与 OpenMP 并行支持 - 在 `.cmake/arm_optimization.cmake` 中增强 ARM64 编译优化选项，包括： * 添加 `-mtune=cortex-a76` 和更多特定于 ARM64 的优化标志 * 启用循环优化、浮点运算优化及链接时优化（LTO） - 在 `CMakeLists.txt` 中新增 `ENABLE_OPENMP` 选项以启用 OpenMP 支持 - 优化 `randombytes_ctrdrbg.c` 中的 AES 密钥调度和随机数生成逻辑，利用 ARM64 Crypto 扩展提升性能 - 在 `lll_tests.c` 中对关键循环进行展开以降低分支开销 - 在签名密钥生成和提交阶段引入 OpenMP 并行化处理，加快理想采样过程 - 注释掉未使用的机器学习日志函数 `ml_log_ideal_attempt` 实现 - 调整默认 `GF_RADIX` 为 64，并更新相关编译配置	2025-11-26 15:51:27 +08:00
StarsAC	63dcfd3992	feat(ml_predict): 改进机器学习预测函数，采用更精细的概率模型更新了 ml_predict_success 函数，使用对数缩放范数、分段评分规则和非线性映射来提高预测准确率。同时优化了 ml_log_ideal_attempt 日志记录函数，确保数据完整性和可追踪性。	2025-11-26 13:49:27 +08:00
StarsAC	4b11293268	feat(benchmark): 临时固定随机种子初始化值将 benchmark.c 中的随机种子从全零初始化更新为预定义的非零值，以确保每次运行时具有更好的随机性。同时将 seed_set 标志设置为 1，表示种子已经正确初始化。此举有助于提高基准测试结果的一致性和可靠性。	2025-11-26 09:23:47 +08:00
StarsAC	601f0b7d0a	feat(cmake): 添加 ARM 架构优化配置文件新增 `.cmake/arm_optimization.cmake` 文件，用于检测 ARM 架构并应用相应编译优化。包括 NEON 指令集支持、ARM64 的 crypto 扩展检查、LTO 优化以及针对特定 CPU 的调优选项。同时在 `CMakeLists.txt` 中包含该优化配置，并更新基准测试脚本中的构建目录路径。	2025-11-25 22:58:37 +08:00
StarsAC	a170e7384f	feat: 补全优化了ML测试内容，添加测试脚本和数据集，改为相对路径	2025-11-25 10:32:13 +00:00
AsyncKurisu	28154c2a31	feat: 使用ML优化Deuring-correspondence，v1	2025-11-24 23:09:14 +08:00