feat(compilation): 启用 ARM64 优化与 OpenMP 并行支持

- 在 `.cmake/arm_optimization.cmake` 中增强 ARM64 编译优化选项,包括:
  * 添加 `-mtune=cortex-a76` 和更多特定于 ARM64 的优化标志
  * 启用循环优化、浮点运算优化及链接时优化(LTO)
- 在 `CMakeLists.txt` 中新增 `ENABLE_OPENMP` 选项以启用 OpenMP 支持
- 优化 `randombytes_ctrdrbg.c` 中的 AES 密钥调度和随机数生成逻辑,利用 ARM64 Crypto 扩展提升性能
- 在 `lll_tests.c` 中对关键循环进行展开以降低分支开销
- 在签名密钥生成和提交阶段引入 OpenMP 并行化处理,加快理想采样过程
- 注释掉未使用的机器学习日志函数 `ml_log_ideal_attempt` 实现
- 调整默认 `GF_RADIX` 为 64,并更新相关编译配置
This commit is contained in:
2025-11-26 15:51:27 +08:00
parent 63dcfd3992
commit 0c2d61119b
8 changed files with 340 additions and 106 deletions

View File

@@ -655,13 +655,48 @@ quat_test_lll_lideal_lideal_mul_reduced()
ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
ibz_mat_4x4_transpose(&(gram_test), &(gram_test));
ibz_mat_4x4_mul(&(gram_test), &(gram_test), &(prod.lattice.basis));
for (int i = 0; i < 4; i++) {
ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
}
// ARM优化: 循环展开以减少分支预测失败的可能性
// 原始循环:
// for (int i = 0; i < 4; i++) {
// ibz_vec_4_set(&vec, (i == 0), (i == 1), (i == 2), (i == 3));
// quat_qf_eval(&norm, &gram, &vec);
// quat_qf_eval(&test_norm, &gram_test, &vec);
// ibz_mul(&norm, &(prod.norm), &norm);
// res = res || !(ibz_cmp(&norm, &test_norm) == 0);
// }
// 展开后的循环 - 减少循环开销更适合ARM处理器流水线
ibz_vec_4_set(&vec, 1, 0, 0, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 1, 0, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 0, 1, 0);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
ibz_vec_4_set(&vec, 0, 0, 0, 1);
quat_qf_eval(&norm, &gram, &vec);
quat_qf_eval(&test_norm, &gram_test, &vec);
ibz_mul(&norm, &(prod.norm), &norm);
res = res || !(ibz_cmp(&norm, &test_norm) == 0);
// 使用NEON优化大整数运算如果可用
#ifdef HAVE_NEON
// 在支持NEON的ARM64平台上并行处理多个规范评估
// 这里可以进一步优化,但需要重构底层的大整数运算库
#endif
quat_lattice_hnf(&(prod.lattice));
res = res || !quat_lideal_equals(&i1, &lideal1, &alg);