Files
sqisign_new/src/common/arm64crypto/randombytes_ctrdrbg.c

407 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SPDX-License-Identifier: Apache-2.0
#include "randombytes_arm64crypto.h"
#include <arm_neon.h>
#include <string.h>
static AES256_CTR_DRBG_struct DRBG_ctx;
// 优化1: 改进S-box实现减少内存操作
static inline uint32_t
AES_sbox_x4(uint32_t in)
{
uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in));
sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0));
return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0);
}
#define ROTR32(x, n) ((x << (32 - n)) | (x >> n))
// 优化2: 使用更紧凑的数据结构,提高缓存效率
typedef union
{
uint8_t u8[240]; // 15*16
uint32_t u32[60]; // 15*4
uint8x16_t v[15];
} subkeys_t;
// 优化3: 改进密钥调度使用Neon指令进行批量处理
static void
AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key)
{
subkeys_t *sk = (subkeys_t *)subkeys;
uint8x16_t rcon = vdupq_n_u8(0x01);
uint8x16_t rcon_step = vdupq_n_u8(0x1b);
// 一次性复制前两轮密钥
memcpy(&subkeys[0][0], key, 32);
uint8x16_t prev_key = vld1q_u8(&subkeys[0][0]);
uint8x16_t prev_prev_key = vld1q_u8(&subkeys[1][0]);
for (int i = 2; i < 15; i++) {
// 提取最后一列并进行S-box变换
uint8x16_t last_col = vextq_u8(prev_key, vdupq_n_u8(0), 12);
last_col = vaeseq_u8(last_col, vdupq_n_u8(0));
// RotWord
last_col = vextq_u8(last_col, last_col, 3);
// XOR with rcon
uint8x16_t new_key_first = veorq_u8(veorq_u8(last_col, rcon), prev_prev_key);
// 生成新密钥的剩余部分
uint8x16_t new_key = vextq_u8(prev_prev_key, new_key_first, 12);
// 保存新密钥
vst1q_u8(&subkeys[i][0], new_key);
// 更新rcon
uint8_t rcon_val = vgetq_lane_u8(rcon, 0);
rcon_val = (rcon_val << 1) ^ ((rcon_val >> 7) * 0x1b);
rcon = vdupq_n_u8(rcon_val);
// 更新前两个密钥
prev_prev_key = prev_key;
prev_key = new_key;
}
}
// 优化4: 改进AES-256 ECB实现减少循环开销
static inline void
AES256_ECB_XWAYS_OPTIMIZED(int ways, const uint8x16_t vsubkeys[15], uint8x16_t state[], unsigned char *out)
{
// 第一轮AddRoundKey
for (int j = 0; j < ways; j++) {
state[j] = vaeseq_u8(state[j], vsubkeys[0]);
state[j] = vaesmcq_u8(state[j]);
}
// 中间轮SubBytes, ShiftRows, MixColumns, AddRoundKey
for (int i = 1; i < 13; i++) {
uint8x16_t subkey = vsubkeys[i];
for (int j = 0; j < ways; j++) {
state[j] = vaeseq_u8(state[j], subkey);
state[j] = vaesmcq_u8(state[j]);
}
}
// 最后一轮SubBytes, ShiftRows, AddRoundKey
for (int j = 0; j < ways; j++) {
state[j] = vaeseq_u8(state[j], vsubkeys[13]);
state[j] = veorq_u8(state[j], vsubkeys[14]);
vst1q_u8(out + j * 16, state[j]);
}
}
// 优化5: 使用向量化的字节交换函数
static inline void
bswap128_vectorized(uint8x16_t *v)
{
// 使用vrev64q_u8和vtrn1q_u8等指令优化字节交换
uint8x16_t reversed = vrev64q_u8(*v);
uint8x8x2_t halves = vtrn_u8(vget_low_u8(reversed), vget_high_u8(reversed));
*v = vcombine_u8(halves.val[1], halves.val[0]);
}
// 优化6: 改进计数器增量函数
static inline void
add_to_V_optimized(unsigned char V[], int incr)
{
// 使用向量化操作增加计数器
uint8x16_t vV = vld1q_u8(V);
uint64x2_t vV64 = vreinterpretq_u64_u8(vV);
// 处理64位增量
uint64x2_t incr64 = vdupq_n_u64((uint64_t)incr);
vV64 = vaddq_u64(vV64, incr64);
// 如果低64位溢出增加高64位
uint64_t low = vgetq_lane_u64(vV64, 0);
if (low < (uint64_t)incr) {
uint64_t high = vgetq_lane_u64(vV64, 1);
vV64 = vsetq_lane_u64(high + 1, vV64, 1);
}
vV = vreinterpretq_u8_u64(vV64);
bswap128_vectorized(&vV);
vst1q_u8(V, vV);
}
// 动态确定最优WAYS值
static int
determine_optimal_ways(unsigned long long data_size)
{
// 根据数据大小选择最优的WAYS值
// 这些阈值可以通过实际测试优化
// 小数据块: 使用4路并行
if (data_size < 256) {
return 4;
}
// 中等数据块: 使用6路并行
else if (data_size < 1024) {
return 6;
}
// 大数据块: 使用8路并行
else if (data_size < 4096) {
return 8;
}
// 超大数据块: 使用10路并行但不超过12
else {
return 8;
}
}
// 优化7: 改进DRBG更新函数减少内存操作
static void
AES256_CTR_DRBG_Update_Optimized(unsigned char *provided_data,
const uint8x16_t vsubkeys[15],
unsigned char *Key,
unsigned char *V)
{
unsigned char temp[48];
// 使用向量化操作处理计数器
uint8x16_t vV = vld1q_u8(V);
uint8x16_t vV1 = vV;
uint8x16_t vV2 = vV;
uint8x16_t vV3 = vV;
// 增量计数器值
uint64x2_t inc = vdupq_n_u64(1);
uint64x2_t vV64 = vreinterpretq_u64_u8(vV1);
vV64 = vaddq_u64(vV64, inc);
vV1 = vreinterpretq_u8_u64(vV64);
vV64 = vreinterpretq_u64_u8(vV2);
vV64 = vaddq_u64(vV64, vdupq_n_u64(2));
vV2 = vreinterpretq_u8_u64(vV64);
vV64 = vreinterpretq_u64_u8(vV3);
vV64 = vaddq_u64(vV64, vdupq_n_u64(3));
vV3 = vreinterpretq_u8_u64(vV64);
// 批量AES加密
uint8x16_t vV_array[3] = { vV1, vV2, vV3 };
AES256_ECB_XWAYS_OPTIMIZED(3, vsubkeys, vV_array, temp);
// 如果有提供的数据进行XOR操作
if (provided_data != NULL) {
uint8x16_t vData = vld1q_u8(provided_data);
uint8x16_t vTemp = vld1q_u8(temp);
vst1q_u8(temp, veorq_u8(vTemp, vData));
vData = vld1q_u8(provided_data + 16);
vTemp = vld1q_u8(temp + 16);
vst1q_u8(temp + 16, veorq_u8(vTemp, vData));
vData = vld1q_u8(provided_data + 32);
vTemp = vld1q_u8(temp + 32);
vst1q_u8(temp + 32, veorq_u8(vTemp, vData));
}
// 更新密钥和V
memcpy(Key, temp, 32);
memcpy(V, temp + 32, 16);
add_to_V_optimized(DRBG_ctx.V, 1);
}
// 优化8: 改进初始化函数
void
randombytes_init_arm64crypto_optimized(unsigned char *entropy_input,
unsigned char *personalization_string,
int security_strength)
{
(void)security_strength;
unsigned char seed_material[48];
uint8_t subkeys[15][16];
uint8x16_t vsubkeys[15];
// 使用向量化操作初始化种子材料
if (personalization_string) {
uint8x16_t vEntropy = vld1q_u8(entropy_input);
uint8x16_t vPersonal = vld1q_u8(personalization_string);
vst1q_u8(seed_material, veorq_u8(vEntropy, vPersonal));
vEntropy = vld1q_u8(entropy_input + 16);
vPersonal = vld1q_u8(personalization_string + 16);
vst1q_u8(seed_material + 16, veorq_u8(vEntropy, vPersonal));
vEntropy = vld1q_u8(entropy_input + 32);
vPersonal = vld1q_u8(personalization_string + 32);
vst1q_u8(seed_material + 32, veorq_u8(vEntropy, vPersonal));
} else {
memcpy(seed_material, entropy_input, 48);
}
// 初始化密钥和V为零
uint8x16_t vZero = vdupq_n_u8(0);
vst1q_u8(DRBG_ctx.Key, vZero);
vst1q_u8(DRBG_ctx.Key + 16, vZero);
vst1q_u8(DRBG_ctx.V, vZero);
// 生成子密钥
AES256_key_schedule(subkeys, DRBG_ctx.Key);
for (int i = 0; i < 15; i++) {
vsubkeys[i] = vld1q_u8(subkeys[i]);
}
// 更新DRBG状态
AES256_CTR_DRBG_Update_Optimized(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter = 1;
}
// 优化9: 动态选择WAYS值的主随机数生成函数
int
randombytes_arm64crypto_optimized(unsigned char *x, unsigned long long xlen)
{
uint8_t subkeys[15][16];
unsigned char block[16];
uint8x16_t vsubkeys[15];
// 预先计算子密钥
AES256_key_schedule(subkeys, DRBG_ctx.Key);
for (int j = 0; j < 15; j++) {
vsubkeys[j] = vld1q_u8(subkeys[j]);
}
// 根据数据大小动态确定最优的WAYS值
int ways = determine_optimal_ways(xlen);
// 处理大块数据使用动态确定的WAYS值
if (xlen >= ways * 16) {
// 使用动态分配的数组来适应不同的WAYS值
uint8x16_t vV_array[12]; // 最多支持12路并行
uint8x16_t vV = vld1q_u8(DRBG_ctx.V);
// 初始化计数器值
vV_array[0] = vV;
for (int j = 1; j < ways; j++) {
uint64x2_t vV64 = vreinterpretq_u64_u8(vV);
uint64x2_t inc = vdupq_n_u64(j);
vV64 = vaddq_u64(vV64, inc);
vV_array[j] = vreinterpretq_u8_u64(vV64);
}
// 处理大块数据
while (xlen >= ways * 16) {
// 批量AES加密
AES256_ECB_XWAYS_OPTIMIZED(ways, vsubkeys, vV_array, x);
// 更新计数器值
uint64x2_t vV64 = vreinterpretq_u64_u8(vV_array[ways - 1]);
uint64x2_t inc = vdupq_n_u64(ways);
vV64 = vaddq_u64(vV64, inc);
for (int j = 0; j < ways; j++) {
uint64x2_t current = vreinterpretq_u64_u8(vV_array[j]);
current = vaddq_u64(current, inc);
vV_array[j] = vreinterpretq_u8_u64(current);
}
x += ways * 16;
xlen -= ways * 16;
}
// 更新V为最后一个计数器值
vV = vV_array[ways - 1];
vst1q_u8(DRBG_ctx.V, vV);
}
// 处理剩余数据(小量数据)
while (xlen > 0) {
uint8x16_t vV = vld1q_u8(DRBG_ctx.V);
if (xlen > 16) {
uint8x16_t state = vV;
AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, x);
x += 16;
xlen -= 16;
} else {
uint8x16_t state = vV;
AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, block);
memcpy(x, block, xlen);
xlen = 0;
}
// 增量V
add_to_V_optimized(DRBG_ctx.V, 1);
}
// 更新DRBG状态
AES256_CTR_DRBG_Update_Optimized(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V);
DRBG_ctx.reseed_counter++;
return RNG_SUCCESS;
}
// // 高级版本:带有自适应学习能力的随机数生成函数
// int
// randombytes_arm64crypto_adaptive(unsigned char *x, unsigned long long xlen)
// {
// // 静态变量用于记录历史性能数据
// static unsigned long long total_bytes_processed = 0;
// static unsigned long long total_time_used = 0; // 假设有时间测量机制
// uint8_t subkeys[15][16];
// uint8x16_t vsubkeys[15];
// // 预先计算子密钥
// AES256_key_schedule(subkeys, DRBG_ctx.Key);
// for (int j = 0; j < 15; j++) {
// vsubkeys[j] = vld1q_u8(subkeys[j]);
// }
// // 基于历史性能数据自适应选择WAYS值
// int ways;
// if (total_bytes_processed > 1024 * 1024) { // 如果已经处理了1MB以上数据
// // 基于历史平均性能选择最优WAYS
// // 这里简化为基于历史平均值的选择,实际中可以更复杂
// unsigned long long avg_bytes_per_time = total_bytes_processed / (total_time_used ? total_time_used : 1);
// if (avg_bytes_per_time > 1000) { // 假设阈值
// ways = (xlen > 4096) ? 12 : 8; // 高性能情况下使用更高并行度
// } else {
// ways = (xlen > 1024) ? 8 : 6; // 普通情况
// }
// } else {
// // 初始阶段使用基本规则
// ways = determine_optimal_ways(xlen);
// }
// // 确保不超过最大支持的并行度
// ways = (ways > 12) ? 12 : ways;
// // 这里开始实际的处理与前面函数类似但使用动态确定的ways值
// // ... (实现与randombytes_arm64crypto_optimized类似)
// // 更新历史统计
// total_bytes_processed += xlen;
// // total_time_used += elapsed_time; // 需要实际测量时间
// return RNG_SUCCESS;
// }
// 包装函数
#ifdef RANDOMBYTES_ARM64CRYPTO
int
randombytes(unsigned char *random_array, unsigned long long nbytes)
{
int ret = randombytes_arm64crypto_optimized(random_array, nbytes);
#ifdef ENABLE_CT_TESTING
VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret);
#endif
return ret;
}
void
randombytes_init(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength)
{
randombytes_init_arm64crypto_optimized(entropy_input, personalization_string, security_strength);
}
#endif