// SPDX-License-Identifier: Apache-2.0 #include "randombytes_arm64crypto.h" #include #include static AES256_CTR_DRBG_struct DRBG_ctx; // 优化1: 改进S-box实现,减少内存操作 static inline uint32_t AES_sbox_x4(uint32_t in) { uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in)); sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0)); return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0); } #define ROTR32(x, n) ((x << (32 - n)) | (x >> n)) // 优化2: 使用更紧凑的数据结构,提高缓存效率 typedef union { uint8_t u8[240]; // 15*16 uint32_t u32[60]; // 15*4 uint8x16_t v[15]; } subkeys_t; // 优化3: 改进密钥调度,使用Neon指令进行批量处理 static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) { subkeys_t *sk = (subkeys_t *)subkeys; uint8x16_t rcon = vdupq_n_u8(0x01); uint8x16_t rcon_step = vdupq_n_u8(0x1b); // 一次性复制前两轮密钥 memcpy(&subkeys[0][0], key, 32); uint8x16_t prev_key = vld1q_u8(&subkeys[0][0]); uint8x16_t prev_prev_key = vld1q_u8(&subkeys[1][0]); for (int i = 2; i < 15; i++) { // 提取最后一列并进行S-box变换 uint8x16_t last_col = vextq_u8(prev_key, vdupq_n_u8(0), 12); last_col = vaeseq_u8(last_col, vdupq_n_u8(0)); // RotWord last_col = vextq_u8(last_col, last_col, 3); // XOR with rcon uint8x16_t new_key_first = veorq_u8(veorq_u8(last_col, rcon), prev_prev_key); // 生成新密钥的剩余部分 uint8x16_t new_key = vextq_u8(prev_prev_key, new_key_first, 12); // 保存新密钥 vst1q_u8(&subkeys[i][0], new_key); // 更新rcon uint8_t rcon_val = vgetq_lane_u8(rcon, 0); rcon_val = (rcon_val << 1) ^ ((rcon_val >> 7) * 0x1b); rcon = vdupq_n_u8(rcon_val); // 更新前两个密钥 prev_prev_key = prev_key; prev_key = new_key; } } // 优化4: 改进AES-256 ECB实现,减少循环开销 static inline void AES256_ECB_XWAYS_OPTIMIZED(int ways, const uint8x16_t vsubkeys[15], uint8x16_t state[], unsigned char *out) { // 第一轮:AddRoundKey for (int j = 0; j < ways; j++) { state[j] = vaeseq_u8(state[j], vsubkeys[0]); state[j] = vaesmcq_u8(state[j]); } // 中间轮:SubBytes, ShiftRows, MixColumns, AddRoundKey for (int i = 1; i < 13; i++) { uint8x16_t subkey = vsubkeys[i]; for (int j = 0; j < ways; j++) { state[j] = vaeseq_u8(state[j], subkey); state[j] = vaesmcq_u8(state[j]); } } // 最后一轮:SubBytes, ShiftRows, AddRoundKey for (int j = 0; j < ways; j++) { state[j] = vaeseq_u8(state[j], vsubkeys[13]); state[j] = veorq_u8(state[j], vsubkeys[14]); vst1q_u8(out + j * 16, state[j]); } } // 优化5: 使用向量化的字节交换函数 static inline void bswap128_vectorized(uint8x16_t *v) { // 使用vrev64q_u8和vtrn1q_u8等指令优化字节交换 uint8x16_t reversed = vrev64q_u8(*v); uint8x8x2_t halves = vtrn_u8(vget_low_u8(reversed), vget_high_u8(reversed)); *v = vcombine_u8(halves.val[1], halves.val[0]); } // 优化6: 改进计数器增量函数 static inline void add_to_V_optimized(unsigned char V[], int incr) { // 使用向量化操作增加计数器 uint8x16_t vV = vld1q_u8(V); uint64x2_t vV64 = vreinterpretq_u64_u8(vV); // 处理64位增量 uint64x2_t incr64 = vdupq_n_u64((uint64_t)incr); vV64 = vaddq_u64(vV64, incr64); // 如果低64位溢出,增加高64位 uint64_t low = vgetq_lane_u64(vV64, 0); if (low < (uint64_t)incr) { uint64_t high = vgetq_lane_u64(vV64, 1); vV64 = vsetq_lane_u64(high + 1, vV64, 1); } vV = vreinterpretq_u8_u64(vV64); bswap128_vectorized(&vV); vst1q_u8(V, vV); } // 优化7: 改进DRBG更新函数,减少内存操作 static void AES256_CTR_DRBG_Update_Optimized(unsigned char *provided_data, const uint8x16_t vsubkeys[15], unsigned char *Key, unsigned char *V) { unsigned char temp[48]; // 使用向量化操作处理计数器 uint8x16_t vV = vld1q_u8(V); uint8x16_t vV1 = vV; uint8x16_t vV2 = vV; uint8x16_t vV3 = vV; // 增量计数器值 uint64x2_t inc = vdupq_n_u64(1); uint64x2_t vV64 = vreinterpretq_u64_u8(vV1); vV64 = vaddq_u64(vV64, inc); vV1 = vreinterpretq_u8_u64(vV64); vV64 = vreinterpretq_u64_u8(vV2); vV64 = vaddq_u64(vV64, vdupq_n_u64(2)); vV2 = vreinterpretq_u8_u64(vV64); vV64 = vreinterpretq_u64_u8(vV3); vV64 = vaddq_u64(vV64, vdupq_n_u64(3)); vV3 = vreinterpretq_u8_u64(vV64); // 批量AES加密 uint8x16_t vV_array[3] = { vV1, vV2, vV3 }; AES256_ECB_XWAYS_OPTIMIZED(3, vsubkeys, vV_array, temp); // 如果有提供的数据,进行XOR操作 if (provided_data != NULL) { uint8x16_t vData = vld1q_u8(provided_data); uint8x16_t vTemp = vld1q_u8(temp); vst1q_u8(temp, veorq_u8(vTemp, vData)); vData = vld1q_u8(provided_data + 16); vTemp = vld1q_u8(temp + 16); vst1q_u8(temp + 16, veorq_u8(vTemp, vData)); vData = vld1q_u8(provided_data + 32); vTemp = vld1q_u8(temp + 32); vst1q_u8(temp + 32, veorq_u8(vTemp, vData)); } // 更新密钥和V memcpy(Key, temp, 32); memcpy(V, temp + 32, 16); add_to_V_optimized(DRBG_ctx.V, 1); } // 优化8: 改进初始化函数 void randombytes_init_arm64crypto_optimized(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength) { (void)security_strength; unsigned char seed_material[48]; uint8_t subkeys[15][16]; uint8x16_t vsubkeys[15]; // 使用向量化操作初始化种子材料 if (personalization_string) { uint8x16_t vEntropy = vld1q_u8(entropy_input); uint8x16_t vPersonal = vld1q_u8(personalization_string); vst1q_u8(seed_material, veorq_u8(vEntropy, vPersonal)); vEntropy = vld1q_u8(entropy_input + 16); vPersonal = vld1q_u8(personalization_string + 16); vst1q_u8(seed_material + 16, veorq_u8(vEntropy, vPersonal)); vEntropy = vld1q_u8(entropy_input + 32); vPersonal = vld1q_u8(personalization_string + 32); vst1q_u8(seed_material + 32, veorq_u8(vEntropy, vPersonal)); } else { memcpy(seed_material, entropy_input, 48); } // 初始化密钥和V为零 uint8x16_t vZero = vdupq_n_u8(0); vst1q_u8(DRBG_ctx.Key, vZero); vst1q_u8(DRBG_ctx.Key + 16, vZero); vst1q_u8(DRBG_ctx.V, vZero); // 生成子密钥 AES256_key_schedule(subkeys, DRBG_ctx.Key); for (int i = 0; i < 15; i++) { vsubkeys[i] = vld1q_u8(subkeys[i]); } // 更新DRBG状态 AES256_CTR_DRBG_Update_Optimized(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); DRBG_ctx.reseed_counter = 1; } // 优化9: 提高WAYS值以利用更宽的向量寄存器 #define WAYS_OPTIMIZED 8 // 增加到8,利用更宽的向量化 // 优化10: 改进主随机数生成函数,使用更大的WAYS值和更好的向量化 int randombytes_arm64crypto_optimized(unsigned char *x, unsigned long long xlen) { uint8_t subkeys[15][16]; unsigned char block[16]; uint8x16_t vsubkeys[15]; // 预先计算子密钥 AES256_key_schedule(subkeys, DRBG_ctx.Key); for (int j = 0; j < 15; j++) { vsubkeys[j] = vld1q_u8(subkeys[j]); } // 处理大块数据(使用优化后的WAYS值) if (xlen >= WAYS_OPTIMIZED * 16) { uint8x16_t vV_array[WAYS_OPTIMIZED]; uint8x16_t vV = vld1q_u8(DRBG_ctx.V); // 初始化计数器值 vV_array[0] = vV; for (int j = 1; j < WAYS_OPTIMIZED; j++) { uint64x2_t vV64 = vreinterpretq_u64_u8(vV); uint64x2_t inc = vdupq_n_u64(j); vV64 = vaddq_u64(vV64, inc); vV_array[j] = vreinterpretq_u8_u64(vV64); } // 处理大块数据 while (xlen >= WAYS_OPTIMIZED * 16) { // 批量AES加密 AES256_ECB_XWAYS_OPTIMIZED(WAYS_OPTIMIZED, vsubkeys, vV_array, x); // 更新计数器值 uint64x2_t vV64 = vreinterpretq_u64_u8(vV_array[WAYS_OPTIMIZED - 1]); uint64x2_t inc = vdupq_n_u64(WAYS_OPTIMIZED); vV64 = vaddq_u64(vV64, inc); for (int j = 0; j < WAYS_OPTIMIZED; j++) { uint64x2_t current = vreinterpretq_u64_u8(vV_array[j]); current = vaddq_u64(current, inc); vV_array[j] = vreinterpretq_u8_u64(current); } x += WAYS_OPTIMIZED * 16; xlen -= WAYS_OPTIMIZED * 16; } // 更新V为最后一个计数器值 vV = vV_array[WAYS_OPTIMIZED - 1]; vst1q_u8(DRBG_ctx.V, vV); } // 处理剩余数据(小量数据) while (xlen > 0) { uint8x16_t vV = vld1q_u8(DRBG_ctx.V); if (xlen > 16) { uint8x16_t state = vV; AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, x); x += 16; xlen -= 16; } else { uint8x16_t state = vV; AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, block); memcpy(x, block, xlen); xlen = 0; } // 增量V add_to_V_optimized(DRBG_ctx.V, 1); } // 更新DRBG状态 AES256_CTR_DRBG_Update_Optimized(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); DRBG_ctx.reseed_counter++; return RNG_SUCCESS; } // 包装函数 #ifdef RANDOMBYTES_ARM64CRYPTO int randombytes(unsigned char *random_array, unsigned long long nbytes) { int ret = randombytes_arm64crypto_optimized(random_array, nbytes); #ifdef ENABLE_CT_TESTING VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret); #endif return ret; } void randombytes_init(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength) { randombytes_init_arm64crypto_optimized(entropy_input, personalization_string, security_strength); } #endif