From 0860c735a3106dc94e24d1074fdbac1b2af730b2 Mon Sep 17 00:00:00 2001 From: AsyncKurisu <1750981157@qq.com> Date: Mon, 24 Nov 2025 16:39:55 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96=E9=9A=8F=E6=9C=BA?= =?UTF-8?q?=E6=95=B0=E7=94=9F=E6=88=90=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E5=B9=B6=E8=A1=8C=E6=95=B0WAYS=3D8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/common/arm64crypto/randombytes_ctrdrbg.c | 521 ++++++++++--------- 1 file changed, 289 insertions(+), 232 deletions(-) diff --git a/src/common/arm64crypto/randombytes_ctrdrbg.c b/src/common/arm64crypto/randombytes_ctrdrbg.c index 0f7a986..b5ab52f 100644 --- a/src/common/arm64crypto/randombytes_ctrdrbg.c +++ b/src/common/arm64crypto/randombytes_ctrdrbg.c @@ -7,270 +7,327 @@ static AES256_CTR_DRBG_struct DRBG_ctx; -static inline uint32_t AES_sbox_x4(uint32_t in) { - uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in)); - sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0)); - - return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0); +// 优化1: 改进S-box实现,减少内存操作 +static inline uint32_t +AES_sbox_x4(uint32_t in) +{ + uint8x16_t sbox_val = vreinterpretq_u8_u32(vdupq_n_u32(in)); + sbox_val = vaeseq_u8(sbox_val, vdupq_n_u8(0)); + return vgetq_lane_u32(vreinterpretq_u32_u8(sbox_val), 0); } #define ROTR32(x, n) ((x << (32 - n)) | (x >> n)) -typedef union { - uint8_t u8[15][16]; - uint32_t u32[15][4]; +// 优化2: 使用更紧凑的数据结构,提高缓存效率 +typedef union +{ + uint8_t u8[240]; // 15*16 + uint32_t u32[60]; // 15*4 + uint8x16_t v[15]; } subkeys_t; -static void AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) { - subkeys_t *sk = (subkeys_t *)subkeys; - uint8_t rcon = 1; - uint32_t s; - int i, j; +// 优化3: 改进密钥调度,使用Neon指令进行批量处理 +static void +AES256_key_schedule(uint8_t subkeys[15][16], const uint8_t *key) +{ + subkeys_t *sk = (subkeys_t *)subkeys; + uint8x16_t rcon = vdupq_n_u8(0x01); + uint8x16_t rcon_step = vdupq_n_u8(0x1b); - memcpy(&subkeys[0][0], key, 32 * sizeof(uint8_t)); + // 一次性复制前两轮密钥 + memcpy(&subkeys[0][0], key, 32); - for (i = 2; i < 14; i += 2) { - s = AES_sbox_x4(sk->u32[i - 1][3]); - sk->u32[i][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[i - 2][0]; + uint8x16_t prev_key = vld1q_u8(&subkeys[0][0]); + uint8x16_t prev_prev_key = vld1q_u8(&subkeys[1][0]); - for (j = 1; j < 4; j++) { - sk->u32[i][j] = sk->u32[i][j - 1] ^ sk->u32[i - 2][j]; - } - - s = AES_sbox_x4(sk->u32[i][3]); - sk->u32[i + 1][0] = s ^ sk->u32[i - 1][0]; - - for (j = 1; j < 4; j++) { - sk->u32[i + 1][j] = sk->u32[i + 1][j - 1] ^ sk->u32[i - 1][j]; - } - - rcon = (rcon << 1) ^ ((rcon >> 7) * 0x11b); - } - - s = AES_sbox_x4(sk->u32[13][3]); - sk->u32[14][0] = ROTR32(s, 8) ^ rcon ^ sk->u32[12][0]; - - for (j = 1; j < 4; j++) { - sk->u32[14][j] = sk->u32[14][j - 1] ^ sk->u32[12][j]; - } -} - -#define AES256_ECB_XWAYS(ways, vsubkeys, ctr, out) \ - do { \ - uint8x16_t state[ways]; \ - \ - for (int j = 0; j < ways; j++) { \ - state[j] = vaeseq_u8(ctr[j], vsubkeys[0]); \ - state[j] = vaesmcq_u8(state[j]); \ - } \ - \ - for (int i = 1; i < 13; i++) { \ - for (int j = 0; j < ways; j++) { \ - state[j] = vaeseq_u8(state[j], vsubkeys[i]); \ - state[j] = vaesmcq_u8(state[j]); \ - } \ - } \ - \ - for (int j = 0; j < ways; j++) { \ - state[j] = vaeseq_u8(state[j], vsubkeys[13]); \ - state[j] = veorq_u8(state[j], vsubkeys[14]); \ - vst1q_u8(out + j * 16, state[j]); \ - } \ - } while (0); - -// subkeys - subkeys for AES-256 -// ctr - a 128-bit plaintext value -// buffer - a 128-bit ciphertext value -static void AES256_ECB(uint8x16_t vsubkeys[15], uint8x16_t ctr, - unsigned char *buffer) { - AES256_ECB_XWAYS(1, vsubkeys, (&ctr), buffer); -} - -// vsubkeys - subkeys for AES-256 -// ctr - an array of 3 x 128-bit plaintext value -// buffer - an array of 3 x 128-bit ciphertext value -static void AES256_ECB_x3(uint8x16_t vsubkeys[15], uint8x16_t ctr[3], - unsigned char *buffer) { - AES256_ECB_XWAYS(3, vsubkeys, ctr, buffer); -} - -static void bswap128(__uint128_t *x) { - uint64_t *x64 = (uint64_t *)x; - - uint64_t t = x64[0]; - x64[0] = x64[1]; - x64[1] = t; - - x64[0] = __builtin_bswap64(x64[0]); - x64[1] = __builtin_bswap64(x64[1]); -} - -static void add_to_V(unsigned char V[], int incr) { - __uint128_t *V128 = (__uint128_t *)V; - bswap128(V128); - (*V128) += incr; - bswap128(V128); -} - -static void AES256_CTR_DRBG_Update(unsigned char *provided_data, - uint8x16_t vsubkeys[15], unsigned char *Key, - unsigned char *V) { - unsigned char temp[48]; - __uint128_t V128, t; - uint64x2_t vV[3]; - - memcpy(&V128, DRBG_ctx.V, sizeof(V128)); - - bswap128(&V128); - - for (int j = 0; j < 3; j++) { - V128++; - t = V128; - bswap128(&t); - vV[j] = vld1q_u64((uint64_t *)&t); - } - - AES256_ECB_x3(vsubkeys, (uint8x16_t *)vV, temp); - - if (provided_data != NULL) - for (int i = 0; i < 48; i++) - temp[i] ^= provided_data[i]; - memcpy(Key, temp, 32); - memcpy(V, temp + 32, 16); - - add_to_V(DRBG_ctx.V, 1); -} - -void randombytes_init_arm64crypto(unsigned char *entropy_input, - unsigned char *personalization_string, - int security_strength) { - (void)security_strength; - - unsigned char seed_material[48]; - uint8_t subkeys[15][16]; - uint8x16_t vsubkeys[15]; - - memcpy(seed_material, entropy_input, 48); - if (personalization_string) - for (int i = 0; i < 48; i++) - seed_material[i] ^= personalization_string[i]; - memset(DRBG_ctx.Key, 0x00, 32); - memset(DRBG_ctx.V, 0x00, 16); - - AES256_key_schedule(subkeys, DRBG_ctx.Key); - for (int i = 0; i < 15; i++) { - vsubkeys[i] = vld1q_u8(subkeys[i]); - } - - AES256_CTR_DRBG_Update(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); - DRBG_ctx.reseed_counter = 1; -} - -#define WAYS 4 - -int randombytes_arm64crypto(unsigned char *x, unsigned long long xlen) { - uint8_t subkeys[15][16]; - unsigned char block[16]; - __uint128_t V[WAYS], Vle[WAYS]; - uint8x16x4_t vV; - uint8x16_t vsubkeys[15]; - - AES256_key_schedule(subkeys, DRBG_ctx.Key); - - for (int j = 0; j < 15; j++) { - vsubkeys[j] = vld1q_u8(subkeys[j]); - } - - memcpy(&Vle[0], DRBG_ctx.V, sizeof(Vle[0])); - V[0] = Vle[0]; - vV.val[0] = vld1q_u8((uint8_t *)&V[0]); - bswap128(&Vle[0]); - for (int j = 1; j < WAYS; j++) { - Vle[j] = Vle[j - 1] + 1; - V[j] = Vle[j]; - bswap128(&V[j]); - vV.val[j] = vld1q_u8((uint8_t *)&V[j]); - } - - int entered_fast_path = (xlen >= WAYS * 16) ? 1 : 0; - - while (xlen >= WAYS * 16) { - for (int j = 0; j < WAYS; j++) { - Vle[j] += 4; - } - - for (int j = 0; j < WAYS; j++) { - vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[0]); - vV.val[j] = vaesmcq_u8(vV.val[j]); + for (int i = 2; i < 15; i++) { + // 提取最后一列并进行S-box变换 + uint8x16_t last_col = vextq_u8(prev_key, vdupq_n_u8(0), 12); + last_col = vaeseq_u8(last_col, vdupq_n_u8(0)); + + // RotWord + last_col = vextq_u8(last_col, last_col, 3); + + // XOR with rcon + uint8x16_t new_key_first = veorq_u8(veorq_u8(last_col, rcon), prev_prev_key); + + // 生成新密钥的剩余部分 + uint8x16_t new_key = vextq_u8(prev_prev_key, new_key_first, 12); + + // 保存新密钥 + vst1q_u8(&subkeys[i][0], new_key); + + // 更新rcon + uint8_t rcon_val = vgetq_lane_u8(rcon, 0); + rcon_val = (rcon_val << 1) ^ ((rcon_val >> 7) * 0x1b); + rcon = vdupq_n_u8(rcon_val); + + // 更新前两个密钥 + prev_prev_key = prev_key; + prev_key = new_key; + } +} + +// 优化4: 改进AES-256 ECB实现,减少循环开销 +static inline void +AES256_ECB_XWAYS_OPTIMIZED(int ways, const uint8x16_t vsubkeys[15], uint8x16_t state[], unsigned char *out) +{ + // 第一轮:AddRoundKey + for (int j = 0; j < ways; j++) { + state[j] = vaeseq_u8(state[j], vsubkeys[0]); + state[j] = vaesmcq_u8(state[j]); } + // 中间轮:SubBytes, ShiftRows, MixColumns, AddRoundKey for (int i = 1; i < 13; i++) { - for (int j = 0; j < WAYS; j++) { - vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[i]); - vV.val[j] = vaesmcq_u8(vV.val[j]); - } + uint8x16_t subkey = vsubkeys[i]; + for (int j = 0; j < ways; j++) { + state[j] = vaeseq_u8(state[j], subkey); + state[j] = vaesmcq_u8(state[j]); + } } - for (int j = 0; j < WAYS; j++) { - vV.val[j] = vaeseq_u8(vV.val[j], vsubkeys[13]); - vV.val[j] = veorq_u8(vV.val[j], vsubkeys[14]); - vst1q_u8(x + j * 16, vV.val[j]); + // 最后一轮:SubBytes, ShiftRows, AddRoundKey + for (int j = 0; j < ways; j++) { + state[j] = vaeseq_u8(state[j], vsubkeys[13]); + state[j] = veorq_u8(state[j], vsubkeys[14]); + vst1q_u8(out + j * 16, state[j]); + } +} + +// 优化5: 使用向量化的字节交换函数 +static inline void +bswap128_vectorized(uint8x16_t *v) +{ + // 使用vrev64q_u8和vtrn1q_u8等指令优化字节交换 + uint8x16_t reversed = vrev64q_u8(*v); + uint8x8x2_t halves = vtrn_u8(vget_low_u8(reversed), vget_high_u8(reversed)); + *v = vcombine_u8(halves.val[1], halves.val[0]); +} + +// 优化6: 改进计数器增量函数 +static inline void +add_to_V_optimized(unsigned char V[], int incr) +{ + // 使用向量化操作增加计数器 + uint8x16_t vV = vld1q_u8(V); + uint64x2_t vV64 = vreinterpretq_u64_u8(vV); + + // 处理64位增量 + uint64x2_t incr64 = vdupq_n_u64((uint64_t)incr); + vV64 = vaddq_u64(vV64, incr64); + + // 如果低64位溢出,增加高64位 + uint64_t low = vgetq_lane_u64(vV64, 0); + if (low < (uint64_t)incr) { + uint64_t high = vgetq_lane_u64(vV64, 1); + vV64 = vsetq_lane_u64(high + 1, vV64, 1); } - for (int j = 0; j < WAYS; j++) { - V[j] = Vle[j]; - bswap128(&V[j]); + vV = vreinterpretq_u8_u64(vV64); + bswap128_vectorized(&vV); + vst1q_u8(V, vV); +} + +// 优化7: 改进DRBG更新函数,减少内存操作 +static void +AES256_CTR_DRBG_Update_Optimized(unsigned char *provided_data, + const uint8x16_t vsubkeys[15], + unsigned char *Key, + unsigned char *V) +{ + unsigned char temp[48]; + + // 使用向量化操作处理计数器 + uint8x16_t vV = vld1q_u8(V); + uint8x16_t vV1 = vV; + uint8x16_t vV2 = vV; + uint8x16_t vV3 = vV; + + // 增量计数器值 + uint64x2_t inc = vdupq_n_u64(1); + uint64x2_t vV64 = vreinterpretq_u64_u8(vV1); + vV64 = vaddq_u64(vV64, inc); + vV1 = vreinterpretq_u8_u64(vV64); + + vV64 = vreinterpretq_u64_u8(vV2); + vV64 = vaddq_u64(vV64, vdupq_n_u64(2)); + vV2 = vreinterpretq_u8_u64(vV64); + + vV64 = vreinterpretq_u64_u8(vV3); + vV64 = vaddq_u64(vV64, vdupq_n_u64(3)); + vV3 = vreinterpretq_u8_u64(vV64); + + // 批量AES加密 + uint8x16_t vV_array[3] = { vV1, vV2, vV3 }; + AES256_ECB_XWAYS_OPTIMIZED(3, vsubkeys, vV_array, temp); + + // 如果有提供的数据,进行XOR操作 + if (provided_data != NULL) { + uint8x16_t vData = vld1q_u8(provided_data); + uint8x16_t vTemp = vld1q_u8(temp); + vst1q_u8(temp, veorq_u8(vTemp, vData)); + + vData = vld1q_u8(provided_data + 16); + vTemp = vld1q_u8(temp + 16); + vst1q_u8(temp + 16, veorq_u8(vTemp, vData)); + + vData = vld1q_u8(provided_data + 32); + vTemp = vld1q_u8(temp + 32); + vst1q_u8(temp + 32, veorq_u8(vTemp, vData)); } - vV = vld1q_u8_x4((uint8_t *)V); + // 更新密钥和V + memcpy(Key, temp, 32); + memcpy(V, temp + 32, 16); - x += WAYS * 16; - xlen -= WAYS * 16; - } + add_to_V_optimized(DRBG_ctx.V, 1); +} - if (entered_fast_path && xlen == 0) { - asm volatile("" : "+r,m"(Vle[3]) : : "memory"); - V[0] = Vle[3] - 4; - bswap128(&V[0]); - } +// 优化8: 改进初始化函数 +void +randombytes_init_arm64crypto_optimized(unsigned char *entropy_input, + unsigned char *personalization_string, + int security_strength) +{ + (void)security_strength; - while (xlen > 0) { - if (xlen > 16) { - AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), x); - x += 16; - xlen -= 16; + unsigned char seed_material[48]; + uint8_t subkeys[15][16]; + uint8x16_t vsubkeys[15]; - Vle[0]++; - V[0] = Vle[0]; - bswap128(&V[0]); + // 使用向量化操作初始化种子材料 + if (personalization_string) { + uint8x16_t vEntropy = vld1q_u8(entropy_input); + uint8x16_t vPersonal = vld1q_u8(personalization_string); + vst1q_u8(seed_material, veorq_u8(vEntropy, vPersonal)); + + vEntropy = vld1q_u8(entropy_input + 16); + vPersonal = vld1q_u8(personalization_string + 16); + vst1q_u8(seed_material + 16, veorq_u8(vEntropy, vPersonal)); + + vEntropy = vld1q_u8(entropy_input + 32); + vPersonal = vld1q_u8(personalization_string + 32); + vst1q_u8(seed_material + 32, veorq_u8(vEntropy, vPersonal)); } else { - AES256_ECB(vsubkeys, vld1q_u8((uint8_t *)&V[0]), block); - memcpy(x, block, xlen); - xlen = 0; + memcpy(seed_material, entropy_input, 48); } - } - memcpy(DRBG_ctx.V, &V[0], sizeof(V[0])); + // 初始化密钥和V为零 + uint8x16_t vZero = vdupq_n_u8(0); + vst1q_u8(DRBG_ctx.Key, vZero); + vst1q_u8(DRBG_ctx.Key + 16, vZero); + vst1q_u8(DRBG_ctx.V, vZero); - AES256_CTR_DRBG_Update(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); - DRBG_ctx.reseed_counter++; + // 生成子密钥 + AES256_key_schedule(subkeys, DRBG_ctx.Key); + for (int i = 0; i < 15; i++) { + vsubkeys[i] = vld1q_u8(subkeys[i]); + } - return RNG_SUCCESS; + // 更新DRBG状态 + AES256_CTR_DRBG_Update_Optimized(seed_material, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); + DRBG_ctx.reseed_counter = 1; } +// 优化9: 提高WAYS值以利用更宽的向量寄存器 +#define WAYS_OPTIMIZED 8 // 增加到8,利用更宽的向量化 + +// 优化10: 改进主随机数生成函数,使用更大的WAYS值和更好的向量化 +int +randombytes_arm64crypto_optimized(unsigned char *x, unsigned long long xlen) +{ + uint8_t subkeys[15][16]; + unsigned char block[16]; + uint8x16_t vsubkeys[15]; + + // 预先计算子密钥 + AES256_key_schedule(subkeys, DRBG_ctx.Key); + for (int j = 0; j < 15; j++) { + vsubkeys[j] = vld1q_u8(subkeys[j]); + } + + // 处理大块数据(使用优化后的WAYS值) + if (xlen >= WAYS_OPTIMIZED * 16) { + uint8x16_t vV_array[WAYS_OPTIMIZED]; + uint8x16_t vV = vld1q_u8(DRBG_ctx.V); + + // 初始化计数器值 + vV_array[0] = vV; + for (int j = 1; j < WAYS_OPTIMIZED; j++) { + uint64x2_t vV64 = vreinterpretq_u64_u8(vV); + uint64x2_t inc = vdupq_n_u64(j); + vV64 = vaddq_u64(vV64, inc); + vV_array[j] = vreinterpretq_u8_u64(vV64); + } + + // 处理大块数据 + while (xlen >= WAYS_OPTIMIZED * 16) { + // 批量AES加密 + AES256_ECB_XWAYS_OPTIMIZED(WAYS_OPTIMIZED, vsubkeys, vV_array, x); + + // 更新计数器值 + uint64x2_t vV64 = vreinterpretq_u64_u8(vV_array[WAYS_OPTIMIZED - 1]); + uint64x2_t inc = vdupq_n_u64(WAYS_OPTIMIZED); + vV64 = vaddq_u64(vV64, inc); + + for (int j = 0; j < WAYS_OPTIMIZED; j++) { + uint64x2_t current = vreinterpretq_u64_u8(vV_array[j]); + current = vaddq_u64(current, inc); + vV_array[j] = vreinterpretq_u8_u64(current); + } + + x += WAYS_OPTIMIZED * 16; + xlen -= WAYS_OPTIMIZED * 16; + } + + // 更新V为最后一个计数器值 + vV = vV_array[WAYS_OPTIMIZED - 1]; + vst1q_u8(DRBG_ctx.V, vV); + } + + // 处理剩余数据(小量数据) + while (xlen > 0) { + uint8x16_t vV = vld1q_u8(DRBG_ctx.V); + + if (xlen > 16) { + uint8x16_t state = vV; + AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, x); + x += 16; + xlen -= 16; + } else { + uint8x16_t state = vV; + AES256_ECB_XWAYS_OPTIMIZED(1, vsubkeys, &state, block); + memcpy(x, block, xlen); + xlen = 0; + } + + // 增量V + add_to_V_optimized(DRBG_ctx.V, 1); + } + + // 更新DRBG状态 + AES256_CTR_DRBG_Update_Optimized(NULL, vsubkeys, DRBG_ctx.Key, DRBG_ctx.V); + DRBG_ctx.reseed_counter++; + + return RNG_SUCCESS; +} + +// 包装函数 #ifdef RANDOMBYTES_ARM64CRYPTO -int randombytes(unsigned char *random_array, unsigned long long nbytes) { - int ret = randombytes_arm64crypto(random_array, nbytes); +int +randombytes(unsigned char *random_array, unsigned long long nbytes) +{ + int ret = randombytes_arm64crypto_optimized(random_array, nbytes); #ifdef ENABLE_CT_TESTING - VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret); + VALGRIND_MAKE_MEM_UNDEFINED(random_array, ret); #endif - return ret; + return ret; } -void randombytes_init(unsigned char *entropy_input, - unsigned char *personalization_string, - int security_strength) { - randombytes_init_arm64crypto(entropy_input, personalization_string, - security_strength); +void +randombytes_init(unsigned char *entropy_input, unsigned char *personalization_string, int security_strength) +{ + randombytes_init_arm64crypto_optimized(entropy_input, personalization_string, security_strength); } -#endif +#endif \ No newline at end of file