743 lines
16 KiB
ArmAsm
743 lines
16 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
|
|
* as specified in rfc8998
|
|
* https://datatracker.ietf.org/doc/html/rfc8998
|
|
*
|
|
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <linux/cfi_types.h>
|
|
#include <asm/assembler.h>
|
|
#include "sm4-ce-asm.h"
|
|
|
|
.arch armv8-a+crypto
|
|
|
|
.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
|
|
.set .Lv\b\().4s, \b
|
|
.endr
|
|
|
|
.macro sm4e, vd, vn
|
|
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
|
|
.endm
|
|
|
|
/* Register macros */
|
|
|
|
/* Used for both encryption and decryption */
|
|
#define RHASH v21
|
|
#define RRCONST v22
|
|
#define RZERO v23
|
|
|
|
/* Helper macros. */
|
|
|
|
/*
|
|
* input: m0, m1
|
|
* output: r0:r1 (low 128-bits in r0, high in r1)
|
|
*/
|
|
#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
|
|
ext T0.16b, m1.16b, m1.16b, #8; \
|
|
pmull r0.1q, m0.1d, m1.1d; \
|
|
pmull T1.1q, m0.1d, T0.1d; \
|
|
pmull2 T0.1q, m0.2d, T0.2d; \
|
|
pmull2 r1.1q, m0.2d, m1.2d; \
|
|
eor T0.16b, T0.16b, T1.16b; \
|
|
ext T1.16b, RZERO.16b, T0.16b, #8; \
|
|
ext T0.16b, T0.16b, RZERO.16b, #8; \
|
|
eor r0.16b, r0.16b, T1.16b; \
|
|
eor r1.16b, r1.16b, T0.16b;
|
|
|
|
#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
|
|
r2, r3, m2, m3, T2, T3, \
|
|
r4, r5, m4, m5, T4, T5, \
|
|
r6, r7, m6, m7, T6, T7) \
|
|
ext T0.16b, m1.16b, m1.16b, #8; \
|
|
ext T2.16b, m3.16b, m3.16b, #8; \
|
|
ext T4.16b, m5.16b, m5.16b, #8; \
|
|
ext T6.16b, m7.16b, m7.16b, #8; \
|
|
pmull r0.1q, m0.1d, m1.1d; \
|
|
pmull r2.1q, m2.1d, m3.1d; \
|
|
pmull r4.1q, m4.1d, m5.1d; \
|
|
pmull r6.1q, m6.1d, m7.1d; \
|
|
pmull T1.1q, m0.1d, T0.1d; \
|
|
pmull T3.1q, m2.1d, T2.1d; \
|
|
pmull T5.1q, m4.1d, T4.1d; \
|
|
pmull T7.1q, m6.1d, T6.1d; \
|
|
pmull2 T0.1q, m0.2d, T0.2d; \
|
|
pmull2 T2.1q, m2.2d, T2.2d; \
|
|
pmull2 T4.1q, m4.2d, T4.2d; \
|
|
pmull2 T6.1q, m6.2d, T6.2d; \
|
|
pmull2 r1.1q, m0.2d, m1.2d; \
|
|
pmull2 r3.1q, m2.2d, m3.2d; \
|
|
pmull2 r5.1q, m4.2d, m5.2d; \
|
|
pmull2 r7.1q, m6.2d, m7.2d; \
|
|
eor T0.16b, T0.16b, T1.16b; \
|
|
eor T2.16b, T2.16b, T3.16b; \
|
|
eor T4.16b, T4.16b, T5.16b; \
|
|
eor T6.16b, T6.16b, T7.16b; \
|
|
ext T1.16b, RZERO.16b, T0.16b, #8; \
|
|
ext T3.16b, RZERO.16b, T2.16b, #8; \
|
|
ext T5.16b, RZERO.16b, T4.16b, #8; \
|
|
ext T7.16b, RZERO.16b, T6.16b, #8; \
|
|
ext T0.16b, T0.16b, RZERO.16b, #8; \
|
|
ext T2.16b, T2.16b, RZERO.16b, #8; \
|
|
ext T4.16b, T4.16b, RZERO.16b, #8; \
|
|
ext T6.16b, T6.16b, RZERO.16b, #8; \
|
|
eor r0.16b, r0.16b, T1.16b; \
|
|
eor r2.16b, r2.16b, T3.16b; \
|
|
eor r4.16b, r4.16b, T5.16b; \
|
|
eor r6.16b, r6.16b, T7.16b; \
|
|
eor r1.16b, r1.16b, T0.16b; \
|
|
eor r3.16b, r3.16b, T2.16b; \
|
|
eor r5.16b, r5.16b, T4.16b; \
|
|
eor r7.16b, r7.16b, T6.16b;
|
|
|
|
/*
|
|
* input: r0:r1 (low 128-bits in r0, high in r1)
|
|
* output: a
|
|
*/
|
|
#define REDUCTION(a, r0, r1, rconst, T0, T1) \
|
|
pmull2 T0.1q, r1.2d, rconst.2d; \
|
|
ext T1.16b, T0.16b, RZERO.16b, #8; \
|
|
ext T0.16b, RZERO.16b, T0.16b, #8; \
|
|
eor r1.16b, r1.16b, T1.16b; \
|
|
eor r0.16b, r0.16b, T0.16b; \
|
|
pmull T0.1q, r1.1d, rconst.1d; \
|
|
eor a.16b, r0.16b, T0.16b;
|
|
|
|
#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
|
|
rev32 b0.16b, b0.16b; \
|
|
ext T0.16b, m1.16b, m1.16b, #8; \
|
|
sm4e b0.4s, v24.4s; \
|
|
pmull r0.1q, m0.1d, m1.1d; \
|
|
sm4e b0.4s, v25.4s; \
|
|
pmull T1.1q, m0.1d, T0.1d; \
|
|
sm4e b0.4s, v26.4s; \
|
|
pmull2 T0.1q, m0.2d, T0.2d; \
|
|
sm4e b0.4s, v27.4s; \
|
|
pmull2 r1.1q, m0.2d, m1.2d; \
|
|
sm4e b0.4s, v28.4s; \
|
|
eor T0.16b, T0.16b, T1.16b; \
|
|
sm4e b0.4s, v29.4s; \
|
|
ext T1.16b, RZERO.16b, T0.16b, #8; \
|
|
sm4e b0.4s, v30.4s; \
|
|
ext T0.16b, T0.16b, RZERO.16b, #8; \
|
|
sm4e b0.4s, v31.4s; \
|
|
eor r0.16b, r0.16b, T1.16b; \
|
|
rev64 b0.4s, b0.4s; \
|
|
eor r1.16b, r1.16b, T0.16b; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
rev32 b0.16b, b0.16b;
|
|
|
|
#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
|
|
r0, r1, m0, m1, T0, T1, \
|
|
r2, r3, m2, m3, T2, T3, \
|
|
r4, r5, m4, m5, T4, T5) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
ext T0.16b, m1.16b, m1.16b, #8; \
|
|
ext T2.16b, m3.16b, m3.16b, #8; \
|
|
ext T4.16b, m5.16b, m5.16b, #8; \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b2.4s, v24.4s; \
|
|
pmull r0.1q, m0.1d, m1.1d; \
|
|
pmull r2.1q, m2.1d, m3.1d; \
|
|
pmull r4.1q, m4.1d, m5.1d; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b2.4s, v25.4s; \
|
|
pmull T1.1q, m0.1d, T0.1d; \
|
|
pmull T3.1q, m2.1d, T2.1d; \
|
|
pmull T5.1q, m4.1d, T4.1d; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b2.4s, v26.4s; \
|
|
pmull2 T0.1q, m0.2d, T0.2d; \
|
|
pmull2 T2.1q, m2.2d, T2.2d; \
|
|
pmull2 T4.1q, m4.2d, T4.2d; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b2.4s, v27.4s; \
|
|
pmull2 r1.1q, m0.2d, m1.2d; \
|
|
pmull2 r3.1q, m2.2d, m3.2d; \
|
|
pmull2 r5.1q, m4.2d, m5.2d; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b2.4s, v28.4s; \
|
|
eor T0.16b, T0.16b, T1.16b; \
|
|
eor T2.16b, T2.16b, T3.16b; \
|
|
eor T4.16b, T4.16b, T5.16b; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b2.4s, v29.4s; \
|
|
ext T1.16b, RZERO.16b, T0.16b, #8; \
|
|
ext T3.16b, RZERO.16b, T2.16b, #8; \
|
|
ext T5.16b, RZERO.16b, T4.16b, #8; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b2.4s, v30.4s; \
|
|
ext T0.16b, T0.16b, RZERO.16b, #8; \
|
|
ext T2.16b, T2.16b, RZERO.16b, #8; \
|
|
ext T4.16b, T4.16b, RZERO.16b, #8; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
sm4e b2.4s, v31.4s; \
|
|
eor r0.16b, r0.16b, T1.16b; \
|
|
eor r2.16b, r2.16b, T3.16b; \
|
|
eor r4.16b, r4.16b, T5.16b; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
eor r1.16b, r1.16b, T0.16b; \
|
|
eor r3.16b, r3.16b, T2.16b; \
|
|
eor r5.16b, r5.16b, T4.16b; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
eor r0.16b, r0.16b, r2.16b; \
|
|
eor r1.16b, r1.16b, r3.16b; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
eor r0.16b, r0.16b, r4.16b; \
|
|
eor r1.16b, r1.16b, r5.16b;
|
|
|
|
#define inc32_le128(vctr) \
|
|
mov vctr.d[1], x9; \
|
|
add w6, w9, #1; \
|
|
mov vctr.d[0], x8; \
|
|
bfi x9, x6, #0, #32; \
|
|
rev64 vctr.16b, vctr.16b;
|
|
|
|
#define GTAG_HASH_LENGTHS(vctr0, vlen) \
|
|
ld1 {vlen.16b}, [x7]; \
|
|
/* construct CTR0 */ \
|
|
/* the lower 32-bits of initial IV is always be32(1) */ \
|
|
mov x6, #0x1; \
|
|
bfi x9, x6, #0, #32; \
|
|
mov vctr0.d[0], x8; \
|
|
mov vctr0.d[1], x9; \
|
|
rbit vlen.16b, vlen.16b; \
|
|
rev64 vctr0.16b, vctr0.16b; \
|
|
/* authtag = GCTR(CTR0, GHASH) */ \
|
|
eor RHASH.16b, RHASH.16b, vlen.16b; \
|
|
SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
|
|
RTMP0, RTMP1); \
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
|
|
rbit RHASH.16b, RHASH.16b; \
|
|
eor RHASH.16b, RHASH.16b, vctr0.16b;
|
|
|
|
|
|
/* Register macros for encrypt and ghash */
|
|
|
|
/* can be the same as input v0-v3 */
|
|
#define RR1 v0
|
|
#define RR3 v1
|
|
#define RR5 v2
|
|
#define RR7 v3
|
|
|
|
#define RR0 v4
|
|
#define RR2 v5
|
|
#define RR4 v6
|
|
#define RR6 v7
|
|
|
|
#define RTMP0 v8
|
|
#define RTMP1 v9
|
|
#define RTMP2 v10
|
|
#define RTMP3 v11
|
|
#define RTMP4 v12
|
|
#define RTMP5 v13
|
|
#define RTMP6 v14
|
|
#define RTMP7 v15
|
|
|
|
#define RH1 v16
|
|
#define RH2 v17
|
|
#define RH3 v18
|
|
#define RH4 v19
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: ghash table
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
adr_l x2, .Lghash_rconst
|
|
ld1r {RRCONST.2d}, [x2]
|
|
|
|
eor RZERO.16b, RZERO.16b, RZERO.16b
|
|
|
|
/* H = E(K, 0^128) */
|
|
rev32 v0.16b, RZERO.16b
|
|
SM4_CRYPT_BLK_BE(v0)
|
|
|
|
/* H ^ 1 */
|
|
rbit RH1.16b, v0.16b
|
|
|
|
/* H ^ 2 */
|
|
PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
/* H ^ 3 */
|
|
PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
/* H ^ 4 */
|
|
PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
|
|
REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
st1 {RH1.16b-RH4.16b}, [x1]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(pmull_ghash_update)
|
|
/* input:
|
|
* x0: ghash table
|
|
* x1: ghash result
|
|
* x2: src
|
|
* w3: nblocks
|
|
*/
|
|
ld1 {RH1.16b-RH4.16b}, [x0]
|
|
|
|
ld1 {RHASH.16b}, [x1]
|
|
rbit RHASH.16b, RHASH.16b
|
|
|
|
adr_l x4, .Lghash_rconst
|
|
ld1r {RRCONST.2d}, [x4]
|
|
|
|
eor RZERO.16b, RZERO.16b, RZERO.16b
|
|
|
|
.Lghash_loop_4x:
|
|
cmp w3, #4
|
|
blt .Lghash_loop_1x
|
|
|
|
sub w3, w3, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
rbit v0.16b, v0.16b
|
|
rbit v1.16b, v1.16b
|
|
rbit v2.16b, v2.16b
|
|
rbit v3.16b, v3.16b
|
|
|
|
/*
|
|
* (in0 ^ HASH) * H^4 => rr0:rr1
|
|
* (in1) * H^3 => rr2:rr3
|
|
* (in2) * H^2 => rr4:rr5
|
|
* (in3) * H^1 => rr6:rr7
|
|
*/
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
|
|
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
|
|
RR2, RR3, v1, RH3, RTMP2, RTMP3,
|
|
RR4, RR5, v2, RH2, RTMP4, RTMP5,
|
|
RR6, RR7, v3, RH1, RTMP6, RTMP7)
|
|
|
|
eor RR0.16b, RR0.16b, RR2.16b
|
|
eor RR1.16b, RR1.16b, RR3.16b
|
|
eor RR0.16b, RR0.16b, RR4.16b
|
|
eor RR1.16b, RR1.16b, RR5.16b
|
|
eor RR0.16b, RR0.16b, RR6.16b
|
|
eor RR1.16b, RR1.16b, RR7.16b
|
|
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
|
|
|
|
cbz w3, .Lghash_end
|
|
b .Lghash_loop_4x
|
|
|
|
.Lghash_loop_1x:
|
|
sub w3, w3, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
rbit v0.16b, v0.16b
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
|
|
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
cbnz w3, .Lghash_loop_1x
|
|
|
|
.Lghash_end:
|
|
rbit RHASH.16b, RHASH.16b
|
|
st1 {RHASH.2d}, [x1]
|
|
|
|
ret
|
|
SYM_FUNC_END(pmull_ghash_update)
|
|
|
|
.align 3
|
|
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: ctr (big endian, 128 bit)
|
|
* w4: nbytes
|
|
* x5: ghash result
|
|
* x6: ghash table
|
|
* x7: lengths (only for last block)
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ldp x8, x9, [x3]
|
|
rev x8, x8
|
|
rev x9, x9
|
|
|
|
ld1 {RH1.16b-RH4.16b}, [x6]
|
|
|
|
ld1 {RHASH.16b}, [x5]
|
|
rbit RHASH.16b, RHASH.16b
|
|
|
|
adr_l x6, .Lghash_rconst
|
|
ld1r {RRCONST.2d}, [x6]
|
|
|
|
eor RZERO.16b, RZERO.16b, RZERO.16b
|
|
|
|
cbz w4, .Lgcm_enc_hash_len
|
|
|
|
.Lgcm_enc_loop_4x:
|
|
cmp w4, #(4 * 16)
|
|
blt .Lgcm_enc_loop_1x
|
|
|
|
sub w4, w4, #(4 * 16)
|
|
|
|
/* construct CTRs */
|
|
inc32_le128(v0) /* +0 */
|
|
inc32_le128(v1) /* +1 */
|
|
inc32_le128(v2) /* +2 */
|
|
inc32_le128(v3) /* +3 */
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3)
|
|
|
|
eor v0.16b, v0.16b, RTMP0.16b
|
|
eor v1.16b, v1.16b, RTMP1.16b
|
|
eor v2.16b, v2.16b, RTMP2.16b
|
|
eor v3.16b, v3.16b, RTMP3.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
/* ghash update */
|
|
|
|
rbit v0.16b, v0.16b
|
|
rbit v1.16b, v1.16b
|
|
rbit v2.16b, v2.16b
|
|
rbit v3.16b, v3.16b
|
|
|
|
/*
|
|
* (in0 ^ HASH) * H^4 => rr0:rr1
|
|
* (in1) * H^3 => rr2:rr3
|
|
* (in2) * H^2 => rr4:rr5
|
|
* (in3) * H^1 => rr6:rr7
|
|
*/
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
|
|
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
|
|
RR2, RR3, v1, RH3, RTMP2, RTMP3,
|
|
RR4, RR5, v2, RH2, RTMP4, RTMP5,
|
|
RR6, RR7, v3, RH1, RTMP6, RTMP7)
|
|
|
|
eor RR0.16b, RR0.16b, RR2.16b
|
|
eor RR1.16b, RR1.16b, RR3.16b
|
|
eor RR0.16b, RR0.16b, RR4.16b
|
|
eor RR1.16b, RR1.16b, RR5.16b
|
|
eor RR0.16b, RR0.16b, RR6.16b
|
|
eor RR1.16b, RR1.16b, RR7.16b
|
|
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
|
|
|
|
cbz w4, .Lgcm_enc_hash_len
|
|
b .Lgcm_enc_loop_4x
|
|
|
|
.Lgcm_enc_loop_1x:
|
|
cmp w4, #16
|
|
blt .Lgcm_enc_tail
|
|
|
|
sub w4, w4, #16
|
|
|
|
/* construct CTRs */
|
|
inc32_le128(v0)
|
|
|
|
ld1 {RTMP0.16b}, [x2], #16
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
eor v0.16b, v0.16b, RTMP0.16b
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
/* ghash update */
|
|
rbit v0.16b, v0.16b
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
cbz w4, .Lgcm_enc_hash_len
|
|
b .Lgcm_enc_loop_1x
|
|
|
|
.Lgcm_enc_tail:
|
|
/* construct CTRs */
|
|
inc32_le128(v0)
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
/* load permute table */
|
|
adr_l x0, .Lcts_permute_table
|
|
add x0, x0, #32
|
|
sub x0, x0, w4, uxtw
|
|
ld1 {v3.16b}, [x0]
|
|
|
|
.Lgcm_enc_tail_loop:
|
|
/* do encrypt */
|
|
ldrb w0, [x2], #1 /* get 1 byte from input */
|
|
umov w6, v0.b[0] /* get top crypted byte */
|
|
eor w6, w6, w0 /* w6 = CTR ^ input */
|
|
strb w6, [x1], #1 /* store out byte */
|
|
|
|
/* shift right out one byte */
|
|
ext v0.16b, v0.16b, v0.16b, #1
|
|
/* the last ciphertext is placed in high bytes */
|
|
ins v0.b[15], w6
|
|
|
|
subs w4, w4, #1
|
|
bne .Lgcm_enc_tail_loop
|
|
|
|
/* padding last block with zeros */
|
|
tbl v0.16b, {v0.16b}, v3.16b
|
|
|
|
/* ghash update */
|
|
rbit v0.16b, v0.16b
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
.Lgcm_enc_hash_len:
|
|
cbz x7, .Lgcm_enc_end
|
|
|
|
GTAG_HASH_LENGTHS(v1, v3)
|
|
|
|
b .Lgcm_enc_ret
|
|
|
|
.Lgcm_enc_end:
|
|
/* store new CTR */
|
|
rev x8, x8
|
|
rev x9, x9
|
|
stp x8, x9, [x3]
|
|
|
|
rbit RHASH.16b, RHASH.16b
|
|
|
|
.Lgcm_enc_ret:
|
|
/* store new MAC */
|
|
st1 {RHASH.2d}, [x5]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
|
|
|
|
#undef RR1
|
|
#undef RR3
|
|
#undef RR5
|
|
#undef RR7
|
|
#undef RR0
|
|
#undef RR2
|
|
#undef RR4
|
|
#undef RR6
|
|
#undef RTMP0
|
|
#undef RTMP1
|
|
#undef RTMP2
|
|
#undef RTMP3
|
|
#undef RTMP4
|
|
#undef RTMP5
|
|
#undef RTMP6
|
|
#undef RTMP7
|
|
#undef RH1
|
|
#undef RH2
|
|
#undef RH3
|
|
#undef RH4
|
|
|
|
|
|
/* Register macros for decrypt */
|
|
|
|
/* v0-v2 for building CTRs, v3-v5 for saving inputs */
|
|
|
|
#define RR1 v6
|
|
#define RR3 v7
|
|
#define RR5 v8
|
|
|
|
#define RR0 v9
|
|
#define RR2 v10
|
|
#define RR4 v11
|
|
|
|
#define RTMP0 v12
|
|
#define RTMP1 v13
|
|
#define RTMP2 v14
|
|
#define RTMP3 v15
|
|
#define RTMP4 v16
|
|
#define RTMP5 v17
|
|
|
|
#define RH1 v18
|
|
#define RH2 v19
|
|
#define RH3 v20
|
|
|
|
.align 3
|
|
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: ctr (big endian, 128 bit)
|
|
* w4: nbytes
|
|
* x5: ghash result
|
|
* x6: ghash table
|
|
* x7: lengths (only for last block)
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ldp x8, x9, [x3]
|
|
rev x8, x8
|
|
rev x9, x9
|
|
|
|
ld1 {RH1.16b-RH3.16b}, [x6]
|
|
|
|
ld1 {RHASH.16b}, [x5]
|
|
rbit RHASH.16b, RHASH.16b
|
|
|
|
adr_l x6, .Lghash_rconst
|
|
ld1r {RRCONST.2d}, [x6]
|
|
|
|
eor RZERO.16b, RZERO.16b, RZERO.16b
|
|
|
|
cbz w4, .Lgcm_dec_hash_len
|
|
|
|
.Lgcm_dec_loop_3x:
|
|
cmp w4, #(3 * 16)
|
|
blt .Lgcm_dec_loop_1x
|
|
|
|
sub w4, w4, #(3 * 16)
|
|
|
|
ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
|
|
|
|
/* construct CTRs */
|
|
inc32_le128(v0) /* +0 */
|
|
rbit v6.16b, v3.16b
|
|
inc32_le128(v1) /* +1 */
|
|
rbit v7.16b, v4.16b
|
|
inc32_le128(v2) /* +2 */
|
|
rbit v8.16b, v5.16b
|
|
|
|
eor RHASH.16b, RHASH.16b, v6.16b
|
|
|
|
/* decrypt & ghash update */
|
|
SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
|
|
RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
|
|
RR2, RR3, v7, RH2, RTMP2, RTMP3,
|
|
RR4, RR5, v8, RH1, RTMP4, RTMP5)
|
|
|
|
eor v0.16b, v0.16b, v3.16b
|
|
eor v1.16b, v1.16b, v4.16b
|
|
eor v2.16b, v2.16b, v5.16b
|
|
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
|
|
|
|
st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
|
|
|
|
cbz w4, .Lgcm_dec_hash_len
|
|
b .Lgcm_dec_loop_3x
|
|
|
|
.Lgcm_dec_loop_1x:
|
|
cmp w4, #16
|
|
blt .Lgcm_dec_tail
|
|
|
|
sub w4, w4, #16
|
|
|
|
ld1 {v3.16b}, [x2], #16
|
|
|
|
/* construct CTRs */
|
|
inc32_le128(v0)
|
|
rbit v6.16b, v3.16b
|
|
|
|
eor RHASH.16b, RHASH.16b, v6.16b
|
|
|
|
SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
|
|
|
|
eor v0.16b, v0.16b, v3.16b
|
|
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
cbz w4, .Lgcm_dec_hash_len
|
|
b .Lgcm_dec_loop_1x
|
|
|
|
.Lgcm_dec_tail:
|
|
/* construct CTRs */
|
|
inc32_le128(v0)
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
/* load permute table */
|
|
adr_l x0, .Lcts_permute_table
|
|
add x0, x0, #32
|
|
sub x0, x0, w4, uxtw
|
|
ld1 {v3.16b}, [x0]
|
|
|
|
.Lgcm_dec_tail_loop:
|
|
/* do decrypt */
|
|
ldrb w0, [x2], #1 /* get 1 byte from input */
|
|
umov w6, v0.b[0] /* get top crypted byte */
|
|
eor w6, w6, w0 /* w6 = CTR ^ input */
|
|
strb w6, [x1], #1 /* store out byte */
|
|
|
|
/* shift right out one byte */
|
|
ext v0.16b, v0.16b, v0.16b, #1
|
|
/* the last ciphertext is placed in high bytes */
|
|
ins v0.b[15], w0
|
|
|
|
subs w4, w4, #1
|
|
bne .Lgcm_dec_tail_loop
|
|
|
|
/* padding last block with zeros */
|
|
tbl v0.16b, {v0.16b}, v3.16b
|
|
|
|
/* ghash update */
|
|
rbit v0.16b, v0.16b
|
|
eor RHASH.16b, RHASH.16b, v0.16b
|
|
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
|
|
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
|
|
|
|
.Lgcm_dec_hash_len:
|
|
cbz x7, .Lgcm_dec_end
|
|
|
|
GTAG_HASH_LENGTHS(v1, v3)
|
|
|
|
b .Lgcm_dec_ret
|
|
|
|
.Lgcm_dec_end:
|
|
/* store new CTR */
|
|
rev x8, x8
|
|
rev x9, x9
|
|
stp x8, x9, [x3]
|
|
|
|
rbit RHASH.16b, RHASH.16b
|
|
|
|
.Lgcm_dec_ret:
|
|
/* store new MAC */
|
|
st1 {RHASH.2d}, [x5]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
|
|
|
|
.section ".rodata", "a"
|
|
.align 4
|
|
.Lcts_permute_table:
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
|
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
|
|
.Lghash_rconst:
|
|
.quad 0x87
|