linux-zen-server/arch/arm/crypto/blake2b-neon-core.S

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * BLAKE2b digest algorithm, NEON accelerated
 *
 * Copyright 2020 Google LLC
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */

#include <linux/linkage.h>

	.text
	.fpu		neon

	// The arguments to blake2b_compress_neon()
	STATE		.req	r0
	BLOCK		.req	r1
	NBLOCKS		.req	r2
	INC		.req	r3

	// Pointers to the rotation tables
	ROR24_TABLE	.req	r4
	ROR16_TABLE	.req	r5

	// The original stack pointer
	ORIG_SP		.req	r6

	// NEON registers which contain the message words of the current block.
	// M_0-M_3 are occasionally used for other purposes too.
	M_0		.req	d16
	M_1		.req	d17
	M_2		.req	d18
	M_3		.req	d19
	M_4		.req	d20
	M_5		.req	d21
	M_6		.req	d22
	M_7		.req	d23
	M_8		.req	d24
	M_9		.req	d25
	M_10		.req	d26
	M_11		.req	d27
	M_12		.req	d28
	M_13		.req	d29
	M_14		.req	d30
	M_15		.req	d31

	.align		4
	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
	// instruction.  This is the most efficient way to implement these
	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
.Lror24_table:
	.byte		3, 4, 5, 6, 7, 0, 1, 2
.Lror16_table:
	.byte		2, 3, 4, 5, 6, 7, 0, 1
	// The BLAKE2b initialization vector
.Lblake2b_IV:
	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179

// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
// (M_0-M_3), so that they can be reloaded if they are used as temporary
// registers.  The macro arguments s0-s15 give the order in which the message
// words are used in this round.  'final' is 1 if this is the final round.
.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
			s8, s9, s10, s11, s12, s13, s14, s15, final=0

	// Mix the columns:
	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).

	// a += b + m[blake2b_sigma[r][2*i + 0]];
	vadd.u64	q0, q0, q2
	vadd.u64	q1, q1, q3
	vadd.u64	d0, d0, M_\s0
	vadd.u64	d1, d1, M_\s2
	vadd.u64	d2, d2, M_\s4
	vadd.u64	d3, d3, M_\s6

	// d = ror64(d ^ a, 32);
	veor		q6, q6, q0
	veor		q7, q7, q1
	vrev64.32	q6, q6
	vrev64.32	q7, q7

	// c += d;
	vadd.u64	q4, q4, q6
	vadd.u64	q5, q5, q7

	// b = ror64(b ^ c, 24);
	vld1.8		{M_0}, [ROR24_TABLE, :64]
	veor		q2, q2, q4
	veor		q3, q3, q5
	vtbl.8		d4, {d4}, M_0
	vtbl.8		d5, {d5}, M_0
	vtbl.8		d6, {d6}, M_0
	vtbl.8		d7, {d7}, M_0

	// a += b + m[blake2b_sigma[r][2*i + 1]];
	//
	// M_0 got clobbered above, so we have to reload it if any of the four
	// message words this step needs happens to be M_0.  Otherwise we don't
	// need to reload it here, as it will just get clobbered again below.
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
	vld1.8		{M_0}, [sp, :64]
.endif
	vadd.u64	q0, q0, q2
	vadd.u64	q1, q1, q3
	vadd.u64	d0, d0, M_\s1
	vadd.u64	d1, d1, M_\s3
	vadd.u64	d2, d2, M_\s5
	vadd.u64	d3, d3, M_\s7

	// d = ror64(d ^ a, 16);
	vld1.8		{M_0}, [ROR16_TABLE, :64]
	veor		q6, q6, q0
	veor		q7, q7, q1
	vtbl.8		d12, {d12}, M_0
	vtbl.8		d13, {d13}, M_0
	vtbl.8		d14, {d14}, M_0
	vtbl.8		d15, {d15}, M_0

	// c += d;
	vadd.u64	q4, q4, q6
	vadd.u64	q5, q5, q7

	// b = ror64(b ^ c, 63);
	//
	// This rotation amount isn't a multiple of 8, so it has to be
	// implemented using a pair of shifts, which requires temporary
	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
	veor		q8, q2, q4
	veor		q9, q3, q5
	vshr.u64	q2, q8, #63
	vshr.u64	q3, q9, #63
	vsli.u64	q2, q8, #1
	vsli.u64	q3, q9, #1
	vld1.8		{q8-q9}, [sp, :256]

	// Mix the diagonals:
	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
	//
	// There are two possible ways to do this: use 'vext' instructions to
	// shift the rows of the matrix so that the diagonals become columns,
	// and undo it afterwards; or just use 64-bit operations on 'd'
	// registers instead of 128-bit operations on 'q' registers.  We use the
	// latter approach, as it performs much better on Cortex-A7.

	// a += b + m[blake2b_sigma[r][2*i + 0]];
	vadd.u64	d0, d0, d5
	vadd.u64	d1, d1, d6
	vadd.u64	d2, d2, d7
	vadd.u64	d3, d3, d4
	vadd.u64	d0, d0, M_\s8
	vadd.u64	d1, d1, M_\s10
	vadd.u64	d2, d2, M_\s12
	vadd.u64	d3, d3, M_\s14

	// d = ror64(d ^ a, 32);
	veor		d15, d15, d0
	veor		d12, d12, d1
	veor		d13, d13, d2
	veor		d14, d14, d3
	vrev64.32	d15, d15
	vrev64.32	d12, d12
	vrev64.32	d13, d13
	vrev64.32	d14, d14

	// c += d;
	vadd.u64	d10, d10, d15
	vadd.u64	d11, d11, d12
	vadd.u64	d8, d8, d13
	vadd.u64	d9, d9, d14

	// b = ror64(b ^ c, 24);
	vld1.8		{M_0}, [ROR24_TABLE, :64]
	veor		d5, d5, d10
	veor		d6, d6, d11
	veor		d7, d7, d8
	veor		d4, d4, d9
	vtbl.8		d5, {d5}, M_0
	vtbl.8		d6, {d6}, M_0
	vtbl.8		d7, {d7}, M_0
	vtbl.8		d4, {d4}, M_0

	// a += b + m[blake2b_sigma[r][2*i + 1]];
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
	vld1.8		{M_0}, [sp, :64]
.endif
	vadd.u64	d0, d0, d5
	vadd.u64	d1, d1, d6
	vadd.u64	d2, d2, d7
	vadd.u64	d3, d3, d4
	vadd.u64	d0, d0, M_\s9
	vadd.u64	d1, d1, M_\s11
	vadd.u64	d2, d2, M_\s13
	vadd.u64	d3, d3, M_\s15

	// d = ror64(d ^ a, 16);
	vld1.8		{M_0}, [ROR16_TABLE, :64]
	veor		d15, d15, d0
	veor		d12, d12, d1
	veor		d13, d13, d2
	veor		d14, d14, d3
	vtbl.8		d12, {d12}, M_0
	vtbl.8		d13, {d13}, M_0
	vtbl.8		d14, {d14}, M_0
	vtbl.8		d15, {d15}, M_0

	// c += d;
	vadd.u64	d10, d10, d15
	vadd.u64	d11, d11, d12
	vadd.u64	d8, d8, d13
	vadd.u64	d9, d9, d14

	// b = ror64(b ^ c, 63);
	veor		d16, d4, d9
	veor		d17, d5, d10
	veor		d18, d6, d11
	veor		d19, d7, d8
	vshr.u64	q2, q8, #63
	vshr.u64	q3, q9, #63
	vsli.u64	q2, q8, #1
	vsli.u64	q3, q9, #1
	// Reloading q8-q9 can be skipped on the final round.
.if ! \final
	vld1.8		{q8-q9}, [sp, :256]
.endif
.endm

//
// void blake2b_compress_neon(struct blake2b_state *state,
//			      const u8 *block, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2b_state are used:
//	u64 h[8];	(inout)
//	u64 t[2];	(inout)
//	u64 f[2];	(in)
//
	.align		5
ENTRY(blake2b_compress_neon)
	push		{r4-r10}

	// Allocate a 32-byte stack buffer that is 32-byte aligned.
	mov		ORIG_SP, sp
	sub		ip, sp, #32
	bic		ip, ip, #31
	mov		sp, ip

	adr		ROR24_TABLE, .Lror24_table
	adr		ROR16_TABLE, .Lror16_table

	mov		ip, STATE
	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
.Lnext_block:
	  adr		r10, .Lblake2b_IV
	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
	  adds		r7, r7, INC		// Increment counter
	bcs		.Lslow_inc_ctr
	vmov.i32	d28[0], r7
	vst1.64		{d28}, [ip]		// Update t[0]
.Linc_ctr_done:

	// Load the next message block and finish initializing the state matrix
	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
	// entire state matrix in q0-q7 and the entire message block in q8-15.
	//
	// However, _blake2b_round also needs some extra registers for rotates,
	// so we have to spill some registers.  It's better to spill the message
	// registers than the state registers, as the message doesn't change.
	// Therefore we store a copy of the first 32 bytes of the message block
	// (q8-q9) in an aligned buffer on the stack so that they can be
	// reloaded when needed.  (We could just reload directly from the
	// message buffer, but it's faster to use aligned loads.)
	vld1.8		{q8-q9}, [BLOCK]!
	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
	vld1.8		{q10-q11}, [BLOCK]!
	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
	vld1.8		{q12-q13}, [BLOCK]!
	vst1.8		{q8-q9}, [sp, :256]
	  mov		ip, STATE
	vld1.8		{q14-q15}, [BLOCK]!

	// Execute the rounds.  Each round is provided the order in which it
	// needs to use the message words.
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
			final=1

	// Fold the final state matrix into the hash chaining value:
	//
	//	for (i = 0; i < 8; i++)
	//		h[i] ^= v[i] ^ v[i + 8];
	//
	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
	  mov		ip, STATE
	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]

	// Advance to the next block, if there is one.
	bne		.Lnext_block		// nblocks != 0?

	mov		sp, ORIG_SP
	pop		{r4-r10}
	mov		pc, lr

.Lslow_inc_ctr:
	// Handle the case where the counter overflowed its low 32 bits, by
	// carrying the overflow bit into the full 128-bit counter.
	vmov		r9, r10, d29
	adcs		r8, r8, #0
	adcs		r9, r9, #0
	adc		r10, r10, #0
	vmov		d28, r7, r8
	vmov		d29, r9, r10
	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
	b		.Linc_ctr_done
ENDPROC(blake2b_compress_neon)
Initial commit 2023-08-30 17:53:23 +02:00			`/* SPDX-License-Identifier: GPL-2.0-or-later */`
			`/*`
			`* BLAKE2b digest algorithm, NEON accelerated`
			`*`
			`* Copyright 2020 Google LLC`
			`*`
			`* Author: Eric Biggers <ebiggers@google.com>`
			`*/`

			`#include <linux/linkage.h>`

			`.text`
			`.fpu neon`

			`// The arguments to blake2b_compress_neon()`
			`STATE .req r0`
			`BLOCK .req r1`
			`NBLOCKS .req r2`
			`INC .req r3`

			`// Pointers to the rotation tables`
			`ROR24_TABLE .req r4`
			`ROR16_TABLE .req r5`

			`// The original stack pointer`
			`ORIG_SP .req r6`

			`// NEON registers which contain the message words of the current block.`
			`// M_0-M_3 are occasionally used for other purposes too.`
			`M_0 .req d16`
			`M_1 .req d17`
			`M_2 .req d18`
			`M_3 .req d19`
			`M_4 .req d20`
			`M_5 .req d21`
			`M_6 .req d22`
			`M_7 .req d23`
			`M_8 .req d24`
			`M_9 .req d25`
			`M_10 .req d26`
			`M_11 .req d27`
			`M_12 .req d28`
			`M_13 .req d29`
			`M_14 .req d30`
			`M_15 .req d31`

			`.align 4`
			`// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8`
			`// instruction. This is the most efficient way to implement these`
			`// rotation amounts with NEON. (On Cortex-A53 it's the same speed as`
			`// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)`
			`.Lror24_table:`
			`.byte 3, 4, 5, 6, 7, 0, 1, 2`
			`.Lror16_table:`
			`.byte 2, 3, 4, 5, 6, 7, 0, 1`
			`// The BLAKE2b initialization vector`
			`.Lblake2b_IV:`
			`.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b`
			`.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1`
			`.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f`
			`.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179`

			`// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the`
			`// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack`
			`// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9`
			`// (M_0-M_3), so that they can be reloaded if they are used as temporary`
			`// registers. The macro arguments s0-s15 give the order in which the message`
			`// words are used in this round. 'final' is 1 if this is the final round.`
			`.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \`
			`s8, s9, s10, s11, s12, s13, s14, s15, final=0`

			`// Mix the columns:`
			`// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),`
			`// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).`

			`// a += b + m[blake2b_sigma[r][2*i + 0]];`
			`vadd.u64 q0, q0, q2`
			`vadd.u64 q1, q1, q3`
			`vadd.u64 d0, d0, M_\s0`
			`vadd.u64 d1, d1, M_\s2`
			`vadd.u64 d2, d2, M_\s4`
			`vadd.u64 d3, d3, M_\s6`

			`// d = ror64(d ^ a, 32);`
			`veor q6, q6, q0`
			`veor q7, q7, q1`
			`vrev64.32 q6, q6`
			`vrev64.32 q7, q7`

			`// c += d;`
			`vadd.u64 q4, q4, q6`
			`vadd.u64 q5, q5, q7`

			`// b = ror64(b ^ c, 24);`
			`vld1.8 {M_0}, [ROR24_TABLE, :64]`
			`veor q2, q2, q4`
			`veor q3, q3, q5`
			`vtbl.8 d4, {d4}, M_0`
			`vtbl.8 d5, {d5}, M_0`
			`vtbl.8 d6, {d6}, M_0`
			`vtbl.8 d7, {d7}, M_0`

			`// a += b + m[blake2b_sigma[r][2*i + 1]];`
			`//`
			`// M_0 got clobbered above, so we have to reload it if any of the four`
			`// message words this step needs happens to be M_0. Otherwise we don't`
			`// need to reload it here, as it will just get clobbered again below.`
			`.if \s1 == 0 \|\| \s3 == 0 \|\| \s5 == 0 \|\| \s7 == 0`
			`vld1.8 {M_0}, [sp, :64]`
			`.endif`
			`vadd.u64 q0, q0, q2`
			`vadd.u64 q1, q1, q3`
			`vadd.u64 d0, d0, M_\s1`
			`vadd.u64 d1, d1, M_\s3`
			`vadd.u64 d2, d2, M_\s5`
			`vadd.u64 d3, d3, M_\s7`

			`// d = ror64(d ^ a, 16);`
			`vld1.8 {M_0}, [ROR16_TABLE, :64]`
			`veor q6, q6, q0`
			`veor q7, q7, q1`
			`vtbl.8 d12, {d12}, M_0`
			`vtbl.8 d13, {d13}, M_0`
			`vtbl.8 d14, {d14}, M_0`
			`vtbl.8 d15, {d15}, M_0`

			`// c += d;`
			`vadd.u64 q4, q4, q6`
			`vadd.u64 q5, q5, q7`

			`// b = ror64(b ^ c, 63);`
			`//`
			`// This rotation amount isn't a multiple of 8, so it has to be`
			`// implemented using a pair of shifts, which requires temporary`
			`// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.`
			`veor q8, q2, q4`
			`veor q9, q3, q5`
			`vshr.u64 q2, q8, #63`
			`vshr.u64 q3, q9, #63`
			`vsli.u64 q2, q8, #1`
			`vsli.u64 q3, q9, #1`
			`vld1.8 {q8-q9}, [sp, :256]`

			`// Mix the diagonals:`
			`// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),`
			`// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).`
			`//`
			`// There are two possible ways to do this: use 'vext' instructions to`
			`// shift the rows of the matrix so that the diagonals become columns,`
			`// and undo it afterwards; or just use 64-bit operations on 'd'`
			`// registers instead of 128-bit operations on 'q' registers. We use the`
			`// latter approach, as it performs much better on Cortex-A7.`

			`// a += b + m[blake2b_sigma[r][2*i + 0]];`
			`vadd.u64 d0, d0, d5`
			`vadd.u64 d1, d1, d6`
			`vadd.u64 d2, d2, d7`
			`vadd.u64 d3, d3, d4`
			`vadd.u64 d0, d0, M_\s8`
			`vadd.u64 d1, d1, M_\s10`
			`vadd.u64 d2, d2, M_\s12`
			`vadd.u64 d3, d3, M_\s14`

			`// d = ror64(d ^ a, 32);`
			`veor d15, d15, d0`
			`veor d12, d12, d1`
			`veor d13, d13, d2`
			`veor d14, d14, d3`
			`vrev64.32 d15, d15`
			`vrev64.32 d12, d12`
			`vrev64.32 d13, d13`
			`vrev64.32 d14, d14`

			`// c += d;`
			`vadd.u64 d10, d10, d15`
			`vadd.u64 d11, d11, d12`
			`vadd.u64 d8, d8, d13`
			`vadd.u64 d9, d9, d14`

			`// b = ror64(b ^ c, 24);`
			`vld1.8 {M_0}, [ROR24_TABLE, :64]`
			`veor d5, d5, d10`
			`veor d6, d6, d11`
			`veor d7, d7, d8`
			`veor d4, d4, d9`
			`vtbl.8 d5, {d5}, M_0`
			`vtbl.8 d6, {d6}, M_0`
			`vtbl.8 d7, {d7}, M_0`
			`vtbl.8 d4, {d4}, M_0`

			`// a += b + m[blake2b_sigma[r][2*i + 1]];`
			`.if \s9 == 0 \|\| \s11 == 0 \|\| \s13 == 0 \|\| \s15 == 0`
			`vld1.8 {M_0}, [sp, :64]`
			`.endif`
			`vadd.u64 d0, d0, d5`
			`vadd.u64 d1, d1, d6`
			`vadd.u64 d2, d2, d7`
			`vadd.u64 d3, d3, d4`
			`vadd.u64 d0, d0, M_\s9`
			`vadd.u64 d1, d1, M_\s11`
			`vadd.u64 d2, d2, M_\s13`
			`vadd.u64 d3, d3, M_\s15`

			`// d = ror64(d ^ a, 16);`
			`vld1.8 {M_0}, [ROR16_TABLE, :64]`
			`veor d15, d15, d0`
			`veor d12, d12, d1`
			`veor d13, d13, d2`
			`veor d14, d14, d3`
			`vtbl.8 d12, {d12}, M_0`
			`vtbl.8 d13, {d13}, M_0`
			`vtbl.8 d14, {d14}, M_0`
			`vtbl.8 d15, {d15}, M_0`

			`// c += d;`
			`vadd.u64 d10, d10, d15`
			`vadd.u64 d11, d11, d12`
			`vadd.u64 d8, d8, d13`
			`vadd.u64 d9, d9, d14`

			`// b = ror64(b ^ c, 63);`
			`veor d16, d4, d9`
			`veor d17, d5, d10`
			`veor d18, d6, d11`
			`veor d19, d7, d8`
			`vshr.u64 q2, q8, #63`
			`vshr.u64 q3, q9, #63`
			`vsli.u64 q2, q8, #1`
			`vsli.u64 q3, q9, #1`
			`// Reloading q8-q9 can be skipped on the final round.`
			`.if ! \final`
			`vld1.8 {q8-q9}, [sp, :256]`
			`.endif`
			`.endm`

			`//`
			`// void blake2b_compress_neon(struct blake2b_state *state,`
			`// const u8 *block, size_t nblocks, u32 inc);`
			`//`
			`// Only the first three fields of struct blake2b_state are used:`
			`// u64 h[8]; (inout)`
			`// u64 t[2]; (inout)`
			`// u64 f[2]; (in)`
			`//`
			`.align 5`
			`ENTRY(blake2b_compress_neon)`
			`push {r4-r10}`

			`// Allocate a 32-byte stack buffer that is 32-byte aligned.`
			`mov ORIG_SP, sp`
			`sub ip, sp, #32`
			`bic ip, ip, #31`
			`mov sp, ip`

			`adr ROR24_TABLE, .Lror24_table`
			`adr ROR16_TABLE, .Lror16_table`

			`mov ip, STATE`
			`vld1.64 {q0-q1}, [ip]! // Load h[0..3]`
			`vld1.64 {q2-q3}, [ip]! // Load h[4..7]`
			`.Lnext_block:`
			`adr r10, .Lblake2b_IV`
			`vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]`
			`vld1.64 {q4-q5}, [r10]! // Load IV[0..3]`
			`vmov r7, r8, d28 // Copy t[0] to (r7, r8)`
			`vld1.64 {q6-q7}, [r10] // Load IV[4..7]`
			`adds r7, r7, INC // Increment counter`
			`bcs .Lslow_inc_ctr`
			`vmov.i32 d28[0], r7`
			`vst1.64 {d28}, [ip] // Update t[0]`
			`.Linc_ctr_done:`

			`// Load the next message block and finish initializing the state matrix`
			`// 'v'. Fortunately, there are exactly enough NEON registers to fit the`
			`// entire state matrix in q0-q7 and the entire message block in q8-15.`
			`//`
			`// However, _blake2b_round also needs some extra registers for rotates,`
			`// so we have to spill some registers. It's better to spill the message`
			`// registers than the state registers, as the message doesn't change.`
			`// Therefore we store a copy of the first 32 bytes of the message block`
			`// (q8-q9) in an aligned buffer on the stack so that they can be`
			`// reloaded when needed. (We could just reload directly from the`
			`// message buffer, but it's faster to use aligned loads.)`
			`vld1.8 {q8-q9}, [BLOCK]!`
			`veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]`
			`vld1.8 {q10-q11}, [BLOCK]!`
			`veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]`
			`vld1.8 {q12-q13}, [BLOCK]!`
			`vst1.8 {q8-q9}, [sp, :256]`
			`mov ip, STATE`
			`vld1.8 {q14-q15}, [BLOCK]!`

			`// Execute the rounds. Each round is provided the order in which it`
			`// needs to use the message words.`
			`_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15`
			`_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3`
			`_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4`
			`_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8`
			`_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13`
			`_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9`
			`_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11`
			`_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10`
			`_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5`
			`_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0`
			`_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15`
			`_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \`
			`final=1`

			`// Fold the final state matrix into the hash chaining value:`
			`//`
			`// for (i = 0; i < 8; i++)`
			`// h[i] ^= v[i] ^ v[i + 8];`
			`//`
			`vld1.64 {q8-q9}, [ip]! // Load old h[0..3]`
			`veor q0, q0, q4 // v[0..1] ^= v[8..9]`
			`veor q1, q1, q5 // v[2..3] ^= v[10..11]`
			`vld1.64 {q10-q11}, [ip] // Load old h[4..7]`
			`veor q2, q2, q6 // v[4..5] ^= v[12..13]`
			`veor q3, q3, q7 // v[6..7] ^= v[14..15]`
			`veor q0, q0, q8 // v[0..1] ^= h[0..1]`
			`veor q1, q1, q9 // v[2..3] ^= h[2..3]`
			`mov ip, STATE`
			`subs NBLOCKS, NBLOCKS, #1 // nblocks--`
			`vst1.64 {q0-q1}, [ip]! // Store new h[0..3]`
			`veor q2, q2, q10 // v[4..5] ^= h[4..5]`
			`veor q3, q3, q11 // v[6..7] ^= h[6..7]`
			`vst1.64 {q2-q3}, [ip]! // Store new h[4..7]`

			`// Advance to the next block, if there is one.`
			`bne .Lnext_block // nblocks != 0?`

			`mov sp, ORIG_SP`
			`pop {r4-r10}`
			`mov pc, lr`

			`.Lslow_inc_ctr:`
			`// Handle the case where the counter overflowed its low 32 bits, by`
			`// carrying the overflow bit into the full 128-bit counter.`
			`vmov r9, r10, d29`
			`adcs r8, r8, #0`
			`adcs r9, r9, #0`
			`adc r10, r10, #0`
			`vmov d28, r7, r8`
			`vmov d29, r9, r10`
			`vst1.64 {q14}, [ip] // Update t[0] and t[1]`
			`b .Linc_ctr_done`
			`ENDPROC(blake2b_compress_neon)`