ANDROID: arm64/crypto: add ARMv8-CE optimized poly_hash algorithm
poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in Internet Draft https://tools.ietf.org/html/draft-cope-heh-01. poly_hash is very similar to GHASH; besides the swapping of the last two coefficients which we opted to handle in the HEH template, poly_hash just uses a different finite field representation. As with GHASH, poly_hash becomes much faster and more secure against timing attacks when implemented using carryless multiplication instructions instead of tables. This patch adds an ARMv8-CE optimized version of poly_hash, based roughly on the existing ARMv8-CE optimized version of GHASH. Benchmark results are shown below, but note that the resistance to timing attacks may be even more important than the performance gain. poly_hash only: poly_hash-generic: 1,000,000 setkey() takes 1185 ms hashing is 328 MB/s poly_hash-ce: 1,000,000 setkey() takes 8 ms hashing is 1756 MB/s heh(aes) with 4096-byte inputs (this is the ideal case, as the improvement is less significant with smaller inputs): encryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 118 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 120 MB/s encryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 291 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 293 MB/s Bug: 32508661 Signed-off-by: Eric Biggers <ebiggers@google.com> Change-Id: I621ec0e1115df7e6f5cbd7e864a4a9d8d2e94cf2
This commit is contained in:
parent
58b9edb065
commit
0223de3a24
5 changed files with 338 additions and 0 deletions
|
@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
|
|||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_POLY_HASH_ARM64_CE
|
||||
tristate "poly_hash (for HEH encryption mode) using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_AES_ARM64_CE
|
||||
tristate "AES core cipher using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
|
|
|
@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
|
|||
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
|
||||
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLY_HASH_ARM64_CE) += poly-hash-ce.o
|
||||
poly-hash-ce-y := poly-hash-ce-glue.o poly-hash-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
|
||||
CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
|
||||
|
||||
|
|
163
arch/arm64/crypto/poly-hash-ce-core.S
Normal file
163
arch/arm64/crypto/poly-hash-ce-core.S
Normal file
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
* Accelerated poly_hash implementation with ARMv8 PMULL instructions.
|
||||
*
|
||||
* Based on ghash-ce-core.S.
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2017 Google, Inc. <ebiggers@google.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
KEY .req v0
|
||||
KEY2 .req v1
|
||||
T1 .req v2
|
||||
T2 .req v3
|
||||
GSTAR .req v4
|
||||
XL .req v5
|
||||
XM .req v6
|
||||
XH .req v7
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
/* 16-byte aligned (2**4 = 16); not required, but might as well */
|
||||
.align 4
|
||||
.Lgstar:
|
||||
.quad 0x87, 0x87
|
||||
|
||||
/*
|
||||
* void pmull_poly_hash_update(le128 *digest, const le128 *key,
|
||||
* const u8 *src, unsigned int blocks,
|
||||
* unsigned int partial);
|
||||
*/
|
||||
ENTRY(pmull_poly_hash_update)
|
||||
|
||||
/* Load digest into XL */
|
||||
ld1 {XL.16b}, [x0]
|
||||
|
||||
/* Load key into KEY */
|
||||
ld1 {KEY.16b}, [x1]
|
||||
|
||||
/* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of
|
||||
* GSTAR */
|
||||
adr x1, .Lgstar
|
||||
ld1 {GSTAR.2d}, [x1]
|
||||
|
||||
/* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]). This is needed for
|
||||
* Karatsuba multiplication. */
|
||||
ext KEY2.16b, KEY.16b, KEY.16b, #8
|
||||
eor KEY2.16b, KEY2.16b, KEY.16b
|
||||
|
||||
/* If 'partial' is nonzero, then we're finishing a pending block and
|
||||
* should go right to the multiplication. */
|
||||
cbnz w4, 1f
|
||||
|
||||
0:
|
||||
/* Add the next block from 'src' to the digest */
|
||||
ld1 {T1.16b}, [x2], #16
|
||||
eor XL.16b, XL.16b, T1.16b
|
||||
sub w3, w3, #1
|
||||
|
||||
1:
|
||||
/*
|
||||
* Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key
|
||||
* (b1:b0, in KEY) using Karatsuba multiplication.
|
||||
*/
|
||||
|
||||
/* T1 = (a1+a0):(a1+a0) */
|
||||
ext T1.16b, XL.16b, XL.16b, #8
|
||||
eor T1.16b, T1.16b, XL.16b
|
||||
|
||||
/* XH = a1 * b1 */
|
||||
pmull2 XH.1q, XL.2d, KEY.2d
|
||||
|
||||
/* XL = a0 * b0 */
|
||||
pmull XL.1q, XL.1d, KEY.1d
|
||||
|
||||
/* XM = (a1+a0) * (b1+b0) */
|
||||
pmull XM.1q, T1.1d, KEY2.1d
|
||||
|
||||
/* XM += (XH[0]:XL[1]) + XL + XH */
|
||||
ext T1.16b, XL.16b, XH.16b, #8
|
||||
eor T2.16b, XL.16b, XH.16b
|
||||
eor XM.16b, XM.16b, T1.16b
|
||||
eor XM.16b, XM.16b, T2.16b
|
||||
|
||||
/*
|
||||
* Now the 256-bit product is in XH[1]:XM:XL[0]. It represents a
|
||||
* polynomial over GF(2) with degree as large as 255. We need to
|
||||
* compute its remainder modulo g(x) = x^128+x^7+x^2+x+1. For this it
|
||||
* is sufficient to compute the remainder of the high half 'c(x)x^128'
|
||||
* add it to the low half. To reduce the high half we use the Barrett
|
||||
* reduction method. The basic idea is that we can express the
|
||||
* remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x).
|
||||
* As detailed in [1], to avoid having to divide by g(x) at runtime the
|
||||
* following equivalent expression can be derived:
|
||||
*
|
||||
* p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128
|
||||
*
|
||||
* where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x)
|
||||
* in this case. This is also equivalent to:
|
||||
*
|
||||
* p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128
|
||||
* = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128
|
||||
*
|
||||
* Since deg g*(x) < 64:
|
||||
*
|
||||
* p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128
|
||||
* = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) +
|
||||
* ((c(x)/x^64)g*(x))/x^64) ] mod x^128
|
||||
*
|
||||
* Letting t(x) = g*(x)(c(x)/x^64):
|
||||
*
|
||||
* p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128
|
||||
*
|
||||
* Therefore, to do the reduction we only need to issue two 64-bit =>
|
||||
* 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x)
|
||||
* times ((c(x) mod x^64) + t(x)/x^64). (Multiplication by x^64 doesn't
|
||||
* count since it is simply a shift or move.)
|
||||
*
|
||||
* An alternate reduction method, also based on Barrett reduction and
|
||||
* described in [1], uses only shifts and XORs --- no multiplications.
|
||||
* However, the method with multiplications requires fewer instructions
|
||||
* and is faster on processors with fast carryless multiplication.
|
||||
*
|
||||
* [1] "Intel Carry-Less Multiplication Instruction and its Usage for
|
||||
* Computing the GCM Mode",
|
||||
* https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
|
||||
*/
|
||||
|
||||
/* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */
|
||||
|
||||
/* T1 = t(x) = g*(x)(c(x)/x^64) */
|
||||
pmull2 T1.1q, GSTAR.2d, XH.2d
|
||||
|
||||
/* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */
|
||||
eor T2.16b, XM.16b, T1.16b
|
||||
pmull2 T2.1q, GSTAR.2d, T2.2d
|
||||
|
||||
/* Make XL[0] be the low half of the 128-bit result by adding the low 64
|
||||
* bits of the T2 term to what was already there. The 't(x)x^64' term
|
||||
* makes no difference, so skip it. */
|
||||
eor XL.16b, XL.16b, T2.16b
|
||||
|
||||
/* Make XL[1] be the high half of the 128-bit result by adding the high
|
||||
* 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0],
|
||||
* then moving XM[0] to XL[1]. */
|
||||
eor XM.16b, XM.16b, T1.16b
|
||||
ext T2.16b, T2.16b, T2.16b, #8
|
||||
eor XM.16b, XM.16b, T2.16b
|
||||
mov XL.d[1], XM.d[0]
|
||||
|
||||
/* If more blocks remain, then loop back to process the next block;
|
||||
* else, store the digest and return. */
|
||||
cbnz w3, 0b
|
||||
st1 {XL.16b}, [x0]
|
||||
ret
|
||||
ENDPROC(pmull_poly_hash_update)
|
166
arch/arm64/crypto/poly-hash-ce-glue.c
Normal file
166
arch/arm64/crypto/poly-hash-ce-glue.c
Normal file
|
@ -0,0 +1,166 @@
|
|||
/*
|
||||
* Accelerated poly_hash implementation with ARMv8 PMULL instructions.
|
||||
*
|
||||
* Based on ghash-ce-glue.c.
|
||||
*
|
||||
* poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in
|
||||
* Internet Draft https://tools.ietf.org/html/draft-cope-heh-01.
|
||||
*
|
||||
* poly_hash is very similar to GHASH: both algorithms are keyed hashes which
|
||||
* interpret their input data as coefficients of a polynomial over GF(2^128),
|
||||
* then calculate a hash value by evaluating that polynomial at the point given
|
||||
* by the key, e.g. using Horner's rule. The difference is that poly_hash uses
|
||||
* the more natural "ble" convention to represent GF(2^128) elements, whereas
|
||||
* GHASH uses the less natural "lle" convention (see include/crypto/gf128mul.h).
|
||||
* The ble convention makes it simpler to implement GF(2^128) multiplication.
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2017 Google Inc. <ebiggers@google.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <crypto/b128ops.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
/*
|
||||
* Note: in this algorithm we currently use 'le128' to represent GF(2^128)
|
||||
* elements, even though poly_hash-generic uses 'be128'. Both types are
|
||||
* actually "wrong" because the elements are actually in 'ble' format, and there
|
||||
* should be a ble type to represent this --- as well as lle, bbe, and lbe types
|
||||
* for the other conventions for representing GF(2^128) elements. But
|
||||
* practically it doesn't matter which type we choose here, so we just use le128
|
||||
* since it's arguably more accurate, while poly_hash-generic still has to use
|
||||
* be128 because the generic GF(2^128) multiplication functions all take be128.
|
||||
*/
|
||||
|
||||
struct poly_hash_desc_ctx {
|
||||
le128 digest;
|
||||
unsigned int count;
|
||||
};
|
||||
|
||||
asmlinkage void pmull_poly_hash_update(le128 *digest, const le128 *key,
|
||||
const u8 *src, unsigned int blocks,
|
||||
unsigned int partial);
|
||||
|
||||
static int poly_hash_setkey(struct crypto_shash *tfm,
|
||||
const u8 *key, unsigned int keylen)
|
||||
{
|
||||
if (keylen != sizeof(le128)) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
memcpy(crypto_shash_ctx(tfm), key, sizeof(le128));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int poly_hash_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
|
||||
ctx->digest = (le128) { 0 };
|
||||
ctx->count = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int poly_hash_update(struct shash_desc *desc, const u8 *src,
|
||||
unsigned int len)
|
||||
{
|
||||
struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = ctx->count % sizeof(le128);
|
||||
u8 *dst = (u8 *)&ctx->digest + partial;
|
||||
|
||||
ctx->count += len;
|
||||
|
||||
/* Finishing at least one block? */
|
||||
if (partial + len >= sizeof(le128)) {
|
||||
const le128 *key = crypto_shash_ctx(desc->tfm);
|
||||
|
||||
if (partial) {
|
||||
/* Finish the pending block. */
|
||||
unsigned int n = sizeof(le128) - partial;
|
||||
|
||||
len -= n;
|
||||
do {
|
||||
*dst++ ^= *src++;
|
||||
} while (--n);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do the real work. If 'partial' is nonzero, this starts by
|
||||
* multiplying 'digest' by 'key'. Then for each additional full
|
||||
* block it adds the block to 'digest' and multiplies by 'key'.
|
||||
*/
|
||||
kernel_neon_begin_partial(8);
|
||||
pmull_poly_hash_update(&ctx->digest, key, src,
|
||||
len / sizeof(le128), partial);
|
||||
kernel_neon_end();
|
||||
|
||||
src += len - (len % sizeof(le128));
|
||||
len %= sizeof(le128);
|
||||
dst = (u8 *)&ctx->digest;
|
||||
}
|
||||
|
||||
/* Continue adding the next block to 'digest'. */
|
||||
while (len--)
|
||||
*dst++ ^= *src++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int poly_hash_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = ctx->count % sizeof(le128);
|
||||
|
||||
/* Finish the last block if needed. */
|
||||
if (partial) {
|
||||
const le128 *key = crypto_shash_ctx(desc->tfm);
|
||||
|
||||
kernel_neon_begin_partial(8);
|
||||
pmull_poly_hash_update(&ctx->digest, key, NULL, 0, partial);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
memcpy(out, &ctx->digest, sizeof(le128));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg poly_hash_alg = {
|
||||
.digestsize = sizeof(le128),
|
||||
.init = poly_hash_init,
|
||||
.update = poly_hash_update,
|
||||
.final = poly_hash_final,
|
||||
.setkey = poly_hash_setkey,
|
||||
.descsize = sizeof(struct poly_hash_desc_ctx),
|
||||
.base = {
|
||||
.cra_name = "poly_hash",
|
||||
.cra_driver_name = "poly_hash-ce",
|
||||
.cra_priority = 300,
|
||||
.cra_ctxsize = sizeof(le128),
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init poly_hash_ce_mod_init(void)
|
||||
{
|
||||
return crypto_register_shash(&poly_hash_alg);
|
||||
}
|
||||
|
||||
static void __exit poly_hash_ce_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&poly_hash_alg);
|
||||
}
|
||||
|
||||
MODULE_DESCRIPTION("Polynomial evaluation hash using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
module_cpu_feature_match(PMULL, poly_hash_ce_mod_init);
|
||||
module_exit(poly_hash_ce_mod_exit);
|
|
@ -295,6 +295,7 @@ config CRYPTO_HEH
|
|||
select CRYPTO_ECB
|
||||
select CRYPTO_GF128MUL
|
||||
select CRYPTO_MANAGER
|
||||
select CRYPTO_POLY_HASH_ARM64_CE if ARM64 && KERNEL_MODE_NEON
|
||||
help
|
||||
HEH: Hash-Encrypt-Hash mode
|
||||
HEH is a proposed block cipher mode of operation which extends the
|
||||
|
|
Loading…
Add table
Reference in a new issue