Index: conf/files.amd64 =================================================================== --- conf/files.amd64 (revision 240127) +++ conf/files.amd64 (working copy) @@ -93,6 +93,11 @@ no-obj no-implicit-rule before-depend \ clean "acpi_wakedata.h" # +aes_encdec.o optional aesni \ + dependency "$S/crypto/aesni/aes_encdec.c" \ + compile-with "clang ${CFLAGS:N-mno-sse} -msse -c ${.IMPSRC}" \ + no-implicit-rule clean "aes_encdec.o" +# amd64/amd64/amd64_mem.c optional mem #amd64/amd64/apic_vector.S standard amd64/amd64/atomic.c standard @@ -131,7 +136,7 @@ amd64/amd64/vm_machdep.c standard amd64/pci/pci_cfgreg.c optional pci cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" -crypto/aesni/aesencdec_amd64.S optional aesni +#crypto/aesni/aesencdec_amd64.S optional aesni crypto/aesni/aeskeys_amd64.S optional aesni crypto/aesni/aesni.c optional aesni crypto/aesni/aesni_wrap.c optional aesni Index: crypto/aesni/aesni.h =================================================================== --- crypto/aesni/aesni.h (revision 240127) +++ crypto/aesni/aesni.h (working copy) @@ -94,6 +94,15 @@ void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); +#ifdef __amd64__ +void aesni_encrypt_xts(int rounds, const void *data_schedule, + const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, + const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_xts(int rounds, const void *data_schedule, + const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, + const uint8_t iv[AES_BLOCK_LEN]); +#endif + int aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini); int aesni_cipher_process(struct aesni_session *ses, Index: crypto/aesni/aes_encdec.c =================================================================== --- crypto/aesni/aes_encdec.c (revision 0) +++ crypto/aesni/aes_encdec.c (working copy) @@ -0,0 +1,311 @@ +/*- + * Copyright (c) 2012 Shane Nievera Tablizo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +#ifdef _KERNEL +#include +#endif + +#ifndef _KERNEL +#include "aes_encdec.h" +#endif + +typedef uint64_t dquad __attribute__((__vector_size__(16))); +typedef int q_int __attribute__((__vector_size__(16))); + +#define V4ZERO (q_int){ 0, 0, 0, 0} + +#define DEFASM(mn, out_t, in_t, in_spec) \ + static inline out_t __attribute__((always_inline)) \ + mn ## _asm(out_t __a, in_t __b) \ + { \ + __asm__(#mn " %1, %0" : "+&x" (__a) : in_spec (__b)); \ + return (__a); \ + } + +#define SW_ROUNDS(rounds, f) \ + switch (rounds) { \ + case AES128_ROUNDS: rounds = AES128_ROUNDS; f; break; \ + case AES192_ROUNDS: rounds = AES192_ROUNDS; f; break; \ + case AES256_ROUNDS: rounds = AES256_ROUNDS; f; break; \ + } + +DEFASM(psrad, dquad, int, "I") +DEFASM(aesenc, dquad, dquad, "x") +DEFASM(aesdec, dquad, dquad, "x") +DEFASM(aesenclast, dquad, dquad, "x") +DEFASM(aesdeclast, dquad, dquad, "x") + +static inline dquad __attribute__((always_inline)) +loadu_asm(const dquad *restrict p) +{ + dquad z; + + __asm__("movdqu %1, %0" : "=x" (z) : "m" (*p)); + + return (z); +} + +#define loadu(x) loadu_asm((x)) +#define psrad(i,x) psrad_asm((x), (i)) +#define aesenc aesenc_asm +#define aesdec aesdec_asm +#define aesdeclast aesdeclast_asm +#define aesenclast aesenclast_asm + +static inline dquad __attribute__((always_inline)) +movdqu(const uint8_t *restrict p) +{ + + return (loadu((dquad*)p)); +} + +static inline dquad __attribute((always_inline)) +encdec(int dir, int rounds, const dquad *restrict sched, dquad t) +{ + + t ^= *sched; + + for (; --rounds;) + t = (dir ? aesenc : aesdec)(t, *++sched); + + return (dir ? aesenclast : aesdeclast)(t, *++sched); +} +#define CRYPT(x) encdec(dir, rounds, (dquad*)sched, (x)) +#define DEC(x) encdec(0, rounds, (dquad*)sched, (x)) + +static inline void __attribute((always_inline)) +decrypt_cbc_X6(int rounds, const dquad *restrict sched, size_t n, + dquad *restrict src, dquad t0) +/* TODO: Pipeline this without explicit vars. */ +{ + dquad t1, t2, t3, t4, t5, t6; + dquad da, db, dc, dd, de, df; + + size_t m; + + if (0 < (m = n / 6)) { + da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2]; + dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5]; + for (;;) { + da = DEC(da); db = DEC(db); dc = DEC(dc); + dd = DEC(dd); de = DEC(de); df = DEC(df); + + *src = da ^ t0; *++src = db ^ t1; *++src = dc ^ t2; + *++src = dd ^ t3; *++src = de ^ t4; *++src = df ^ t5; + + ++src; + t0 = t6; + if (--m == 0) + break; + da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2]; + dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5]; + } + } + if (n % 2 == 1) { + da = t1 = src[0]; + da = DEC(da); + *src = da ^ t0; + ++src; --n; + t0 = t1; + + } + switch (n %= 6) { + case 2: + da = t1 = src[0]; db = t2 = src[1]; + da = DEC(da); db = DEC(db); + *src = da ^ t0; *++src = db ^ t1; + ++src; + t0 = t2; + break; + case 4: + da = t1 = src[0]; db = t2 = src[1]; + dc = t3 = src[2]; dd = t4 = src[3]; + da = DEC(da); db = DEC(db); dc = DEC(dc); dd = DEC(dd); + *src = da ^ t0; *++src = db ^ t1; + *++src = dc ^ t2; *++src = dd ^ t3; + break; + } +} + +/* #define GF128_GENERATOR 0x87 */ + +static inline dquad __attribute((always_inline)) +GF128_shl(dquad z) +{ + dquad a, c; + + c = (dquad){ 0x87 /* GF128 generator */, 1 }; + + a = psrad(31, __builtin_shufflevector((q_int)z, V4ZERO, 3, 3, 1, 1)); + + return (z + z ^ a & c); +} + +static inline void +init_tweaks(dquad *restrict ts, int n) +{ + int i; + + for (i = 0; i < n - 1; ++i) + ts[i+1] = GF128_shl(ts[i]); +} + +static inline void +out_xor(dquad *restrict ds, dquad *restrict ts, dquad *restrict *dst, int n) +{ + int i; + + for (i = 0; i < n; ++i, ++*dst) + **dst = ds[i] ^ ts[i]; +} + +static inline void +llel_crypt(int dir, int rounds, const dquad *restrict sched, dquad *restrict ds, + dquad *restrict ts, const dquad **src, int n) +{ + int i; + + for (i = 0; i < n; ++i, ++*src) + ds[i] = CRYPT( **src ^ ts[i]); +} + +static inline void __attribute((always_inline)) +crypt_xex_X6(int dir, int rounds, const dquad *restrict sched, size_t n, + const dquad *src, dquad *dst, dquad t0) +{ + dquad d[6]; + dquad t[6]; + size_t m; + + t[0] = t0; + + if (0 < (m = n / 6)) { + init_tweaks(t, 6); + for (;;) { + llel_crypt(dir, rounds, sched, d, t, &src, 6); + out_xor(d, t, &dst, 6); + t[0] = GF128_shl(t[5]); + if (--m == 0) + break; + init_tweaks(t, 6); + } + + } + if (n % 2 == 1) { + *dst++ = *t ^ CRYPT(*t ^ *src++); + *t = GF128_shl(*t); + --n; + } + switch (n %= 6) { + case 2: + n = 2; + init_tweaks(t, n); + llel_crypt(dir, rounds, sched, d, t, &src, n); + out_xor(d, t, &dst, n); + break; + case 4: + n = 4; + init_tweaks(t, n); + llel_crypt(dir, rounds, sched, d, t, &src, n); + out_xor(d, t, &dst, n); + break; + } +} + +void +aesni_enc(int rounds, const uint8_t *restrict sched, const uint8_t *src, + uint8_t *dst, const uint8_t *restrict iv) +{ + + *(dquad*)dst = encdec(1, rounds, (dquad*)sched, iv == NULL ? + *(dquad*)src : *(dquad*)src ^ movdqu(iv)); +} + +void +aesni_dec(int rounds, const uint8_t *restrict sched, const uint8_t *src, + uint8_t *dst, const uint8_t *restrict iv) +{ + + *(dquad*)dst = encdec(0, rounds, (dquad*)sched, *(dquad*)src); + *(dquad*)dst = (iv == NULL) ? *(dquad*)dst : *(dquad*)dst ^ movdqu(iv); +} + +void +aesni_encrypt_cbc(int rounds, const void *restrict sched, size_t len, + const uint8_t *src, uint8_t *dst, const uint8_t *restrict ivp) +{ + dquad iv; + dquad *p, *q; + + len /= AES_BLOCK_LEN; + iv = movdqu(ivp); + p = (dquad*)src; + q = (dquad*)dst; + +#define ENCRYPT_CBC_LOOP \ + for (; len--; ++p, ++q) { \ + iv = encdec(1, rounds, (dquad*)sched, *p ^ iv); \ + *q = iv; \ + } + SW_ROUNDS(rounds, ENCRYPT_CBC_LOOP) +} + +void +aesni_decrypt_cbc(int rounds, const void *restrict sched, size_t len, + const uint8_t *restrict dat, const uint8_t *restrict ivp) +{ + dquad iv; + + iv = movdqu(ivp); + SW_ROUNDS(rounds, decrypt_cbc_X6(rounds, sched, len / AES_BLOCK_LEN, + (dquad*)dat, iv)) +} + +#define CRYPT_XEX_X6(dir) (\ + crypt_xex_X6((dir), rounds, sched, len / AES_BLOCK_LEN, \ + (dquad*)src, (dquad*)dst, \ + encdec(1, rounds, xsched, movdqu(ivp))) \ + ) +void +aesni_encrypt_xts(int rounds, const void *restrict sched, + const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst, + const uint8_t *restrict ivp) +{ + + SW_ROUNDS(rounds, CRYPT_XEX_X6(1)) +} + +void +aesni_decrypt_xts(int rounds, const void *restrict sched, + const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst, + const uint8_t *restrict ivp) +{ + + SW_ROUNDS(rounds, CRYPT_XEX_X6(0)) +} + Index: crypto/aesni/aesni_wrap.c =================================================================== --- crypto/aesni/aesni_wrap.c (revision 240127) +++ crypto/aesni/aesni_wrap.c (working copy) @@ -37,6 +37,7 @@ MALLOC_DECLARE(M_AESNI); +#ifndef __amd64__ void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) @@ -53,6 +54,7 @@ to += AES_BLOCK_LEN; } } +#endif void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, @@ -82,6 +84,7 @@ } } +#ifndef __amd64__ #define AES_XTS_BLOCKSIZE 16 #define AES_XTS_IVSIZE 8 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ @@ -169,6 +172,7 @@ aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, iv, 0); } +#endif static int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key, Index: modules/aesni/Makefile =================================================================== --- modules/aesni/Makefile (revision 240127) +++ modules/aesni/Makefile (working copy) @@ -4,7 +4,19 @@ KMOD= aesni SRCS= aesni.c aesni_wrap.c -SRCS+= aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S +SRCS+= aeskeys_${MACHINE_CPUARCH}.S + +.if ${MACHINE_CPUARCH} == "i386" +SRCS+= aesencdec_${MACHINE_CPUARCH}.S +.endif + +.if ${MACHINE_CPUARCH} == "amd64" +AESNI_CFLAGS= $(CFLAGS:N-mno-sse) -msse +OBJS= aes_encdec.o +aes_encdec.o: aes_encdec.c + clang $(AESNI_CFLAGS) -o $(.TARGET) -c $(.IMPSRC) +.endif + SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h .include #benchmark 3.917 sec, 2000000 aes crypts, 16 bytes, 8169394 byte/sec, 62.3 Mb/sec 3.957 sec, 2000000 xts crypts, 16 bytes, 8087413 byte/sec, 61.7 Mb/sec 3.925 sec, 2000000 aes192 crypts, 16 bytes, 8152586 byte/sec, 62.2 Mb/sec 3.929 sec, 2000000 aes256 crypts, 16 bytes, 8143737 byte/sec, 62.1 Mb/sec 3.988 sec, 2000000 xts256 crypts, 16 bytes, 8023624 byte/sec, 61.2 Mb/sec 5.309 sec, 2000000 aes crypts, 512 bytes, 192891714 byte/sec, 1471.6 Mb/sec 4.804 sec, 2000000 xts crypts, 512 bytes, 213153973 byte/sec, 1626.2 Mb/sec 5.515 sec, 2000000 aes192 crypts, 512 bytes, 185674185 byte/sec, 1416.6 Mb/sec 5.734 sec, 2000000 aes256 crypts, 512 bytes, 178570712 byte/sec, 1362.4 Mb/sec 4.881 sec, 2000000 xts256 crypts, 512 bytes, 209785511 byte/sec, 1600.5 Mb/sec 1.505 sec, 200000 aes crypts, 4096 bytes, 544330149 byte/sec, 4152.9 Mb/sec 0.990 sec, 200000 xts crypts, 4096 bytes, 827397858 byte/sec, 6312.5 Mb/sec 1.662 sec, 200000 aes192 crypts, 4096 bytes, 492907535 byte/sec, 3760.6 Mb/sec 1.816 sec, 200000 aes256 crypts, 4096 bytes, 451074247 byte/sec, 3441.4 Mb/sec 1.081 sec, 200000 xts256 crypts, 4096 bytes, 757781786 byte/sec, 5781.4 Mb/sec