Attachment #126470 for bug #170200

Lines 93-98 Link Here

(-)conf/files.amd64 (-1 / +6 lines)
93	no-obj no-implicit-rule before-depend \	93	no-obj no-implicit-rule before-depend \
94	clean "acpi_wakedata.h"	94	clean "acpi_wakedata.h"
95	#	95	#
		96	aes_encdec.o optional aesni \
		97	dependency "$S/crypto/aesni/aes_encdec.c" \
		98	compile-with "clang ${CFLAGS:N-mno-sse} -msse -c ${.IMPSRC}" \
		99	no-implicit-rule clean "aes_encdec.o"
		100	#
96	amd64/amd64/amd64_mem.c optional mem	101	amd64/amd64/amd64_mem.c optional mem
97	#amd64/amd64/apic_vector.S standard	102	#amd64/amd64/apic_vector.S standard
98	amd64/amd64/atomic.c standard	103	amd64/amd64/atomic.c standard
Lines 131-137 Link Here
131	amd64/amd64/vm_machdep.c standard	136	amd64/amd64/vm_machdep.c standard
132	amd64/pci/pci_cfgreg.c optional pci	137	amd64/pci/pci_cfgreg.c optional pci
133	cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}"	138	cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}"
134	crypto/aesni/aesencdec_amd64.S optional aesni	139	#crypto/aesni/aesencdec_amd64.S optional aesni
135	crypto/aesni/aeskeys_amd64.S optional aesni	140	crypto/aesni/aeskeys_amd64.S optional aesni
136	crypto/aesni/aesni.c optional aesni	141	crypto/aesni/aesni.c optional aesni
137	crypto/aesni/aesni_wrap.c optional aesni	142	crypto/aesni/aesni_wrap.c optional aesni

Lines 94-99 Link Here

(-)crypto/aesni/aesni.h (+9 lines)
94	void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,	94	void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
95	const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]);	95	const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]);
96		96
		97	#ifdef __amd64__
		98	void aesni_encrypt_xts(int rounds, const void *data_schedule,
		99	const void tweak_schedule, size_t len, const uint8_t from, uint8_t *to,
		100	const uint8_t iv[AES_BLOCK_LEN]);
		101	void aesni_decrypt_xts(int rounds, const void *data_schedule,
		102	const void tweak_schedule, size_t len, const uint8_t from, uint8_t *to,
		103	const uint8_t iv[AES_BLOCK_LEN]);
		104	#endif
		105
97	int aesni_cipher_setup(struct aesni_session *ses,	106	int aesni_cipher_setup(struct aesni_session *ses,
98	struct cryptoini *encini);	107	struct cryptoini *encini);
99	int aesni_cipher_process(struct aesni_session *ses,	108	int aesni_cipher_process(struct aesni_session *ses,




/*-
 * Copyright (c) 2012 Shane Nievera Tablizo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/param.h>
#ifdef _KERNEL
#include <crypto/aesni/aesni.h>
#endif

#ifndef _KERNEL
#include "aes_encdec.h"
#endif

typedef uint64_t	dquad __attribute__((__vector_size__(16)));
typedef int		q_int __attribute__((__vector_size__(16)));

#define V4ZERO		(q_int){ 0, 0, 0, 0}

#define DEFASM(mn, out_t, in_t, in_spec)				\
	static inline out_t __attribute__((always_inline))		\
	mn ## _asm(out_t __a, in_t __b)					\
	{                                                               \
		__asm__(#mn " %1, %0" : "+&x" (__a) : in_spec (__b));	\
		return (__a);						\
	}

#define SW_ROUNDS(rounds, f)						\
	switch (rounds) {						\
		case AES128_ROUNDS: rounds = AES128_ROUNDS; f; break;	\
		case AES192_ROUNDS: rounds = AES192_ROUNDS; f; break;	\
		case AES256_ROUNDS: rounds = AES256_ROUNDS; f; break;	\
	}

DEFASM(psrad, dquad, int, "I")
DEFASM(aesenc, dquad, dquad, "x")
DEFASM(aesdec, dquad, dquad, "x")
DEFASM(aesenclast, dquad, dquad, "x")
DEFASM(aesdeclast, dquad, dquad, "x")

static inline dquad __attribute__((always_inline))
loadu_asm(const dquad *restrict p)
{
	dquad z;

	__asm__("movdqu	%1, %0" : "=x" (z) : "m" (*p));

	return (z);
}

#define loadu(x)		loadu_asm((x))
#define psrad(i,x)		psrad_asm((x), (i))
#define aesenc			aesenc_asm
#define aesdec			aesdec_asm
#define aesdeclast		aesdeclast_asm
#define aesenclast		aesenclast_asm

static inline dquad __attribute__((always_inline))
movdqu(const uint8_t *restrict p)
{

	return (loadu((dquad*)p));
}

static inline dquad __attribute((always_inline))
encdec(int dir, int rounds, const dquad *restrict sched, dquad t)
{

	t ^= *sched;

	for (; --rounds;)
		t = (dir ? aesenc : aesdec)(t, *++sched);

	return (dir ? aesenclast : aesdeclast)(t, *++sched);
}
#define CRYPT(x)	encdec(dir, rounds, (dquad*)sched, (x))
#define DEC(x)		encdec(0, rounds, (dquad*)sched, (x))

static inline void __attribute((always_inline))
decrypt_cbc_X6(int rounds, const dquad *restrict sched, size_t n,
    dquad *restrict src, dquad t0)
/* TODO:  Pipeline this without explicit vars. */
{
	dquad	 t1, t2, t3, t4, t5, t6;
	dquad	 da, db, dc, dd, de, df;

	size_t	 m;
	
	if (0 < (m = n / 6)) {
		da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
		dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
		for (;;) {
			da = DEC(da); db = DEC(db); dc = DEC(dc);
			dd = DEC(dd); de = DEC(de); df = DEC(df);

			*src = da ^ t0; *++src = db ^ t1; *++src = dc ^ t2;
			*++src = dd ^ t3; *++src = de ^ t4; *++src = df ^ t5;

			++src;
			t0 = t6;
			if (--m == 0)
				break;
			da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
			dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
		}
	}
	if (n % 2 == 1) {
		da = t1 = src[0];
		da = DEC(da);
		*src = da ^ t0;
		++src; --n;
		t0 = t1;

	}
	switch (n %= 6) {
		case 2:
			da = t1 = src[0]; db = t2 = src[1];
			da = DEC(da); db = DEC(db);
			*src = da ^ t0; *++src = db ^ t1;
			++src;
			t0 = t2;
			break;
		case 4:
			da = t1 = src[0]; db = t2 = src[1];
			dc = t3 = src[2]; dd = t4 = src[3];
			da = DEC(da); db = DEC(db); dc = DEC(dc); dd = DEC(dd);
			*src = da ^ t0; *++src = db ^ t1;
			*++src = dc ^ t2; *++src = dd ^ t3;
			break;
	}
}

/* #define GF128_GENERATOR	0x87 */

static inline dquad __attribute((always_inline))
GF128_shl(dquad z)
{
	dquad a, c;

	c = (dquad){ 0x87 /* GF128 generator */, 1 };

	a = psrad(31, __builtin_shufflevector((q_int)z, V4ZERO, 3, 3, 1, 1));

	return (z + z ^ a & c);
}

static inline void
init_tweaks(dquad *restrict ts, int n)
{
	int i;

	for (i = 0;  i < n - 1; ++i)
		ts[i+1] = GF128_shl(ts[i]);
}

static inline void
out_xor(dquad *restrict ds, dquad *restrict ts, dquad *restrict *dst, int n)
{
	int i;

	for (i = 0; i < n; ++i, ++*dst)
		**dst = ds[i] ^ ts[i];
}

static inline void
llel_crypt(int dir, int rounds, const dquad *restrict sched, dquad *restrict ds,
    dquad *restrict ts, const dquad **src, int n)
{
	int i;

	for (i = 0; i < n; ++i, ++*src)
		ds[i] = CRYPT( **src ^ ts[i]);
}

static inline void __attribute((always_inline))
crypt_xex_X6(int dir, int rounds, const dquad *restrict sched, size_t n,
    const dquad *src, dquad *dst, dquad t0)
{
	dquad	d[6];
	dquad	t[6];
	size_t	m;

	t[0] = t0;

	if (0 < (m = n / 6)) {
		init_tweaks(t, 6);
		for (;;) {
			llel_crypt(dir, rounds, sched, d, t, &src, 6);
			out_xor(d, t, &dst, 6);
			t[0] = GF128_shl(t[5]);
			if (--m == 0)
				break;
			init_tweaks(t, 6);
		}

	}
	if (n % 2 == 1) {
		*dst++ = *t ^ CRYPT(*t ^ *src++);
		*t = GF128_shl(*t);
		--n;
	}
	switch (n %= 6) {
	case 2:
		n = 2;
		init_tweaks(t, n);
		llel_crypt(dir, rounds, sched, d, t, &src, n);
		out_xor(d, t, &dst, n);
		break;
	case 4:
		n = 4;
		init_tweaks(t, n);
		llel_crypt(dir, rounds, sched, d, t, &src, n);
		out_xor(d, t, &dst, n);
		break;
	}
}

void
aesni_enc(int rounds, const uint8_t *restrict sched, const uint8_t *src,
    uint8_t *dst, const uint8_t *restrict iv)
{

	*(dquad*)dst = encdec(1, rounds, (dquad*)sched, iv == NULL ?
				*(dquad*)src : *(dquad*)src ^ movdqu(iv));
}

void
aesni_dec(int rounds, const uint8_t *restrict sched, const uint8_t *src,
    uint8_t *dst, const uint8_t *restrict iv)
{

	*(dquad*)dst = encdec(0, rounds, (dquad*)sched, *(dquad*)src);
	*(dquad*)dst = (iv == NULL) ? *(dquad*)dst : *(dquad*)dst ^ movdqu(iv);
}

void
aesni_encrypt_cbc(int rounds, const void *restrict sched, size_t len,
    const uint8_t *src, uint8_t *dst, const uint8_t *restrict ivp)
{
	dquad	 iv;
	dquad	*p, *q;

	len /= AES_BLOCK_LEN;
	iv = movdqu(ivp);
	p = (dquad*)src;
	q = (dquad*)dst;

#define	ENCRYPT_CBC_LOOP						\
	for (; len--; ++p, ++q) {					\
		iv = encdec(1, rounds, (dquad*)sched, *p ^ iv);		\
		*q = iv;						\
	}
	SW_ROUNDS(rounds, ENCRYPT_CBC_LOOP)
}

void
aesni_decrypt_cbc(int rounds, const void *restrict sched, size_t len,
    const uint8_t *restrict dat, const uint8_t *restrict ivp)
{
	dquad iv;

	iv = movdqu(ivp);
	SW_ROUNDS(rounds, decrypt_cbc_X6(rounds, sched, len / AES_BLOCK_LEN,
				(dquad*)dat, iv))
}

#define CRYPT_XEX_X6(dir)	(\
		crypt_xex_X6((dir), rounds, sched, len / AES_BLOCK_LEN,	\
			(dquad*)src, (dquad*)dst,			\
			encdec(1, rounds, xsched, movdqu(ivp)))		\
		)
void
aesni_encrypt_xts(int rounds, const void *restrict sched,
    const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst,
    const uint8_t *restrict ivp)
{

	SW_ROUNDS(rounds, CRYPT_XEX_X6(1))
}

void
aesni_decrypt_xts(int rounds, const void *restrict sched,
    const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst,
    const uint8_t *restrict ivp)
{

	SW_ROUNDS(rounds, CRYPT_XEX_X6(0))
}






MALLOC_DECLARE(M_AESNI);

#ifndef __amd64__
void
aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
    const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN])

		to += AES_BLOCK_LEN;
	}
}
#endif

void
aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,

	}
}

#ifndef __amd64__
#define	AES_XTS_BLOCKSIZE	16
#define	AES_XTS_IVSIZE		8
#define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */

	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
	    iv, 0);
}
#endif

static int
aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,





KMOD=	aesni
SRCS=	aesni.c aesni_wrap.c
SRCS+=	aeskeys_${MACHINE_CPUARCH}.S

.if ${MACHINE_CPUARCH} == "i386"
SRCS+=	aesencdec_${MACHINE_CPUARCH}.S
.endif

.if ${MACHINE_CPUARCH} == "amd64"
AESNI_CFLAGS= $(CFLAGS:N-mno-sse) -msse
OBJS=	aes_encdec.o
aes_encdec.o:	aes_encdec.c
	clang $(AESNI_CFLAGS) -o $(.TARGET) -c $(.IMPSRC)
.endif

SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h

.include <bsd.kmod.mk>
  3.917 sec, 2000000    aes crypts,      16 bytes,  8169394 byte/sec,    62.3 Mb/sec
  3.957 sec, 2000000    xts crypts,      16 bytes,  8087413 byte/sec,    61.7 Mb/sec
  3.925 sec, 2000000 aes192 crypts,      16 bytes,  8152586 byte/sec,    62.2 Mb/sec
  3.929 sec, 2000000 aes256 crypts,      16 bytes,  8143737 byte/sec,    62.1 Mb/sec
  3.988 sec, 2000000 xts256 crypts,      16 bytes,  8023624 byte/sec,    61.2 Mb/sec
  5.309 sec, 2000000    aes crypts,     512 bytes, 192891714 byte/sec,  1471.6 Mb/sec
  4.804 sec, 2000000    xts crypts,     512 bytes, 213153973 byte/sec,  1626.2 Mb/sec
  5.515 sec, 2000000 aes192 crypts,     512 bytes, 185674185 byte/sec,  1416.6 Mb/sec
  5.734 sec, 2000000 aes256 crypts,     512 bytes, 178570712 byte/sec,  1362.4 Mb/sec
  4.881 sec, 2000000 xts256 crypts,     512 bytes, 209785511 byte/sec,  1600.5 Mb/sec
  1.505 sec,  200000    aes crypts,    4096 bytes, 544330149 byte/sec,  4152.9 Mb/sec
  0.990 sec,  200000    xts crypts,    4096 bytes, 827397858 byte/sec,  6312.5 Mb/sec
  1.662 sec,  200000 aes192 crypts,    4096 bytes, 492907535 byte/sec,  3760.6 Mb/sec
  1.816 sec,  200000 aes256 crypts,    4096 bytes, 451074247 byte/sec,  3441.4 Mb/sec
  1.081 sec,  200000 xts256 crypts,    4096 bytes, 757781786 byte/sec,  5781.4 Mb/sec

Return to bug 170200

Line 0 Link Here

(-)crypto/aesni/aes_encdec.c (+311 lines)
		1	/*-
		2	* Copyright (c) 2012 Shane Nievera Tablizo
		3	* All rights reserved.
		4	*
		5	* Redistribution and use in source and binary forms, with or without
		6	* modification, are permitted provided that the following conditions
		7	* are met:
		8	* 1. Redistributions of source code must retain the above copyright
		9	* notice, this list of conditions and the following disclaimer.
		10	* 2. Redistributions in binary form must reproduce the above copyright
		11	* notice, this list of conditions and the following disclaimer in the
		12	* documentation and/or other materials provided with the distribution.
		13	*
		14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
		15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
		16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
		17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
		18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
		19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
		20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
		21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
		22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
		23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
		24	* SUCH DAMAGE.
		25	*
		26	*/
		27
		28	#include <sys/param.h>
		29	#ifdef _KERNEL
		30	#include <crypto/aesni/aesni.h>
		31	#endif
		32
		33	#ifndef _KERNEL
		34	#include "aes_encdec.h"
		35	#endif
		36
		37	typedef uint64_t dquad __attribute__((__vector_size__(16)));
		38	typedef int q_int __attribute__((__vector_size__(16)));
		39
		40	#define V4ZERO (q_int){ 0, 0, 0, 0}
		41
		42	#define DEFASM(mn, out_t, in_t, in_spec) \
		43	static inline out_t __attribute__((always_inline)) \
		44	mn ## _asm(out_t __a, in_t __b) \
		45	{ \
		46	__asm__(#mn " %1, %0" : "+&x" (__a) : in_spec (__b)); \
		47	return (__a); \
		48	}
		49
		50	#define SW_ROUNDS(rounds, f) \
		51	switch (rounds) { \
		52	case AES128_ROUNDS: rounds = AES128_ROUNDS; f; break; \
		53	case AES192_ROUNDS: rounds = AES192_ROUNDS; f; break; \
		54	case AES256_ROUNDS: rounds = AES256_ROUNDS; f; break; \
		55	}
		56
		57	DEFASM(psrad, dquad, int, "I")
		58	DEFASM(aesenc, dquad, dquad, "x")
		59	DEFASM(aesdec, dquad, dquad, "x")
		60	DEFASM(aesenclast, dquad, dquad, "x")
		61	DEFASM(aesdeclast, dquad, dquad, "x")
		62
		63	static inline dquad __attribute__((always_inline))
		64	loadu_asm(const dquad *restrict p)
65	{
66	dquad z;
67
68	__asm__("movdqu %1, %0" : "=x" (z) : "m" (*p));
69
70	return (z);
71	}
72
73	#define loadu(x) loadu_asm((x))
74	#define psrad(i,x) psrad_asm((x), (i))
75	#define aesenc aesenc_asm
76	#define aesdec aesdec_asm
77	#define aesdeclast aesdeclast_asm
78	#define aesenclast aesenclast_asm
79
80	static inline dquad __attribute__((always_inline))
81	movdqu(const uint8_t *restrict p)
82	{
83
84	return (loadu((dquad*)p));
85	}
86
87	static inline dquad __attribute((always_inline))
88	encdec(int dir, int rounds, const dquad *restrict sched, dquad t)
89	{
90
91	t ^= *sched;
92
93	for (; --rounds;)
94	t = (dir ? aesenc : aesdec)(t, *++sched);
95
96	return (dir ? aesenclast : aesdeclast)(t, *++sched);
97	}
98	#define CRYPT(x) encdec(dir, rounds, (dquad*)sched, (x))
99	#define DEC(x) encdec(0, rounds, (dquad*)sched, (x))
100
101	static inline void __attribute((always_inline))
102	decrypt_cbc_X6(int rounds, const dquad *restrict sched, size_t n,
103	dquad *restrict src, dquad t0)
104	/* TODO: Pipeline this without explicit vars. */
105	{
106	dquad t1, t2, t3, t4, t5, t6;
107	dquad da, db, dc, dd, de, df;
108
109	size_t m;
110
111	if (0 < (m = n / 6)) {
112	da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
113	dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
114	for (;;) {
115	da = DEC(da); db = DEC(db); dc = DEC(dc);
116	dd = DEC(dd); de = DEC(de); df = DEC(df);
117
118	src = da ^ t0; ++src = db ^ t1; *++src = dc ^ t2;
119	++src = dd ^ t3; ++src = de ^ t4; *++src = df ^ t5;
120
121	++src;
122	t0 = t6;
123	if (--m == 0)
124	break;
125	da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
126	dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
127	}
128	}
129	if (n % 2 == 1) {
130	da = t1 = src[0];
131	da = DEC(da);
132	*src = da ^ t0;
133	++src; --n;
134	t0 = t1;
135
136	}
137	switch (n %= 6) {
138	case 2:
139	da = t1 = src[0]; db = t2 = src[1];
140	da = DEC(da); db = DEC(db);
141	src = da ^ t0; ++src = db ^ t1;
142	++src;
143	t0 = t2;
144	break;
145	case 4:
146	da = t1 = src[0]; db = t2 = src[1];
147	dc = t3 = src[2]; dd = t4 = src[3];
148	da = DEC(da); db = DEC(db); dc = DEC(dc); dd = DEC(dd);
149	src = da ^ t0; ++src = db ^ t1;
150	++src = dc ^ t2; ++src = dd ^ t3;
151	break;
152	}
153	}
154
155	/* #define GF128_GENERATOR 0x87 */
156
157	static inline dquad __attribute((always_inline))
158	GF128_shl(dquad z)
159	{
160	dquad a, c;
161
162	c = (dquad){ 0x87 /* GF128 generator */, 1 };
163
164	a = psrad(31, __builtin_shufflevector((q_int)z, V4ZERO, 3, 3, 1, 1));
165
166	return (z + z ^ a & c);
167	}
168
169	static inline void
170	init_tweaks(dquad *restrict ts, int n)
171	{
172	int i;
173
174	for (i = 0; i < n - 1; ++i)
175	ts[i+1] = GF128_shl(ts[i]);
176	}
177
178	static inline void
179	out_xor(dquad restrict ds, dquad restrict ts, dquad restrict dst, int n)
180	{
181	int i;
182
183	for (i = 0; i < n; ++i, ++*dst)
184	**dst = ds[i] ^ ts[i];
185	}
186
187	static inline void
188	llel_crypt(int dir, int rounds, const dquad restrict sched, dquad restrict ds,
189	dquad restrict ts, const dquad *src, int n)
190	{
191	int i;
192
193	for (i = 0; i < n; ++i, ++*src)
194	ds[i] = CRYPT( **src ^ ts[i]);
195	}
196
197	static inline void __attribute((always_inline))
198	crypt_xex_X6(int dir, int rounds, const dquad *restrict sched, size_t n,
199	const dquad src, dquad dst, dquad t0)
200	{
201	dquad d[6];
202	dquad t[6];
203	size_t m;
204
205	t[0] = t0;
206
207	if (0 < (m = n / 6)) {
208	init_tweaks(t, 6);
209	for (;;) {
210	llel_crypt(dir, rounds, sched, d, t, &src, 6);
211	out_xor(d, t, &dst, 6);
212	t[0] = GF128_shl(t[5]);
213	if (--m == 0)
214	break;
215	init_tweaks(t, 6);
216	}
217
218	}
219	if (n % 2 == 1) {
220	dst++ = t ^ CRYPT(t ^ src++);
221	t = GF128_shl(t);
222	--n;
223	}
224	switch (n %= 6) {
225	case 2:
226	n = 2;
227	init_tweaks(t, n);
228	llel_crypt(dir, rounds, sched, d, t, &src, n);
229	out_xor(d, t, &dst, n);
230	break;
231	case 4:
232	n = 4;
233	init_tweaks(t, n);
234	llel_crypt(dir, rounds, sched, d, t, &src, n);
235	out_xor(d, t, &dst, n);
236	break;
237	}
238	}
239
240	void
241	aesni_enc(int rounds, const uint8_t restrict sched, const uint8_t src,
242	uint8_t dst, const uint8_t restrict iv)
243	{
244
245	(dquad)dst = encdec(1, rounds, (dquad*)sched, iv == NULL ?
246	(dquad)src : (dquad)src ^ movdqu(iv));
247	}
248
249	void
250	aesni_dec(int rounds, const uint8_t restrict sched, const uint8_t src,
251	uint8_t dst, const uint8_t restrict iv)
252	{
253
254	(dquad)dst = encdec(0, rounds, (dquad)sched, (dquad*)src);
255	(dquad)dst = (iv == NULL) ? (dquad)dst : (dquad)dst ^ movdqu(iv);
256	}
257
258	void
259	aesni_encrypt_cbc(int rounds, const void *restrict sched, size_t len,
260	const uint8_t src, uint8_t dst, const uint8_t *restrict ivp)
261	{
262	dquad iv;
263	dquad p, q;
264
265	len /= AES_BLOCK_LEN;
266	iv = movdqu(ivp);
267	p = (dquad*)src;
268	q = (dquad*)dst;
269
270	#define ENCRYPT_CBC_LOOP \
271	for (; len--; ++p, ++q) { \
272	iv = encdec(1, rounds, (dquad)sched, p ^ iv); \
273	*q = iv; \
274	}
275	SW_ROUNDS(rounds, ENCRYPT_CBC_LOOP)
276	}
277
278	void
279	aesni_decrypt_cbc(int rounds, const void *restrict sched, size_t len,
280	const uint8_t restrict dat, const uint8_t restrict ivp)
281	{
282	dquad iv;
283
284	iv = movdqu(ivp);
285	SW_ROUNDS(rounds, decrypt_cbc_X6(rounds, sched, len / AES_BLOCK_LEN,
286	(dquad*)dat, iv))
287	}
288
289	#define CRYPT_XEX_X6(dir) (\
290	crypt_xex_X6((dir), rounds, sched, len / AES_BLOCK_LEN, \
291	(dquad)src, (dquad)dst, \
292	encdec(1, rounds, xsched, movdqu(ivp))) \
293	)
294	void
295	aesni_encrypt_xts(int rounds, const void *restrict sched,
296	const void restrict xsched, size_t len, const uint8_t src, uint8_t *dst,
297	const uint8_t *restrict ivp)
298	{
299
300	SW_ROUNDS(rounds, CRYPT_XEX_X6(1))
301	}
302
303	void
304	aesni_decrypt_xts(int rounds, const void *restrict sched,
305	const void restrict xsched, size_t len, const uint8_t src, uint8_t *dst,
306	const uint8_t *restrict ivp)
307	{
308
309	SW_ROUNDS(rounds, CRYPT_XEX_X6(0))
310	}
311

Lines 37-42 Link Here

(-)crypto/aesni/aesni_wrap.c (+4 lines)
37		37
38	MALLOC_DECLARE(M_AESNI);	38	MALLOC_DECLARE(M_AESNI);
39		39
		40	#ifndef __amd64__
40	void	41	void
41	aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,	42	aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
42	const uint8_t from, uint8_t to, const uint8_t iv[AES_BLOCK_LEN])	43	const uint8_t from, uint8_t to, const uint8_t iv[AES_BLOCK_LEN])
Lines 53-58 Link Here
53	to += AES_BLOCK_LEN;	54	to += AES_BLOCK_LEN;
54	}	55	}
55	}	56	}
		57	#endif
56		58
57	void	59	void
58	aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,	60	aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
Lines 82-87 Link Here
82	}	84	}
83	}	85	}
84		86
		87	#ifndef __amd64__
85	#define AES_XTS_BLOCKSIZE 16	88	#define AES_XTS_BLOCKSIZE 16
86	#define AES_XTS_IVSIZE 8	89	#define AES_XTS_IVSIZE 8
87	#define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */	90	#define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
Lines 169-174 Link Here
169	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,	172	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
170	iv, 0);	173	iv, 0);
171	}	174	}
		175	#endif
172		176
173	static int	177	static int
174	aesni_cipher_setup_common(struct aesni_session ses, const uint8_t key,	178	aesni_cipher_setup_common(struct aesni_session ses, const uint8_t key,

Lines 4-10 Link Here

(-)modules/aesni/Makefile (-1 / +13 lines)
4		4
5	KMOD= aesni	5	KMOD= aesni
6	SRCS= aesni.c aesni_wrap.c	6	SRCS= aesni.c aesni_wrap.c
7	SRCS+= aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S	7	SRCS+= aeskeys_${MACHINE_CPUARCH}.S
		8
		9	.if ${MACHINE_CPUARCH} == "i386"
		10	SRCS+= aesencdec_${MACHINE_CPUARCH}.S
		11	.endif
		12
		13	.if ${MACHINE_CPUARCH} == "amd64"
		14	AESNI_CFLAGS= $(CFLAGS:N-mno-sse) -msse
		15	OBJS= aes_encdec.o
		16	aes_encdec.o: aes_encdec.c
		17	clang $(AESNI_CFLAGS) -o $(.TARGET) -c $(.IMPSRC)
		18	.endif
		19
8	SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h	20	SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h
9		21
10	.include <bsd.kmod.mk>	22	.include <bsd.kmod.mk>
11	3.917 sec, 2000000 aes crypts, 16 bytes, 8169394 byte/sec, 62.3 Mb/sec	23	3.917 sec, 2000000 aes crypts, 16 bytes, 8169394 byte/sec, 62.3 Mb/sec
12	3.957 sec, 2000000 xts crypts, 16 bytes, 8087413 byte/sec, 61.7 Mb/sec	24	3.957 sec, 2000000 xts crypts, 16 bytes, 8087413 byte/sec, 61.7 Mb/sec
13	3.925 sec, 2000000 aes192 crypts, 16 bytes, 8152586 byte/sec, 62.2 Mb/sec	25	3.925 sec, 2000000 aes192 crypts, 16 bytes, 8152586 byte/sec, 62.2 Mb/sec
14	3.929 sec, 2000000 aes256 crypts, 16 bytes, 8143737 byte/sec, 62.1 Mb/sec	26	3.929 sec, 2000000 aes256 crypts, 16 bytes, 8143737 byte/sec, 62.1 Mb/sec
15	3.988 sec, 2000000 xts256 crypts, 16 bytes, 8023624 byte/sec, 61.2 Mb/sec	27	3.988 sec, 2000000 xts256 crypts, 16 bytes, 8023624 byte/sec, 61.2 Mb/sec
16	5.309 sec, 2000000 aes crypts, 512 bytes, 192891714 byte/sec, 1471.6 Mb/sec	28	5.309 sec, 2000000 aes crypts, 512 bytes, 192891714 byte/sec, 1471.6 Mb/sec
17	4.804 sec, 2000000 xts crypts, 512 bytes, 213153973 byte/sec, 1626.2 Mb/sec	29	4.804 sec, 2000000 xts crypts, 512 bytes, 213153973 byte/sec, 1626.2 Mb/sec
18	5.515 sec, 2000000 aes192 crypts, 512 bytes, 185674185 byte/sec, 1416.6 Mb/sec	30	5.515 sec, 2000000 aes192 crypts, 512 bytes, 185674185 byte/sec, 1416.6 Mb/sec
19	5.734 sec, 2000000 aes256 crypts, 512 bytes, 178570712 byte/sec, 1362.4 Mb/sec	31	5.734 sec, 2000000 aes256 crypts, 512 bytes, 178570712 byte/sec, 1362.4 Mb/sec
20	4.881 sec, 2000000 xts256 crypts, 512 bytes, 209785511 byte/sec, 1600.5 Mb/sec	32	4.881 sec, 2000000 xts256 crypts, 512 bytes, 209785511 byte/sec, 1600.5 Mb/sec
21	1.505 sec, 200000 aes crypts, 4096 bytes, 544330149 byte/sec, 4152.9 Mb/sec	33	1.505 sec, 200000 aes crypts, 4096 bytes, 544330149 byte/sec, 4152.9 Mb/sec
22	0.990 sec, 200000 xts crypts, 4096 bytes, 827397858 byte/sec, 6312.5 Mb/sec	34	0.990 sec, 200000 xts crypts, 4096 bytes, 827397858 byte/sec, 6312.5 Mb/sec
23	1.662 sec, 200000 aes192 crypts, 4096 bytes, 492907535 byte/sec, 3760.6 Mb/sec	35	1.662 sec, 200000 aes192 crypts, 4096 bytes, 492907535 byte/sec, 3760.6 Mb/sec
24	1.816 sec, 200000 aes256 crypts, 4096 bytes, 451074247 byte/sec, 3441.4 Mb/sec	36	1.816 sec, 200000 aes256 crypts, 4096 bytes, 451074247 byte/sec, 3441.4 Mb/sec
25	1.081 sec, 200000 xts256 crypts, 4096 bytes, 757781786 byte/sec, 5781.4 Mb/sec	37	1.081 sec, 200000 xts256 crypts, 4096 bytes, 757781786 byte/sec, 5781.4 Mb/sec