Index: conf/files.amd64
===================================================================
--- conf/files.amd64	(revision 240127)
+++ conf/files.amd64	(working copy)
@@ -93,6 +93,11 @@
 	no-obj no-implicit-rule	before-depend				\
 	clean		"acpi_wakedata.h"
 #
+aes_encdec.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/aes_encdec.c"			\
+	compile-with	"clang ${CFLAGS:N-mno-sse} -msse -c ${.IMPSRC}"	\
+	no-implicit-rule clean "aes_encdec.o"
+#
 amd64/amd64/amd64_mem.c		optional	mem
 #amd64/amd64/apic_vector.S	standard
 amd64/amd64/atomic.c		standard
@@ -131,7 +136,7 @@
 amd64/amd64/vm_machdep.c	standard
 amd64/pci/pci_cfgreg.c		optional	pci
 cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S	optional zfs compile-with "${ZFS_S}"
-crypto/aesni/aesencdec_amd64.S	optional aesni
+#crypto/aesni/aesencdec_amd64.S	optional aesni
 crypto/aesni/aeskeys_amd64.S	optional aesni
 crypto/aesni/aesni.c		optional aesni
 crypto/aesni/aesni_wrap.c	optional aesni
Index: crypto/aesni/aesni.h
===================================================================
--- crypto/aesni/aesni.h	(revision 240127)
+++ crypto/aesni/aesni.h	(working copy)
@@ -94,6 +94,15 @@
 void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
     const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]);
 
+#ifdef __amd64__
+void aesni_encrypt_xts(int rounds, const void *data_schedule,
+    const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
+    const uint8_t iv[AES_BLOCK_LEN]);
+void aesni_decrypt_xts(int rounds, const void *data_schedule,
+    const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
+    const uint8_t iv[AES_BLOCK_LEN]);
+#endif
+
 int aesni_cipher_setup(struct aesni_session *ses,
     struct cryptoini *encini);
 int aesni_cipher_process(struct aesni_session *ses,
Index: crypto/aesni/aes_encdec.c
===================================================================
--- crypto/aesni/aes_encdec.c	(revision 0)
+++ crypto/aesni/aes_encdec.c	(working copy)
@@ -0,0 +1,311 @@
+/*-
+ * Copyright (c) 2012 Shane Nievera Tablizo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/param.h>
+#ifdef _KERNEL
+#include <crypto/aesni/aesni.h>
+#endif
+
+#ifndef _KERNEL
+#include "aes_encdec.h"
+#endif
+
+typedef uint64_t	dquad __attribute__((__vector_size__(16)));
+typedef int		q_int __attribute__((__vector_size__(16)));
+
+#define V4ZERO		(q_int){ 0, 0, 0, 0}
+
+#define DEFASM(mn, out_t, in_t, in_spec)				\
+	static inline out_t __attribute__((always_inline))		\
+	mn ## _asm(out_t __a, in_t __b)					\
+	{                                                               \
+		__asm__(#mn " %1, %0" : "+&x" (__a) : in_spec (__b));	\
+		return (__a);						\
+	}
+
+#define SW_ROUNDS(rounds, f)						\
+	switch (rounds) {						\
+		case AES128_ROUNDS: rounds = AES128_ROUNDS; f; break;	\
+		case AES192_ROUNDS: rounds = AES192_ROUNDS; f; break;	\
+		case AES256_ROUNDS: rounds = AES256_ROUNDS; f; break;	\
+	}
+
+DEFASM(psrad, dquad, int, "I")
+DEFASM(aesenc, dquad, dquad, "x")
+DEFASM(aesdec, dquad, dquad, "x")
+DEFASM(aesenclast, dquad, dquad, "x")
+DEFASM(aesdeclast, dquad, dquad, "x")
+
+static inline dquad __attribute__((always_inline))
+loadu_asm(const dquad *restrict p)
+{
+	dquad z;
+
+	__asm__("movdqu	%1, %0" : "=x" (z) : "m" (*p));
+
+	return (z);
+}
+
+#define loadu(x)		loadu_asm((x))
+#define psrad(i,x)		psrad_asm((x), (i))
+#define aesenc			aesenc_asm
+#define aesdec			aesdec_asm
+#define aesdeclast		aesdeclast_asm
+#define aesenclast		aesenclast_asm
+
+static inline dquad __attribute__((always_inline))
+movdqu(const uint8_t *restrict p)
+{
+
+	return (loadu((dquad*)p));
+}
+
+static inline dquad __attribute((always_inline))
+encdec(int dir, int rounds, const dquad *restrict sched, dquad t)
+{
+
+	t ^= *sched;
+
+	for (; --rounds;)
+		t = (dir ? aesenc : aesdec)(t, *++sched);
+
+	return (dir ? aesenclast : aesdeclast)(t, *++sched);
+}
+#define CRYPT(x)	encdec(dir, rounds, (dquad*)sched, (x))
+#define DEC(x)		encdec(0, rounds, (dquad*)sched, (x))
+
+static inline void __attribute((always_inline))
+decrypt_cbc_X6(int rounds, const dquad *restrict sched, size_t n,
+    dquad *restrict src, dquad t0)
+/* TODO:  Pipeline this without explicit vars. */
+{
+	dquad	 t1, t2, t3, t4, t5, t6;
+	dquad	 da, db, dc, dd, de, df;
+
+	size_t	 m;
+	
+	if (0 < (m = n / 6)) {
+		da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
+		dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
+		for (;;) {
+			da = DEC(da); db = DEC(db); dc = DEC(dc);
+			dd = DEC(dd); de = DEC(de); df = DEC(df);
+
+			*src = da ^ t0; *++src = db ^ t1; *++src = dc ^ t2;
+			*++src = dd ^ t3; *++src = de ^ t4; *++src = df ^ t5;
+
+			++src;
+			t0 = t6;
+			if (--m == 0)
+				break;
+			da = t1 = src[0]; db = t2 = src[1]; dc = t3 = src[2];
+			dd = t4 = src[3]; de = t5 = src[4]; df = t6 = src[5];
+		}
+	}
+	if (n % 2 == 1) {
+		da = t1 = src[0];
+		da = DEC(da);
+		*src = da ^ t0;
+		++src; --n;
+		t0 = t1;
+
+	}
+	switch (n %= 6) {
+		case 2:
+			da = t1 = src[0]; db = t2 = src[1];
+			da = DEC(da); db = DEC(db);
+			*src = da ^ t0; *++src = db ^ t1;
+			++src;
+			t0 = t2;
+			break;
+		case 4:
+			da = t1 = src[0]; db = t2 = src[1];
+			dc = t3 = src[2]; dd = t4 = src[3];
+			da = DEC(da); db = DEC(db); dc = DEC(dc); dd = DEC(dd);
+			*src = da ^ t0; *++src = db ^ t1;
+			*++src = dc ^ t2; *++src = dd ^ t3;
+			break;
+	}
+}
+
+/* #define GF128_GENERATOR	0x87 */
+
+static inline dquad __attribute((always_inline))
+GF128_shl(dquad z)
+{
+	dquad a, c;
+
+	c = (dquad){ 0x87 /* GF128 generator */, 1 };
+
+	a = psrad(31, __builtin_shufflevector((q_int)z, V4ZERO, 3, 3, 1, 1));
+
+	return (z + z ^ a & c);
+}
+
+static inline void
+init_tweaks(dquad *restrict ts, int n)
+{
+	int i;
+
+	for (i = 0;  i < n - 1; ++i)
+		ts[i+1] = GF128_shl(ts[i]);
+}
+
+static inline void
+out_xor(dquad *restrict ds, dquad *restrict ts, dquad *restrict *dst, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i, ++*dst)
+		**dst = ds[i] ^ ts[i];
+}
+
+static inline void
+llel_crypt(int dir, int rounds, const dquad *restrict sched, dquad *restrict ds,
+    dquad *restrict ts, const dquad **src, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i, ++*src)
+		ds[i] = CRYPT( **src ^ ts[i]);
+}
+
+static inline void __attribute((always_inline))
+crypt_xex_X6(int dir, int rounds, const dquad *restrict sched, size_t n,
+    const dquad *src, dquad *dst, dquad t0)
+{
+	dquad	d[6];
+	dquad	t[6];
+	size_t	m;
+
+	t[0] = t0;
+
+	if (0 < (m = n / 6)) {
+		init_tweaks(t, 6);
+		for (;;) {
+			llel_crypt(dir, rounds, sched, d, t, &src, 6);
+			out_xor(d, t, &dst, 6);
+			t[0] = GF128_shl(t[5]);
+			if (--m == 0)
+				break;
+			init_tweaks(t, 6);
+		}
+
+	}
+	if (n % 2 == 1) {
+		*dst++ = *t ^ CRYPT(*t ^ *src++);
+		*t = GF128_shl(*t);
+		--n;
+	}
+	switch (n %= 6) {
+	case 2:
+		n = 2;
+		init_tweaks(t, n);
+		llel_crypt(dir, rounds, sched, d, t, &src, n);
+		out_xor(d, t, &dst, n);
+		break;
+	case 4:
+		n = 4;
+		init_tweaks(t, n);
+		llel_crypt(dir, rounds, sched, d, t, &src, n);
+		out_xor(d, t, &dst, n);
+		break;
+	}
+}
+
+void
+aesni_enc(int rounds, const uint8_t *restrict sched, const uint8_t *src,
+    uint8_t *dst, const uint8_t *restrict iv)
+{
+
+	*(dquad*)dst = encdec(1, rounds, (dquad*)sched, iv == NULL ?
+				*(dquad*)src : *(dquad*)src ^ movdqu(iv));
+}
+
+void
+aesni_dec(int rounds, const uint8_t *restrict sched, const uint8_t *src,
+    uint8_t *dst, const uint8_t *restrict iv)
+{
+
+	*(dquad*)dst = encdec(0, rounds, (dquad*)sched, *(dquad*)src);
+	*(dquad*)dst = (iv == NULL) ? *(dquad*)dst : *(dquad*)dst ^ movdqu(iv);
+}
+
+void
+aesni_encrypt_cbc(int rounds, const void *restrict sched, size_t len,
+    const uint8_t *src, uint8_t *dst, const uint8_t *restrict ivp)
+{
+	dquad	 iv;
+	dquad	*p, *q;
+
+	len /= AES_BLOCK_LEN;
+	iv = movdqu(ivp);
+	p = (dquad*)src;
+	q = (dquad*)dst;
+
+#define	ENCRYPT_CBC_LOOP						\
+	for (; len--; ++p, ++q) {					\
+		iv = encdec(1, rounds, (dquad*)sched, *p ^ iv);		\
+		*q = iv;						\
+	}
+	SW_ROUNDS(rounds, ENCRYPT_CBC_LOOP)
+}
+
+void
+aesni_decrypt_cbc(int rounds, const void *restrict sched, size_t len,
+    const uint8_t *restrict dat, const uint8_t *restrict ivp)
+{
+	dquad iv;
+
+	iv = movdqu(ivp);
+	SW_ROUNDS(rounds, decrypt_cbc_X6(rounds, sched, len / AES_BLOCK_LEN,
+				(dquad*)dat, iv))
+}
+
+#define CRYPT_XEX_X6(dir)	(\
+		crypt_xex_X6((dir), rounds, sched, len / AES_BLOCK_LEN,	\
+			(dquad*)src, (dquad*)dst,			\
+			encdec(1, rounds, xsched, movdqu(ivp)))		\
+		)
+void
+aesni_encrypt_xts(int rounds, const void *restrict sched,
+    const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst,
+    const uint8_t *restrict ivp)
+{
+
+	SW_ROUNDS(rounds, CRYPT_XEX_X6(1))
+}
+
+void
+aesni_decrypt_xts(int rounds, const void *restrict sched,
+    const void *restrict xsched, size_t len, const uint8_t *src, uint8_t *dst,
+    const uint8_t *restrict ivp)
+{
+
+	SW_ROUNDS(rounds, CRYPT_XEX_X6(0))
+}
+
Index: crypto/aesni/aesni_wrap.c
===================================================================
--- crypto/aesni/aesni_wrap.c	(revision 240127)
+++ crypto/aesni/aesni_wrap.c	(working copy)
@@ -37,6 +37,7 @@
 
 MALLOC_DECLARE(M_AESNI);
 
+#ifndef __amd64__
 void
 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
     const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN])
@@ -53,6 +54,7 @@
 		to += AES_BLOCK_LEN;
 	}
 }
+#endif
 
 void
 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
@@ -82,6 +84,7 @@
 	}
 }
 
+#ifndef __amd64__
 #define	AES_XTS_BLOCKSIZE	16
 #define	AES_XTS_IVSIZE		8
 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
@@ -169,6 +172,7 @@
 	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
 	    iv, 0);
 }
+#endif
 
 static int
 aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
Index: modules/aesni/Makefile
===================================================================
--- modules/aesni/Makefile	(revision 240127)
+++ modules/aesni/Makefile	(working copy)
@@ -4,7 +4,19 @@
 
 KMOD=	aesni
 SRCS=	aesni.c aesni_wrap.c
-SRCS+=	aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S
+SRCS+=	aeskeys_${MACHINE_CPUARCH}.S
+
+.if ${MACHINE_CPUARCH} == "i386"
+SRCS+=	aesencdec_${MACHINE_CPUARCH}.S
+.endif
+
+.if ${MACHINE_CPUARCH} == "amd64"
+AESNI_CFLAGS= $(CFLAGS:N-mno-sse) -msse
+OBJS=	aes_encdec.o
+aes_encdec.o:	aes_encdec.c
+	clang $(AESNI_CFLAGS) -o $(.TARGET) -c $(.IMPSRC)
+.endif
+
 SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h
 
 .include <bsd.kmod.mk>

#benchmark
   3.917 sec, 2000000    aes crypts,      16 bytes,  8169394 byte/sec,    62.3 Mb/sec
   3.957 sec, 2000000    xts crypts,      16 bytes,  8087413 byte/sec,    61.7 Mb/sec
   3.925 sec, 2000000 aes192 crypts,      16 bytes,  8152586 byte/sec,    62.2 Mb/sec
   3.929 sec, 2000000 aes256 crypts,      16 bytes,  8143737 byte/sec,    62.1 Mb/sec
   3.988 sec, 2000000 xts256 crypts,      16 bytes,  8023624 byte/sec,    61.2 Mb/sec
   5.309 sec, 2000000    aes crypts,     512 bytes, 192891714 byte/sec,  1471.6 Mb/sec
   4.804 sec, 2000000    xts crypts,     512 bytes, 213153973 byte/sec,  1626.2 Mb/sec
   5.515 sec, 2000000 aes192 crypts,     512 bytes, 185674185 byte/sec,  1416.6 Mb/sec
   5.734 sec, 2000000 aes256 crypts,     512 bytes, 178570712 byte/sec,  1362.4 Mb/sec
   4.881 sec, 2000000 xts256 crypts,     512 bytes, 209785511 byte/sec,  1600.5 Mb/sec
   1.505 sec,  200000    aes crypts,    4096 bytes, 544330149 byte/sec,  4152.9 Mb/sec
   0.990 sec,  200000    xts crypts,    4096 bytes, 827397858 byte/sec,  6312.5 Mb/sec
   1.662 sec,  200000 aes192 crypts,    4096 bytes, 492907535 byte/sec,  3760.6 Mb/sec
   1.816 sec,  200000 aes256 crypts,    4096 bytes, 451074247 byte/sec,  3441.4 Mb/sec
   1.081 sec,  200000 xts256 crypts,    4096 bytes, 757781786 byte/sec,  5781.4 Mb/sec