Attachment 149948 Details for Bug 195238 – VMBUS and storage driver enhancements for Hyper-V

[patch] VMBUS and storage driver enhancements for Hyper-V

m1-diff.patch (text/plain), 91.30 KB, created by Wei Hu on 2014-11-28 04:08:02 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Wei Hu

Created: 2014-11-28 04:08:02 UTC

Size: 91.30 KB

patch

obsolete

>Index: sys/dev/hyperv/include/hyperv.h
>===================================================================
>--- sys/dev/hyperv/include/hyperv.h	(revision 1)
>+++ sys/dev/hyperv/include/hyperv.h	(revision 3)
>@@ -46,6 +46,7 @@
> #include <sys/systm.h>
> #include <sys/lock.h>
> #include <sys/sema.h>
>+#include <sys/smp.h>
> #include <sys/mutex.h>
> #include <sys/bus.h>
> #include <vm/vm.h>
>@@ -63,12 +64,23 @@
> #define HV_ERROR_MACHINE_LOCKED	0x800704F7
> 
> /*
>- * A revision number of vmbus that is used for ensuring both ends on a
>- * partition are using compatible versions.
>+ * VMBUS version is 32 bit, upper 16 bit for major_number and lower
>+ * 16 bit for minor_number.
>+ *
>+ * 0.13  --  Windows Server 2008
>+ * 1.1   --  Windows 7
>+ * 2.4   --  Windows 8
>+ * 3.0   --  Windows 8.1
>  */
>+#define HV_VMBUS_VERSION_WS2008		((0 << 16) | (13))
>+#define HV_VMBUS_VERSION_WIN7		((1 << 16) | (1))
>+#define HV_VMBUS_VERSION_WIN8		((2 << 16) | (4))
>+#define HV_VMBUS_VERSION_WIN8_1		((3 << 16) | (0))
> 
>-#define HV_VMBUS_REVISION_NUMBER	13
>+#define HV_VMBUS_VERSION_INVALID	-1
> 
>+#define HV_VMBUS_VERSION_CURRENT	HV_VMBUS_VERSION_WIN8_1
>+
> /*
>  * Make maximum size of pipe payload of 16K
>  */
>@@ -112,6 +124,18 @@
> 	 unsigned char data[16];
> } __packed hv_guid;
> 
>+#define HV_NIC_GUID							\
>+	.data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,	\
>+		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
>+
>+#define HV_IDE_GUID							\
>+	.data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44,	\
>+		 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5}
>+
>+#define HV_SCSI_GUID							\
>+	.data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,	\
>+		 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f}
>+
> /*
>  * At the center of the Channel Management library is
>  * the Channel Offer. This struct contains the
>@@ -147,7 +171,11 @@
> 		} __packed pipe;
> 	} u;
> 
>-	uint32_t	padding;
>+	/*
>+	 * Sub_channel_index, newly added in Win8.
>+	 */
>+	uint16_t	sub_channel_index;
>+	uint16_t	padding;
> 
> } __packed hv_vmbus_channel_offer;
> 
>@@ -344,7 +372,25 @@
> 	hv_vmbus_channel_offer		offer;
> 	uint32_t			child_rel_id;
> 	uint8_t				monitor_id;
>-	hv_bool_uint8_t			monitor_allocated;
>+	/*
>+	 * This field has been splited into a bit field on Win7
>+	 * and higher.
>+	 */
>+	uint8_t				monitor_allocated:1;
>+	uint8_t				reserved:7;
>+	/*
>+	 * Following fields were added in win7 and higher.
>+	 * Make sure to check the version before accessing these fields.
>+	 *
>+	 * If "is_dedicated_interrupt" is set, we must not set the
>+	 * associated bit in the channel bitmap while sending the
>+	 * interrupt to the host.
>+	 *
>+	 * connection_id is used in signaling the host.
>+	 */
>+	uint16_t			is_dedicated_interrupt:1;
>+	uint16_t			reserved1:15;
>+	uint32_t			connection_id;
> } __packed hv_vmbus_channel_offer_channel;
> 
> /*
>@@ -394,9 +440,14 @@
>     hv_gpadl_handle	ring_buffer_gpadl_handle;
> 
>     /*
>-     * GPADL for the channel's server context save area.
>+     * Starting with win8, this field will be used to specify
>+     * the target virtual processor on which to deliver the interrupt for
>+     * the host to guest.
>+     * Before win8, all incoming channel interrupts are only to
>+     * be delivered on cpu 0. Setting this value to 0 would
>+     * preserve the earlier behavior.
>      */
>-    hv_gpadl_handle	server_context_area_gpadl_handle;
>+    uint32_t		target_vcpu;
> 
>     /*
>      * The upstream ring buffer begins at offset zero in the memory described
>@@ -646,14 +697,42 @@
> } hv_vmbus_ring_buffer_info;
> 
> typedef void (*hv_vmbus_pfn_channel_callback)(void *context);
>+typedef void (*hv_vmbus_sc_creation_callback)(void *context);
> 
> typedef enum {
> 	HV_CHANNEL_OFFER_STATE,
> 	HV_CHANNEL_OPENING_STATE,
> 	HV_CHANNEL_OPEN_STATE,
>+	HV_CHANNEL_OPENED_STATE,
> 	HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE,
> } hv_vmbus_channel_state;
> 
>+/*
>+ *  Connection identifier type
>+ */
>+typedef union {
>+	uint32_t		as_uint32_t;
>+	struct {
>+		uint32_t	id:24;
>+		uint32_t	reserved:8;
>+	} u;
>+
>+} __packed hv_vmbus_connection_id;
>+
>+/*
>+ * Definition of the hv_vmbus_signal_event hypercall input structure
>+ */
>+typedef struct {
>+	hv_vmbus_connection_id	connection_id;
>+	uint16_t		flag_number;
>+	uint16_t		rsvd_z;
>+} __packed hv_vmbus_input_signal_event;
>+
>+typedef struct {
>+	uint64_t			align8;
>+	hv_vmbus_input_signal_event	event;
>+} __packed hv_vmbus_input_signal_event_buffer;
>+
> typedef struct hv_vmbus_channel {
> 	TAILQ_ENTRY(hv_vmbus_channel)	list_entry;
> 	struct hv_device*		device;
>@@ -688,8 +767,87 @@
> 	hv_vmbus_pfn_channel_callback	on_channel_callback;
> 	void*				channel_callback_context;
> 
>+	/*
>+	 * If batched_reading is set to "true", mask the interrupt
>+	 * and read until the channel is empty.
>+	 * If batched_reading is set to "false", the channel is not
>+	 * going to perform batched reading.
>+	 *
>+	 * Batched reading is enabled by default; specific
>+	 * drivers that don't want this behavior can turn it off.
>+	 */
>+	boolean_t			batched_reading;
>+
>+	boolean_t			is_dedicated_interrupt;
>+
>+	/*
>+	 * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall.
>+	 */
>+	hv_vmbus_input_signal_event_buffer	signal_event_buffer;
>+	/*
>+	 * 8-bytes aligned of the buffer above
>+	 */
>+	hv_vmbus_input_signal_event	*signal_event_param;
>+
>+	/*
>+	 * From Win8, this field specifies the target virtual process
>+	 * on which to deliver the interupt from the host to guest.
>+	 * Before Win8, all channel interrupts would only be
>+	 * delivered on cpu 0. Setting this value to 0 would preserve
>+	 * the earlier behavior.
>+	 */
>+	uint32_t			target_vcpu;
>+	/* The corresponding CPUID in the guest */
>+	uint32_t			target_cpu;
>+
>+	/*
>+	 * Support for multi-channels.
>+	 * The initial offer is considered the primary channel and this
>+	 * offer message will indicate if the host supports multi-channels.
>+	 * The guest is free to ask for multi-channels to be offerred and can
>+	 * open these multi-channels as a normal "primary" channel. However,
>+	 * all multi-channels will have the same type and instance guids as the
>+	 * primary channel. Requests sent on a given channel will result in a
>+	 * response on the same channel.
>+	 */
>+
>+	/*
>+	 * Multi-channel creation callback. This callback will be called in
>+	 * process context when a Multi-channel offer is received from the host.
>+	 * The guest can open the Multi-channel in the context of this callback.
>+	 */
>+	hv_vmbus_sc_creation_callback	sc_creation_callback;
>+
>+	struct mtx			sc_lock;
>+
>+	/*
>+	 * Link list of all the multi-channels if this is a primary channel
>+	 */
>+	TAILQ_HEAD(, hv_vmbus_channel)	sc_list_anchor;
>+	TAILQ_ENTRY(hv_vmbus_channel)	sc_list_entry;
>+
>+	/*
>+	 * The primary channel this sub-channle belongs to.
>+	 * This will be NULL for the primary channel.
>+	 */
>+	struct hv_vmbus_channel		*primary_channel;
>+	/*
>+	 * Support per channel state for use by vmbus drivers.
>+	 */
>+	void				*per_channel_state;
>+	/*
>+	 * To support per-cpu lookup mapping of relid to channel, link up
>+	 * channels based on their CPU affinity.
>+	 */
>+	/*XXX TAILQ_HEAD(, uint32_t)		percpu_list; */
> } hv_vmbus_channel;
> 
>+static inline void
>+hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state)
>+{
>+	channel->batched_reading = state;
>+}
>+
> typedef struct hv_device {
> 	hv_guid		    class_id;
> 	hv_guid		    device_id;
>@@ -760,6 +918,8 @@
> 				hv_vmbus_channel*	channel,
> 				uint32_t		gpadl_handle);
> 
>+struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
>+
> /*
>  * Work abstraction defines
>  */
>@@ -819,6 +979,9 @@
> 
> extern uint8_t* receive_buffer[];
> extern hv_vmbus_service service_table[];
>+extern uint32_t hv_vmbus_protocal_version;
>+extern int mp_ncpus;
>+extern volatile int smp_started;
> 
> void hv_kvp_callback(void *context);
> int hv_kvp_init(hv_vmbus_service *serv);
>Index: sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
>===================================================================
>--- sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c	(revision 1)
>+++ sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c	(revision 3)
>@@ -38,6 +38,7 @@
> #include <sys/param.h>
> #include <sys/proc.h>
> #include <sys/condvar.h>
>+#include <sys/time.h>
> #include <sys/systm.h>
> #include <sys/sockio.h>
> #include <sys/mbuf.h>
>@@ -53,8 +54,12 @@
> #include <sys/callout.h>
> #include <vm/vm.h>
> #include <vm/pmap.h>
>+#include <vm/uma.h>
> #include <sys/lock.h>
> #include <sys/sema.h>
>+#include <sys/sglist.h>
>+#include <machine/bus.h>
>+#include <sys/bus_dma.h>
> 
> #include <cam/cam.h>
> #include <cam/cam_ccb.h>
>@@ -66,7 +71,6 @@
> #include <cam/scsi/scsi_all.h>
> #include <cam/scsi/scsi_message.h>
> 
>-
> #include <dev/hyperv/include/hyperv.h>
> #include "hv_vstorage.h"
> 
>@@ -77,8 +81,29 @@
> #define BLKVSC_MAX_IO_REQUESTS		STORVSC_MAX_IO_REQUESTS
> #define STORVSC_MAX_TARGETS		(2)
> 
>+#define STORVSC_WIN7_MAJOR 4
>+#define STORVSC_WIN7_MINOR 2
>+
>+#define STORVSC_WIN8_MAJOR 5
>+#define STORVSC_WIN8_MINOR 1
>+
>+#define HV_ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
>+
> struct storvsc_softc;
> 
>+struct hv_sgl_node {
>+	LIST_ENTRY(hv_sgl_node) link;
>+	struct sglist *sgl_data;
>+};
>+
>+struct hv_sgl_page_pool{
>+	LIST_HEAD(, hv_sgl_node) in_use_sgl_list;
>+	LIST_HEAD(, hv_sgl_node) free_sgl_list;
>+	boolean_t                is_init;
>+} g_hv_sgl_page_pool;
>+
>+#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT
>+
> enum storvsc_request_type {
> 	WRITE_TYPE,
> 	READ_TYPE,
>@@ -96,20 +121,24 @@
> 	struct storvsc_softc *softc;
> 	struct callout callout;
> 	struct sema synch_sema; /*Synchronize the request/response if needed */
>+	struct sglist *bounce_sgl;
>+	unsigned int bounce_sgl_count;
>+	uint64_t not_aligned_seg_bits;
> };
> 
> struct storvsc_softc {
> 	struct hv_device		*hs_dev;
>-        LIST_HEAD(, hv_storvsc_request) hs_free_list;
>-        struct mtx      		hs_lock;
>-        struct storvsc_driver_props     *hs_drv_props;
>-        int 				hs_unit;
>-        uint32_t         		hs_frozen;
>-        struct cam_sim  		*hs_sim;
>-        struct cam_path 		*hs_path;
>+	LIST_HEAD(, hv_storvsc_request) hs_free_list;
>+	struct mtx      		hs_lock;
>+	struct storvsc_driver_props     *hs_drv_props;
>+	int 				hs_unit;
>+	uint32_t         		hs_frozen;
>+	struct cam_sim  		*hs_sim;
>+	struct cam_path 		*hs_path;
> 	uint32_t			hs_num_out_reqs;
> 	boolean_t			hs_destroy;
> 	boolean_t			hs_drain_notify;
>+	boolean_t			hs_open_multi_channel;
> 	struct sema 			hs_drain_sema;	
> 	struct hv_storvsc_request	hs_init_req;
> 	struct hv_storvsc_request	hs_reset_req;
>@@ -124,7 +153,7 @@
>  * The first can be tested by "sg_senddiag -vv /dev/daX",
>  * and the second and third can be done by
>  * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX".
>- */ 
>+ */
> #define HVS_TIMEOUT_TEST 0
> 
> /*
>@@ -138,7 +167,7 @@
> 	char		*drv_name;
> 	char		*drv_desc;
> 	uint8_t		drv_max_luns_per_target;
>-	uint8_t		drv_max_ios_per_target; 
>+	uint8_t		drv_max_ios_per_target;
> 	uint32_t	drv_ringbuffer_size;
> };
> 
>@@ -150,6 +179,8 @@
> 
> #define HS_MAX_ADAPTERS 10
> 
>+#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1
>+
> /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */
> static const hv_guid gStorVscDeviceType={
> 	.data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,
>@@ -171,6 +202,9 @@
> 	 STORVSC_RINGBUFFER_SIZE}
> };
> 
>+static int storvsc_current_major;
>+static int storvsc_current_minor;
>+
> /* static functions */
> static int storvsc_probe(device_t dev);
> static int storvsc_attach(device_t dev);
>@@ -177,7 +211,7 @@
> static int storvsc_detach(device_t dev);
> static void storvsc_poll(struct cam_sim * sim);
> static void storvsc_action(struct cam_sim * sim, union ccb * ccb);
>-static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
>+static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
> static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp);
> static enum hv_storage_type storvsc_get_storage_type(device_t dev);
> static void hv_storvsc_on_channel_callback(void *context);
>@@ -186,6 +220,14 @@
> 					struct hv_storvsc_request *request);
> static int hv_storvsc_connect_vsp(struct hv_device *device);
> static void storvsc_io_done(struct hv_storvsc_request *reqp);
>+void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
>+				bus_dma_segment_t *orig_sgl,
>+				unsigned int orig_sgl_count,
>+				uint64_t seg_bits);
>+void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
>+	                unsigned int dest_sgl_count,
>+				    struct sglist* src_sgl,
>+				    uint64_t seg_bits);
> 
> static device_method_t storvsc_methods[] = {
> 	/* Device interface */
>@@ -207,7 +249,7 @@
> 
> 
> /**
>- * The host is capable of sending messages to us that are 
>+ * The host is capable of sending messages to us that are
>  * completely unsolicited. So, we need to address the race
>  * condition where we may be in the process of unloading the
>  * driver when the host may send us an unsolicited message.
>@@ -223,7 +265,7 @@
>  *    destroyed.
>  *
>  * 3. Once the device is marked as being destroyed, we only
>- *    permit incoming traffic to properly account for 
>+ *    permit incoming traffic to properly account for
>  *    packets already sent out.
>  */
> static inline struct storvsc_softc *
>@@ -260,6 +302,114 @@
> }
> 
> /**
>+ * @brief Callback handler, will be invoked when receive mutil-channel offer
>+ *
>+ * @param context  new multi-channel
>+ */
>+static void
>+storvsc_handle_sc_creation(void *context)
>+{
>+	hv_vmbus_channel *new_channel = NULL;
>+	struct hv_device *device = NULL;
>+	struct storvsc_softc *sc = NULL;
>+	struct vmstor_chan_props props;
>+	int ret = 0;
>+
>+	new_channel = (hv_vmbus_channel *)context;
>+	device = new_channel->primary_channel->device;
>+	sc = get_stor_device(device, TRUE);
>+	if (NULL == sc){
>+		return;
>+	}
>+
>+	if (FALSE == sc->hs_open_multi_channel){
>+		return;
>+	}
>+	
>+	memset(&props, 0, sizeof(struct vmstor_chan_props));
>+
>+	ret = hv_vmbus_channel_open(new_channel,
>+			sc->hs_drv_props->drv_ringbuffer_size,
>+			sc->hs_drv_props->drv_ringbuffer_size,
>+			(void *)&props,
>+			sizeof(struct vmstor_chan_props),
>+			hv_storvsc_on_channel_callback,
>+			new_channel);
>+
>+	return;
>+}
>+
>+/**
>+ * @brief Send multi-channel creation request to host
>+ *
>+ * @param device  a Hyper-V device pointer
>+ * @param max_chans  the max channels supported by vmbus
>+ */
>+static void
>+storvsc_send_multichannel_request(struct hv_device *dev, int max_chans)
>+{
>+	struct storvsc_softc *sc = NULL;
>+	struct hv_storvsc_request *request = NULL;
>+	struct vstor_packet *vstor_packet = NULL;	
>+	int request_channels_cnt = 0;
>+	int ret;
>+
>+	/* get multichannels count that need to create */
>+	request_channels_cnt = ((max_chans > mp_ncpus) ? mp_ncpus : max_chans);
>+
>+	sc = get_stor_device(dev, TRUE);
>+	if (sc == NULL) {
>+		printf("Storvsc_error: get sc failed while send mutilchannel "
>+		    "request\n");
>+		return;
>+	}
>+
>+	request = &sc->hs_init_req;
>+
>+	/* Establish a handler for multi-channel */
>+	dev->channel->sc_creation_callback = storvsc_handle_sc_creation;
>+
>+	/* request the host to create multi-channel */
>+	memset(request, 0, sizeof(struct hv_storvsc_request));
>+	
>+	sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
>+
>+	vstor_packet = &request->vstor_packet;
>+	
>+	vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS;
>+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
>+	vstor_packet->u.multi_channels_cnt = request_channels_cnt;
>+
>+	ret = hv_vmbus_channel_send_packet(
>+				dev->channel,
>+				vstor_packet,
>+				sizeof(struct vstor_packet),
>+				(uint64_t)(uintptr_t)request,
>+				HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
>+				HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
>+
>+	/* wait for 500 ticks */
>+	ret = sema_timedwait(&request->synch_sema, 500);
>+	if (ret != 0) {		
>+		printf("Storvsc_error: create multi-channel timeout, %d\n",
>+		    ret);
>+		return;
>+	}
>+
>+	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
>+	    vstor_packet->status != 0) {		
>+		printf("Storvsc_error: create multi-channel invalid operation "
>+		    "(%d) or statue (%u)\n",
>+		    vstor_packet->operation, vstor_packet->status);
>+		return;
>+	}
>+
>+	sc->hs_open_multi_channel = TRUE;
>+
>+	printf("Storvsc create multi-channel success!\n");
>+}
>+
>+/**
>  * @brief initialize channel connection to parent partition
>  *
>  * @param dev  a Hyper-V device pointer
>@@ -272,6 +422,8 @@
> 	struct hv_storvsc_request *request;
> 	struct vstor_packet *vstor_packet;
> 	struct storvsc_softc *sc;
>+	uint16_t max_chans = 0;
>+	boolean_t is_support_multichannel = FALSE;
> 
> 	sc = get_stor_device(dev, TRUE);
> 	if (sc == NULL) {
>@@ -304,7 +456,8 @@
> 		goto cleanup;
> 	}
> 
>-	ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
>+	/* wait 500 ticks */
>+	ret = sema_timedwait(&request->synch_sema, 500);
> 
> 	if (ret != 0) {
> 		goto cleanup;
>@@ -321,7 +474,8 @@
> 	vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION;
> 	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
> 
>-	vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT;
>+	vstor_packet->u.version.major_minor =
>+	    VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor);
> 
> 	/* revision is only significant for Windows guests */
> 	vstor_packet->u.version.revision = 0;
>@@ -338,7 +492,8 @@
> 		goto cleanup;
> 	}
> 
>-	ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
>+	/* wait 500 ticks */
>+	ret = sema_timedwait(&request->synch_sema, 500);
> 
> 	if (ret) {
> 		goto cleanup;
>@@ -369,7 +524,8 @@
> 		goto cleanup;
> 	}
> 
>-	ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
>+	/* wait 500 ticks */
>+	ret = sema_timedwait(&request->synch_sema, 500);
> 
> 	if (ret != 0) {
> 		goto cleanup;
>@@ -377,10 +533,20 @@
> 
> 	/* TODO: Check returned version */
> 	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
>-		vstor_packet->status != 0) {
>+	    vstor_packet->status != 0) {
> 		goto cleanup;
> 	}
> 
>+	/* multi-channels feature is supported by WIN8 and above version */
>+	max_chans = vstor_packet->u.chan_props.max_channel_cnt;
>+	if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) &&
>+	    (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008)) {
>+		if (vstor_packet->u.chan_props.flags &
>+		    HV_STORAGE_SUPPORTS_MULTI_CHANNEL) {
>+			is_support_multichannel = TRUE;
>+		}
>+	}
>+
> 	memset(vstor_packet, 0, sizeof(struct vstor_packet));
> 	vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION;
> 	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
>@@ -397,7 +563,8 @@
> 		goto cleanup;
> 	}
> 
>-	ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
>+	/* wait 500 ticks */
>+	ret = sema_timedwait(&request->synch_sema, 500);
> 
> 	if (ret != 0) {
> 		goto cleanup;
>@@ -404,10 +571,18 @@
> 	}
> 
> 	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
>-		vstor_packet->status != 0) {
>+	    vstor_packet->status != 0) {
> 		goto cleanup;
> 	}
> 
>+	/*
>+	 * If multi-channel is supported, send multichannel create
>+	 * request to host.
>+	 */
>+	if (is_support_multichannel){
>+		storvsc_send_multichannel_request(dev, max_chans);
>+	}
>+
> cleanup:
> 	sema_destroy(&request->synch_sema);
> 	return (ret);
>@@ -443,9 +618,8 @@
> 		(void *)&props,
> 		sizeof(struct vmstor_chan_props),
> 		hv_storvsc_on_channel_callback,
>-		dev);
>+		dev->channel);
> 
>-
> 	if (ret != 0) {
> 		return ret;
> 	}
>@@ -498,7 +672,7 @@
> 
> 
> 	/*
>-	 * At this point, all outstanding requests in the adapter 
>+	 * At this point, all outstanding requests in the adapter
> 	 * should have been flushed out and return to us
> 	 */
> 
>@@ -521,6 +695,7 @@
> {
> 	struct storvsc_softc *sc;
> 	struct vstor_packet *vstor_packet = &request->vstor_packet;
>+	struct hv_vmbus_channel* outgoing_channel = NULL;
> 	int ret = 0;
> 
> 	sc = get_stor_device(device, TRUE);
>@@ -539,19 +714,20 @@
> 
> 	vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;
> 
>+	outgoing_channel = vmbus_select_outgoing_channel(device->channel);
> 
> 	mtx_unlock(&request->softc->hs_lock);
> 	if (request->data_buf.length) {
> 		ret = hv_vmbus_channel_send_packet_multipagebuffer(
>-				device->channel,
>+				outgoing_channel,
> 				&request->data_buf,
>-				vstor_packet, 
>-				sizeof(struct vstor_packet), 
>+				vstor_packet,
>+				sizeof(struct vstor_packet),
> 				(uint64_t)(uintptr_t)request);
> 
> 	} else {
> 		ret = hv_vmbus_channel_send_packet(
>-			device->channel,
>+			outgoing_channel,
> 			vstor_packet,
> 			sizeof(struct vstor_packet),
> 			(uint64_t)(uintptr_t)request,
>@@ -610,7 +786,8 @@
> hv_storvsc_on_channel_callback(void *context)
> {
> 	int ret = 0;
>-	struct hv_device *device = (struct hv_device *)context;
>+	hv_vmbus_channel *channel = (hv_vmbus_channel *)context;
>+	struct hv_device *device = NULL;
> 	struct storvsc_softc *sc;
> 	uint32_t bytes_recvd;
> 	uint64_t request_id;
>@@ -618,15 +795,22 @@
> 	struct hv_storvsc_request *request;
> 	struct vstor_packet *vstor_packet;
> 
>+	if (channel->primary_channel != NULL){
>+		device = channel->primary_channel->device;
>+	} else {
>+		device = channel->device;
>+	}
>+
>+	KASSERT(device, ("device"));
>+
> 	sc = get_stor_device(device, FALSE);
> 	if (sc == NULL) {
>+		printf("Storvsc_error: get stor device failed.\n");
> 		return;
> 	}
> 
>-	KASSERT(device, ("device"));
>-
> 	ret = hv_vmbus_channel_recv_packet(
>-			device->channel,
>+			channel,
> 			packet,
> 			roundup2(sizeof(struct vstor_packet), 8),
> 			&bytes_recvd,
>@@ -634,21 +818,29 @@
> 
> 	while ((ret == 0) && (bytes_recvd > 0)) {
> 		request = (struct hv_storvsc_request *)(uintptr_t)request_id;
>-		KASSERT(request, ("request"));
> 
> 		if ((request == &sc->hs_init_req) ||
> 			(request == &sc->hs_reset_req)) {
> 			memcpy(&request->vstor_packet, packet,
> 				   sizeof(struct vstor_packet));
>-			sema_post(&request->synch_sema); 
>+			sema_post(&request->synch_sema);
> 		} else {
> 			vstor_packet = (struct vstor_packet *)packet;
> 			switch(vstor_packet->operation) {
> 			case VSTOR_OPERATION_COMPLETEIO:
>+				if (request == NULL) {
>+					printf("VMBUS: storvsc received a "
>+					    "packet with NULL request id in "
>+					    "COMPLETEIO operation. Panick!\n");
>+					KASSERT(request, ("request"));
>+				}
> 				hv_storvsc_on_iocompletion(sc,
> 							vstor_packet, request);
> 				break;
> 			case VSTOR_OPERATION_REMOVEDEVICE:
>+			case VSTOR_OPERATION_ENUMERATE_BUS:
>+				printf("VMBUS: storvsc operation %d not "
>+				    "implemented.\n", vstor_packet->operation);
> 				/* TODO: implement */
> 				break;
> 			default:
>@@ -656,7 +848,7 @@
> 			}			
> 		}
> 		ret = hv_vmbus_channel_recv_packet(
>-				device->channel,
>+				channel,
> 				packet,
> 				roundup2(sizeof(struct vstor_packet), 8),
> 				&bytes_recvd,
>@@ -680,7 +872,16 @@
> {
> 	int ata_disk_enable = 0;
> 	int ret	= ENXIO;
>-
>+	
>+	if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) ||
>+	    (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){
>+		storvsc_current_major = STORVSC_WIN8_MAJOR;
>+		storvsc_current_minor = STORVSC_WIN8_MINOR;
>+	} else {
>+		storvsc_current_major = STORVSC_WIN7_MAJOR;
>+		storvsc_current_minor = STORVSC_WIN7_MINOR;
>+	}
>+	
> 	switch (storvsc_get_storage_type(dev)) {
> 	case DRIVER_BLKVSC:
> 		if(bootverbose)
>@@ -721,9 +922,11 @@
> 	enum hv_storage_type stor_type;
> 	struct storvsc_softc *sc;
> 	struct cam_devq *devq;
>-	int ret, i;
>+	int ret, i, j;
> 	struct hv_storvsc_request *reqp;
> 	struct root_hold_token *root_mount_token = NULL;
>+	struct hv_sgl_node *sgl_node = NULL;
>+	void *tmp_buff = NULL;
> 
> 	/*
> 	 * We need to serialize storvsc attach calls.
>@@ -764,8 +967,46 @@
> 		LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
> 	}
> 
>+	/* create sg-list page pool */
>+	if (FALSE == g_hv_sgl_page_pool.is_init){
>+		g_hv_sgl_page_pool.is_init = TRUE;
>+		LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list);
>+		LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list);
>+
>+	    /* pre-create SG list, each SG list with HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each segment has one page buffer */
>+		for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++){
>+	        sgl_node = malloc(sizeof(struct hv_sgl_node),
>+				          M_DEVBUF, M_WAITOK|M_ZERO);
>+			if (NULL == sgl_node){
>+				ret = ENOMEM;
>+	            goto cleanup;
>+			}
>+
>+			sgl_node->sgl_data = sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT,
>+				                      M_WAITOK|M_ZERO);
>+			if (NULL == sgl_node->sgl_data){
>+				ret = ENOMEM;
>+	            goto cleanup;
>+			}
>+
>+			for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){
>+				tmp_buff = malloc(PAGE_SIZE,
>+				            M_DEVBUF, M_WAITOK|M_ZERO);
>+				if (NULL == tmp_buff){
>+	    			ret = ENOMEM;
>+		            goto cleanup;
>+				}
>+
>+				sgl_node->sgl_data->sg_segs[j].ss_paddr = (vm_paddr_t)tmp_buff;
>+			}
>+
>+			LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
>+		}
>+	}
>+
> 	sc->hs_destroy = FALSE;
> 	sc->hs_drain_notify = FALSE;
>+	sc->hs_open_multi_channel = FALSE;
> 	sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
> 
> 	ret = hv_storvsc_connect_vsp(hv_dev);
>@@ -834,6 +1075,19 @@
> 		LIST_REMOVE(reqp, link);
> 		free(reqp, M_DEVBUF);
> 	}
>+
>+	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
>+		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
>+		LIST_REMOVE(sgl_node, link);
>+		for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){
>+			if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr){
>+		        free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
>+			}
>+		}
>+		sglist_free(sgl_node->sgl_data);
>+		free(sgl_node, M_DEVBUF);
>+	}
>+
> 	return (ret);
> }
> 
>@@ -853,6 +1107,8 @@
> 	struct storvsc_softc *sc = device_get_softc(dev);
> 	struct hv_storvsc_request *reqp = NULL;
> 	struct hv_device *hv_device = vmbus_get_devctx(dev);
>+	struct hv_sgl_node *sgl_node = NULL;
>+	int j = 0;
> 
> 	mtx_lock(&hv_device->channel->inbound_lock);
> 	sc->hs_destroy = TRUE;
>@@ -884,6 +1140,19 @@
> 		free(reqp, M_DEVBUF);
> 	}
> 	mtx_unlock(&sc->hs_lock);
>+
>+	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
>+		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
>+		LIST_REMOVE(sgl_node, link);
>+		for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){
>+			if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr){
>+		        free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
>+			}
>+		}
>+		sglist_free(sgl_node->sgl_data);
>+		free(sgl_node, M_DEVBUF);
>+	}
>+	
> 	return (0);
> }
> 
>@@ -939,7 +1208,7 @@
> 				ticks, __func__, (ret == 0)?
> 				"IO return detected" :
> 				"IO return not detected");
>-		/* 
>+		/*
> 		 * Now both the timer handler and io done are running
> 		 * simultaneously. We want to confirm the io done always
> 		 * finishes after the timer handler exits. So reqp used by
>@@ -1024,7 +1293,7 @@
> 
> 	mtx_assert(&sc->hs_lock, MA_OWNED);
> 	mtx_unlock(&sc->hs_lock);
>-	hv_storvsc_on_channel_callback(sc->hs_dev);
>+	hv_storvsc_on_channel_callback(sc->hs_dev->channel);
> 	mtx_lock(&sc->hs_lock);
> }
> 
>@@ -1152,10 +1421,14 @@
> 
> 		bzero(reqp, sizeof(struct hv_storvsc_request));
> 		reqp->softc = sc;
>+		
>+		ccb->ccb_h.status |= CAM_SIM_QUEUED;
>+		if ((res = create_storvsc_request(ccb, reqp)) != 0) {
>+			ccb->ccb_h.status = CAM_REQ_INVALID;
>+			xpt_done(ccb);
>+			return;
>+		}
> 
>-		ccb->ccb_h.status |= CAM_SIM_QUEUED;	    
>-		create_storvsc_request(ccb, reqp);
>-
> 		if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
> 			callout_init(&reqp->callout, CALLOUT_MPSAFE);
> 			callout_reset(&reqp->callout,
>@@ -1195,6 +1468,207 @@
> }
> 
> /**
>+ * @brief destroy bounce buffer
>+ *
>+ * This function is responsible for destroy a Scatter/Gather list
>+ * that create by storvsc_create_bounce_buffer()
>+ *
>+ * @param sgl- the Scatter/Gather need be destroy
>+ * @param sg_count- page count of the SG list.
>+ *
>+ */
>+static void
>+storvsc_destroy_bounce_buffer(struct sglist *sgl)
>+{
>+	struct hv_sgl_node *sgl_node = NULL;
>+
>+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
>+	LIST_REMOVE(sgl_node, link);
>+	if (NULL == sgl_node) {
>+		printf("storvsc error: not enough in use sgl\n");
>+		return;
>+	}
>+	sgl_node->sgl_data = sgl;
>+	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
>+}
>+
>+/**
>+ * @brief create bounce buffer
>+ *
>+ * This function is responsible for create a Scatter/Gather list,
>+ * which hold several pages that can be aligned with page size.
>+ *
>+ * @param seg_count- SG-list segments count
>+ * @param write - if WRITE_TYPE, set SG list page used size to 0,
>+ * otherwise set used size to page size.
>+ *
>+ * return NULL if create failed
>+ */
>+static struct sglist *
>+storvsc_create_bounce_buffer(uint16_t seg_count, int write)
>+{
>+	int i = 0;
>+	struct sglist *bounce_sgl = NULL;
>+	unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE);
>+	struct hv_sgl_node *sgl_node = NULL;	
>+
>+	/* get struct sglist from free_sgl_list */
>+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
>+	LIST_REMOVE(sgl_node, link);
>+	if (NULL == sgl_node) {
>+		printf("storvsc error: not enough free sgl\n");
>+		return NULL;
>+	}
>+	bounce_sgl = sgl_node->sgl_data;
>+	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
>+
>+	bounce_sgl->sg_maxseg = seg_count;
>+	if (write == WRITE_TYPE) {
>+		bounce_sgl->sg_nseg = 0;
>+	} else {
>+		bounce_sgl->sg_nseg = seg_count;
>+	}
>+
>+	for (i = 0; i < seg_count; i++) {
>+	        bounce_sgl->sg_segs[i].ss_len = buf_len;
>+	}
>+
>+	return bounce_sgl;
>+}
>+
>+/**
>+ * @brief copy data from SG list to bounce buffer
>+ *
>+ * This function is responsible for copy data from one SG list's segments
>+ * to another SG list which used as bounce buffer.
>+ *
>+ * @param bounce_sgl - the destination SG list
>+ * @param orig_sgl - the segment of the source SG list.
>+ * @param orig_sgl_count - the count of segments.
>+ * @param orig_sgl_count - indicate which segment need bounce buffer, set 1 means need.
>+ *
>+ */
>+void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
>+				bus_dma_segment_t *orig_sgl,
>+				unsigned int orig_sgl_count,
>+				uint64_t seg_bits)
>+{
>+	int src_sgl_idx = 0;
>+
>+	for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) {
>+		if (seg_bits & (1 << src_sgl_idx)) {
>+			memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr,
>+			    (void*)orig_sgl[src_sgl_idx].ds_addr,
>+			    orig_sgl[src_sgl_idx].ds_len);
>+			bounce_sgl->sg_segs[src_sgl_idx].ss_len =
>+			    orig_sgl[src_sgl_idx].ds_len;
>+		}
>+	}
>+}
>+
>+/**
>+ * @brief copy data from SG list which used as bounce to another SG list
>+ *
>+ * This function is responsible for copy data from one SG list with bounce
>+ * buffer to another SG list's segments.
>+ *
>+ * @param dest_sgl - the destination SG list's segments
>+ * @param dest_sgl_count - the count of destination SG list's segment.
>+ * @param src_sgl - the source SG list.
>+ * @param seg_bits - indicate which segment used bounce buffer of src SG-list.
>+ *
>+ */
>+void
>+storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
>+	                unsigned int dest_sgl_count,
>+				    struct sglist* src_sgl,
>+				    uint64_t seg_bits)
>+{
>+	int sgl_idx = 0;
>+	
>+	for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) {
>+		if (seg_bits & (1 << sgl_idx)) {
>+			memcpy((void*)(dest_sgl[sgl_idx].ds_addr),
>+			    (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr),
>+			    src_sgl->sg_segs[sgl_idx].ss_len);
>+		}
>+	}
>+}
>+
>+/**
>+ * @brief check SG list with bounce buffer or not
>+ *
>+ * This function is responsible for check if need bounce buffer for SG list.
>+ *
>+ * @param sgl - the SG list's segments
>+ * @param sg_count - the count of SG list's segment.
>+ * @param bits - segmengs number that need bounce buffer
>+ *
>+ * return -1 if SG list needless bounce buffer
>+ */
>+static int
>+storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, unsigned int sg_count, uint64_t *bits)
>+{
>+	int i = 0;
>+	int offset = 0;
>+	uint64_t phys_addr = 0;
>+	uint64_t tmp_bits = 0;
>+	boolean_t found_hole = FALSE;
>+	boolean_t pre_aligned = TRUE;
>+
>+	if (sg_count < 2){
>+		return -1;
>+	}
>+
>+	*bits = 0;
>+	
>+	phys_addr = vtophys(sgl[0].ds_addr);
>+	offset =  phys_addr - trunc_page(phys_addr);
>+	if (offset){
>+	    pre_aligned = FALSE;
>+		tmp_bits |= 1;
>+	}
>+
>+	for (i = 1; i < sg_count; i++) {
>+		phys_addr = vtophys(sgl[i].ds_addr);
>+		offset =  phys_addr - trunc_page(phys_addr);
>+
>+		if (0 == offset) {
>+			if (FALSE == pre_aligned){
>+				/*
>+				 * This segment is aligned, if the previous
>+				 * one is not aligned, find a hole
>+				 */
>+				found_hole = TRUE;
>+			}
>+			pre_aligned = TRUE;
>+		} else {
>+			tmp_bits |= 1 << i;
>+			if (FALSE == pre_aligned) {
>+				if (phys_addr != vtophys(sgl[i-1].ds_addr +
>+				    sgl[i-1].ds_len)) {
>+					/*
>+					 * Check whether connect to previous
>+					 * segment,if not, find the hole
>+					 */
>+					found_hole = TRUE;
>+				}
>+			} else {
>+				found_hole = TRUE;
>+			}
>+			pre_aligned = FALSE;
>+		}
>+	}
>+
>+	if (FALSE == found_hole) {
>+		return -1;
>+	} else {
>+		*bits = tmp_bits;
>+		return 0;
>+	}
>+}
>+
>+/**
>  * @brief Fill in a request structure based on a CAM control block
>  *
>  * Fills in a request structure based on the contents of a CAM control
>@@ -1204,7 +1678,7 @@
>  * @param ccb pointer to a CAM contorl block
>  * @param reqp pointer to a request structure
>  */
>-static void
>+static int
> create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
> {
> 	struct ccb_scsiio *csio = &ccb->csio;
>@@ -1212,6 +1686,7 @@
> 	uint32_t bytes_to_copy = 0;
> 	uint32_t pfn_num = 0;
> 	uint32_t pfn;
>+	uint64_t not_aligned_seg_bits = 0;
> 	
> 	/* refer to struct vmscsi_req for meanings of these two fields */
> 	reqp->vstor_packet.u.vm_srb.port =
>@@ -1232,18 +1707,18 @@
> 	}
> 
> 	switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
>-    	case CAM_DIR_OUT: 
>-    		reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
>-    		break;
>-    	case CAM_DIR_IN:
>-    		reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
>-    		break;
>-    	case CAM_DIR_NONE:
>-    		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
>-    		break;
>-    	default:
>-    		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
>-    		break;
>+	case CAM_DIR_OUT:
>+		reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;	
>+		break;
>+	case CAM_DIR_IN:
>+		reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
>+		break;
>+	case CAM_DIR_NONE:
>+		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
>+		break;
>+	default:
>+		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
>+		break;
> 	}
> 
> 	reqp->sense_data     = &csio->sense_data;
>@@ -1250,30 +1725,138 @@
> 	reqp->sense_info_len = csio->sense_len;
> 
> 	reqp->ccb = ccb;
>-	/*
>-	KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0,
>-			("ccb is scatter gather valid\n"));
>-	*/
>-	if (csio->dxfer_len != 0) {
>-		reqp->data_buf.length = csio->dxfer_len;
>+
>+	if (0 == csio->dxfer_len) {
>+		return 0;
>+	}
>+
>+	reqp->data_buf.length = csio->dxfer_len;
>+
>+	switch (ccb->ccb_h.flags & CAM_DATA_MASK) {
>+	case CAM_DATA_VADDR:{
> 		bytes_to_copy = csio->dxfer_len;
> 		phys_addr = vtophys(csio->data_ptr);
>-		reqp->data_buf.offset = phys_addr - trunc_page(phys_addr);
>+		reqp->data_buf.offset = phys_addr & PAGE_MASK;
>+		
>+		while (bytes_to_copy != 0) {
>+			int bytes, page_offset;
>+			phys_addr =
>+			    vtophys(&csio->data_ptr[reqp->data_buf.length -
>+			    bytes_to_copy]);
>+			pfn = phys_addr >> PAGE_SHIFT;
>+			reqp->data_buf.pfn_array[pfn_num] = pfn;
>+			page_offset = phys_addr & PAGE_MASK;
>+
>+			bytes = min(PAGE_SIZE - page_offset, bytes_to_copy);
>+
>+			bytes_to_copy -= bytes;
>+			pfn_num++;
>+		}
>+		break;
> 	}
>+	case CAM_DATA_SG:{
>+		int i = 0;
>+		int offset = 0;
>+		bus_dma_segment_t *storvsc_sglist =
>+		    (bus_dma_segment_t *)ccb->csio.data_ptr;
>+		u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt;
> 
>-	while (bytes_to_copy != 0) {
>-		int bytes, page_offset;
>-		phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length -
>-		                                    bytes_to_copy]);
>-		pfn = phys_addr >> PAGE_SHIFT;
>-		reqp->data_buf.pfn_array[pfn_num] = pfn;
>-		page_offset = phys_addr - trunc_page(phys_addr);
>+		printf("Storvsc: get SG I/O operation, %d\n",
>+		    reqp->vstor_packet.u.vm_srb.data_in);
> 
>-		bytes = min(PAGE_SIZE - page_offset, bytes_to_copy);
>+		if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){
>+			printf("Storvsc: %d segments is too much, "
>+			    "only support %d segments\n",
>+			    storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT);
>+			return EINVAL;
>+		}
> 
>-		bytes_to_copy -= bytes;
>-		pfn_num++;
>+		/* check if we need to create bounce buffer */
>+		if (storvsc_check_bounce_buffer_sgl(
>+			          storvsc_sglist,
>+			          storvsc_sg_count,
>+			          &not_aligned_seg_bits) != -1) {
>+			reqp->bounce_sgl =
>+			    storvsc_create_bounce_buffer(storvsc_sg_count,
>+			       reqp->vstor_packet.u.vm_srb.data_in);
>+			if (NULL == reqp->bounce_sgl) {
>+				printf("Storvsc_error: create bounce buffer failed.\n");
>+				return ENOMEM;
>+			}
>+
>+			reqp->bounce_sgl_count = storvsc_sg_count;
>+			reqp->not_aligned_seg_bits = not_aligned_seg_bits;
>+
>+			/*
>+			 * if it is write, we need copy the original data
>+			 *to bounce buffer
>+			 */
>+			if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
>+				storvsc_copy_sgl_to_bounce_buf(
>+				    reqp->bounce_sgl,
>+				    storvsc_sglist,
>+				    storvsc_sg_count,
>+				    reqp->not_aligned_seg_bits);
>+			}
>+
>+			/* transfer virtual address to physical frame number */
>+			if (reqp->not_aligned_seg_bits & 0x1){
>+ 				phys_addr =
>+					vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr);
>+			}else{
>+ 				phys_addr =
>+					vtophys(storvsc_sglist[0].ds_addr);
>+			}
>+			reqp->data_buf.offset = phys_addr & PAGE_MASK;
>+
>+			pfn = phys_addr >> PAGE_SHIFT;
>+			reqp->data_buf.pfn_array[0] = pfn;
>+			
>+			for (i = 1; i < storvsc_sg_count; i++) {
>+				if (reqp->not_aligned_seg_bits & (1 << i)){
>+					phys_addr =
>+						vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr);
>+				}
>+				else{
>+					phys_addr =
>+						vtophys(storvsc_sglist[i].ds_addr);
>+				}
>+
>+				pfn = phys_addr >> PAGE_SHIFT;
>+				reqp->data_buf.pfn_array[i] = pfn;
>+			}
>+		}
>+		else {
>+			phys_addr = vtophys(storvsc_sglist[0].ds_addr);
>+
>+			reqp->data_buf.offset = phys_addr & PAGE_MASK;
>+
>+			for (i = 0; i < storvsc_sg_count; i++){
>+				phys_addr = vtophys(storvsc_sglist[i].ds_addr);
>+				pfn = phys_addr >> PAGE_SHIFT;
>+				reqp->data_buf.pfn_array[i] = pfn;
>+			}
>+
>+			/* check the last segment cross boundary or not */
>+			offset = phys_addr & PAGE_MASK;
>+			if (offset){
>+				phys_addr =
>+				    vtophys(storvsc_sglist[i-1].ds_addr +
>+				    PAGE_SIZE - offset);
>+				pfn = phys_addr >> PAGE_SHIFT;
>+				reqp->data_buf.pfn_array[i] = pfn;
>+			}
>+			
>+			reqp->bounce_sgl_count = 0;
>+		}
>+		break;
> 	}
>+	default:
>+		printf("Unknow flags: %d\n", ccb->ccb_h.flags);
>+		return EINVAL;
>+	}
>+
>+	return 0;
> }
> 
> /**
>@@ -1292,7 +1875,29 @@
> 	struct ccb_scsiio *csio = &ccb->csio;
> 	struct storvsc_softc *sc = reqp->softc;
> 	struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
>-	
>+	bus_dma_segment_t *ori_sglist = NULL;
>+	int ori_sg_count = 0;
>+
>+	/* destroy bounce buffer if it is used */
>+	if (reqp->bounce_sgl_count) {
>+		ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
>+		ori_sg_count = ccb->csio.sglist_cnt;
>+
>+		/*
>+		 * If it is READ operation, we should copy back the data
>+		 * to original SG list.
>+		 */
>+		if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
>+			storvsc_copy_from_bounce_buf_to_sgl(ori_sglist,
>+			    ori_sg_count,
>+			    reqp->bounce_sgl,
>+			    reqp->not_aligned_seg_bits);
>+		}
>+
>+		storvsc_destroy_bounce_buffer(reqp->bounce_sgl);
>+		reqp->bounce_sgl_count = 0;
>+	}
>+		
> 	if (reqp->retries > 0) {
> 		mtx_lock(&sc->hs_lock);
> #if HVS_TIMEOUT_TEST
>@@ -1310,7 +1915,7 @@
> 		mtx_unlock(&sc->hs_lock);
> 	}
> 
>-	/* 
>+	/*
> 	 * callout_drain() will wait for the timer handler to finish
> 	 * if it is running. So we don't need any lock to synchronize
> 	 * between this routine and the timer handler.
>Index: sys/dev/hyperv/storvsc/hv_vstorage.h
>===================================================================
>--- sys/dev/hyperv/storvsc/hv_vstorage.h	(revision 1)
>+++ sys/dev/hyperv/storvsc/hv_vstorage.h	(revision 3)
>@@ -53,7 +53,7 @@
>  * V1 RC > 2008/1/31          2.0
>  */
> 
>-#define VMSTOR_PROTOCOL_VERSION_CURRENT	VMSTOR_PROTOCOL_VERSION(2, 0)
>+#define VMSTOR_PROTOCOL_VERSION_CURRENT	VMSTOR_PROTOCOL_VERSION(5, 1)
> 
> /**
>  *  Packet structure ops describing virtual storage requests.
>@@ -69,7 +69,10 @@
> 	VSTOR_OPERATION_ENDINITIALIZATION     = 8,
> 	VSTOR_OPERATION_QUERYPROTOCOLVERSION  = 9,
> 	VSTOR_OPERATION_QUERYPROPERTIES       = 10,
>-	VSTOR_OPERATION_MAXIMUM               = 10
>+	VSTOR_OPERATION_ENUMERATE_BUS         = 11,
>+	VSTOR_OPERATION_FCHBA_DATA            = 12,
>+	VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13,
>+	VSTOR_OPERATION_MAXIMUM               = 13
> };
> 
> 
>@@ -123,10 +126,12 @@
> 	uint8_t  path_id;
> 	uint8_t  target_id;
> 
>+	uint16_t max_channel_cnt;
>+
> 	/**
> 	 * Note: port number is only really known on the client side
> 	 */
>-	uint32_t port;
>+	uint16_t port;
> 	uint32_t flags;
> 	uint32_t max_transfer_bytes;
> 
>@@ -193,6 +198,11 @@
> 	     * Used during version negotiations.
> 	     */
> 	    struct vmstor_proto_ver version;
>+
>+	    /**
>+             * Number of multichannels to create
>+	     */
>+	    uint16_t multi_channels_cnt;
> 	} u;
> 
> } __packed;
>Index: sys/dev/hyperv/utilities/hv_util.c
>===================================================================
>--- sys/dev/hyperv/utilities/hv_util.c	(revision 1)
>+++ sys/dev/hyperv/utilities/hv_util.c	(revision 3)
>@@ -408,6 +408,15 @@
> 	    }
> 	}
> 
>+	/*
>+	 * These services are not performance critical and do not need
>+	 * batched reading. Furthermore, some services such as KVP can
>+	 * only handle one message from the host at a time.
>+	 * Turn off batched reading for all util drivers before we open the
>+	 * channel.
>+	 */
>+	hv_set_channel_read_state(hv_dev->channel, FALSE);
>+
> 	ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE,
> 		    4 * PAGE_SIZE, NULL, 0,
> 		    service->callback, hv_dev->channel);
>Index: sys/dev/hyperv/utilities/hv_kvp.c
>===================================================================
>--- sys/dev/hyperv/utilities/hv_kvp.c	(revision 1)
>+++ sys/dev/hyperv/utilities/hv_kvp.c	(revision 3)
>@@ -55,6 +55,7 @@
> #include <sys/_null.h>
> #include <sys/signal.h>
> #include <sys/syslog.h>
>+#include <sys/systm.h>
> #include <sys/mutex.h>
> #include <net/if_arp.h>
> 
>@@ -232,7 +233,7 @@
> 	 */
> 	if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) {
> 		icframe_vercnt = 3;
>-		if (icmsg_vercnt >= 2)
>+		if (icmsg_vercnt > 2)
> 			icmsg_vercnt = 4;
> 		else
> 			icmsg_vercnt = 3;
>@@ -734,8 +735,8 @@
> 		recvlen = 0;
> 		ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
> 			&recvlen, &requestid);
>-		hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n",
>-			__func__, context, pending_cnt, ret, recvlen);
>+		hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n",
>+			__func__, context, (unsigned long long)pending_cnt, ret, recvlen);
> 	} 
> }
> 
>@@ -813,9 +814,9 @@
> hv_kvp_dev_destroy(void)
> {
> 
>-        if (daemon_task != NULL) {
>+	if (daemon_task != NULL) {
> 		PROC_LOCK(daemon_task);
>-        	kern_psignal(daemon_task, SIGKILL);
>+		kern_psignal(daemon_task, SIGKILL);
> 		PROC_UNLOCK(daemon_task);
> 	}
> 	
>Index: sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c	(revision 3)
>@@ -53,7 +53,10 @@
> 
> #include <machine/stdarg.h>
> #include <machine/intr_machdep.h>
>+#include <machine/md_var.h>
>+#include <machine/segments.h>
> #include <sys/pcpu.h>
>+#include <x86/apicvar.h>
> 
> #include "hv_vmbus_priv.h"
> 
>@@ -60,15 +63,7 @@
> 
> #define VMBUS_IRQ	0x5
> 
>-static struct intr_event *hv_msg_intr_event;
>-static struct intr_event *hv_event_intr_event;
>-static void *msg_swintr;
>-static void *event_swintr;
> static device_t vmbus_devp;
>-static void *vmbus_cookiep;
>-static int vmbus_rid;
>-struct resource *intr_res;
>-static int vmbus_irq = VMBUS_IRQ;
> static int vmbus_inited;
> static hv_setup_args setup_args; /* only CPU 0 supported at this time */
> 
>@@ -77,7 +72,7 @@
>  * the hypervisor.
>  */
> static void
>-vmbus_msg_swintr(void *dummy)
>+vmbus_msg_swintr(void *arg)
> {
> 	int 			cpu;
> 	void*			page_addr;
>@@ -84,7 +79,10 @@
> 	hv_vmbus_message*	msg;
> 	hv_vmbus_message*	copied;
> 
>-	cpu = PCPU_GET(cpuid);
>+	cpu = (int)(long)arg;
>+	KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
>+	    "cpu out of range!"));
>+
> 	page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu];
> 	msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
> 
>@@ -130,17 +128,8 @@
>  *
>  * The purpose of this routine is to determine the type of VMBUS protocol
>  * message to process - an event or a channel message.
>- * As this is an interrupt filter routine, the function runs in a very
>- * restricted envinronment.  From the manpage for bus_setup_intr(9)
>- *
>- *   In this restricted environment, care must be taken to account for all
>- *   races.  A careful analysis of races should be done as well.  It is gener-
>- *   ally cheaper to take an extra interrupt, for example, than to protect
>- *   variables with spinlocks.	Read, modify, write cycles of hardware regis-
>- *   ters need to be carefully analyzed if other threads are accessing the
>- *   same registers.
>  */
>-static int
>+static inline int
> hv_vmbus_isr(void *unused) 
> {
> 	int				cpu;
>@@ -149,8 +138,6 @@
> 	void*				page_addr;
> 
> 	cpu = PCPU_GET(cpuid);
>-	/* (Temporary limit) */
>-	KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero"));
> 
> 	/*
> 	 * The Windows team has advised that we check for events
>@@ -162,9 +149,21 @@
> 	event = (hv_vmbus_synic_event_flags*)
> 		    page_addr + HV_VMBUS_MESSAGE_SINT;
> 
>-	/* Since we are a child, we only need to check bit 0 */
>-	if (synch_test_and_clear_bit(0, &event->flags32[0])) {
>-		swi_sched(event_swintr, 0);
>+	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
>+	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
>+		/* Since we are a child, we only need to check bit 0 */
>+		if (synch_test_and_clear_bit(0, &event->flags32[0])) {
>+			swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
>+		}
>+	} else {
>+		/*
>+		 * On host with Win8 or above, we can directly look at
>+		 * the event page. If bit n is set, we have an interrupt 
>+		 * on the channel with id n.
>+		 * Directly schedule the event software interrupt on
>+		 * current cpu.
>+		 */
>+		swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
> 	}
> 
> 	/* Check if there are actual msgs to be process */
>@@ -172,12 +171,47 @@
> 	msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
> 
> 	if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) {
>-		swi_sched(msg_swintr, 0);
>+		swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0);
> 	}
> 
> 	return FILTER_HANDLED;
> }
> 
>+#ifdef HV_DEBUG_INTR 
>+uint32_t hv_intr_count = 0;
>+#endif
>+uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
>+uint32_t hv_vmbus_intr_cpu[MAXCPU];
>+
>+void
>+hv_vector_handler(struct trapframe *trap_frame)
>+{
>+#ifdef HV_DEBUG_INTR
>+	int cpu;
>+#endif
>+
>+	/*
>+	 * Disable preemption.
>+	 */
>+	critical_enter();
>+
>+#ifdef HV_DEBUG_INTR
>+	/*
>+	 * Do a little interrupt counting.
>+	 */
>+	cpu = PCPU_GET(cpuid);
>+	hv_vmbus_intr_cpu[cpu]++;
>+	hv_intr_count++;
>+#endif
>+
>+	hv_vmbus_isr(NULL); 
>+
>+	/*
>+	 * Enable preemption.
>+	 */
>+	critical_exit();
>+}
>+
> static int
> vmbus_read_ivar(
> 	device_t	dev,
>@@ -316,7 +350,66 @@
> 	return (BUS_PROBE_NOWILDCARD);
> }
> 
>+extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback);
>+
> /**
>+ * @brief Find a free IDT slot and setup the interrupt handler.
>+ */
>+static int
>+vmbus_vector_alloc(void)
>+{
>+	int vector;
>+	uintptr_t func;
>+	struct gate_descriptor *ip;
>+
>+	/*
>+	 * Search backwards form the highest IDT vector available for use
>+	 * as vmbus channel callback vector. We install 'hv_vmbus_callback'
>+	 * handler at that vector and use it to interrupt vcpus.
>+	 */
>+	vector = APIC_SPURIOUS_INT;
>+	while (--vector >= APIC_IPI_INTS) {
>+		ip = &idt[vector];
>+		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
>+		if (func == (uintptr_t)&IDTVEC(rsvd)) {
>+#ifdef __i386__
>+			setidt(vector , &IDTVEC(hv_vmbus_callback), SDT_SYS386IGT,
>+			    SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
>+#else
>+			setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT,
>+			    SEL_KPL, 0);
>+#endif
>+
>+			return (vector);
>+		}
>+	}
>+	return (0);
>+}
>+
>+/**
>+ * @brief Restore the IDT slot to rsvd.
>+ */
>+static void
>+vmbus_vector_free(int vector)
>+{
>+        uintptr_t func;
>+        struct gate_descriptor *ip;
>+
>+	if (vector == 0)
>+		return;
>+
>+        KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT,
>+            ("invalid vector %d", vector));
>+
>+        ip = &idt[vector];
>+        func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
>+        KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback),
>+            ("invalid vector %d", vector));
>+
>+        setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
>+}
>+
>+/**
>  * @brief Main vmbus driver initialization routine.
>  *
>  * Here, we
>@@ -331,22 +424,7 @@
> static int
> vmbus_bus_init(void)
> {
>-	struct ioapic_intsrc {
>-		struct intsrc io_intsrc;
>-		u_int io_irq;
>-		u_int io_intpin:8;
>-		u_int io_vector:8;
>-		u_int io_cpu:8;
>-		u_int io_activehi:1;
>-		u_int io_edgetrigger:1;
>-		u_int io_masked:1;
>-		int io_bus:4;
>-		uint32_t io_lowreg;
>-	};
>-	int i, ret;
>-	unsigned int vector = 0;
>-	struct intsrc *isrc;
>-	struct ioapic_intsrc *intpin;
>+	int i, j, n, ret;
> 
> 	if (vmbus_inited)
> 		return (0);
>@@ -361,80 +439,100 @@
> 		return (ret);
> 	}
> 
>-	ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr,
>-	    NULL, SWI_CLOCK, 0, &msg_swintr);
>-
>-	if (ret)
>-	    goto cleanup;
>-
> 	/*
>-	 * Message SW interrupt handler checks a per-CPU page and
>-	 * thus the thread needs to be bound to CPU-0 - which is where
>-	 * all interrupts are processed.
>+	 * Find a free IDT slot for vmbus callback.
> 	 */
>-	ret = intr_event_bind(hv_msg_intr_event, 0);
>+	hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc();
> 
>-	if (ret)
>-		goto cleanup1;
>+	if (hv_vmbus_g_context.hv_cb_vector == 0) {
>+		if(bootverbose)
>+			printf("Error VMBUS: Cannot find free IDT slot for "
>+			    "vmbus callback!\n");
>+		goto cleanup;
>+	}
> 
>-	ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events,
>-	    NULL, SWI_CLOCK, 0, &event_swintr);
>+	if(bootverbose)
>+		printf("VMBUS: vmbus callback vector %d\n",
>+		    hv_vmbus_g_context.hv_cb_vector);
> 
>-	if (ret)
>-		goto cleanup1;
>+	/*
>+	 * Notify the hypervisor of our vector.
>+	 */
>+	setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
> 
>-	intr_res = bus_alloc_resource(vmbus_devp,
>-	    SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE);
>+	CPU_FOREACH(j) {
>+		hv_vmbus_intr_cpu[j] = 0;
>+		hv_vmbus_swintr_event_cpu[j] = 0;
>+		hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
>+		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
>+		hv_vmbus_g_context.event_swintr[j] = NULL;
>+		hv_vmbus_g_context.msg_swintr[j] = NULL;
> 
>-	if (intr_res == NULL) {
>-		ret = ENOMEM; /* XXXKYS: Need a better errno */
>-		goto cleanup2;
>+		for (i = 0; i < 2; i++)
>+			setup_args.page_buffers[2 * j + i] = NULL;
> 	}
> 
> 	/*
>-	 * Setup interrupt filter handler
>+	 * Per cpu setup.
> 	 */
>-	ret = bus_setup_intr(vmbus_devp, intr_res,
>-	    INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL,
>-	    NULL, &vmbus_cookiep);
>+	CPU_FOREACH(j) {
>+		/*
>+		 * Setup software interrupt thread and handler for msg handling.
>+		 */
>+		ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j],
>+		    "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0,
>+		    &hv_vmbus_g_context.msg_swintr[j]);
>+		if (ret) {
>+			if(bootverbose)
>+				printf("VMBUS: failed to setup msg swi for "
>+				    "cpu %d\n", j);
>+			goto cleanup1;
>+		}
> 
>-	if (ret != 0)
>-		goto cleanup3;
>+		/*
>+		 * Bind the swi thread to the cpu.
>+		 */
>+		ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
>+		    j);
>+	 	if (ret) {
>+			if(bootverbose)
>+				printf("VMBUS: failed to bind msg swi thread "
>+				    "to cpu %d\n", j);
>+			goto cleanup1;
>+		}
> 
>-	ret = bus_bind_intr(vmbus_devp, intr_res, 0);
>-	if (ret != 0)
>-		goto cleanup4;
>+		/*
>+		 * Setup software interrupt thread and handler for
>+		 * event handling.
>+		 */
>+		ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
>+		    "hv_event", hv_vmbus_on_events, (void *)(long)j,
>+		    SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
>+		if (ret) {
>+			if(bootverbose)
>+				printf("VMBUS: failed to setup event swi for "
>+				    "cpu %d\n", j);
>+			goto cleanup1;
>+		}
> 
>-	isrc = intr_lookup_source(vmbus_irq);
>-	if ((isrc == NULL) || (isrc->is_event == NULL)) {
>-		ret = EINVAL;
>-		goto cleanup4;
>-	}
>-
>-	/* vector = isrc->is_event->ie_vector; */
>-	intpin = (struct ioapic_intsrc *)isrc;
>-	vector = intpin->io_vector;
>-
>-	if(bootverbose)
>-		printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector);
>-
>-	/**
>-	 * Notify the hypervisor of our irq.
>-	 */
>-	setup_args.vector = vector;
>-	for(i = 0; i < 2; i++) {
>-		setup_args.page_buffers[i] =
>+		/*
>+		 * Prepare the per cpu msg and event pages to be called on each cpu.
>+		 */
>+		for(i = 0; i < 2; i++) {
>+			setup_args.page_buffers[2 * j + i] =
> 				malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
>-		if (setup_args.page_buffers[i] == NULL) {
>-			KASSERT(setup_args.page_buffers[i] != NULL,
>+			if (setup_args.page_buffers[2 * j + i] == NULL) {
>+				KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
> 					("Error VMBUS: malloc failed!"));
>-			if (i > 0)
>-				free(setup_args.page_buffers[0], M_DEVBUF);
>-			goto cleanup4;
>+				goto cleanup1;
>+			}
> 		}
> 	}
> 
>-	/* only CPU #0 supported at this time */
>+	if (bootverbose)
>+		printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n",
>+		    smp_started);
>+
> 	smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args);
> 
> 	/*
>@@ -443,27 +541,33 @@
> 	ret = hv_vmbus_connect();
> 
> 	if (ret != 0)
>-	    goto cleanup4;
>+		goto cleanup1;
> 
> 	hv_vmbus_request_channel_offers();
> 	return (ret);
> 
>-	cleanup4:
>+	cleanup1:
>+	/*
>+	 * Free pages alloc'ed
>+	 */
>+	for (n = 0; n < 2 * MAXCPU; n++)
>+		if (setup_args.page_buffers[n] != NULL)
>+			free(setup_args.page_buffers[n], M_DEVBUF);
> 
> 	/*
>-	 * remove swi, bus and intr resource
>+	 * remove swi and vmbus callback vector;
> 	 */
>-	bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
>+	CPU_FOREACH(j) {
>+		if (hv_vmbus_g_context.msg_swintr[j] != NULL)
>+			swi_remove(hv_vmbus_g_context.msg_swintr[j]);
>+		if (hv_vmbus_g_context.event_swintr[j] != NULL)
>+			swi_remove(hv_vmbus_g_context.event_swintr[j]);
>+		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;	
>+		hv_vmbus_g_context.hv_event_intr_event[j] = NULL;	
>+	}
> 
>-	cleanup3:
>-	bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
>+	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
> 
>-	cleanup2:
>-	swi_remove(event_swintr);
>-
>-	cleanup1:
>-	swi_remove(msg_swintr);
>-
> 	cleanup:
> 	hv_vmbus_cleanup();
> 
>@@ -515,7 +619,7 @@
> 
> 	smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL);
> 
>-	for(i = 0; i < 2; i++) {
>+	for(i = 0; i < 2 * MAXCPU; i++) {
> 		if (setup_args.page_buffers[i] != 0)
> 			free(setup_args.page_buffers[i], M_DEVBUF);
> 	}
>@@ -522,14 +626,18 @@
> 
> 	hv_vmbus_cleanup();
> 
>-	/* remove swi, bus and intr resource */
>-	bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
>+	/* remove swi */
>+	CPU_FOREACH(i) {
>+		if (hv_vmbus_g_context.msg_swintr[i] != NULL)
>+			swi_remove(hv_vmbus_g_context.msg_swintr[i]);
>+		if (hv_vmbus_g_context.event_swintr[i] != NULL)
>+			swi_remove(hv_vmbus_g_context.event_swintr[i]);
>+		hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;	
>+		hv_vmbus_g_context.hv_event_intr_event[i] = NULL;	
>+	}
> 
>-	bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
>+	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
> 
>-	swi_remove(msg_swintr);
>-	swi_remove(event_swintr);
>-
> 	return;
> }
> 
>@@ -603,6 +711,6 @@
> DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0);
> MODULE_VERSION(vmbus,1);
> 
>-/* TODO: We want to be earlier than SI_SUB_VFS */
>-SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL);
>+/* We want to be started after SMP is initialized */
>+SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL);
> 
>Index: sys/dev/hyperv/vmbus/hv_vmbus_priv.h
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_vmbus_priv.h	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_vmbus_priv.h	(revision 3)
>@@ -181,49 +181,30 @@
> 
> #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t)
> 
>-/*
>- *  Connection identifier type
>- */
>-typedef union {
>-	uint32_t		as_uint32_t;
>-	struct {
>-		uint32_t	id:24;
>-		uint32_t	reserved:8;
>-	} u;
>-
>-} __packed hv_vmbus_connection_id;
>-
>-/*
>- * Definition of the hv_vmbus_signal_event hypercall input structure
>- */
> typedef struct {
>-	hv_vmbus_connection_id	connection_id;
>-	uint16_t		flag_number;
>-	uint16_t		rsvd_z;
>-} __packed hv_vmbus_input_signal_event;
>-
>-typedef struct {
>-	uint64_t			align8;
>-	hv_vmbus_input_signal_event	event;
>-} __packed hv_vmbus_input_signal_event_buffer;
>-
>-typedef struct {
> 	uint64_t	guest_id;
> 	void*		hypercall_page;
> 	hv_bool_uint8_t	syn_ic_initialized;
>+
>+	hv_vmbus_handle	syn_ic_msg_page[MAXCPU];
>+	hv_vmbus_handle	syn_ic_event_page[MAXCPU];
> 	/*
>-	 * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall.
>-	 * The input param is immutable  in our usage and
>-	 * must be dynamic mem (vs stack or global).
>+	 * For FreeBSD cpuid to Hyper-V vcpuid mapping.
> 	 */
>-	hv_vmbus_input_signal_event_buffer	*signal_event_buffer;
>+	uint32_t	hv_vcpu_index[MAXCPU];
> 	/*
>-	 * 8-bytes aligned of the buffer above
>+	 * Each cpu has its own software interrupt handler for channel
>+	 * event and msg handling.
> 	 */
>-	hv_vmbus_input_signal_event		*signal_event_param;
>-
>-	hv_vmbus_handle	syn_ic_msg_page[MAXCPU];
>-	hv_vmbus_handle	syn_ic_event_page[MAXCPU];
>+	struct intr_event		*hv_event_intr_event[MAXCPU];
>+	struct intr_event		*hv_msg_intr_event[MAXCPU];
>+	void				*event_swintr[MAXCPU];
>+	void				*msg_swintr[MAXCPU];
>+	/*
>+	 * Host use this vector to intrrupt guest for vmbus channel
>+	 * event and msg.
>+	 */
>+	unsigned int			hv_cb_vector;
> } hv_vmbus_context;
> 
> /*
>@@ -368,7 +349,8 @@
> 	TAILQ_HEAD(, hv_vmbus_channel_msg_info)	channel_msg_anchor;
> 	struct mtx				channel_msg_lock;
> 	/**
>-	 * List of channels
>+	 * List of primary channels. Sub channels will be linked
>+	 * under their primary channel.
> 	 */
> 	TAILQ_HEAD(, hv_vmbus_channel)		channel_anchor;
> 	struct mtx				channel_lock;
>@@ -560,6 +542,8 @@
> 	uint32_t	flags32[HV_EVENT_FLAGS_DWORD_COUNT];
> } hv_vmbus_synic_event_flags;
> 
>+/* MSR used to provide vcpu index */
>+#define	HV_X64_MSR_VP_INDEX   (0x40000002)
> 
> /*
>  * Define synthetic interrupt controller model specific registers
>@@ -618,7 +602,8 @@
> int			hv_ring_buffer_write(
> 				hv_vmbus_ring_buffer_info	*ring_info,
> 				hv_vmbus_sg_buffer_list		sg_buffers[],
>-				uint32_t			sg_buff_count);
>+				uint32_t			sg_buff_count,
>+				boolean_t			*need_sig);
> 
> int			hv_ring_buffer_peek(
> 				hv_vmbus_ring_buffer_info	*ring_info,
>@@ -638,6 +623,12 @@
> 				hv_vmbus_ring_buffer_info	*ring_info,
> 				char				*prefix);
> 
>+void			hv_ring_buffer_read_begin(
>+				hv_vmbus_ring_buffer_info	*ring_info);
>+
>+uint32_t		hv_ring_buffer_read_end(
>+				hv_vmbus_ring_buffer_info	*ring_info);
>+
> hv_vmbus_channel*	hv_vmbus_allocate_channel(void);
> void			hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
> void			hv_vmbus_on_channel_message(void *context);
>@@ -652,7 +643,7 @@
> 				void			*payload,
> 				size_t			payload_size);
> 
>-uint16_t		hv_vmbus_signal_event(void);
>+uint16_t		hv_vmbus_signal_event(void *con_id);
> void			hv_vmbus_synic_init(void *irq_arg);
> void			hv_vmbus_synic_cleanup(void *arg);
> int			hv_vmbus_query_hypervisor_presence(void);
>@@ -674,7 +665,7 @@
> int			hv_vmbus_connect(void);
> int			hv_vmbus_disconnect(void);
> int			hv_vmbus_post_message(void *buffer, size_t buf_size);
>-int			hv_vmbus_set_event(uint32_t child_rel_id);
>+int			hv_vmbus_set_event(hv_vmbus_channel *channel);
> void			hv_vmbus_on_events(void *);
> 
> 
>@@ -718,7 +709,7 @@
> 
> typedef struct {
> 	unsigned int	vector;
>-	void		*page_buffers[2];
>+	void		*page_buffers[2 * MAXCPU];
> } hv_setup_args;
> 
> #endif  /* __HYPERV_PRIV_H__ */
>Index: sys/dev/hyperv/vmbus/hv_channel.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_channel.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_channel.c	(revision 3)
>@@ -75,7 +75,7 @@
> 			(uint32_t *)&monitor_page->
> 				trigger_group[channel->monitor_group].u.pending);
> 	} else {
>-		hv_vmbus_set_event(channel->offer_msg.child_rel_id);
>+		hv_vmbus_set_event(channel);
> 	}
> 
> }
>@@ -99,6 +99,18 @@
> 	hv_vmbus_channel_open_channel*	open_msg;
> 	hv_vmbus_channel_msg_info* 	open_info;
> 
>+	mtx_lock_spin(&new_channel->sc_lock);
>+	if (new_channel->state == HV_CHANNEL_OPEN_STATE) {
>+	    new_channel->state = HV_CHANNEL_OPENING_STATE;
>+	} else {
>+	    mtx_unlock_spin(&new_channel->sc_lock);
>+	    if(bootverbose)
>+		printf("VMBUS: Trying to open channel <%p> which in "
>+		    "%d state.\n", new_channel, new_channel->state);
>+	    return (EINVAL);
>+	}
>+	mtx_unlock_spin(&new_channel->sc_lock);
>+
> 	new_channel->on_channel_callback = pfn_on_channel_callback;
> 	new_channel->channel_callback_context = context;
> 
>@@ -162,7 +174,7 @@
> 		new_channel->ring_buffer_gpadl_handle;
> 	open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size
> 		>> PAGE_SHIFT;
>-	open_msg->server_context_area_gpadl_handle = 0;
>+	open_msg->target_vcpu = new_channel->target_vcpu;
> 
> 	if (user_data_len)
> 		memcpy(open_msg->user_data, user_data, user_data_len);
>@@ -182,10 +194,14 @@
> 
> 	ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */
> 
>-	if (ret)
>+	if (ret) {
>+	    if(bootverbose)
>+		printf("VMBUS: channel <%p> open timeout.\n", new_channel);
> 	    goto cleanup;
>+	}
> 
> 	if (open_info->response.open_result.status == 0) {
>+	    new_channel->state = HV_CHANNEL_OPENED_STATE;
> 	    if(bootverbose)
> 		printf("VMBUS: channel <%p> open success.\n", new_channel);
> 	} else {
>@@ -497,16 +513,20 @@
> 	return (ret);
> }
> 
>-/**
>- * @brief Close the specified channel
>- */
>-void
>-hv_vmbus_channel_close(hv_vmbus_channel *channel)
>+static void
>+hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
> {
> 	int ret = 0;
> 	hv_vmbus_channel_close_channel* msg;
> 	hv_vmbus_channel_msg_info* info;
> 
>+	channel->state = HV_CHANNEL_OPEN_STATE;
>+	channel->sc_creation_callback = NULL;
>+
>+	/*
>+	 * Grab the lock to prevent race condition when a packet received
>+	 * and unloading driver is in the process.
>+	 */
> 	mtx_lock(&channel->inbound_lock);
> 	channel->on_channel_callback = NULL;
> 	mtx_unlock(&channel->inbound_lock);
>@@ -545,23 +565,37 @@
> 	    M_DEVBUF);
> 
> 	free(info, M_DEVBUF);
>+}
> 
>+/**
>+ * @brief Close the specified channel
>+ */
>+void
>+hv_vmbus_channel_close(hv_vmbus_channel *channel)
>+{
>+	hv_vmbus_channel*	sub_channel;
>+
>+	if (channel->primary_channel != NULL) {
>+		/*
>+		 * We only close multi-channels when the primary is
>+		 * closed.
>+		 */
>+		return;
>+	}
>+
> 	/*
>-	 *  If we are closing the channel during an error path in
>-	 *  opening the channel, don't free the channel
>-	 *  since the caller will free the channel
>+	 * Close all multi-channels first.
> 	 */
>-	if (channel->state == HV_CHANNEL_OPEN_STATE) {
>-		mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
>-		TAILQ_REMOVE(
>-			&hv_vmbus_g_connection.channel_anchor,
>-			channel,
>-			list_entry);
>-		mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
>-
>-		hv_vmbus_free_vmbus_channel(channel);
>+	TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor,
>+	    sc_list_entry) {
>+		if (sub_channel->state != HV_CHANNEL_OPENED_STATE)
>+			continue;
>+		hv_vmbus_channel_close_internal(sub_channel);
> 	}
>-
>+	/*
>+	 * Then close the primary channel.
>+	 */
>+	hv_vmbus_channel_close_internal(channel);
> }
> 
> /**
>@@ -581,6 +615,7 @@
> 	uint32_t		packet_len;
> 	uint64_t		aligned_data;
> 	uint32_t		packet_len_aligned;
>+	boolean_t		need_sig;
> 	hv_vmbus_sg_buffer_list	buffer_list[3];
> 
> 	packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len;
>@@ -604,12 +639,11 @@
> 	buffer_list[2].data = &aligned_data;
> 	buffer_list[2].length = packet_len_aligned - packet_len;
> 
>-	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
>+	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
>+	    &need_sig);
> 
> 	/* TODO: We should determine if this is optional */
>-	if (ret == 0
>-		&& !hv_vmbus_get_ring_buffer_interrupt_mask(
>-			&channel->outbound)) {
>+	if (ret == 0 && need_sig) {
> 		vmbus_channel_set_event(channel);
> 	}
> 
>@@ -632,6 +666,7 @@
> 
> 	int					ret = 0;
> 	int					i = 0;
>+	boolean_t				need_sig;
> 	uint32_t				packet_len;
> 	uint32_t				packetLen_aligned;
> 	hv_vmbus_sg_buffer_list			buffer_list[3];
>@@ -675,11 +710,11 @@
> 	buffer_list[2].data = &alignedData;
> 	buffer_list[2].length = packetLen_aligned - packet_len;
> 
>-	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
>+	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
>+	    &need_sig);
> 
> 	/* TODO: We should determine if this is optional */
>-	if (ret == 0 &&
>-		!hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
>+	if (ret == 0 && need_sig) {
> 		vmbus_channel_set_event(channel);
> 	}
> 
>@@ -700,6 +735,7 @@
> 
> 	int			ret = 0;
> 	uint32_t		desc_size;
>+	boolean_t		need_sig;
> 	uint32_t		packet_len;
> 	uint32_t		packet_len_aligned;
> 	uint32_t		pfn_count;
>@@ -750,11 +786,11 @@
> 	buffer_list[2].data = &aligned_data;
> 	buffer_list[2].length = packet_len_aligned - packet_len;
> 
>-	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
>+	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
>+	    &need_sig);
> 
> 	/* TODO: We should determine if this is optional */
>-	if (ret == 0 &&
>-	    !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
>+	if (ret == 0 && need_sig) {
> 	    vmbus_channel_set_event(channel);
> 	}
> 
>Index: sys/dev/hyperv/vmbus/hv_ring_buffer.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_ring_buffer.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_ring_buffer.c	(revision 3)
>@@ -144,6 +144,69 @@
> 	return (uint64_t) ring_info->ring_buffer->write_index << 32;
> }
> 
>+void
>+hv_ring_buffer_read_begin(
>+	hv_vmbus_ring_buffer_info*	ring_info)
>+{
>+	ring_info->ring_buffer->interrupt_mask = 1;
>+	mb();
>+}
>+
>+uint32_t
>+hv_ring_buffer_read_end(
>+	hv_vmbus_ring_buffer_info*	ring_info)
>+{
>+	uint32_t read, write;	
>+
>+	ring_info->ring_buffer->interrupt_mask = 0;
>+	mb();
>+
>+	/*
>+	 * Now check to see if the ring buffer is still empty.
>+	 * If it is not, we raced and we need to process new
>+	 * incoming messages.
>+	 */
>+	get_ring_buffer_avail_bytes(ring_info, &read, &write);
>+
>+	return (read);
>+}
>+
>+/*
>+ * When we write to the ring buffer, check if the host needs to
>+ * be signaled. Here is the details of this protocol:
>+ *
>+ *	1. The host guarantees that while it is draining the
>+ *	   ring buffer, it will set the interrupt_mask to
>+ *	   indicate it does not need to be interrupted when
>+ *	   new data is placed.
>+ *
>+ *	2. The host guarantees that it will completely drain
>+ *	   the ring buffer before exiting the read loop. Further,
>+ *	   once the ring buffer is empty, it will clear the
>+ *	   interrupt_mask and re-check to see if new data has
>+ *	   arrived.
>+ */
>+static boolean_t
>+hv_ring_buffer_needsig_on_write(
>+	uint32_t			old_write_location,
>+	hv_vmbus_ring_buffer_info*	rbi)
>+{
>+	mb();
>+	if (rbi->ring_buffer->interrupt_mask)
>+		return (FALSE);
>+
>+	/* Read memory barrier */
>+	rmb();
>+	/*
>+	 * This is the only case we need to signal when the
>+	 * ring transitions from being empty to non-empty.
>+	 */
>+	if (old_write_location == rbi->ring_buffer->read_index)
>+		return (TRUE);
>+
>+	return (FALSE);
>+}
>+
> static uint32_t	copy_to_ring_buffer(
> 			hv_vmbus_ring_buffer_info*	ring_info,
> 			uint32_t			start_write_offset,
>@@ -204,11 +267,13 @@
> hv_ring_buffer_write(
> 	hv_vmbus_ring_buffer_info*	out_ring_info,
> 	hv_vmbus_sg_buffer_list		sg_buffers[],
>-	uint32_t			sg_buffer_count)
>+	uint32_t			sg_buffer_count,
>+	boolean_t			*need_sig)
> {
> 	int i = 0;
> 	uint32_t byte_avail_to_write;
> 	uint32_t byte_avail_to_read;
>+	uint32_t old_write_location;
> 	uint32_t total_bytes_to_write = 0;
> 
> 	volatile uint32_t next_write_location;
>@@ -242,6 +307,8 @@
> 	 */
> 	next_write_location = get_next_write_location(out_ring_info);
> 
>+	old_write_location = next_write_location;
>+
> 	for (i = 0; i < sg_buffer_count; i++) {
> 	    next_write_location = copy_to_ring_buffer(out_ring_info,
> 		next_write_location, (char *) sg_buffers[i].data,
>@@ -258,9 +325,9 @@
> 		(char *) &prev_indices, sizeof(uint64_t));
> 
> 	/*
>-	 * Make sure we flush all writes before updating the writeIndex
>+	 * Full memory barrier before upding the write index. 
> 	 */
>-	wmb();
>+	mb();
> 
> 	/*
> 	 * Now, update the write location
>@@ -269,6 +336,9 @@
> 
> 	mtx_unlock_spin(&out_ring_info->ring_lock);
> 
>+	*need_sig = hv_ring_buffer_needsig_on_write(old_write_location,
>+	    out_ring_info);
>+
> 	return (0);
> }
> 
>Index: sys/dev/hyperv/vmbus/hv_channel_mgmt.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_channel_mgmt.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_channel_mgmt.c	(revision 3)
>@@ -50,6 +50,7 @@
> static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
> static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
> static void vmbus_channel_process_offer(void *context);
>+struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
> 
> /**
>  * Channel message dispatch table
>@@ -233,7 +234,10 @@
> 	    return (NULL);
> 
> 	mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
>+	mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_SPIN);
> 
>+	TAILQ_INIT(&channel->sc_list_anchor);
>+
> 	channel->control_work_queue = hv_work_queue_create("control");
> 
> 	if (channel->control_work_queue == NULL) {
>@@ -262,6 +266,7 @@
> void
> hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
> {
>+	mtx_destroy(&channel->sc_lock);
> 	mtx_destroy(&channel->inbound_lock);
> 	/*
> 	 * We have to release the channel's workqueue/thread in
>@@ -279,10 +284,10 @@
> static void
> vmbus_channel_process_offer(void *context)
> {
>-	int			ret;
> 	hv_vmbus_channel*	new_channel;
> 	boolean_t		f_new;
> 	hv_vmbus_channel*	channel;
>+	int			ret;
> 
> 	new_channel = (hv_vmbus_channel*) context;
> 	f_new = TRUE;
>@@ -296,33 +301,71 @@
> 	TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
> 	    list_entry)
> 	{
>-	    if (!memcmp(
>-		&channel->offer_msg.offer.interface_type,
>-		&new_channel->offer_msg.offer.interface_type,
>-		sizeof(hv_guid))
>-		&& !memcmp(
>-		    &channel->offer_msg.offer.interface_instance,
>+		if (!memcmp( &channel->offer_msg.offer.interface_type,
>+		    &new_channel->offer_msg.offer.interface_type,
>+		    sizeof(hv_guid)) &&
>+		    !memcmp(&channel->offer_msg.offer.interface_instance,
> 		    &new_channel->offer_msg.offer.interface_instance,
> 		    sizeof(hv_guid))) {
>-		f_new = FALSE;
>-		break;
>-	    }
>+			f_new = FALSE;
>+			break;
>+		}
> 	}
> 
> 	if (f_new) {
>-	    /* Insert at tail */
>-	    TAILQ_INSERT_TAIL(
>-		&hv_vmbus_g_connection.channel_anchor,
>-		new_channel,
>-		list_entry);
>+		/* Insert at tail */
>+		TAILQ_INSERT_TAIL(
>+		    &hv_vmbus_g_connection.channel_anchor,
>+		    new_channel,
>+		    list_entry);
> 	}
> 	mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
> 
>+	/*XXX add new channel to percpu_list */
>+
> 	if (!f_new) {
>+		/*
>+		 * Check if this is a sub channel.
>+		 */
>+		if (new_channel->offer_msg.offer.sub_channel_index != 0) {
>+			/*
>+			 * It is a sub channel offer, process it.
>+			 */
>+			new_channel->primary_channel = channel;
>+			mtx_lock_spin(&channel->sc_lock);
>+			TAILQ_INSERT_TAIL(
>+			    &channel->sc_list_anchor,
>+			    new_channel,
>+			    sc_list_entry);
>+			mtx_unlock_spin(&channel->sc_lock);
>+
>+			/* Insert new channel into channel_anchor. */
>+			printf("Storvsc get multi-channel offer, rel=%u.\n",
>+			    new_channel->offer_msg.child_rel_id);	
>+			mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
>+			TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
>+			    new_channel, list_entry);				
>+			mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
>+
>+			if(bootverbose)
>+				printf("VMBUS: new multi-channel offer <%p>.\n",
>+				    new_channel);
>+
>+			/*XXX add it to percpu_list */
>+
>+			new_channel->state = HV_CHANNEL_OPEN_STATE;
>+			if (channel->sc_creation_callback != NULL) {
>+				channel->sc_creation_callback(new_channel);
>+			}
>+			return;
>+		}
>+
> 	    hv_vmbus_free_vmbus_channel(new_channel);
> 	    return;
> 	}
> 
>+	new_channel->state = HV_CHANNEL_OPEN_STATE;
>+
> 	/*
> 	 * Start the process of binding this offer to the driver
> 	 * (We need to set the device field before calling
>@@ -333,13 +376,6 @@
> 	    new_channel->offer_msg.offer.interface_instance, new_channel);
> 
> 	/*
>-	 *  TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below
>-	 *  but in the "open" channel request. The ret != 0 logic below
>-	 *  doesn't take into account that a channel
>-	 *  may have been opened successfully
>-	 */
>-
>-	/*
> 	 * Add the new device to the bus. This will kick off device-driver
> 	 * binding which eventually invokes the device driver's AddDevice()
> 	 * method.
>@@ -346,22 +382,80 @@
> 	 */
> 	ret = hv_vmbus_child_device_register(new_channel->device);
> 	if (ret != 0) {
>-	    mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
>-	    TAILQ_REMOVE(
>-		&hv_vmbus_g_connection.channel_anchor,
>-		new_channel,
>-		list_entry);
>-	    mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
>-	    hv_vmbus_free_vmbus_channel(new_channel);
>-	} else {
>-	    /*
>-	     * This state is used to indicate a successful open
>-	     * so that when we do close the channel normally,
>-	     * we can clean up properly
>-	     */
>-	    new_channel->state = HV_CHANNEL_OPEN_STATE;
>+		mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
>+		TAILQ_REMOVE(
>+		    &hv_vmbus_g_connection.channel_anchor,
>+		    new_channel,
>+		    list_entry);
>+		mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
>+		hv_vmbus_free_vmbus_channel(new_channel);
>+	}
>+}
> 
>+/**
>+ * Array of device guids that are performance critical. We try to distribute
>+ * the interrupt load for these devices across all online cpus. 
>+ */
>+static const hv_guid high_perf_devices[] = {
>+	{HV_NIC_GUID, },
>+	{HV_IDE_GUID, },
>+	{HV_SCSI_GUID, },
>+};
>+
>+enum {
>+	PERF_CHN_NIC = 0,
>+	PERF_CHN_IDE,
>+	PERF_CHN_SCSI,
>+	MAX_PERF_CHN,
>+};
>+
>+/*
>+ * We use this static number to distribute the channel interrupt load.
>+ */
>+static uint32_t next_vcpu;
>+
>+/**
>+ * Starting with Win8, we can statically distribute the incoming
>+ * channel interrupt load by binding a channel to VCPU. We
>+ * implement here a simple round robin scheme for distributing
>+ * the interrupt load.
>+ * We will bind channels that are not performance critical to cpu 0 and
>+ * performance critical channels (IDE, SCSI and Network) will be uniformly
>+ * distributed across all available CPUs.
>+ */
>+static void
>+vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
>+{
>+	uint32_t current_cpu;
>+	int i;
>+	boolean_t is_perf_channel = FALSE;
>+
>+	for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) {
>+		if (!memcmp(guid->data, high_perf_devices[i].data,
>+		    sizeof(hv_guid))) {
>+			is_perf_channel = TRUE;
>+			break;
>+		}
> 	}
>+
>+	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
>+	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) ||
>+	    (!is_perf_channel)) {
>+		/* Host's view of guest cpu */
>+		channel->target_vcpu = 0;
>+		/* Guest's own view of cpu */
>+		channel->target_cpu = 0;
>+		return;
>+	}
>+	/* mp_ncpus should have the number cpus currently online */
>+	current_cpu = (++next_vcpu % mp_ncpus);
>+	channel->target_cpu = current_cpu;
>+	channel->target_vcpu =
>+	    hv_vmbus_g_context.hv_vcpu_index[current_cpu];
>+	if (bootverbose)
>+		printf("VMBUS: Total online cpus %d, assign perf channel %d "
>+		    "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu,
>+		    current_cpu);
> }
> 
> /**
>@@ -391,6 +485,38 @@
> 	if (new_channel == NULL)
> 	    return;
> 
>+	/*
>+	 * By default we setup state to enable batched
>+	 * reading. A specific service can choose to
>+	 * disable this prior to opening the channel.
>+	 */
>+	new_channel->batched_reading = TRUE;
>+
>+	new_channel->signal_event_param =
>+	    (hv_vmbus_input_signal_event *)
>+	    (HV_ALIGN_UP((unsigned long)
>+		&new_channel->signal_event_buffer,
>+		HV_HYPERCALL_PARAM_ALIGN));
>+
>+ 	new_channel->signal_event_param->connection_id.as_uint32_t = 0;	
>+	new_channel->signal_event_param->connection_id.u.id =
>+	    HV_VMBUS_EVENT_CONNECTION_ID;
>+	new_channel->signal_event_param->flag_number = 0;
>+	new_channel->signal_event_param->rsvd_z = 0;
>+
>+	if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) {
>+		new_channel->is_dedicated_interrupt =
>+		    (offer->is_dedicated_interrupt != 0);
>+		new_channel->signal_event_param->connection_id.u.id =
>+		    offer->connection_id;
>+	}
>+
>+	/*
>+	 * Bind the channel to a chosen cpu.
>+	 */
>+	vmbus_channel_select_cpu(new_channel,
>+	    &offer->offer.interface_type);
>+
> 	memcpy(&new_channel->offer_msg, offer,
> 	    sizeof(hv_vmbus_channel_offer_channel));
> 	new_channel->monitor_group = (uint8_t) offer->monitor_id / 32;
>@@ -678,3 +804,60 @@
> 	}
> 	mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
> }
>+
>+/**
>+ * @brief Select the best outgoing channel
>+ * 
>+ * The channel whose vcpu binding is closest to the currect vcpu will
>+ * be selected.
>+ * If no multi-channel, always select primary channel
>+ * 
>+ * @param primary - primary channel
>+ */
>+struct hv_vmbus_channel *
>+vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary)
>+{
>+	hv_vmbus_channel *new_channel = NULL;
>+	hv_vmbus_channel *outgoing_channel = primary;
>+	int old_cpu_distance = 0;
>+	int new_cpu_distance = 0;
>+	int cur_vcpu = 0;
>+	int smp_pro_id = PCPU_GET(cpuid);
>+
>+	if (TAILQ_EMPTY(&primary->sc_list_anchor)) {
>+		return outgoing_channel;
>+	}
>+
>+	if (smp_pro_id >= MAXCPU) {
>+		return outgoing_channel;
>+	}
>+
>+	cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id];
>+	
>+	TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) {
>+		if (new_channel->state != HV_CHANNEL_OPENED_STATE){
>+			continue;
>+		}
>+
>+		if (new_channel->target_vcpu == cur_vcpu){
>+			return new_channel;
>+		}
>+
>+		old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ?
>+		    (outgoing_channel->target_vcpu - cur_vcpu) :
>+		    (cur_vcpu - outgoing_channel->target_vcpu));
>+
>+		new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ?
>+		    (new_channel->target_vcpu - cur_vcpu) :
>+		    (cur_vcpu - new_channel->target_vcpu));
>+
>+		if (old_cpu_distance < new_cpu_distance) {
>+			continue;
>+		}
>+
>+		outgoing_channel = new_channel;
>+	}
>+
>+	return outgoing_channel;
>+}
>+
>Index: sys/dev/hyperv/vmbus/hv_hv.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_hv.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_hv.c	(revision 3)
>@@ -67,8 +67,6 @@
> hv_vmbus_context hv_vmbus_g_context = {
> 	.syn_ic_initialized = FALSE,
> 	.hypercall_page = NULL,
>-	.signal_event_param = NULL,
>-	.signal_event_buffer = NULL,
> };
> 
> static struct timecounter hv_timecounter = {
>@@ -256,28 +254,6 @@
> 
> 	hv_vmbus_g_context.hypercall_page = virt_addr;
> 
>-	/*
>-	 * Setup the global signal event param for the signal event hypercall
>-	 */
>-	hv_vmbus_g_context.signal_event_buffer =
>-	    malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF,
>-		M_ZERO | M_NOWAIT);
>-	KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL,
>-	    ("Error VMBUS: Failed to allocate signal_event_buffer\n"));
>-	if (hv_vmbus_g_context.signal_event_buffer == NULL)
>-	    goto cleanup;
>-
>-	hv_vmbus_g_context.signal_event_param =
>-	    (hv_vmbus_input_signal_event*)
>-	    (HV_ALIGN_UP((unsigned long)
>-		hv_vmbus_g_context.signal_event_buffer,
>-		HV_HYPERCALL_PARAM_ALIGN));
>-	hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0;
>-	hv_vmbus_g_context.signal_event_param->connection_id.u.id =
>-	    HV_VMBUS_EVENT_CONNECTION_ID;
>-	hv_vmbus_g_context.signal_event_param->flag_number = 0;
>-	hv_vmbus_g_context.signal_event_param->rsvd_z = 0;
>-	
> 	tc_init(&hv_timecounter); /* register virtual timecount */
> 	
> 	return (0);
>@@ -303,12 +279,6 @@
> {
> 	hv_vmbus_x64_msr_hypercall_contents hypercall_msr;
> 
>-	if (hv_vmbus_g_context.signal_event_buffer != NULL) {
>-	    free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF);
>-	    hv_vmbus_g_context.signal_event_buffer = NULL;
>-	    hv_vmbus_g_context.signal_event_param = NULL;
>-	}
>-
> 	if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) {
> 	    if (hv_vmbus_g_context.hypercall_page != NULL) {
> 		hypercall_msr.as_uint64_t = 0;
>@@ -370,13 +340,13 @@
>  * event IPC. (This involves a hypercall.)
>  */
> hv_vmbus_status
>-hv_vmbus_signal_event()
>+hv_vmbus_signal_event(void *con_id)
> {
> 	hv_vmbus_status status;
> 
> 	status = hv_vmbus_do_hypercall(
> 		    HV_CALL_SIGNAL_EVENT,
>-		    hv_vmbus_g_context.signal_event_param,
>+		    con_id,
> 		    0) & 0xFFFF;
> 
> 	return (status);
>@@ -390,6 +360,7 @@
> 
> {
> 	int			cpu;
>+	uint64_t		hv_vcpu_index;
> 	hv_vmbus_synic_simp	simp;
> 	hv_vmbus_synic_siefp	siefp;
> 	hv_vmbus_synic_scontrol sctrl;
>@@ -403,23 +374,14 @@
> 	    return;
> 
> 	/*
>-	 * KYS: Looks like we can only initialize on cpu0; don't we support
>-	 * SMP guests?
>-	 *
>-	 * TODO: Need to add SMP support for FreeBSD V9
>-	 */
>-
>-	if (cpu != 0)
>-	    return;
>-
>-	/*
> 	 * TODO: Check the version
> 	 */
> 	version = rdmsr(HV_X64_MSR_SVERSION);
>-
> 	
>-	hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0];
>-	hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1];
>+	hv_vmbus_g_context.syn_ic_msg_page[cpu] =
>+	    setup_args->page_buffers[2 * cpu];
>+	hv_vmbus_g_context.syn_ic_event_page[cpu] =
>+	    setup_args->page_buffers[2 * cpu + 1];
> 
> 	/*
> 	 * Setup the Synic's message page
>@@ -443,9 +405,10 @@
> 	wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t);
> 
> 	/*HV_SHARED_SINT_IDT_VECTOR + 0x20; */
>+	shared_sint.as_uint64_t = 0;
> 	shared_sint.u.vector = setup_args->vector;
> 	shared_sint.u.masked = FALSE;
>-	shared_sint.u.auto_eoi = FALSE;
>+	shared_sint.u.auto_eoi = TRUE;
> 
> 	wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
> 	    shared_sint.as_uint64_t);
>@@ -458,6 +421,13 @@
> 
> 	hv_vmbus_g_context.syn_ic_initialized = TRUE;
> 
>+	/*
>+	 * Set up the cpuid mapping from Hyper-V to FreeBSD.
>+	 * The array is indexed using FreeBSD cpuid.
>+	 */
>+	hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX);
>+	hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index;
>+
> 	return;
> }
> 
>@@ -469,14 +439,10 @@
> 	hv_vmbus_synic_sint	shared_sint;
> 	hv_vmbus_synic_simp	simp;
> 	hv_vmbus_synic_siefp	siefp;
>-	int			cpu = PCPU_GET(cpuid);
> 
> 	if (!hv_vmbus_g_context.syn_ic_initialized)
> 	    return;
> 
>-	if (cpu != 0)
>-	    return; /* TODO: XXXKYS: SMP? */
>-
> 	shared_sint.as_uint64_t = rdmsr(
> 	    HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT);
> 
>Index: sys/dev/hyperv/vmbus/hv_connection.c
>===================================================================
>--- sys/dev/hyperv/vmbus/hv_connection.c	(revision 1)
>+++ sys/dev/hyperv/vmbus/hv_connection.c	(revision 3)
>@@ -45,14 +45,113 @@
> 	{ .connect_state = HV_DISCONNECTED,
> 	  .next_gpadl_handle = 0xE1E10, };
> 
>+uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008;
>+
>+static uint32_t
>+hv_vmbus_get_next_version(uint32_t current_ver)
>+{
>+	switch (current_ver) {
>+	case (HV_VMBUS_VERSION_WIN7):
>+		return 	HV_VMBUS_VERSION_WS2008;
>+
>+	case (HV_VMBUS_VERSION_WIN8):
>+		return 	HV_VMBUS_VERSION_WIN7;
>+
>+	case (HV_VMBUS_VERSION_WIN8_1):
>+		return 	HV_VMBUS_VERSION_WIN8;
>+
>+	case (HV_VMBUS_VERSION_WS2008):
>+	default:
>+		return 	HV_VMBUS_VERSION_INVALID;
>+	}
>+}
>+
> /**
>+ * Negotiate the highest supported hypervisor version.
>+ */
>+static int
>+hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
>+	uint32_t version)
>+{
>+	int					ret = 0;
>+	hv_vmbus_channel_initiate_contact	*msg;
>+
>+	sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
>+	msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
>+
>+	msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
>+	msg->vmbus_version_requested = version;
>+
>+	msg->interrupt_page = hv_get_phys_addr(
>+		hv_vmbus_g_connection.interrupt_page);
>+
>+	msg->monitor_page_1 = hv_get_phys_addr(
>+		hv_vmbus_g_connection.monitor_pages);
>+
>+	msg->monitor_page_2 =
>+		hv_get_phys_addr(
>+			((uint8_t *) hv_vmbus_g_connection.monitor_pages
>+			+ PAGE_SIZE));
>+
>+	/**
>+	 * Add to list before we send the request since we may receive the
>+	 * response before returning from this routine
>+	 */
>+	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+
>+	TAILQ_INSERT_TAIL(
>+		&hv_vmbus_g_connection.channel_msg_anchor,
>+		msg_info,
>+		msg_list_entry);
>+
>+	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+
>+	ret = hv_vmbus_post_message(
>+		msg,
>+		sizeof(hv_vmbus_channel_initiate_contact));
>+
>+	if (ret != 0) {
>+		mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+		TAILQ_REMOVE(
>+			&hv_vmbus_g_connection.channel_msg_anchor,
>+			msg_info,
>+			msg_list_entry);
>+		mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+		return (ret);
>+	}
>+
>+	/**
>+	 * Wait for the connection response
>+	 */
>+	ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
>+
>+	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+	TAILQ_REMOVE(
>+		&hv_vmbus_g_connection.channel_msg_anchor,
>+		msg_info,
>+		msg_list_entry);
>+	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+
>+	/**
>+	 * Check if successful
>+	 */
>+	if (msg_info->response.version_response.version_supported) {
>+		hv_vmbus_g_connection.connect_state = HV_CONNECTED;
>+	} else {
>+		ret = ECONNREFUSED;
>+	}
>+
>+	return (ret);
>+}
>+
>+/**
>  * Send a connect request on the partition service connection
>  */
> int
> hv_vmbus_connect(void) {
> 	int					ret = 0;
>+	uint32_t				version;
> 	hv_vmbus_channel_msg_info*		msg_info = NULL;
>-	hv_vmbus_channel_initiate_contact*	msg;
> 
> 	/**
> 	 * Make sure we are not connecting or connected
>@@ -130,72 +229,31 @@
> 	    goto cleanup;
> 	}
> 
>-	sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
>-	msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
>-
>-	msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
>-	msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER;
>-
>-	msg->interrupt_page = hv_get_phys_addr(
>-		hv_vmbus_g_connection.interrupt_page);
>-
>-	msg->monitor_page_1 = hv_get_phys_addr(
>-		hv_vmbus_g_connection.monitor_pages);
>-
>-	msg->monitor_page_2 =
>-		hv_get_phys_addr(
>-			((uint8_t *) hv_vmbus_g_connection.monitor_pages
>-			+ PAGE_SIZE));
>-
>-	/**
>-	 * Add to list before we send the request since we may receive the
>-	 * response before returning from this routine
>+	/*
>+	 * Find the highest vmbus version number we can support.
> 	 */
>-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+	version = HV_VMBUS_VERSION_CURRENT;
> 
>-	TAILQ_INSERT_TAIL(
>-		&hv_vmbus_g_connection.channel_msg_anchor,
>-		msg_info,
>-		msg_list_entry);
>+	do {
>+		ret = hv_vmbus_negotiate_version(msg_info, version);
>+		if (ret == EWOULDBLOCK) {
>+			/*
>+			 * We timed out.
>+			 */
>+			goto cleanup;
>+		}
> 
>-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>+		if (hv_vmbus_g_connection.connect_state == HV_CONNECTED)
>+			break;
> 
>-	ret = hv_vmbus_post_message(
>-		msg,
>-		sizeof(hv_vmbus_channel_initiate_contact));
>+		version = hv_vmbus_get_next_version(version);
>+	} while (version != HV_VMBUS_VERSION_INVALID);
> 
>-	if (ret != 0) {
>-		mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>-		TAILQ_REMOVE(
>-			&hv_vmbus_g_connection.channel_msg_anchor,
>-			msg_info,
>-			msg_list_entry);
>-		mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>-		goto cleanup;
>-	}
>+	hv_vmbus_protocal_version = version;
>+	if (bootverbose)
>+		printf("VMBUS: Portocal Version: %d.%d\n",
>+		    version >> 16, version & 0xFFFF);
> 
>-	/**
>-	 * Wait for the connection response
>-	 */
>-	ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
>-
>-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>-	TAILQ_REMOVE(
>-		&hv_vmbus_g_connection.channel_msg_anchor,
>-		msg_info,
>-		msg_list_entry);
>-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
>-
>-	/**
>-	 * Check if successful
>-	 */
>-	if (msg_info->response.version_response.version_supported) {
>-		hv_vmbus_g_connection.connect_state = HV_CONNECTED;
>-	} else {
>-		ret = ECONNREFUSED;
>-		goto cleanup;
>-	}
>-
> 	sema_destroy(&msg_info->wait_sema);
> 	free(msg_info, M_DEVBUF);
> 
>@@ -306,7 +364,10 @@
> static void
> VmbusProcessChannelEvent(uint32_t relid) 
> {
>+	void* arg;
>+	uint32_t bytes_to_read;
> 	hv_vmbus_channel* channel;
>+	boolean_t is_batched_reading;
> 
> 	/**
> 	 * Find the channel based on this relid and invokes
>@@ -329,11 +390,40 @@
> 
> 	mtx_lock(&channel->inbound_lock);
> 	if (channel->on_channel_callback != NULL) {
>-		channel->on_channel_callback(channel->channel_callback_context);
>+		arg = channel->channel_callback_context;
>+		is_batched_reading = channel->batched_reading;
>+		/*
>+		 * Optimize host to guest signaling by ensuring:
>+		 * 1. While reading the channel, we disable interrupts from
>+		 *    host.
>+		 * 2. Ensure that we process all posted messages from the host
>+		 *    before returning from this callback.
>+		 * 3. Once we return, enable signaling from the host. Once this
>+		 *    state is set we check to see if additional packets are
>+		 *    available to read. In this case we repeat the process.
>+		 */
>+		do {
>+			if (is_batched_reading)
>+				hv_ring_buffer_read_begin(&channel->inbound);
>+
>+			channel->on_channel_callback(arg);
>+
>+			if (is_batched_reading)
>+				bytes_to_read =
>+				    hv_ring_buffer_read_end(&channel->inbound);
>+			else
>+				bytes_to_read = 0;
>+		} while (is_batched_reading && (bytes_to_read != 0));
> 	}
> 	mtx_unlock(&channel->inbound_lock);
> }
> 
>+#ifdef HV_DEBUG_INTR
>+extern uint32_t hv_intr_count;
>+extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
>+extern uint32_t hv_vmbus_intr_cpu[MAXCPU];
>+#endif
>+
> /**
>  * Handler for events
>  */
>@@ -340,19 +430,52 @@
> void
> hv_vmbus_on_events(void *arg) 
> {
>+	int bit;
>+	int cpu;
> 	int dword;
>-	int bit;
>+	void *page_addr;
>+	uint32_t* recv_interrupt_page = NULL;
> 	int rel_id;
>-	int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
>+	int maxdword;
>+	hv_vmbus_synic_event_flags *event;
> 	/* int maxdword = PAGE_SIZE >> 3; */
> 
>-	/*
>-	 * receive size is 1/2 page and divide that by 4 bytes
>-	 */
>+	cpu = (int)(long)arg;
>+	KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
>+	    "cpu out of range!"));
> 
>-	uint32_t* recv_interrupt_page =
>-	    hv_vmbus_g_connection.recv_interrupt_page;
>+#ifdef HV_DEBUG_INTR
>+	int i;
>+	hv_vmbus_swintr_event_cpu[cpu]++;
>+	if (hv_intr_count % 10000 == 0) {
>+                printf("VMBUS: Total interrupt %d\n", hv_intr_count);
>+                for (i = 0; i < mp_ncpus; i++)
>+                        printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n",
>+			    i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]);
>+        }
>+#endif
> 
>+	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
>+	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
>+		maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
>+		/*
>+		 * receive size is 1/2 page and divide that by 4 bytes
>+		 */
>+		recv_interrupt_page =
>+		    hv_vmbus_g_connection.recv_interrupt_page;
>+	} else {
>+		/*
>+		 * On Host with Win8 or above, the event page can be
>+		 * checked directly to get the id of the channel
>+		 * that has the pending interrupt.
>+		 */
>+		maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
>+		page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
>+		event = (hv_vmbus_synic_event_flags *)
>+		    page_addr + HV_VMBUS_MESSAGE_SINT;
>+		recv_interrupt_page = event->flags32;
>+	}
>+
> 	/*
> 	 * Check events
> 	 */
>@@ -416,8 +539,9 @@
>  * Send an event notification to the parent
>  */
> int
>-hv_vmbus_set_event(uint32_t child_rel_id) {
>+hv_vmbus_set_event(hv_vmbus_channel *channel) {
> 	int ret = 0;
>+	uint32_t child_rel_id = channel->offer_msg.child_rel_id;
> 
> 	/* Each uint32_t represents 32 channels */
> 
>@@ -424,8 +548,7 @@
> 	synch_set_bit(child_rel_id & 31,
> 		(((uint32_t *)hv_vmbus_g_connection.send_interrupt_page
> 			+ (child_rel_id >> 5))));
>-	ret = hv_vmbus_signal_event();
>+	ret = hv_vmbus_signal_event(channel->signal_event_param);
> 
> 	return (ret);
> }
>-
>Index: sys/x86/include/apicvar.h
>===================================================================
>--- sys/x86/include/apicvar.h	(revision 1)
>+++ sys/x86/include/apicvar.h	(revision 3)
>@@ -416,6 +416,7 @@
> void	lapic_handle_intr(int vector, struct trapframe *frame);
> void	lapic_handle_timer(struct trapframe *frame);
> void	xen_intr_handle_upcall(struct trapframe *frame);
>+void	hv_vector_handler(struct trapframe *frame);
> 
> #endif /* !LOCORE */
> #endif /* _X86_APICVAR_H_ */
>Index: sys/i386/i386/apic_vector.s
>===================================================================
>--- sys/i386/i386/apic_vector.s	(revision 1)
>+++ sys/i386/i386/apic_vector.s	(revision 3)
>@@ -157,6 +157,23 @@
> 	jmp	doreti
> #endif
> 
>+/*
>+ * This is the Hyper-V vmbus channel direct callback interrupt.
>+ * Only used when it is running on Hyper-V.
>+ */
>+	.text
>+	SUPERALIGN_TEXT
>+IDTVEC(hv_vmbus_callback)
>+	PUSH_FRAME
>+	SET_KERNEL_SREGS
>+	cld
>+	FAKE_MCOUNT(TF_EIP(%esp))
>+	pushl	%esp
>+	call	hv_vector_handler
>+	add	$4, %esp
>+	MEXITCOUNT
>+	jmp	doreti
>+
> #ifdef SMP
> /*
>  * Global address space TLB shootdown.
>Index: sys/amd64/amd64/apic_vector.S
>===================================================================
>--- sys/amd64/amd64/apic_vector.S	(revision 1)
>+++ sys/amd64/amd64/apic_vector.S	(revision 3)
>@@ -150,6 +150,20 @@
> 	jmp	doreti
> #endif
> 
>+/*
>+ * This is the Hyper-V vmbus channel direct callback interrupt.
>+ * Only used when it is running on Hyper-V.
>+ */
>+	.text
>+	SUPERALIGN_TEXT
>+IDTVEC(hv_vmbus_callback)
>+	PUSH_FRAME
>+	FAKE_MCOUNT(TF_RIP(%rsp))
>+	movq	%rsp, %rdi
>+	call	hv_vector_handler
>+	MEXITCOUNT
>+	jmp	doreti
>+
> #ifdef SMP
> /*
>  * Global address space TLB shootdown.

Actions: View | Diff

Attachments on bug 195238: 149948