FreeBSD Bugzilla – Attachment 149948 Details for
Bug 195238
[Hyper-v] Enhance VMBUS and improve storage performance
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
VMBUS and storage driver enhancements for Hyper-V
m1-diff.patch (text/plain), 91.30 KB, created by
Wei Hu
on 2014-11-28 04:08:02 UTC
(
hide
)
Description:
VMBUS and storage driver enhancements for Hyper-V
Filename:
MIME Type:
Creator:
Wei Hu
Created:
2014-11-28 04:08:02 UTC
Size:
91.30 KB
patch
obsolete
>Index: sys/dev/hyperv/include/hyperv.h >=================================================================== >--- sys/dev/hyperv/include/hyperv.h (revision 1) >+++ sys/dev/hyperv/include/hyperv.h (revision 3) >@@ -46,6 +46,7 @@ > #include <sys/systm.h> > #include <sys/lock.h> > #include <sys/sema.h> >+#include <sys/smp.h> > #include <sys/mutex.h> > #include <sys/bus.h> > #include <vm/vm.h> >@@ -63,12 +64,23 @@ > #define HV_ERROR_MACHINE_LOCKED 0x800704F7 > > /* >- * A revision number of vmbus that is used for ensuring both ends on a >- * partition are using compatible versions. >+ * VMBUS version is 32 bit, upper 16 bit for major_number and lower >+ * 16 bit for minor_number. >+ * >+ * 0.13 -- Windows Server 2008 >+ * 1.1 -- Windows 7 >+ * 2.4 -- Windows 8 >+ * 3.0 -- Windows 8.1 > */ >+#define HV_VMBUS_VERSION_WS2008 ((0 << 16) | (13)) >+#define HV_VMBUS_VERSION_WIN7 ((1 << 16) | (1)) >+#define HV_VMBUS_VERSION_WIN8 ((2 << 16) | (4)) >+#define HV_VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) > >-#define HV_VMBUS_REVISION_NUMBER 13 >+#define HV_VMBUS_VERSION_INVALID -1 > >+#define HV_VMBUS_VERSION_CURRENT HV_VMBUS_VERSION_WIN8_1 >+ > /* > * Make maximum size of pipe payload of 16K > */ >@@ -112,6 +124,18 @@ > unsigned char data[16]; > } __packed hv_guid; > >+#define HV_NIC_GUID \ >+ .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \ >+ 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} >+ >+#define HV_IDE_GUID \ >+ .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, \ >+ 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} >+ >+#define HV_SCSI_GUID \ >+ .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, \ >+ 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} >+ > /* > * At the center of the Channel Management library is > * the Channel Offer. This struct contains the >@@ -147,7 +171,11 @@ > } __packed pipe; > } u; > >- uint32_t padding; >+ /* >+ * Sub_channel_index, newly added in Win8. >+ */ >+ uint16_t sub_channel_index; >+ uint16_t padding; > > } __packed hv_vmbus_channel_offer; > >@@ -344,7 +372,25 @@ > hv_vmbus_channel_offer offer; > uint32_t child_rel_id; > uint8_t monitor_id; >- hv_bool_uint8_t monitor_allocated; >+ /* >+ * This field has been splited into a bit field on Win7 >+ * and higher. >+ */ >+ uint8_t monitor_allocated:1; >+ uint8_t reserved:7; >+ /* >+ * Following fields were added in win7 and higher. >+ * Make sure to check the version before accessing these fields. >+ * >+ * If "is_dedicated_interrupt" is set, we must not set the >+ * associated bit in the channel bitmap while sending the >+ * interrupt to the host. >+ * >+ * connection_id is used in signaling the host. >+ */ >+ uint16_t is_dedicated_interrupt:1; >+ uint16_t reserved1:15; >+ uint32_t connection_id; > } __packed hv_vmbus_channel_offer_channel; > > /* >@@ -394,9 +440,14 @@ > hv_gpadl_handle ring_buffer_gpadl_handle; > > /* >- * GPADL for the channel's server context save area. >+ * Starting with win8, this field will be used to specify >+ * the target virtual processor on which to deliver the interrupt for >+ * the host to guest. >+ * Before win8, all incoming channel interrupts are only to >+ * be delivered on cpu 0. Setting this value to 0 would >+ * preserve the earlier behavior. > */ >- hv_gpadl_handle server_context_area_gpadl_handle; >+ uint32_t target_vcpu; > > /* > * The upstream ring buffer begins at offset zero in the memory described >@@ -646,14 +697,42 @@ > } hv_vmbus_ring_buffer_info; > > typedef void (*hv_vmbus_pfn_channel_callback)(void *context); >+typedef void (*hv_vmbus_sc_creation_callback)(void *context); > > typedef enum { > HV_CHANNEL_OFFER_STATE, > HV_CHANNEL_OPENING_STATE, > HV_CHANNEL_OPEN_STATE, >+ HV_CHANNEL_OPENED_STATE, > HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE, > } hv_vmbus_channel_state; > >+/* >+ * Connection identifier type >+ */ >+typedef union { >+ uint32_t as_uint32_t; >+ struct { >+ uint32_t id:24; >+ uint32_t reserved:8; >+ } u; >+ >+} __packed hv_vmbus_connection_id; >+ >+/* >+ * Definition of the hv_vmbus_signal_event hypercall input structure >+ */ >+typedef struct { >+ hv_vmbus_connection_id connection_id; >+ uint16_t flag_number; >+ uint16_t rsvd_z; >+} __packed hv_vmbus_input_signal_event; >+ >+typedef struct { >+ uint64_t align8; >+ hv_vmbus_input_signal_event event; >+} __packed hv_vmbus_input_signal_event_buffer; >+ > typedef struct hv_vmbus_channel { > TAILQ_ENTRY(hv_vmbus_channel) list_entry; > struct hv_device* device; >@@ -688,8 +767,87 @@ > hv_vmbus_pfn_channel_callback on_channel_callback; > void* channel_callback_context; > >+ /* >+ * If batched_reading is set to "true", mask the interrupt >+ * and read until the channel is empty. >+ * If batched_reading is set to "false", the channel is not >+ * going to perform batched reading. >+ * >+ * Batched reading is enabled by default; specific >+ * drivers that don't want this behavior can turn it off. >+ */ >+ boolean_t batched_reading; >+ >+ boolean_t is_dedicated_interrupt; >+ >+ /* >+ * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall. >+ */ >+ hv_vmbus_input_signal_event_buffer signal_event_buffer; >+ /* >+ * 8-bytes aligned of the buffer above >+ */ >+ hv_vmbus_input_signal_event *signal_event_param; >+ >+ /* >+ * From Win8, this field specifies the target virtual process >+ * on which to deliver the interupt from the host to guest. >+ * Before Win8, all channel interrupts would only be >+ * delivered on cpu 0. Setting this value to 0 would preserve >+ * the earlier behavior. >+ */ >+ uint32_t target_vcpu; >+ /* The corresponding CPUID in the guest */ >+ uint32_t target_cpu; >+ >+ /* >+ * Support for multi-channels. >+ * The initial offer is considered the primary channel and this >+ * offer message will indicate if the host supports multi-channels. >+ * The guest is free to ask for multi-channels to be offerred and can >+ * open these multi-channels as a normal "primary" channel. However, >+ * all multi-channels will have the same type and instance guids as the >+ * primary channel. Requests sent on a given channel will result in a >+ * response on the same channel. >+ */ >+ >+ /* >+ * Multi-channel creation callback. This callback will be called in >+ * process context when a Multi-channel offer is received from the host. >+ * The guest can open the Multi-channel in the context of this callback. >+ */ >+ hv_vmbus_sc_creation_callback sc_creation_callback; >+ >+ struct mtx sc_lock; >+ >+ /* >+ * Link list of all the multi-channels if this is a primary channel >+ */ >+ TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor; >+ TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry; >+ >+ /* >+ * The primary channel this sub-channle belongs to. >+ * This will be NULL for the primary channel. >+ */ >+ struct hv_vmbus_channel *primary_channel; >+ /* >+ * Support per channel state for use by vmbus drivers. >+ */ >+ void *per_channel_state; >+ /* >+ * To support per-cpu lookup mapping of relid to channel, link up >+ * channels based on their CPU affinity. >+ */ >+ /*XXX TAILQ_HEAD(, uint32_t) percpu_list; */ > } hv_vmbus_channel; > >+static inline void >+hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state) >+{ >+ channel->batched_reading = state; >+} >+ > typedef struct hv_device { > hv_guid class_id; > hv_guid device_id; >@@ -760,6 +918,8 @@ > hv_vmbus_channel* channel, > uint32_t gpadl_handle); > >+struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); >+ > /* > * Work abstraction defines > */ >@@ -819,6 +979,9 @@ > > extern uint8_t* receive_buffer[]; > extern hv_vmbus_service service_table[]; >+extern uint32_t hv_vmbus_protocal_version; >+extern int mp_ncpus; >+extern volatile int smp_started; > > void hv_kvp_callback(void *context); > int hv_kvp_init(hv_vmbus_service *serv); >Index: sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c >=================================================================== >--- sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 1) >+++ sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 3) >@@ -38,6 +38,7 @@ > #include <sys/param.h> > #include <sys/proc.h> > #include <sys/condvar.h> >+#include <sys/time.h> > #include <sys/systm.h> > #include <sys/sockio.h> > #include <sys/mbuf.h> >@@ -53,8 +54,12 @@ > #include <sys/callout.h> > #include <vm/vm.h> > #include <vm/pmap.h> >+#include <vm/uma.h> > #include <sys/lock.h> > #include <sys/sema.h> >+#include <sys/sglist.h> >+#include <machine/bus.h> >+#include <sys/bus_dma.h> > > #include <cam/cam.h> > #include <cam/cam_ccb.h> >@@ -66,7 +71,6 @@ > #include <cam/scsi/scsi_all.h> > #include <cam/scsi/scsi_message.h> > >- > #include <dev/hyperv/include/hyperv.h> > #include "hv_vstorage.h" > >@@ -77,8 +81,29 @@ > #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS > #define STORVSC_MAX_TARGETS (2) > >+#define STORVSC_WIN7_MAJOR 4 >+#define STORVSC_WIN7_MINOR 2 >+ >+#define STORVSC_WIN8_MAJOR 5 >+#define STORVSC_WIN8_MINOR 1 >+ >+#define HV_ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1)) >+ > struct storvsc_softc; > >+struct hv_sgl_node { >+ LIST_ENTRY(hv_sgl_node) link; >+ struct sglist *sgl_data; >+}; >+ >+struct hv_sgl_page_pool{ >+ LIST_HEAD(, hv_sgl_node) in_use_sgl_list; >+ LIST_HEAD(, hv_sgl_node) free_sgl_list; >+ boolean_t is_init; >+} g_hv_sgl_page_pool; >+ >+#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT >+ > enum storvsc_request_type { > WRITE_TYPE, > READ_TYPE, >@@ -96,20 +121,24 @@ > struct storvsc_softc *softc; > struct callout callout; > struct sema synch_sema; /*Synchronize the request/response if needed */ >+ struct sglist *bounce_sgl; >+ unsigned int bounce_sgl_count; >+ uint64_t not_aligned_seg_bits; > }; > > struct storvsc_softc { > struct hv_device *hs_dev; >- LIST_HEAD(, hv_storvsc_request) hs_free_list; >- struct mtx hs_lock; >- struct storvsc_driver_props *hs_drv_props; >- int hs_unit; >- uint32_t hs_frozen; >- struct cam_sim *hs_sim; >- struct cam_path *hs_path; >+ LIST_HEAD(, hv_storvsc_request) hs_free_list; >+ struct mtx hs_lock; >+ struct storvsc_driver_props *hs_drv_props; >+ int hs_unit; >+ uint32_t hs_frozen; >+ struct cam_sim *hs_sim; >+ struct cam_path *hs_path; > uint32_t hs_num_out_reqs; > boolean_t hs_destroy; > boolean_t hs_drain_notify; >+ boolean_t hs_open_multi_channel; > struct sema hs_drain_sema; > struct hv_storvsc_request hs_init_req; > struct hv_storvsc_request hs_reset_req; >@@ -124,7 +153,7 @@ > * The first can be tested by "sg_senddiag -vv /dev/daX", > * and the second and third can be done by > * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". >- */ >+ */ > #define HVS_TIMEOUT_TEST 0 > > /* >@@ -138,7 +167,7 @@ > char *drv_name; > char *drv_desc; > uint8_t drv_max_luns_per_target; >- uint8_t drv_max_ios_per_target; >+ uint8_t drv_max_ios_per_target; > uint32_t drv_ringbuffer_size; > }; > >@@ -150,6 +179,8 @@ > > #define HS_MAX_ADAPTERS 10 > >+#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 >+ > /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ > static const hv_guid gStorVscDeviceType={ > .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, >@@ -171,6 +202,9 @@ > STORVSC_RINGBUFFER_SIZE} > }; > >+static int storvsc_current_major; >+static int storvsc_current_minor; >+ > /* static functions */ > static int storvsc_probe(device_t dev); > static int storvsc_attach(device_t dev); >@@ -177,7 +211,7 @@ > static int storvsc_detach(device_t dev); > static void storvsc_poll(struct cam_sim * sim); > static void storvsc_action(struct cam_sim * sim, union ccb * ccb); >-static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); >+static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); > static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); > static enum hv_storage_type storvsc_get_storage_type(device_t dev); > static void hv_storvsc_on_channel_callback(void *context); >@@ -186,6 +220,14 @@ > struct hv_storvsc_request *request); > static int hv_storvsc_connect_vsp(struct hv_device *device); > static void storvsc_io_done(struct hv_storvsc_request *reqp); >+void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, >+ bus_dma_segment_t *orig_sgl, >+ unsigned int orig_sgl_count, >+ uint64_t seg_bits); >+void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, >+ unsigned int dest_sgl_count, >+ struct sglist* src_sgl, >+ uint64_t seg_bits); > > static device_method_t storvsc_methods[] = { > /* Device interface */ >@@ -207,7 +249,7 @@ > > > /** >- * The host is capable of sending messages to us that are >+ * The host is capable of sending messages to us that are > * completely unsolicited. So, we need to address the race > * condition where we may be in the process of unloading the > * driver when the host may send us an unsolicited message. >@@ -223,7 +265,7 @@ > * destroyed. > * > * 3. Once the device is marked as being destroyed, we only >- * permit incoming traffic to properly account for >+ * permit incoming traffic to properly account for > * packets already sent out. > */ > static inline struct storvsc_softc * >@@ -260,6 +302,114 @@ > } > > /** >+ * @brief Callback handler, will be invoked when receive mutil-channel offer >+ * >+ * @param context new multi-channel >+ */ >+static void >+storvsc_handle_sc_creation(void *context) >+{ >+ hv_vmbus_channel *new_channel = NULL; >+ struct hv_device *device = NULL; >+ struct storvsc_softc *sc = NULL; >+ struct vmstor_chan_props props; >+ int ret = 0; >+ >+ new_channel = (hv_vmbus_channel *)context; >+ device = new_channel->primary_channel->device; >+ sc = get_stor_device(device, TRUE); >+ if (NULL == sc){ >+ return; >+ } >+ >+ if (FALSE == sc->hs_open_multi_channel){ >+ return; >+ } >+ >+ memset(&props, 0, sizeof(struct vmstor_chan_props)); >+ >+ ret = hv_vmbus_channel_open(new_channel, >+ sc->hs_drv_props->drv_ringbuffer_size, >+ sc->hs_drv_props->drv_ringbuffer_size, >+ (void *)&props, >+ sizeof(struct vmstor_chan_props), >+ hv_storvsc_on_channel_callback, >+ new_channel); >+ >+ return; >+} >+ >+/** >+ * @brief Send multi-channel creation request to host >+ * >+ * @param device a Hyper-V device pointer >+ * @param max_chans the max channels supported by vmbus >+ */ >+static void >+storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) >+{ >+ struct storvsc_softc *sc = NULL; >+ struct hv_storvsc_request *request = NULL; >+ struct vstor_packet *vstor_packet = NULL; >+ int request_channels_cnt = 0; >+ int ret; >+ >+ /* get multichannels count that need to create */ >+ request_channels_cnt = ((max_chans > mp_ncpus) ? mp_ncpus : max_chans); >+ >+ sc = get_stor_device(dev, TRUE); >+ if (sc == NULL) { >+ printf("Storvsc_error: get sc failed while send mutilchannel " >+ "request\n"); >+ return; >+ } >+ >+ request = &sc->hs_init_req; >+ >+ /* Establish a handler for multi-channel */ >+ dev->channel->sc_creation_callback = storvsc_handle_sc_creation; >+ >+ /* request the host to create multi-channel */ >+ memset(request, 0, sizeof(struct hv_storvsc_request)); >+ >+ sema_init(&request->synch_sema, 0, ("stor_synch_sema")); >+ >+ vstor_packet = &request->vstor_packet; >+ >+ vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; >+ vstor_packet->flags = REQUEST_COMPLETION_FLAG; >+ vstor_packet->u.multi_channels_cnt = request_channels_cnt; >+ >+ ret = hv_vmbus_channel_send_packet( >+ dev->channel, >+ vstor_packet, >+ sizeof(struct vstor_packet), >+ (uint64_t)(uintptr_t)request, >+ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, >+ HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); >+ >+ /* wait for 500 ticks */ >+ ret = sema_timedwait(&request->synch_sema, 500); >+ if (ret != 0) { >+ printf("Storvsc_error: create multi-channel timeout, %d\n", >+ ret); >+ return; >+ } >+ >+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || >+ vstor_packet->status != 0) { >+ printf("Storvsc_error: create multi-channel invalid operation " >+ "(%d) or statue (%u)\n", >+ vstor_packet->operation, vstor_packet->status); >+ return; >+ } >+ >+ sc->hs_open_multi_channel = TRUE; >+ >+ printf("Storvsc create multi-channel success!\n"); >+} >+ >+/** > * @brief initialize channel connection to parent partition > * > * @param dev a Hyper-V device pointer >@@ -272,6 +422,8 @@ > struct hv_storvsc_request *request; > struct vstor_packet *vstor_packet; > struct storvsc_softc *sc; >+ uint16_t max_chans = 0; >+ boolean_t is_support_multichannel = FALSE; > > sc = get_stor_device(dev, TRUE); > if (sc == NULL) { >@@ -304,7 +456,8 @@ > goto cleanup; > } > >- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ >+ /* wait 500 ticks */ >+ ret = sema_timedwait(&request->synch_sema, 500); > > if (ret != 0) { > goto cleanup; >@@ -321,7 +474,8 @@ > vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; > vstor_packet->flags = REQUEST_COMPLETION_FLAG; > >- vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT; >+ vstor_packet->u.version.major_minor = >+ VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor); > > /* revision is only significant for Windows guests */ > vstor_packet->u.version.revision = 0; >@@ -338,7 +492,8 @@ > goto cleanup; > } > >- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ >+ /* wait 500 ticks */ >+ ret = sema_timedwait(&request->synch_sema, 500); > > if (ret) { > goto cleanup; >@@ -369,7 +524,8 @@ > goto cleanup; > } > >- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ >+ /* wait 500 ticks */ >+ ret = sema_timedwait(&request->synch_sema, 500); > > if (ret != 0) { > goto cleanup; >@@ -377,10 +533,20 @@ > > /* TODO: Check returned version */ > if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || >- vstor_packet->status != 0) { >+ vstor_packet->status != 0) { > goto cleanup; > } > >+ /* multi-channels feature is supported by WIN8 and above version */ >+ max_chans = vstor_packet->u.chan_props.max_channel_cnt; >+ if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) && >+ (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008)) { >+ if (vstor_packet->u.chan_props.flags & >+ HV_STORAGE_SUPPORTS_MULTI_CHANNEL) { >+ is_support_multichannel = TRUE; >+ } >+ } >+ > memset(vstor_packet, 0, sizeof(struct vstor_packet)); > vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; > vstor_packet->flags = REQUEST_COMPLETION_FLAG; >@@ -397,7 +563,8 @@ > goto cleanup; > } > >- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ >+ /* wait 500 ticks */ >+ ret = sema_timedwait(&request->synch_sema, 500); > > if (ret != 0) { > goto cleanup; >@@ -404,10 +571,18 @@ > } > > if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || >- vstor_packet->status != 0) { >+ vstor_packet->status != 0) { > goto cleanup; > } > >+ /* >+ * If multi-channel is supported, send multichannel create >+ * request to host. >+ */ >+ if (is_support_multichannel){ >+ storvsc_send_multichannel_request(dev, max_chans); >+ } >+ > cleanup: > sema_destroy(&request->synch_sema); > return (ret); >@@ -443,9 +618,8 @@ > (void *)&props, > sizeof(struct vmstor_chan_props), > hv_storvsc_on_channel_callback, >- dev); >+ dev->channel); > >- > if (ret != 0) { > return ret; > } >@@ -498,7 +672,7 @@ > > > /* >- * At this point, all outstanding requests in the adapter >+ * At this point, all outstanding requests in the adapter > * should have been flushed out and return to us > */ > >@@ -521,6 +695,7 @@ > { > struct storvsc_softc *sc; > struct vstor_packet *vstor_packet = &request->vstor_packet; >+ struct hv_vmbus_channel* outgoing_channel = NULL; > int ret = 0; > > sc = get_stor_device(device, TRUE); >@@ -539,19 +714,20 @@ > > vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; > >+ outgoing_channel = vmbus_select_outgoing_channel(device->channel); > > mtx_unlock(&request->softc->hs_lock); > if (request->data_buf.length) { > ret = hv_vmbus_channel_send_packet_multipagebuffer( >- device->channel, >+ outgoing_channel, > &request->data_buf, >- vstor_packet, >- sizeof(struct vstor_packet), >+ vstor_packet, >+ sizeof(struct vstor_packet), > (uint64_t)(uintptr_t)request); > > } else { > ret = hv_vmbus_channel_send_packet( >- device->channel, >+ outgoing_channel, > vstor_packet, > sizeof(struct vstor_packet), > (uint64_t)(uintptr_t)request, >@@ -610,7 +786,8 @@ > hv_storvsc_on_channel_callback(void *context) > { > int ret = 0; >- struct hv_device *device = (struct hv_device *)context; >+ hv_vmbus_channel *channel = (hv_vmbus_channel *)context; >+ struct hv_device *device = NULL; > struct storvsc_softc *sc; > uint32_t bytes_recvd; > uint64_t request_id; >@@ -618,15 +795,22 @@ > struct hv_storvsc_request *request; > struct vstor_packet *vstor_packet; > >+ if (channel->primary_channel != NULL){ >+ device = channel->primary_channel->device; >+ } else { >+ device = channel->device; >+ } >+ >+ KASSERT(device, ("device")); >+ > sc = get_stor_device(device, FALSE); > if (sc == NULL) { >+ printf("Storvsc_error: get stor device failed.\n"); > return; > } > >- KASSERT(device, ("device")); >- > ret = hv_vmbus_channel_recv_packet( >- device->channel, >+ channel, > packet, > roundup2(sizeof(struct vstor_packet), 8), > &bytes_recvd, >@@ -634,21 +818,29 @@ > > while ((ret == 0) && (bytes_recvd > 0)) { > request = (struct hv_storvsc_request *)(uintptr_t)request_id; >- KASSERT(request, ("request")); > > if ((request == &sc->hs_init_req) || > (request == &sc->hs_reset_req)) { > memcpy(&request->vstor_packet, packet, > sizeof(struct vstor_packet)); >- sema_post(&request->synch_sema); >+ sema_post(&request->synch_sema); > } else { > vstor_packet = (struct vstor_packet *)packet; > switch(vstor_packet->operation) { > case VSTOR_OPERATION_COMPLETEIO: >+ if (request == NULL) { >+ printf("VMBUS: storvsc received a " >+ "packet with NULL request id in " >+ "COMPLETEIO operation. Panick!\n"); >+ KASSERT(request, ("request")); >+ } > hv_storvsc_on_iocompletion(sc, > vstor_packet, request); > break; > case VSTOR_OPERATION_REMOVEDEVICE: >+ case VSTOR_OPERATION_ENUMERATE_BUS: >+ printf("VMBUS: storvsc operation %d not " >+ "implemented.\n", vstor_packet->operation); > /* TODO: implement */ > break; > default: >@@ -656,7 +848,7 @@ > } > } > ret = hv_vmbus_channel_recv_packet( >- device->channel, >+ channel, > packet, > roundup2(sizeof(struct vstor_packet), 8), > &bytes_recvd, >@@ -680,7 +872,16 @@ > { > int ata_disk_enable = 0; > int ret = ENXIO; >- >+ >+ if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) || >+ (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){ >+ storvsc_current_major = STORVSC_WIN8_MAJOR; >+ storvsc_current_minor = STORVSC_WIN8_MINOR; >+ } else { >+ storvsc_current_major = STORVSC_WIN7_MAJOR; >+ storvsc_current_minor = STORVSC_WIN7_MINOR; >+ } >+ > switch (storvsc_get_storage_type(dev)) { > case DRIVER_BLKVSC: > if(bootverbose) >@@ -721,9 +922,11 @@ > enum hv_storage_type stor_type; > struct storvsc_softc *sc; > struct cam_devq *devq; >- int ret, i; >+ int ret, i, j; > struct hv_storvsc_request *reqp; > struct root_hold_token *root_mount_token = NULL; >+ struct hv_sgl_node *sgl_node = NULL; >+ void *tmp_buff = NULL; > > /* > * We need to serialize storvsc attach calls. >@@ -764,8 +967,46 @@ > LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); > } > >+ /* create sg-list page pool */ >+ if (FALSE == g_hv_sgl_page_pool.is_init){ >+ g_hv_sgl_page_pool.is_init = TRUE; >+ LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); >+ LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); >+ >+ /* pre-create SG list, each SG list with HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each segment has one page buffer */ >+ for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++){ >+ sgl_node = malloc(sizeof(struct hv_sgl_node), >+ M_DEVBUF, M_WAITOK|M_ZERO); >+ if (NULL == sgl_node){ >+ ret = ENOMEM; >+ goto cleanup; >+ } >+ >+ sgl_node->sgl_data = sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT, >+ M_WAITOK|M_ZERO); >+ if (NULL == sgl_node->sgl_data){ >+ ret = ENOMEM; >+ goto cleanup; >+ } >+ >+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ >+ tmp_buff = malloc(PAGE_SIZE, >+ M_DEVBUF, M_WAITOK|M_ZERO); >+ if (NULL == tmp_buff){ >+ ret = ENOMEM; >+ goto cleanup; >+ } >+ >+ sgl_node->sgl_data->sg_segs[j].ss_paddr = (vm_paddr_t)tmp_buff; >+ } >+ >+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); >+ } >+ } >+ > sc->hs_destroy = FALSE; > sc->hs_drain_notify = FALSE; >+ sc->hs_open_multi_channel = FALSE; > sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); > > ret = hv_storvsc_connect_vsp(hv_dev); >@@ -834,6 +1075,19 @@ > LIST_REMOVE(reqp, link); > free(reqp, M_DEVBUF); > } >+ >+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { >+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); >+ LIST_REMOVE(sgl_node, link); >+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ >+ if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr){ >+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); >+ } >+ } >+ sglist_free(sgl_node->sgl_data); >+ free(sgl_node, M_DEVBUF); >+ } >+ > return (ret); > } > >@@ -853,6 +1107,8 @@ > struct storvsc_softc *sc = device_get_softc(dev); > struct hv_storvsc_request *reqp = NULL; > struct hv_device *hv_device = vmbus_get_devctx(dev); >+ struct hv_sgl_node *sgl_node = NULL; >+ int j = 0; > > mtx_lock(&hv_device->channel->inbound_lock); > sc->hs_destroy = TRUE; >@@ -884,6 +1140,19 @@ > free(reqp, M_DEVBUF); > } > mtx_unlock(&sc->hs_lock); >+ >+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { >+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); >+ LIST_REMOVE(sgl_node, link); >+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ >+ if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr){ >+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); >+ } >+ } >+ sglist_free(sgl_node->sgl_data); >+ free(sgl_node, M_DEVBUF); >+ } >+ > return (0); > } > >@@ -939,7 +1208,7 @@ > ticks, __func__, (ret == 0)? > "IO return detected" : > "IO return not detected"); >- /* >+ /* > * Now both the timer handler and io done are running > * simultaneously. We want to confirm the io done always > * finishes after the timer handler exits. So reqp used by >@@ -1024,7 +1293,7 @@ > > mtx_assert(&sc->hs_lock, MA_OWNED); > mtx_unlock(&sc->hs_lock); >- hv_storvsc_on_channel_callback(sc->hs_dev); >+ hv_storvsc_on_channel_callback(sc->hs_dev->channel); > mtx_lock(&sc->hs_lock); > } > >@@ -1152,10 +1421,14 @@ > > bzero(reqp, sizeof(struct hv_storvsc_request)); > reqp->softc = sc; >+ >+ ccb->ccb_h.status |= CAM_SIM_QUEUED; >+ if ((res = create_storvsc_request(ccb, reqp)) != 0) { >+ ccb->ccb_h.status = CAM_REQ_INVALID; >+ xpt_done(ccb); >+ return; >+ } > >- ccb->ccb_h.status |= CAM_SIM_QUEUED; >- create_storvsc_request(ccb, reqp); >- > if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { > callout_init(&reqp->callout, CALLOUT_MPSAFE); > callout_reset(&reqp->callout, >@@ -1195,6 +1468,207 @@ > } > > /** >+ * @brief destroy bounce buffer >+ * >+ * This function is responsible for destroy a Scatter/Gather list >+ * that create by storvsc_create_bounce_buffer() >+ * >+ * @param sgl- the Scatter/Gather need be destroy >+ * @param sg_count- page count of the SG list. >+ * >+ */ >+static void >+storvsc_destroy_bounce_buffer(struct sglist *sgl) >+{ >+ struct hv_sgl_node *sgl_node = NULL; >+ >+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); >+ LIST_REMOVE(sgl_node, link); >+ if (NULL == sgl_node) { >+ printf("storvsc error: not enough in use sgl\n"); >+ return; >+ } >+ sgl_node->sgl_data = sgl; >+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); >+} >+ >+/** >+ * @brief create bounce buffer >+ * >+ * This function is responsible for create a Scatter/Gather list, >+ * which hold several pages that can be aligned with page size. >+ * >+ * @param seg_count- SG-list segments count >+ * @param write - if WRITE_TYPE, set SG list page used size to 0, >+ * otherwise set used size to page size. >+ * >+ * return NULL if create failed >+ */ >+static struct sglist * >+storvsc_create_bounce_buffer(uint16_t seg_count, int write) >+{ >+ int i = 0; >+ struct sglist *bounce_sgl = NULL; >+ unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); >+ struct hv_sgl_node *sgl_node = NULL; >+ >+ /* get struct sglist from free_sgl_list */ >+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); >+ LIST_REMOVE(sgl_node, link); >+ if (NULL == sgl_node) { >+ printf("storvsc error: not enough free sgl\n"); >+ return NULL; >+ } >+ bounce_sgl = sgl_node->sgl_data; >+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); >+ >+ bounce_sgl->sg_maxseg = seg_count; >+ if (write == WRITE_TYPE) { >+ bounce_sgl->sg_nseg = 0; >+ } else { >+ bounce_sgl->sg_nseg = seg_count; >+ } >+ >+ for (i = 0; i < seg_count; i++) { >+ bounce_sgl->sg_segs[i].ss_len = buf_len; >+ } >+ >+ return bounce_sgl; >+} >+ >+/** >+ * @brief copy data from SG list to bounce buffer >+ * >+ * This function is responsible for copy data from one SG list's segments >+ * to another SG list which used as bounce buffer. >+ * >+ * @param bounce_sgl - the destination SG list >+ * @param orig_sgl - the segment of the source SG list. >+ * @param orig_sgl_count - the count of segments. >+ * @param orig_sgl_count - indicate which segment need bounce buffer, set 1 means need. >+ * >+ */ >+void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, >+ bus_dma_segment_t *orig_sgl, >+ unsigned int orig_sgl_count, >+ uint64_t seg_bits) >+{ >+ int src_sgl_idx = 0; >+ >+ for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { >+ if (seg_bits & (1 << src_sgl_idx)) { >+ memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, >+ (void*)orig_sgl[src_sgl_idx].ds_addr, >+ orig_sgl[src_sgl_idx].ds_len); >+ bounce_sgl->sg_segs[src_sgl_idx].ss_len = >+ orig_sgl[src_sgl_idx].ds_len; >+ } >+ } >+} >+ >+/** >+ * @brief copy data from SG list which used as bounce to another SG list >+ * >+ * This function is responsible for copy data from one SG list with bounce >+ * buffer to another SG list's segments. >+ * >+ * @param dest_sgl - the destination SG list's segments >+ * @param dest_sgl_count - the count of destination SG list's segment. >+ * @param src_sgl - the source SG list. >+ * @param seg_bits - indicate which segment used bounce buffer of src SG-list. >+ * >+ */ >+void >+storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, >+ unsigned int dest_sgl_count, >+ struct sglist* src_sgl, >+ uint64_t seg_bits) >+{ >+ int sgl_idx = 0; >+ >+ for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { >+ if (seg_bits & (1 << sgl_idx)) { >+ memcpy((void*)(dest_sgl[sgl_idx].ds_addr), >+ (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), >+ src_sgl->sg_segs[sgl_idx].ss_len); >+ } >+ } >+} >+ >+/** >+ * @brief check SG list with bounce buffer or not >+ * >+ * This function is responsible for check if need bounce buffer for SG list. >+ * >+ * @param sgl - the SG list's segments >+ * @param sg_count - the count of SG list's segment. >+ * @param bits - segmengs number that need bounce buffer >+ * >+ * return -1 if SG list needless bounce buffer >+ */ >+static int >+storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, unsigned int sg_count, uint64_t *bits) >+{ >+ int i = 0; >+ int offset = 0; >+ uint64_t phys_addr = 0; >+ uint64_t tmp_bits = 0; >+ boolean_t found_hole = FALSE; >+ boolean_t pre_aligned = TRUE; >+ >+ if (sg_count < 2){ >+ return -1; >+ } >+ >+ *bits = 0; >+ >+ phys_addr = vtophys(sgl[0].ds_addr); >+ offset = phys_addr - trunc_page(phys_addr); >+ if (offset){ >+ pre_aligned = FALSE; >+ tmp_bits |= 1; >+ } >+ >+ for (i = 1; i < sg_count; i++) { >+ phys_addr = vtophys(sgl[i].ds_addr); >+ offset = phys_addr - trunc_page(phys_addr); >+ >+ if (0 == offset) { >+ if (FALSE == pre_aligned){ >+ /* >+ * This segment is aligned, if the previous >+ * one is not aligned, find a hole >+ */ >+ found_hole = TRUE; >+ } >+ pre_aligned = TRUE; >+ } else { >+ tmp_bits |= 1 << i; >+ if (FALSE == pre_aligned) { >+ if (phys_addr != vtophys(sgl[i-1].ds_addr + >+ sgl[i-1].ds_len)) { >+ /* >+ * Check whether connect to previous >+ * segment,if not, find the hole >+ */ >+ found_hole = TRUE; >+ } >+ } else { >+ found_hole = TRUE; >+ } >+ pre_aligned = FALSE; >+ } >+ } >+ >+ if (FALSE == found_hole) { >+ return -1; >+ } else { >+ *bits = tmp_bits; >+ return 0; >+ } >+} >+ >+/** > * @brief Fill in a request structure based on a CAM control block > * > * Fills in a request structure based on the contents of a CAM control >@@ -1204,7 +1678,7 @@ > * @param ccb pointer to a CAM contorl block > * @param reqp pointer to a request structure > */ >-static void >+static int > create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) > { > struct ccb_scsiio *csio = &ccb->csio; >@@ -1212,6 +1686,7 @@ > uint32_t bytes_to_copy = 0; > uint32_t pfn_num = 0; > uint32_t pfn; >+ uint64_t not_aligned_seg_bits = 0; > > /* refer to struct vmscsi_req for meanings of these two fields */ > reqp->vstor_packet.u.vm_srb.port = >@@ -1232,18 +1707,18 @@ > } > > switch (ccb->ccb_h.flags & CAM_DIR_MASK) { >- case CAM_DIR_OUT: >- reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; >- break; >- case CAM_DIR_IN: >- reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; >- break; >- case CAM_DIR_NONE: >- reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; >- break; >- default: >- reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; >- break; >+ case CAM_DIR_OUT: >+ reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; >+ break; >+ case CAM_DIR_IN: >+ reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; >+ break; >+ case CAM_DIR_NONE: >+ reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; >+ break; >+ default: >+ reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; >+ break; > } > > reqp->sense_data = &csio->sense_data; >@@ -1250,30 +1725,138 @@ > reqp->sense_info_len = csio->sense_len; > > reqp->ccb = ccb; >- /* >- KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0, >- ("ccb is scatter gather valid\n")); >- */ >- if (csio->dxfer_len != 0) { >- reqp->data_buf.length = csio->dxfer_len; >+ >+ if (0 == csio->dxfer_len) { >+ return 0; >+ } >+ >+ reqp->data_buf.length = csio->dxfer_len; >+ >+ switch (ccb->ccb_h.flags & CAM_DATA_MASK) { >+ case CAM_DATA_VADDR:{ > bytes_to_copy = csio->dxfer_len; > phys_addr = vtophys(csio->data_ptr); >- reqp->data_buf.offset = phys_addr - trunc_page(phys_addr); >+ reqp->data_buf.offset = phys_addr & PAGE_MASK; >+ >+ while (bytes_to_copy != 0) { >+ int bytes, page_offset; >+ phys_addr = >+ vtophys(&csio->data_ptr[reqp->data_buf.length - >+ bytes_to_copy]); >+ pfn = phys_addr >> PAGE_SHIFT; >+ reqp->data_buf.pfn_array[pfn_num] = pfn; >+ page_offset = phys_addr & PAGE_MASK; >+ >+ bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); >+ >+ bytes_to_copy -= bytes; >+ pfn_num++; >+ } >+ break; > } >+ case CAM_DATA_SG:{ >+ int i = 0; >+ int offset = 0; >+ bus_dma_segment_t *storvsc_sglist = >+ (bus_dma_segment_t *)ccb->csio.data_ptr; >+ u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; > >- while (bytes_to_copy != 0) { >- int bytes, page_offset; >- phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length - >- bytes_to_copy]); >- pfn = phys_addr >> PAGE_SHIFT; >- reqp->data_buf.pfn_array[pfn_num] = pfn; >- page_offset = phys_addr - trunc_page(phys_addr); >+ printf("Storvsc: get SG I/O operation, %d\n", >+ reqp->vstor_packet.u.vm_srb.data_in); > >- bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); >+ if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){ >+ printf("Storvsc: %d segments is too much, " >+ "only support %d segments\n", >+ storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT); >+ return EINVAL; >+ } > >- bytes_to_copy -= bytes; >- pfn_num++; >+ /* check if we need to create bounce buffer */ >+ if (storvsc_check_bounce_buffer_sgl( >+ storvsc_sglist, >+ storvsc_sg_count, >+ ¬_aligned_seg_bits) != -1) { >+ reqp->bounce_sgl = >+ storvsc_create_bounce_buffer(storvsc_sg_count, >+ reqp->vstor_packet.u.vm_srb.data_in); >+ if (NULL == reqp->bounce_sgl) { >+ printf("Storvsc_error: create bounce buffer failed.\n"); >+ return ENOMEM; >+ } >+ >+ reqp->bounce_sgl_count = storvsc_sg_count; >+ reqp->not_aligned_seg_bits = not_aligned_seg_bits; >+ >+ /* >+ * if it is write, we need copy the original data >+ *to bounce buffer >+ */ >+ if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { >+ storvsc_copy_sgl_to_bounce_buf( >+ reqp->bounce_sgl, >+ storvsc_sglist, >+ storvsc_sg_count, >+ reqp->not_aligned_seg_bits); >+ } >+ >+ /* transfer virtual address to physical frame number */ >+ if (reqp->not_aligned_seg_bits & 0x1){ >+ phys_addr = >+ vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); >+ }else{ >+ phys_addr = >+ vtophys(storvsc_sglist[0].ds_addr); >+ } >+ reqp->data_buf.offset = phys_addr & PAGE_MASK; >+ >+ pfn = phys_addr >> PAGE_SHIFT; >+ reqp->data_buf.pfn_array[0] = pfn; >+ >+ for (i = 1; i < storvsc_sg_count; i++) { >+ if (reqp->not_aligned_seg_bits & (1 << i)){ >+ phys_addr = >+ vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); >+ } >+ else{ >+ phys_addr = >+ vtophys(storvsc_sglist[i].ds_addr); >+ } >+ >+ pfn = phys_addr >> PAGE_SHIFT; >+ reqp->data_buf.pfn_array[i] = pfn; >+ } >+ } >+ else { >+ phys_addr = vtophys(storvsc_sglist[0].ds_addr); >+ >+ reqp->data_buf.offset = phys_addr & PAGE_MASK; >+ >+ for (i = 0; i < storvsc_sg_count; i++){ >+ phys_addr = vtophys(storvsc_sglist[i].ds_addr); >+ pfn = phys_addr >> PAGE_SHIFT; >+ reqp->data_buf.pfn_array[i] = pfn; >+ } >+ >+ /* check the last segment cross boundary or not */ >+ offset = phys_addr & PAGE_MASK; >+ if (offset){ >+ phys_addr = >+ vtophys(storvsc_sglist[i-1].ds_addr + >+ PAGE_SIZE - offset); >+ pfn = phys_addr >> PAGE_SHIFT; >+ reqp->data_buf.pfn_array[i] = pfn; >+ } >+ >+ reqp->bounce_sgl_count = 0; >+ } >+ break; > } >+ default: >+ printf("Unknow flags: %d\n", ccb->ccb_h.flags); >+ return EINVAL; >+ } >+ >+ return 0; > } > > /** >@@ -1292,7 +1875,29 @@ > struct ccb_scsiio *csio = &ccb->csio; > struct storvsc_softc *sc = reqp->softc; > struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; >- >+ bus_dma_segment_t *ori_sglist = NULL; >+ int ori_sg_count = 0; >+ >+ /* destroy bounce buffer if it is used */ >+ if (reqp->bounce_sgl_count) { >+ ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; >+ ori_sg_count = ccb->csio.sglist_cnt; >+ >+ /* >+ * If it is READ operation, we should copy back the data >+ * to original SG list. >+ */ >+ if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { >+ storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, >+ ori_sg_count, >+ reqp->bounce_sgl, >+ reqp->not_aligned_seg_bits); >+ } >+ >+ storvsc_destroy_bounce_buffer(reqp->bounce_sgl); >+ reqp->bounce_sgl_count = 0; >+ } >+ > if (reqp->retries > 0) { > mtx_lock(&sc->hs_lock); > #if HVS_TIMEOUT_TEST >@@ -1310,7 +1915,7 @@ > mtx_unlock(&sc->hs_lock); > } > >- /* >+ /* > * callout_drain() will wait for the timer handler to finish > * if it is running. So we don't need any lock to synchronize > * between this routine and the timer handler. >Index: sys/dev/hyperv/storvsc/hv_vstorage.h >=================================================================== >--- sys/dev/hyperv/storvsc/hv_vstorage.h (revision 1) >+++ sys/dev/hyperv/storvsc/hv_vstorage.h (revision 3) >@@ -53,7 +53,7 @@ > * V1 RC > 2008/1/31 2.0 > */ > >-#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0) >+#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) > > /** > * Packet structure ops describing virtual storage requests. >@@ -69,7 +69,10 @@ > VSTOR_OPERATION_ENDINITIALIZATION = 8, > VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, > VSTOR_OPERATION_QUERYPROPERTIES = 10, >- VSTOR_OPERATION_MAXIMUM = 10 >+ VSTOR_OPERATION_ENUMERATE_BUS = 11, >+ VSTOR_OPERATION_FCHBA_DATA = 12, >+ VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, >+ VSTOR_OPERATION_MAXIMUM = 13 > }; > > >@@ -123,10 +126,12 @@ > uint8_t path_id; > uint8_t target_id; > >+ uint16_t max_channel_cnt; >+ > /** > * Note: port number is only really known on the client side > */ >- uint32_t port; >+ uint16_t port; > uint32_t flags; > uint32_t max_transfer_bytes; > >@@ -193,6 +198,11 @@ > * Used during version negotiations. > */ > struct vmstor_proto_ver version; >+ >+ /** >+ * Number of multichannels to create >+ */ >+ uint16_t multi_channels_cnt; > } u; > > } __packed; >Index: sys/dev/hyperv/utilities/hv_util.c >=================================================================== >--- sys/dev/hyperv/utilities/hv_util.c (revision 1) >+++ sys/dev/hyperv/utilities/hv_util.c (revision 3) >@@ -408,6 +408,15 @@ > } > } > >+ /* >+ * These services are not performance critical and do not need >+ * batched reading. Furthermore, some services such as KVP can >+ * only handle one message from the host at a time. >+ * Turn off batched reading for all util drivers before we open the >+ * channel. >+ */ >+ hv_set_channel_read_state(hv_dev->channel, FALSE); >+ > ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE, > 4 * PAGE_SIZE, NULL, 0, > service->callback, hv_dev->channel); >Index: sys/dev/hyperv/utilities/hv_kvp.c >=================================================================== >--- sys/dev/hyperv/utilities/hv_kvp.c (revision 1) >+++ sys/dev/hyperv/utilities/hv_kvp.c (revision 3) >@@ -55,6 +55,7 @@ > #include <sys/_null.h> > #include <sys/signal.h> > #include <sys/syslog.h> >+#include <sys/systm.h> > #include <sys/mutex.h> > #include <net/if_arp.h> > >@@ -232,7 +233,7 @@ > */ > if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) { > icframe_vercnt = 3; >- if (icmsg_vercnt >= 2) >+ if (icmsg_vercnt > 2) > icmsg_vercnt = 4; > else > icmsg_vercnt = 3; >@@ -734,8 +735,8 @@ > recvlen = 0; > ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, > &recvlen, &requestid); >- hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n", >- __func__, context, pending_cnt, ret, recvlen); >+ hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n", >+ __func__, context, (unsigned long long)pending_cnt, ret, recvlen); > } > } > >@@ -813,9 +814,9 @@ > hv_kvp_dev_destroy(void) > { > >- if (daemon_task != NULL) { >+ if (daemon_task != NULL) { > PROC_LOCK(daemon_task); >- kern_psignal(daemon_task, SIGKILL); >+ kern_psignal(daemon_task, SIGKILL); > PROC_UNLOCK(daemon_task); > } > >Index: sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c (revision 3) >@@ -53,7 +53,10 @@ > > #include <machine/stdarg.h> > #include <machine/intr_machdep.h> >+#include <machine/md_var.h> >+#include <machine/segments.h> > #include <sys/pcpu.h> >+#include <x86/apicvar.h> > > #include "hv_vmbus_priv.h" > >@@ -60,15 +63,7 @@ > > #define VMBUS_IRQ 0x5 > >-static struct intr_event *hv_msg_intr_event; >-static struct intr_event *hv_event_intr_event; >-static void *msg_swintr; >-static void *event_swintr; > static device_t vmbus_devp; >-static void *vmbus_cookiep; >-static int vmbus_rid; >-struct resource *intr_res; >-static int vmbus_irq = VMBUS_IRQ; > static int vmbus_inited; > static hv_setup_args setup_args; /* only CPU 0 supported at this time */ > >@@ -77,7 +72,7 @@ > * the hypervisor. > */ > static void >-vmbus_msg_swintr(void *dummy) >+vmbus_msg_swintr(void *arg) > { > int cpu; > void* page_addr; >@@ -84,7 +79,10 @@ > hv_vmbus_message* msg; > hv_vmbus_message* copied; > >- cpu = PCPU_GET(cpuid); >+ cpu = (int)(long)arg; >+ KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " >+ "cpu out of range!")); >+ > page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; > msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; > >@@ -130,17 +128,8 @@ > * > * The purpose of this routine is to determine the type of VMBUS protocol > * message to process - an event or a channel message. >- * As this is an interrupt filter routine, the function runs in a very >- * restricted envinronment. From the manpage for bus_setup_intr(9) >- * >- * In this restricted environment, care must be taken to account for all >- * races. A careful analysis of races should be done as well. It is gener- >- * ally cheaper to take an extra interrupt, for example, than to protect >- * variables with spinlocks. Read, modify, write cycles of hardware regis- >- * ters need to be carefully analyzed if other threads are accessing the >- * same registers. > */ >-static int >+static inline int > hv_vmbus_isr(void *unused) > { > int cpu; >@@ -149,8 +138,6 @@ > void* page_addr; > > cpu = PCPU_GET(cpuid); >- /* (Temporary limit) */ >- KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); > > /* > * The Windows team has advised that we check for events >@@ -162,9 +149,21 @@ > event = (hv_vmbus_synic_event_flags*) > page_addr + HV_VMBUS_MESSAGE_SINT; > >- /* Since we are a child, we only need to check bit 0 */ >- if (synch_test_and_clear_bit(0, &event->flags32[0])) { >- swi_sched(event_swintr, 0); >+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || >+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { >+ /* Since we are a child, we only need to check bit 0 */ >+ if (synch_test_and_clear_bit(0, &event->flags32[0])) { >+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); >+ } >+ } else { >+ /* >+ * On host with Win8 or above, we can directly look at >+ * the event page. If bit n is set, we have an interrupt >+ * on the channel with id n. >+ * Directly schedule the event software interrupt on >+ * current cpu. >+ */ >+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); > } > > /* Check if there are actual msgs to be process */ >@@ -172,12 +171,47 @@ > msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; > > if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { >- swi_sched(msg_swintr, 0); >+ swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); > } > > return FILTER_HANDLED; > } > >+#ifdef HV_DEBUG_INTR >+uint32_t hv_intr_count = 0; >+#endif >+uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; >+uint32_t hv_vmbus_intr_cpu[MAXCPU]; >+ >+void >+hv_vector_handler(struct trapframe *trap_frame) >+{ >+#ifdef HV_DEBUG_INTR >+ int cpu; >+#endif >+ >+ /* >+ * Disable preemption. >+ */ >+ critical_enter(); >+ >+#ifdef HV_DEBUG_INTR >+ /* >+ * Do a little interrupt counting. >+ */ >+ cpu = PCPU_GET(cpuid); >+ hv_vmbus_intr_cpu[cpu]++; >+ hv_intr_count++; >+#endif >+ >+ hv_vmbus_isr(NULL); >+ >+ /* >+ * Enable preemption. >+ */ >+ critical_exit(); >+} >+ > static int > vmbus_read_ivar( > device_t dev, >@@ -316,7 +350,66 @@ > return (BUS_PROBE_NOWILDCARD); > } > >+extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); >+ > /** >+ * @brief Find a free IDT slot and setup the interrupt handler. >+ */ >+static int >+vmbus_vector_alloc(void) >+{ >+ int vector; >+ uintptr_t func; >+ struct gate_descriptor *ip; >+ >+ /* >+ * Search backwards form the highest IDT vector available for use >+ * as vmbus channel callback vector. We install 'hv_vmbus_callback' >+ * handler at that vector and use it to interrupt vcpus. >+ */ >+ vector = APIC_SPURIOUS_INT; >+ while (--vector >= APIC_IPI_INTS) { >+ ip = &idt[vector]; >+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); >+ if (func == (uintptr_t)&IDTVEC(rsvd)) { >+#ifdef __i386__ >+ setidt(vector , &IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, >+ SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); >+#else >+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, >+ SEL_KPL, 0); >+#endif >+ >+ return (vector); >+ } >+ } >+ return (0); >+} >+ >+/** >+ * @brief Restore the IDT slot to rsvd. >+ */ >+static void >+vmbus_vector_free(int vector) >+{ >+ uintptr_t func; >+ struct gate_descriptor *ip; >+ >+ if (vector == 0) >+ return; >+ >+ KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, >+ ("invalid vector %d", vector)); >+ >+ ip = &idt[vector]; >+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); >+ KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), >+ ("invalid vector %d", vector)); >+ >+ setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); >+} >+ >+/** > * @brief Main vmbus driver initialization routine. > * > * Here, we >@@ -331,22 +424,7 @@ > static int > vmbus_bus_init(void) > { >- struct ioapic_intsrc { >- struct intsrc io_intsrc; >- u_int io_irq; >- u_int io_intpin:8; >- u_int io_vector:8; >- u_int io_cpu:8; >- u_int io_activehi:1; >- u_int io_edgetrigger:1; >- u_int io_masked:1; >- int io_bus:4; >- uint32_t io_lowreg; >- }; >- int i, ret; >- unsigned int vector = 0; >- struct intsrc *isrc; >- struct ioapic_intsrc *intpin; >+ int i, j, n, ret; > > if (vmbus_inited) > return (0); >@@ -361,80 +439,100 @@ > return (ret); > } > >- ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, >- NULL, SWI_CLOCK, 0, &msg_swintr); >- >- if (ret) >- goto cleanup; >- > /* >- * Message SW interrupt handler checks a per-CPU page and >- * thus the thread needs to be bound to CPU-0 - which is where >- * all interrupts are processed. >+ * Find a free IDT slot for vmbus callback. > */ >- ret = intr_event_bind(hv_msg_intr_event, 0); >+ hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc(); > >- if (ret) >- goto cleanup1; >+ if (hv_vmbus_g_context.hv_cb_vector == 0) { >+ if(bootverbose) >+ printf("Error VMBUS: Cannot find free IDT slot for " >+ "vmbus callback!\n"); >+ goto cleanup; >+ } > >- ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, >- NULL, SWI_CLOCK, 0, &event_swintr); >+ if(bootverbose) >+ printf("VMBUS: vmbus callback vector %d\n", >+ hv_vmbus_g_context.hv_cb_vector); > >- if (ret) >- goto cleanup1; >+ /* >+ * Notify the hypervisor of our vector. >+ */ >+ setup_args.vector = hv_vmbus_g_context.hv_cb_vector; > >- intr_res = bus_alloc_resource(vmbus_devp, >- SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); >+ CPU_FOREACH(j) { >+ hv_vmbus_intr_cpu[j] = 0; >+ hv_vmbus_swintr_event_cpu[j] = 0; >+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL; >+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; >+ hv_vmbus_g_context.event_swintr[j] = NULL; >+ hv_vmbus_g_context.msg_swintr[j] = NULL; > >- if (intr_res == NULL) { >- ret = ENOMEM; /* XXXKYS: Need a better errno */ >- goto cleanup2; >+ for (i = 0; i < 2; i++) >+ setup_args.page_buffers[2 * j + i] = NULL; > } > > /* >- * Setup interrupt filter handler >+ * Per cpu setup. > */ >- ret = bus_setup_intr(vmbus_devp, intr_res, >- INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, >- NULL, &vmbus_cookiep); >+ CPU_FOREACH(j) { >+ /* >+ * Setup software interrupt thread and handler for msg handling. >+ */ >+ ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], >+ "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, >+ &hv_vmbus_g_context.msg_swintr[j]); >+ if (ret) { >+ if(bootverbose) >+ printf("VMBUS: failed to setup msg swi for " >+ "cpu %d\n", j); >+ goto cleanup1; >+ } > >- if (ret != 0) >- goto cleanup3; >+ /* >+ * Bind the swi thread to the cpu. >+ */ >+ ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], >+ j); >+ if (ret) { >+ if(bootverbose) >+ printf("VMBUS: failed to bind msg swi thread " >+ "to cpu %d\n", j); >+ goto cleanup1; >+ } > >- ret = bus_bind_intr(vmbus_devp, intr_res, 0); >- if (ret != 0) >- goto cleanup4; >+ /* >+ * Setup software interrupt thread and handler for >+ * event handling. >+ */ >+ ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], >+ "hv_event", hv_vmbus_on_events, (void *)(long)j, >+ SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); >+ if (ret) { >+ if(bootverbose) >+ printf("VMBUS: failed to setup event swi for " >+ "cpu %d\n", j); >+ goto cleanup1; >+ } > >- isrc = intr_lookup_source(vmbus_irq); >- if ((isrc == NULL) || (isrc->is_event == NULL)) { >- ret = EINVAL; >- goto cleanup4; >- } >- >- /* vector = isrc->is_event->ie_vector; */ >- intpin = (struct ioapic_intsrc *)isrc; >- vector = intpin->io_vector; >- >- if(bootverbose) >- printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); >- >- /** >- * Notify the hypervisor of our irq. >- */ >- setup_args.vector = vector; >- for(i = 0; i < 2; i++) { >- setup_args.page_buffers[i] = >+ /* >+ * Prepare the per cpu msg and event pages to be called on each cpu. >+ */ >+ for(i = 0; i < 2; i++) { >+ setup_args.page_buffers[2 * j + i] = > malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); >- if (setup_args.page_buffers[i] == NULL) { >- KASSERT(setup_args.page_buffers[i] != NULL, >+ if (setup_args.page_buffers[2 * j + i] == NULL) { >+ KASSERT(setup_args.page_buffers[2 * j + i] != NULL, > ("Error VMBUS: malloc failed!")); >- if (i > 0) >- free(setup_args.page_buffers[0], M_DEVBUF); >- goto cleanup4; >+ goto cleanup1; >+ } > } > } > >- /* only CPU #0 supported at this time */ >+ if (bootverbose) >+ printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n", >+ smp_started); >+ > smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args); > > /* >@@ -443,27 +541,33 @@ > ret = hv_vmbus_connect(); > > if (ret != 0) >- goto cleanup4; >+ goto cleanup1; > > hv_vmbus_request_channel_offers(); > return (ret); > >- cleanup4: >+ cleanup1: >+ /* >+ * Free pages alloc'ed >+ */ >+ for (n = 0; n < 2 * MAXCPU; n++) >+ if (setup_args.page_buffers[n] != NULL) >+ free(setup_args.page_buffers[n], M_DEVBUF); > > /* >- * remove swi, bus and intr resource >+ * remove swi and vmbus callback vector; > */ >- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); >+ CPU_FOREACH(j) { >+ if (hv_vmbus_g_context.msg_swintr[j] != NULL) >+ swi_remove(hv_vmbus_g_context.msg_swintr[j]); >+ if (hv_vmbus_g_context.event_swintr[j] != NULL) >+ swi_remove(hv_vmbus_g_context.event_swintr[j]); >+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; >+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL; >+ } > >- cleanup3: >- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); >+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); > >- cleanup2: >- swi_remove(event_swintr); >- >- cleanup1: >- swi_remove(msg_swintr); >- > cleanup: > hv_vmbus_cleanup(); > >@@ -515,7 +619,7 @@ > > smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); > >- for(i = 0; i < 2; i++) { >+ for(i = 0; i < 2 * MAXCPU; i++) { > if (setup_args.page_buffers[i] != 0) > free(setup_args.page_buffers[i], M_DEVBUF); > } >@@ -522,14 +626,18 @@ > > hv_vmbus_cleanup(); > >- /* remove swi, bus and intr resource */ >- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); >+ /* remove swi */ >+ CPU_FOREACH(i) { >+ if (hv_vmbus_g_context.msg_swintr[i] != NULL) >+ swi_remove(hv_vmbus_g_context.msg_swintr[i]); >+ if (hv_vmbus_g_context.event_swintr[i] != NULL) >+ swi_remove(hv_vmbus_g_context.event_swintr[i]); >+ hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; >+ hv_vmbus_g_context.hv_event_intr_event[i] = NULL; >+ } > >- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); >+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); > >- swi_remove(msg_swintr); >- swi_remove(event_swintr); >- > return; > } > >@@ -603,6 +711,6 @@ > DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); > MODULE_VERSION(vmbus,1); > >-/* TODO: We want to be earlier than SI_SUB_VFS */ >-SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); >+/* We want to be started after SMP is initialized */ >+SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL); > >Index: sys/dev/hyperv/vmbus/hv_vmbus_priv.h >=================================================================== >--- sys/dev/hyperv/vmbus/hv_vmbus_priv.h (revision 1) >+++ sys/dev/hyperv/vmbus/hv_vmbus_priv.h (revision 3) >@@ -181,49 +181,30 @@ > > #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) > >-/* >- * Connection identifier type >- */ >-typedef union { >- uint32_t as_uint32_t; >- struct { >- uint32_t id:24; >- uint32_t reserved:8; >- } u; >- >-} __packed hv_vmbus_connection_id; >- >-/* >- * Definition of the hv_vmbus_signal_event hypercall input structure >- */ > typedef struct { >- hv_vmbus_connection_id connection_id; >- uint16_t flag_number; >- uint16_t rsvd_z; >-} __packed hv_vmbus_input_signal_event; >- >-typedef struct { >- uint64_t align8; >- hv_vmbus_input_signal_event event; >-} __packed hv_vmbus_input_signal_event_buffer; >- >-typedef struct { > uint64_t guest_id; > void* hypercall_page; > hv_bool_uint8_t syn_ic_initialized; >+ >+ hv_vmbus_handle syn_ic_msg_page[MAXCPU]; >+ hv_vmbus_handle syn_ic_event_page[MAXCPU]; > /* >- * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. >- * The input param is immutable in our usage and >- * must be dynamic mem (vs stack or global). >+ * For FreeBSD cpuid to Hyper-V vcpuid mapping. > */ >- hv_vmbus_input_signal_event_buffer *signal_event_buffer; >+ uint32_t hv_vcpu_index[MAXCPU]; > /* >- * 8-bytes aligned of the buffer above >+ * Each cpu has its own software interrupt handler for channel >+ * event and msg handling. > */ >- hv_vmbus_input_signal_event *signal_event_param; >- >- hv_vmbus_handle syn_ic_msg_page[MAXCPU]; >- hv_vmbus_handle syn_ic_event_page[MAXCPU]; >+ struct intr_event *hv_event_intr_event[MAXCPU]; >+ struct intr_event *hv_msg_intr_event[MAXCPU]; >+ void *event_swintr[MAXCPU]; >+ void *msg_swintr[MAXCPU]; >+ /* >+ * Host use this vector to intrrupt guest for vmbus channel >+ * event and msg. >+ */ >+ unsigned int hv_cb_vector; > } hv_vmbus_context; > > /* >@@ -368,7 +349,8 @@ > TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; > struct mtx channel_msg_lock; > /** >- * List of channels >+ * List of primary channels. Sub channels will be linked >+ * under their primary channel. > */ > TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; > struct mtx channel_lock; >@@ -560,6 +542,8 @@ > uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; > } hv_vmbus_synic_event_flags; > >+/* MSR used to provide vcpu index */ >+#define HV_X64_MSR_VP_INDEX (0x40000002) > > /* > * Define synthetic interrupt controller model specific registers >@@ -618,7 +602,8 @@ > int hv_ring_buffer_write( > hv_vmbus_ring_buffer_info *ring_info, > hv_vmbus_sg_buffer_list sg_buffers[], >- uint32_t sg_buff_count); >+ uint32_t sg_buff_count, >+ boolean_t *need_sig); > > int hv_ring_buffer_peek( > hv_vmbus_ring_buffer_info *ring_info, >@@ -638,6 +623,12 @@ > hv_vmbus_ring_buffer_info *ring_info, > char *prefix); > >+void hv_ring_buffer_read_begin( >+ hv_vmbus_ring_buffer_info *ring_info); >+ >+uint32_t hv_ring_buffer_read_end( >+ hv_vmbus_ring_buffer_info *ring_info); >+ > hv_vmbus_channel* hv_vmbus_allocate_channel(void); > void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); > void hv_vmbus_on_channel_message(void *context); >@@ -652,7 +643,7 @@ > void *payload, > size_t payload_size); > >-uint16_t hv_vmbus_signal_event(void); >+uint16_t hv_vmbus_signal_event(void *con_id); > void hv_vmbus_synic_init(void *irq_arg); > void hv_vmbus_synic_cleanup(void *arg); > int hv_vmbus_query_hypervisor_presence(void); >@@ -674,7 +665,7 @@ > int hv_vmbus_connect(void); > int hv_vmbus_disconnect(void); > int hv_vmbus_post_message(void *buffer, size_t buf_size); >-int hv_vmbus_set_event(uint32_t child_rel_id); >+int hv_vmbus_set_event(hv_vmbus_channel *channel); > void hv_vmbus_on_events(void *); > > >@@ -718,7 +709,7 @@ > > typedef struct { > unsigned int vector; >- void *page_buffers[2]; >+ void *page_buffers[2 * MAXCPU]; > } hv_setup_args; > > #endif /* __HYPERV_PRIV_H__ */ >Index: sys/dev/hyperv/vmbus/hv_channel.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_channel.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_channel.c (revision 3) >@@ -75,7 +75,7 @@ > (uint32_t *)&monitor_page-> > trigger_group[channel->monitor_group].u.pending); > } else { >- hv_vmbus_set_event(channel->offer_msg.child_rel_id); >+ hv_vmbus_set_event(channel); > } > > } >@@ -99,6 +99,18 @@ > hv_vmbus_channel_open_channel* open_msg; > hv_vmbus_channel_msg_info* open_info; > >+ mtx_lock_spin(&new_channel->sc_lock); >+ if (new_channel->state == HV_CHANNEL_OPEN_STATE) { >+ new_channel->state = HV_CHANNEL_OPENING_STATE; >+ } else { >+ mtx_unlock_spin(&new_channel->sc_lock); >+ if(bootverbose) >+ printf("VMBUS: Trying to open channel <%p> which in " >+ "%d state.\n", new_channel, new_channel->state); >+ return (EINVAL); >+ } >+ mtx_unlock_spin(&new_channel->sc_lock); >+ > new_channel->on_channel_callback = pfn_on_channel_callback; > new_channel->channel_callback_context = context; > >@@ -162,7 +174,7 @@ > new_channel->ring_buffer_gpadl_handle; > open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size > >> PAGE_SHIFT; >- open_msg->server_context_area_gpadl_handle = 0; >+ open_msg->target_vcpu = new_channel->target_vcpu; > > if (user_data_len) > memcpy(open_msg->user_data, user_data, user_data_len); >@@ -182,10 +194,14 @@ > > ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ > >- if (ret) >+ if (ret) { >+ if(bootverbose) >+ printf("VMBUS: channel <%p> open timeout.\n", new_channel); > goto cleanup; >+ } > > if (open_info->response.open_result.status == 0) { >+ new_channel->state = HV_CHANNEL_OPENED_STATE; > if(bootverbose) > printf("VMBUS: channel <%p> open success.\n", new_channel); > } else { >@@ -497,16 +513,20 @@ > return (ret); > } > >-/** >- * @brief Close the specified channel >- */ >-void >-hv_vmbus_channel_close(hv_vmbus_channel *channel) >+static void >+hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) > { > int ret = 0; > hv_vmbus_channel_close_channel* msg; > hv_vmbus_channel_msg_info* info; > >+ channel->state = HV_CHANNEL_OPEN_STATE; >+ channel->sc_creation_callback = NULL; >+ >+ /* >+ * Grab the lock to prevent race condition when a packet received >+ * and unloading driver is in the process. >+ */ > mtx_lock(&channel->inbound_lock); > channel->on_channel_callback = NULL; > mtx_unlock(&channel->inbound_lock); >@@ -545,23 +565,37 @@ > M_DEVBUF); > > free(info, M_DEVBUF); >+} > >+/** >+ * @brief Close the specified channel >+ */ >+void >+hv_vmbus_channel_close(hv_vmbus_channel *channel) >+{ >+ hv_vmbus_channel* sub_channel; >+ >+ if (channel->primary_channel != NULL) { >+ /* >+ * We only close multi-channels when the primary is >+ * closed. >+ */ >+ return; >+ } >+ > /* >- * If we are closing the channel during an error path in >- * opening the channel, don't free the channel >- * since the caller will free the channel >+ * Close all multi-channels first. > */ >- if (channel->state == HV_CHANNEL_OPEN_STATE) { >- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); >- TAILQ_REMOVE( >- &hv_vmbus_g_connection.channel_anchor, >- channel, >- list_entry); >- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); >- >- hv_vmbus_free_vmbus_channel(channel); >+ TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor, >+ sc_list_entry) { >+ if (sub_channel->state != HV_CHANNEL_OPENED_STATE) >+ continue; >+ hv_vmbus_channel_close_internal(sub_channel); > } >- >+ /* >+ * Then close the primary channel. >+ */ >+ hv_vmbus_channel_close_internal(channel); > } > > /** >@@ -581,6 +615,7 @@ > uint32_t packet_len; > uint64_t aligned_data; > uint32_t packet_len_aligned; >+ boolean_t need_sig; > hv_vmbus_sg_buffer_list buffer_list[3]; > > packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; >@@ -604,12 +639,11 @@ > buffer_list[2].data = &aligned_data; > buffer_list[2].length = packet_len_aligned - packet_len; > >- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); >+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, >+ &need_sig); > > /* TODO: We should determine if this is optional */ >- if (ret == 0 >- && !hv_vmbus_get_ring_buffer_interrupt_mask( >- &channel->outbound)) { >+ if (ret == 0 && need_sig) { > vmbus_channel_set_event(channel); > } > >@@ -632,6 +666,7 @@ > > int ret = 0; > int i = 0; >+ boolean_t need_sig; > uint32_t packet_len; > uint32_t packetLen_aligned; > hv_vmbus_sg_buffer_list buffer_list[3]; >@@ -675,11 +710,11 @@ > buffer_list[2].data = &alignedData; > buffer_list[2].length = packetLen_aligned - packet_len; > >- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); >+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, >+ &need_sig); > > /* TODO: We should determine if this is optional */ >- if (ret == 0 && >- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { >+ if (ret == 0 && need_sig) { > vmbus_channel_set_event(channel); > } > >@@ -700,6 +735,7 @@ > > int ret = 0; > uint32_t desc_size; >+ boolean_t need_sig; > uint32_t packet_len; > uint32_t packet_len_aligned; > uint32_t pfn_count; >@@ -750,11 +786,11 @@ > buffer_list[2].data = &aligned_data; > buffer_list[2].length = packet_len_aligned - packet_len; > >- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); >+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, >+ &need_sig); > > /* TODO: We should determine if this is optional */ >- if (ret == 0 && >- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { >+ if (ret == 0 && need_sig) { > vmbus_channel_set_event(channel); > } > >Index: sys/dev/hyperv/vmbus/hv_ring_buffer.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_ring_buffer.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_ring_buffer.c (revision 3) >@@ -144,6 +144,69 @@ > return (uint64_t) ring_info->ring_buffer->write_index << 32; > } > >+void >+hv_ring_buffer_read_begin( >+ hv_vmbus_ring_buffer_info* ring_info) >+{ >+ ring_info->ring_buffer->interrupt_mask = 1; >+ mb(); >+} >+ >+uint32_t >+hv_ring_buffer_read_end( >+ hv_vmbus_ring_buffer_info* ring_info) >+{ >+ uint32_t read, write; >+ >+ ring_info->ring_buffer->interrupt_mask = 0; >+ mb(); >+ >+ /* >+ * Now check to see if the ring buffer is still empty. >+ * If it is not, we raced and we need to process new >+ * incoming messages. >+ */ >+ get_ring_buffer_avail_bytes(ring_info, &read, &write); >+ >+ return (read); >+} >+ >+/* >+ * When we write to the ring buffer, check if the host needs to >+ * be signaled. Here is the details of this protocol: >+ * >+ * 1. The host guarantees that while it is draining the >+ * ring buffer, it will set the interrupt_mask to >+ * indicate it does not need to be interrupted when >+ * new data is placed. >+ * >+ * 2. The host guarantees that it will completely drain >+ * the ring buffer before exiting the read loop. Further, >+ * once the ring buffer is empty, it will clear the >+ * interrupt_mask and re-check to see if new data has >+ * arrived. >+ */ >+static boolean_t >+hv_ring_buffer_needsig_on_write( >+ uint32_t old_write_location, >+ hv_vmbus_ring_buffer_info* rbi) >+{ >+ mb(); >+ if (rbi->ring_buffer->interrupt_mask) >+ return (FALSE); >+ >+ /* Read memory barrier */ >+ rmb(); >+ /* >+ * This is the only case we need to signal when the >+ * ring transitions from being empty to non-empty. >+ */ >+ if (old_write_location == rbi->ring_buffer->read_index) >+ return (TRUE); >+ >+ return (FALSE); >+} >+ > static uint32_t copy_to_ring_buffer( > hv_vmbus_ring_buffer_info* ring_info, > uint32_t start_write_offset, >@@ -204,11 +267,13 @@ > hv_ring_buffer_write( > hv_vmbus_ring_buffer_info* out_ring_info, > hv_vmbus_sg_buffer_list sg_buffers[], >- uint32_t sg_buffer_count) >+ uint32_t sg_buffer_count, >+ boolean_t *need_sig) > { > int i = 0; > uint32_t byte_avail_to_write; > uint32_t byte_avail_to_read; >+ uint32_t old_write_location; > uint32_t total_bytes_to_write = 0; > > volatile uint32_t next_write_location; >@@ -242,6 +307,8 @@ > */ > next_write_location = get_next_write_location(out_ring_info); > >+ old_write_location = next_write_location; >+ > for (i = 0; i < sg_buffer_count; i++) { > next_write_location = copy_to_ring_buffer(out_ring_info, > next_write_location, (char *) sg_buffers[i].data, >@@ -258,9 +325,9 @@ > (char *) &prev_indices, sizeof(uint64_t)); > > /* >- * Make sure we flush all writes before updating the writeIndex >+ * Full memory barrier before upding the write index. > */ >- wmb(); >+ mb(); > > /* > * Now, update the write location >@@ -269,6 +336,9 @@ > > mtx_unlock_spin(&out_ring_info->ring_lock); > >+ *need_sig = hv_ring_buffer_needsig_on_write(old_write_location, >+ out_ring_info); >+ > return (0); > } > >Index: sys/dev/hyperv/vmbus/hv_channel_mgmt.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_channel_mgmt.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_channel_mgmt.c (revision 3) >@@ -50,6 +50,7 @@ > static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); > static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); > static void vmbus_channel_process_offer(void *context); >+struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); > > /** > * Channel message dispatch table >@@ -233,7 +234,10 @@ > return (NULL); > > mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); >+ mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_SPIN); > >+ TAILQ_INIT(&channel->sc_list_anchor); >+ > channel->control_work_queue = hv_work_queue_create("control"); > > if (channel->control_work_queue == NULL) { >@@ -262,6 +266,7 @@ > void > hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) > { >+ mtx_destroy(&channel->sc_lock); > mtx_destroy(&channel->inbound_lock); > /* > * We have to release the channel's workqueue/thread in >@@ -279,10 +284,10 @@ > static void > vmbus_channel_process_offer(void *context) > { >- int ret; > hv_vmbus_channel* new_channel; > boolean_t f_new; > hv_vmbus_channel* channel; >+ int ret; > > new_channel = (hv_vmbus_channel*) context; > f_new = TRUE; >@@ -296,33 +301,71 @@ > TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, > list_entry) > { >- if (!memcmp( >- &channel->offer_msg.offer.interface_type, >- &new_channel->offer_msg.offer.interface_type, >- sizeof(hv_guid)) >- && !memcmp( >- &channel->offer_msg.offer.interface_instance, >+ if (!memcmp( &channel->offer_msg.offer.interface_type, >+ &new_channel->offer_msg.offer.interface_type, >+ sizeof(hv_guid)) && >+ !memcmp(&channel->offer_msg.offer.interface_instance, > &new_channel->offer_msg.offer.interface_instance, > sizeof(hv_guid))) { >- f_new = FALSE; >- break; >- } >+ f_new = FALSE; >+ break; >+ } > } > > if (f_new) { >- /* Insert at tail */ >- TAILQ_INSERT_TAIL( >- &hv_vmbus_g_connection.channel_anchor, >- new_channel, >- list_entry); >+ /* Insert at tail */ >+ TAILQ_INSERT_TAIL( >+ &hv_vmbus_g_connection.channel_anchor, >+ new_channel, >+ list_entry); > } > mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); > >+ /*XXX add new channel to percpu_list */ >+ > if (!f_new) { >+ /* >+ * Check if this is a sub channel. >+ */ >+ if (new_channel->offer_msg.offer.sub_channel_index != 0) { >+ /* >+ * It is a sub channel offer, process it. >+ */ >+ new_channel->primary_channel = channel; >+ mtx_lock_spin(&channel->sc_lock); >+ TAILQ_INSERT_TAIL( >+ &channel->sc_list_anchor, >+ new_channel, >+ sc_list_entry); >+ mtx_unlock_spin(&channel->sc_lock); >+ >+ /* Insert new channel into channel_anchor. */ >+ printf("Storvsc get multi-channel offer, rel=%u.\n", >+ new_channel->offer_msg.child_rel_id); >+ mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); >+ TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, >+ new_channel, list_entry); >+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); >+ >+ if(bootverbose) >+ printf("VMBUS: new multi-channel offer <%p>.\n", >+ new_channel); >+ >+ /*XXX add it to percpu_list */ >+ >+ new_channel->state = HV_CHANNEL_OPEN_STATE; >+ if (channel->sc_creation_callback != NULL) { >+ channel->sc_creation_callback(new_channel); >+ } >+ return; >+ } >+ > hv_vmbus_free_vmbus_channel(new_channel); > return; > } > >+ new_channel->state = HV_CHANNEL_OPEN_STATE; >+ > /* > * Start the process of binding this offer to the driver > * (We need to set the device field before calling >@@ -333,13 +376,6 @@ > new_channel->offer_msg.offer.interface_instance, new_channel); > > /* >- * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below >- * but in the "open" channel request. The ret != 0 logic below >- * doesn't take into account that a channel >- * may have been opened successfully >- */ >- >- /* > * Add the new device to the bus. This will kick off device-driver > * binding which eventually invokes the device driver's AddDevice() > * method. >@@ -346,22 +382,80 @@ > */ > ret = hv_vmbus_child_device_register(new_channel->device); > if (ret != 0) { >- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); >- TAILQ_REMOVE( >- &hv_vmbus_g_connection.channel_anchor, >- new_channel, >- list_entry); >- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); >- hv_vmbus_free_vmbus_channel(new_channel); >- } else { >- /* >- * This state is used to indicate a successful open >- * so that when we do close the channel normally, >- * we can clean up properly >- */ >- new_channel->state = HV_CHANNEL_OPEN_STATE; >+ mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); >+ TAILQ_REMOVE( >+ &hv_vmbus_g_connection.channel_anchor, >+ new_channel, >+ list_entry); >+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); >+ hv_vmbus_free_vmbus_channel(new_channel); >+ } >+} > >+/** >+ * Array of device guids that are performance critical. We try to distribute >+ * the interrupt load for these devices across all online cpus. >+ */ >+static const hv_guid high_perf_devices[] = { >+ {HV_NIC_GUID, }, >+ {HV_IDE_GUID, }, >+ {HV_SCSI_GUID, }, >+}; >+ >+enum { >+ PERF_CHN_NIC = 0, >+ PERF_CHN_IDE, >+ PERF_CHN_SCSI, >+ MAX_PERF_CHN, >+}; >+ >+/* >+ * We use this static number to distribute the channel interrupt load. >+ */ >+static uint32_t next_vcpu; >+ >+/** >+ * Starting with Win8, we can statically distribute the incoming >+ * channel interrupt load by binding a channel to VCPU. We >+ * implement here a simple round robin scheme for distributing >+ * the interrupt load. >+ * We will bind channels that are not performance critical to cpu 0 and >+ * performance critical channels (IDE, SCSI and Network) will be uniformly >+ * distributed across all available CPUs. >+ */ >+static void >+vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) >+{ >+ uint32_t current_cpu; >+ int i; >+ boolean_t is_perf_channel = FALSE; >+ >+ for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { >+ if (!memcmp(guid->data, high_perf_devices[i].data, >+ sizeof(hv_guid))) { >+ is_perf_channel = TRUE; >+ break; >+ } > } >+ >+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || >+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || >+ (!is_perf_channel)) { >+ /* Host's view of guest cpu */ >+ channel->target_vcpu = 0; >+ /* Guest's own view of cpu */ >+ channel->target_cpu = 0; >+ return; >+ } >+ /* mp_ncpus should have the number cpus currently online */ >+ current_cpu = (++next_vcpu % mp_ncpus); >+ channel->target_cpu = current_cpu; >+ channel->target_vcpu = >+ hv_vmbus_g_context.hv_vcpu_index[current_cpu]; >+ if (bootverbose) >+ printf("VMBUS: Total online cpus %d, assign perf channel %d " >+ "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, >+ current_cpu); > } > > /** >@@ -391,6 +485,38 @@ > if (new_channel == NULL) > return; > >+ /* >+ * By default we setup state to enable batched >+ * reading. A specific service can choose to >+ * disable this prior to opening the channel. >+ */ >+ new_channel->batched_reading = TRUE; >+ >+ new_channel->signal_event_param = >+ (hv_vmbus_input_signal_event *) >+ (HV_ALIGN_UP((unsigned long) >+ &new_channel->signal_event_buffer, >+ HV_HYPERCALL_PARAM_ALIGN)); >+ >+ new_channel->signal_event_param->connection_id.as_uint32_t = 0; >+ new_channel->signal_event_param->connection_id.u.id = >+ HV_VMBUS_EVENT_CONNECTION_ID; >+ new_channel->signal_event_param->flag_number = 0; >+ new_channel->signal_event_param->rsvd_z = 0; >+ >+ if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) { >+ new_channel->is_dedicated_interrupt = >+ (offer->is_dedicated_interrupt != 0); >+ new_channel->signal_event_param->connection_id.u.id = >+ offer->connection_id; >+ } >+ >+ /* >+ * Bind the channel to a chosen cpu. >+ */ >+ vmbus_channel_select_cpu(new_channel, >+ &offer->offer.interface_type); >+ > memcpy(&new_channel->offer_msg, offer, > sizeof(hv_vmbus_channel_offer_channel)); > new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; >@@ -678,3 +804,60 @@ > } > mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); > } >+ >+/** >+ * @brief Select the best outgoing channel >+ * >+ * The channel whose vcpu binding is closest to the currect vcpu will >+ * be selected. >+ * If no multi-channel, always select primary channel >+ * >+ * @param primary - primary channel >+ */ >+struct hv_vmbus_channel * >+vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) >+{ >+ hv_vmbus_channel *new_channel = NULL; >+ hv_vmbus_channel *outgoing_channel = primary; >+ int old_cpu_distance = 0; >+ int new_cpu_distance = 0; >+ int cur_vcpu = 0; >+ int smp_pro_id = PCPU_GET(cpuid); >+ >+ if (TAILQ_EMPTY(&primary->sc_list_anchor)) { >+ return outgoing_channel; >+ } >+ >+ if (smp_pro_id >= MAXCPU) { >+ return outgoing_channel; >+ } >+ >+ cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id]; >+ >+ TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) { >+ if (new_channel->state != HV_CHANNEL_OPENED_STATE){ >+ continue; >+ } >+ >+ if (new_channel->target_vcpu == cur_vcpu){ >+ return new_channel; >+ } >+ >+ old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ? >+ (outgoing_channel->target_vcpu - cur_vcpu) : >+ (cur_vcpu - outgoing_channel->target_vcpu)); >+ >+ new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ? >+ (new_channel->target_vcpu - cur_vcpu) : >+ (cur_vcpu - new_channel->target_vcpu)); >+ >+ if (old_cpu_distance < new_cpu_distance) { >+ continue; >+ } >+ >+ outgoing_channel = new_channel; >+ } >+ >+ return outgoing_channel; >+} >+ >Index: sys/dev/hyperv/vmbus/hv_hv.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_hv.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_hv.c (revision 3) >@@ -67,8 +67,6 @@ > hv_vmbus_context hv_vmbus_g_context = { > .syn_ic_initialized = FALSE, > .hypercall_page = NULL, >- .signal_event_param = NULL, >- .signal_event_buffer = NULL, > }; > > static struct timecounter hv_timecounter = { >@@ -256,28 +254,6 @@ > > hv_vmbus_g_context.hypercall_page = virt_addr; > >- /* >- * Setup the global signal event param for the signal event hypercall >- */ >- hv_vmbus_g_context.signal_event_buffer = >- malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, >- M_ZERO | M_NOWAIT); >- KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, >- ("Error VMBUS: Failed to allocate signal_event_buffer\n")); >- if (hv_vmbus_g_context.signal_event_buffer == NULL) >- goto cleanup; >- >- hv_vmbus_g_context.signal_event_param = >- (hv_vmbus_input_signal_event*) >- (HV_ALIGN_UP((unsigned long) >- hv_vmbus_g_context.signal_event_buffer, >- HV_HYPERCALL_PARAM_ALIGN)); >- hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; >- hv_vmbus_g_context.signal_event_param->connection_id.u.id = >- HV_VMBUS_EVENT_CONNECTION_ID; >- hv_vmbus_g_context.signal_event_param->flag_number = 0; >- hv_vmbus_g_context.signal_event_param->rsvd_z = 0; >- > tc_init(&hv_timecounter); /* register virtual timecount */ > > return (0); >@@ -303,12 +279,6 @@ > { > hv_vmbus_x64_msr_hypercall_contents hypercall_msr; > >- if (hv_vmbus_g_context.signal_event_buffer != NULL) { >- free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); >- hv_vmbus_g_context.signal_event_buffer = NULL; >- hv_vmbus_g_context.signal_event_param = NULL; >- } >- > if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { > if (hv_vmbus_g_context.hypercall_page != NULL) { > hypercall_msr.as_uint64_t = 0; >@@ -370,13 +340,13 @@ > * event IPC. (This involves a hypercall.) > */ > hv_vmbus_status >-hv_vmbus_signal_event() >+hv_vmbus_signal_event(void *con_id) > { > hv_vmbus_status status; > > status = hv_vmbus_do_hypercall( > HV_CALL_SIGNAL_EVENT, >- hv_vmbus_g_context.signal_event_param, >+ con_id, > 0) & 0xFFFF; > > return (status); >@@ -390,6 +360,7 @@ > > { > int cpu; >+ uint64_t hv_vcpu_index; > hv_vmbus_synic_simp simp; > hv_vmbus_synic_siefp siefp; > hv_vmbus_synic_scontrol sctrl; >@@ -403,23 +374,14 @@ > return; > > /* >- * KYS: Looks like we can only initialize on cpu0; don't we support >- * SMP guests? >- * >- * TODO: Need to add SMP support for FreeBSD V9 >- */ >- >- if (cpu != 0) >- return; >- >- /* > * TODO: Check the version > */ > version = rdmsr(HV_X64_MSR_SVERSION); >- > >- hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0]; >- hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1]; >+ hv_vmbus_g_context.syn_ic_msg_page[cpu] = >+ setup_args->page_buffers[2 * cpu]; >+ hv_vmbus_g_context.syn_ic_event_page[cpu] = >+ setup_args->page_buffers[2 * cpu + 1]; > > /* > * Setup the Synic's message page >@@ -443,9 +405,10 @@ > wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); > > /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ >+ shared_sint.as_uint64_t = 0; > shared_sint.u.vector = setup_args->vector; > shared_sint.u.masked = FALSE; >- shared_sint.u.auto_eoi = FALSE; >+ shared_sint.u.auto_eoi = TRUE; > > wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, > shared_sint.as_uint64_t); >@@ -458,6 +421,13 @@ > > hv_vmbus_g_context.syn_ic_initialized = TRUE; > >+ /* >+ * Set up the cpuid mapping from Hyper-V to FreeBSD. >+ * The array is indexed using FreeBSD cpuid. >+ */ >+ hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX); >+ hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index; >+ > return; > } > >@@ -469,14 +439,10 @@ > hv_vmbus_synic_sint shared_sint; > hv_vmbus_synic_simp simp; > hv_vmbus_synic_siefp siefp; >- int cpu = PCPU_GET(cpuid); > > if (!hv_vmbus_g_context.syn_ic_initialized) > return; > >- if (cpu != 0) >- return; /* TODO: XXXKYS: SMP? */ >- > shared_sint.as_uint64_t = rdmsr( > HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); > >Index: sys/dev/hyperv/vmbus/hv_connection.c >=================================================================== >--- sys/dev/hyperv/vmbus/hv_connection.c (revision 1) >+++ sys/dev/hyperv/vmbus/hv_connection.c (revision 3) >@@ -45,14 +45,113 @@ > { .connect_state = HV_DISCONNECTED, > .next_gpadl_handle = 0xE1E10, }; > >+uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008; >+ >+static uint32_t >+hv_vmbus_get_next_version(uint32_t current_ver) >+{ >+ switch (current_ver) { >+ case (HV_VMBUS_VERSION_WIN7): >+ return HV_VMBUS_VERSION_WS2008; >+ >+ case (HV_VMBUS_VERSION_WIN8): >+ return HV_VMBUS_VERSION_WIN7; >+ >+ case (HV_VMBUS_VERSION_WIN8_1): >+ return HV_VMBUS_VERSION_WIN8; >+ >+ case (HV_VMBUS_VERSION_WS2008): >+ default: >+ return HV_VMBUS_VERSION_INVALID; >+ } >+} >+ > /** >+ * Negotiate the highest supported hypervisor version. >+ */ >+static int >+hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, >+ uint32_t version) >+{ >+ int ret = 0; >+ hv_vmbus_channel_initiate_contact *msg; >+ >+ sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); >+ msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; >+ >+ msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; >+ msg->vmbus_version_requested = version; >+ >+ msg->interrupt_page = hv_get_phys_addr( >+ hv_vmbus_g_connection.interrupt_page); >+ >+ msg->monitor_page_1 = hv_get_phys_addr( >+ hv_vmbus_g_connection.monitor_pages); >+ >+ msg->monitor_page_2 = >+ hv_get_phys_addr( >+ ((uint8_t *) hv_vmbus_g_connection.monitor_pages >+ + PAGE_SIZE)); >+ >+ /** >+ * Add to list before we send the request since we may receive the >+ * response before returning from this routine >+ */ >+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ >+ TAILQ_INSERT_TAIL( >+ &hv_vmbus_g_connection.channel_msg_anchor, >+ msg_info, >+ msg_list_entry); >+ >+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ >+ ret = hv_vmbus_post_message( >+ msg, >+ sizeof(hv_vmbus_channel_initiate_contact)); >+ >+ if (ret != 0) { >+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ TAILQ_REMOVE( >+ &hv_vmbus_g_connection.channel_msg_anchor, >+ msg_info, >+ msg_list_entry); >+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ return (ret); >+ } >+ >+ /** >+ * Wait for the connection response >+ */ >+ ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ >+ >+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ TAILQ_REMOVE( >+ &hv_vmbus_g_connection.channel_msg_anchor, >+ msg_info, >+ msg_list_entry); >+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ >+ /** >+ * Check if successful >+ */ >+ if (msg_info->response.version_response.version_supported) { >+ hv_vmbus_g_connection.connect_state = HV_CONNECTED; >+ } else { >+ ret = ECONNREFUSED; >+ } >+ >+ return (ret); >+} >+ >+/** > * Send a connect request on the partition service connection > */ > int > hv_vmbus_connect(void) { > int ret = 0; >+ uint32_t version; > hv_vmbus_channel_msg_info* msg_info = NULL; >- hv_vmbus_channel_initiate_contact* msg; > > /** > * Make sure we are not connecting or connected >@@ -130,72 +229,31 @@ > goto cleanup; > } > >- sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); >- msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; >- >- msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; >- msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; >- >- msg->interrupt_page = hv_get_phys_addr( >- hv_vmbus_g_connection.interrupt_page); >- >- msg->monitor_page_1 = hv_get_phys_addr( >- hv_vmbus_g_connection.monitor_pages); >- >- msg->monitor_page_2 = >- hv_get_phys_addr( >- ((uint8_t *) hv_vmbus_g_connection.monitor_pages >- + PAGE_SIZE)); >- >- /** >- * Add to list before we send the request since we may receive the >- * response before returning from this routine >+ /* >+ * Find the highest vmbus version number we can support. > */ >- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ version = HV_VMBUS_VERSION_CURRENT; > >- TAILQ_INSERT_TAIL( >- &hv_vmbus_g_connection.channel_msg_anchor, >- msg_info, >- msg_list_entry); >+ do { >+ ret = hv_vmbus_negotiate_version(msg_info, version); >+ if (ret == EWOULDBLOCK) { >+ /* >+ * We timed out. >+ */ >+ goto cleanup; >+ } > >- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >+ if (hv_vmbus_g_connection.connect_state == HV_CONNECTED) >+ break; > >- ret = hv_vmbus_post_message( >- msg, >- sizeof(hv_vmbus_channel_initiate_contact)); >+ version = hv_vmbus_get_next_version(version); >+ } while (version != HV_VMBUS_VERSION_INVALID); > >- if (ret != 0) { >- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >- TAILQ_REMOVE( >- &hv_vmbus_g_connection.channel_msg_anchor, >- msg_info, >- msg_list_entry); >- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >- goto cleanup; >- } >+ hv_vmbus_protocal_version = version; >+ if (bootverbose) >+ printf("VMBUS: Portocal Version: %d.%d\n", >+ version >> 16, version & 0xFFFF); > >- /** >- * Wait for the connection response >- */ >- ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ >- >- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); >- TAILQ_REMOVE( >- &hv_vmbus_g_connection.channel_msg_anchor, >- msg_info, >- msg_list_entry); >- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); >- >- /** >- * Check if successful >- */ >- if (msg_info->response.version_response.version_supported) { >- hv_vmbus_g_connection.connect_state = HV_CONNECTED; >- } else { >- ret = ECONNREFUSED; >- goto cleanup; >- } >- > sema_destroy(&msg_info->wait_sema); > free(msg_info, M_DEVBUF); > >@@ -306,7 +364,10 @@ > static void > VmbusProcessChannelEvent(uint32_t relid) > { >+ void* arg; >+ uint32_t bytes_to_read; > hv_vmbus_channel* channel; >+ boolean_t is_batched_reading; > > /** > * Find the channel based on this relid and invokes >@@ -329,11 +390,40 @@ > > mtx_lock(&channel->inbound_lock); > if (channel->on_channel_callback != NULL) { >- channel->on_channel_callback(channel->channel_callback_context); >+ arg = channel->channel_callback_context; >+ is_batched_reading = channel->batched_reading; >+ /* >+ * Optimize host to guest signaling by ensuring: >+ * 1. While reading the channel, we disable interrupts from >+ * host. >+ * 2. Ensure that we process all posted messages from the host >+ * before returning from this callback. >+ * 3. Once we return, enable signaling from the host. Once this >+ * state is set we check to see if additional packets are >+ * available to read. In this case we repeat the process. >+ */ >+ do { >+ if (is_batched_reading) >+ hv_ring_buffer_read_begin(&channel->inbound); >+ >+ channel->on_channel_callback(arg); >+ >+ if (is_batched_reading) >+ bytes_to_read = >+ hv_ring_buffer_read_end(&channel->inbound); >+ else >+ bytes_to_read = 0; >+ } while (is_batched_reading && (bytes_to_read != 0)); > } > mtx_unlock(&channel->inbound_lock); > } > >+#ifdef HV_DEBUG_INTR >+extern uint32_t hv_intr_count; >+extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; >+extern uint32_t hv_vmbus_intr_cpu[MAXCPU]; >+#endif >+ > /** > * Handler for events > */ >@@ -340,19 +430,52 @@ > void > hv_vmbus_on_events(void *arg) > { >+ int bit; >+ int cpu; > int dword; >- int bit; >+ void *page_addr; >+ uint32_t* recv_interrupt_page = NULL; > int rel_id; >- int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; >+ int maxdword; >+ hv_vmbus_synic_event_flags *event; > /* int maxdword = PAGE_SIZE >> 3; */ > >- /* >- * receive size is 1/2 page and divide that by 4 bytes >- */ >+ cpu = (int)(long)arg; >+ KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " >+ "cpu out of range!")); > >- uint32_t* recv_interrupt_page = >- hv_vmbus_g_connection.recv_interrupt_page; >+#ifdef HV_DEBUG_INTR >+ int i; >+ hv_vmbus_swintr_event_cpu[cpu]++; >+ if (hv_intr_count % 10000 == 0) { >+ printf("VMBUS: Total interrupt %d\n", hv_intr_count); >+ for (i = 0; i < mp_ncpus; i++) >+ printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n", >+ i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]); >+ } >+#endif > >+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || >+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { >+ maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; >+ /* >+ * receive size is 1/2 page and divide that by 4 bytes >+ */ >+ recv_interrupt_page = >+ hv_vmbus_g_connection.recv_interrupt_page; >+ } else { >+ /* >+ * On Host with Win8 or above, the event page can be >+ * checked directly to get the id of the channel >+ * that has the pending interrupt. >+ */ >+ maxdword = HV_EVENT_FLAGS_DWORD_COUNT; >+ page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; >+ event = (hv_vmbus_synic_event_flags *) >+ page_addr + HV_VMBUS_MESSAGE_SINT; >+ recv_interrupt_page = event->flags32; >+ } >+ > /* > * Check events > */ >@@ -416,8 +539,9 @@ > * Send an event notification to the parent > */ > int >-hv_vmbus_set_event(uint32_t child_rel_id) { >+hv_vmbus_set_event(hv_vmbus_channel *channel) { > int ret = 0; >+ uint32_t child_rel_id = channel->offer_msg.child_rel_id; > > /* Each uint32_t represents 32 channels */ > >@@ -424,8 +548,7 @@ > synch_set_bit(child_rel_id & 31, > (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page > + (child_rel_id >> 5)))); >- ret = hv_vmbus_signal_event(); >+ ret = hv_vmbus_signal_event(channel->signal_event_param); > > return (ret); > } >- >Index: sys/x86/include/apicvar.h >=================================================================== >--- sys/x86/include/apicvar.h (revision 1) >+++ sys/x86/include/apicvar.h (revision 3) >@@ -416,6 +416,7 @@ > void lapic_handle_intr(int vector, struct trapframe *frame); > void lapic_handle_timer(struct trapframe *frame); > void xen_intr_handle_upcall(struct trapframe *frame); >+void hv_vector_handler(struct trapframe *frame); > > #endif /* !LOCORE */ > #endif /* _X86_APICVAR_H_ */ >Index: sys/i386/i386/apic_vector.s >=================================================================== >--- sys/i386/i386/apic_vector.s (revision 1) >+++ sys/i386/i386/apic_vector.s (revision 3) >@@ -157,6 +157,23 @@ > jmp doreti > #endif > >+/* >+ * This is the Hyper-V vmbus channel direct callback interrupt. >+ * Only used when it is running on Hyper-V. >+ */ >+ .text >+ SUPERALIGN_TEXT >+IDTVEC(hv_vmbus_callback) >+ PUSH_FRAME >+ SET_KERNEL_SREGS >+ cld >+ FAKE_MCOUNT(TF_EIP(%esp)) >+ pushl %esp >+ call hv_vector_handler >+ add $4, %esp >+ MEXITCOUNT >+ jmp doreti >+ > #ifdef SMP > /* > * Global address space TLB shootdown. >Index: sys/amd64/amd64/apic_vector.S >=================================================================== >--- sys/amd64/amd64/apic_vector.S (revision 1) >+++ sys/amd64/amd64/apic_vector.S (revision 3) >@@ -150,6 +150,20 @@ > jmp doreti > #endif > >+/* >+ * This is the Hyper-V vmbus channel direct callback interrupt. >+ * Only used when it is running on Hyper-V. >+ */ >+ .text >+ SUPERALIGN_TEXT >+IDTVEC(hv_vmbus_callback) >+ PUSH_FRAME >+ FAKE_MCOUNT(TF_RIP(%rsp)) >+ movq %rsp, %rdi >+ call hv_vector_handler >+ MEXITCOUNT >+ jmp doreti >+ > #ifdef SMP > /* > * Global address space TLB shootdown.
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 195238
: 149948