From 240ad77cb50d9f0a961fcb0f21e67939cf7a9c04 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:10 +0200 Subject: PCI: hv: Prepare hv_compose_msi_msg() for the VMBus-channel-interrupt-to-vCPU reassignment functionality The current implementation of hv_compose_msi_msg() is incompatible with the new functionality that allows changing the vCPU a VMBus channel will interrupt: if this function always calls hv_pci_onchannelcallback() in the polling loop, the interrupt going to a different CPU could cause hv_pci_onchannelcallback() to be running simultaneously in a tasklet, which will break. The current code also has a problem in that it is not synchronized with vmbus_reset_channel_cb(): hv_compose_msi_msg() could be accessing the ring buffer via the call of hv_pci_onchannelcallback() well after the time that vmbus_reset_channel_cb() has finished. Fix these issues as follows. Disable the channel tasklet before entering the polling loop in hv_compose_msi_msg() and re-enable it when done. This will prevent hv_pci_onchannelcallback() from running in a tasklet on a different CPU. Moreover, poll by always calling hv_pci_onchannelcallback(), but check the channel callback function for NULL and invoke the callback within a sched_lock critical section. This will prevent hv_compose_msi_msg() from accessing the ring buffer after vmbus_reset_channel_cb() has acquired the sched_lock spinlock. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Link: https://lore.kernel.org/r/20200406001514.19876-8-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 44 +++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e15022ff63e3..222ff5639ebe 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); struct hv_pcibus_device *hbus; + struct vmbus_channel *channel; struct hv_pci_dev *hpdev; struct pci_bus *pbus; struct pci_dev *pdev; struct cpumask *dest; - unsigned long flags; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; struct { @@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + channel = hbus->hdev->channel; hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); if (!hpdev) goto return_null_message; @@ -1435,43 +1436,52 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) goto free_int_desc; } + /* + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ + tasklet_disable(&channel->callback_event); + /* * Since this function is called with IRQ locks held, can't * do normal wait for completion; instead poll. */ while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { + unsigned long flags; + /* 0xFFFF means an invalid PCI VENDOR ID. */ if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { dev_err_once(&hbus->hdev->device, "the device has gone\n"); - goto free_int_desc; + goto enable_tasklet; } /* - * When the higher level interrupt code calls us with - * interrupt disabled, we must poll the channel by calling - * the channel callback directly when channel->target_cpu is - * the current CPU. When the higher level interrupt code - * calls us with interrupt enabled, let's add the - * local_irq_save()/restore() to avoid race: - * hv_pci_onchannelcallback() can also run in tasklet. + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). */ - local_irq_save(flags); - - if (hbus->hdev->channel->target_cpu == smp_processor_id()) - hv_pci_onchannelcallback(hbus); - - local_irq_restore(flags); + spin_lock_irqsave(&channel->sched_lock, flags); + if (unlikely(channel->onchannel_callback == NULL)) { + spin_unlock_irqrestore(&channel->sched_lock, flags); + goto enable_tasklet; + } + hv_pci_onchannelcallback(hbus); + spin_unlock_irqrestore(&channel->sched_lock, flags); if (hpdev->state == hv_pcichild_ejecting) { dev_err_once(&hbus->hdev->device, "the device is being ejected\n"); - goto free_int_desc; + goto enable_tasklet; } udelay(100); } + tasklet_enable(&channel->callback_event); + if (comp.comp_pkt.completion_status < 0) { dev_err(&hbus->hdev->device, "Request for interrupt failed: 0x%x", @@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) put_pcichild(hpdev); return; +enable_tasklet: + tasklet_enable(&channel->callback_event); free_int_desc: kfree(int_desc); drop_reference: -- cgit From 83cc3508ffaa6e2cd364d29418d35fab6f069b51 Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Thu, 7 May 2020 13:02:11 +0800 Subject: PCI: hv: Fix the PCI HyperV probe failure path to release resource properly In some error cases in hv_pci_probe(), allocated resources are not freed. Fix this by adding a field to keep track of the high water mark for slots that have resources allocated to them. In case of an error, this high water mark is used to know which slots have resources that must be released. Since slots are numbered starting with zero, a value of -1 indicates no slots have been allocated resources. There may be unused slots in the range between slot 0 and the high water mark slot, but these slots are already ignored by the existing code in the allocate and release loops with the call to get_pcichild_wslot(). Link: https://lore.kernel.org/r/20200507050211.10923-1-weh@microsoft.com Signed-off-by: Wei Hu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Michael Kelley --- drivers/pci/controller/pci-hyperv.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e15022ff63e3..e6fac0187722 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -480,6 +480,9 @@ struct hv_pcibus_device { struct workqueue_struct *wq; + /* Highest slot of child device with resources allocated */ + int wslot_res_allocated; + /* hypercall arg, must not cross page boundary */ struct hv_retarget_device_interrupt retarget_msi_interrupt_params; @@ -2847,7 +2850,7 @@ static int hv_send_resources_allocated(struct hv_device *hdev) struct hv_pci_dev *hpdev; struct pci_packet *pkt; size_t size_res; - u32 wslot; + int wslot; int ret; size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) @@ -2900,6 +2903,8 @@ static int hv_send_resources_allocated(struct hv_device *hdev) comp_pkt.completion_status); break; } + + hbus->wslot_res_allocated = wslot; } kfree(pkt); @@ -2918,10 +2923,10 @@ static int hv_send_resources_released(struct hv_device *hdev) struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct pci_child_message pkt; struct hv_pci_dev *hpdev; - u32 wslot; + int wslot; int ret; - for (wslot = 0; wslot < 256; wslot++) { + for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; @@ -2936,8 +2941,12 @@ static int hv_send_resources_released(struct hv_device *hdev) VM_PKT_DATA_INBAND, 0); if (ret) return ret; + + hbus->wslot_res_allocated = wslot - 1; } + hbus->wslot_res_allocated = -1; + return 0; } @@ -3037,6 +3046,7 @@ static int hv_pci_probe(struct hv_device *hdev, if (!hbus) return -ENOMEM; hbus->state = hv_pcibus_init; + hbus->wslot_res_allocated = -1; /* * The PCI bus "domain" is what is called "segment" in ACPI and other @@ -3136,7 +3146,7 @@ static int hv_pci_probe(struct hv_device *hdev, ret = hv_pci_allocate_bridge_windows(hbus); if (ret) - goto free_irq_domain; + goto exit_d0; ret = hv_send_resources_allocated(hdev); if (ret) @@ -3154,6 +3164,8 @@ static int hv_pci_probe(struct hv_device *hdev, free_windows: hv_pci_free_bridge_windows(hbus); +exit_d0: + (void) hv_pci_bus_exit(hdev, true); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: -- cgit From c81992e7f4aa19a055dbff5bd6c6d5ff9408f2fb Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Thu, 7 May 2020 13:03:00 +0800 Subject: PCI: hv: Retry PCI bus D0 entry on invalid device state When kdump is triggered, some PCI devices may have not been shut down cleanly before the kdump kernel starts. This causes the initial attempt to enter D0 state in the kdump kernel to fail with invalid device state returned from Hyper-V host. When this happens, explicitly call hv_pci_bus_exit() and retry to enter the D0 state. Link: https://lore.kernel.org/r/20200507050300.10974-1-weh@microsoft.com Signed-off-by: Wei Hu [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Michael Kelley --- drivers/pci/controller/pci-hyperv.c | 40 +++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e6fac0187722..92092a47d3af 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2739,6 +2739,8 @@ static void hv_free_config_window(struct hv_pcibus_device *hbus) vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); } +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); + /** * hv_pci_enter_d0() - Bring the "bus" into the D0 power state * @hdev: VMBus's tracking struct for this root PCI bus @@ -2751,8 +2753,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -2779,6 +2783,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3185,7 +3221,7 @@ free_bus: return ret; } -static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct { @@ -3203,7 +3239,7 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) if (hdev->channel->rescind) return 0; - if (!hibernating) { + if (!keep_devs) { /* Delete any children which might still exist. */ dr = kzalloc(sizeof(*dr), GFP_KERNEL); if (dr && hv_pci_start_relations_work(hbus, dr)) -- cgit From d0684fd0bd79395e074dd668feee5d53b134b1a3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 25 May 2020 11:43:19 -0500 Subject: PCI: hv: Use struct_size() helper One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct hv_dr_state { ... struct hv_pcidev_description func[]; }; struct pci_bus_relations { ... struct pci_function_description func[]; } __packed; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following forms: offsetof(struct hv_dr_state, func) + (sizeof(struct hv_pcidev_description) * (relations->device_count)) offsetof(struct pci_bus_relations, func) + (sizeof(struct pci_function_description) * (bus_rel->device_count)) with: struct_size(dr, func, relations->device_count) and struct_size(bus_rel, func, bus_rel->device_count) respectively. Link: https://lore.kernel.org/r/20200525164319.GA13596@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Lorenzo Pieralisi Reviewed-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 92092a47d3af..c95e520e62e4 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2201,10 +2201,8 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus, struct hv_dr_state *dr; int i; - dr = kzalloc(offsetof(struct hv_dr_state, func) + - (sizeof(struct hv_pcidev_description) * - (relations->device_count)), GFP_NOWAIT); - + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); if (!dr) return; @@ -2238,10 +2236,8 @@ static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, struct hv_dr_state *dr; int i; - dr = kzalloc(offsetof(struct hv_dr_state, func) + - (sizeof(struct hv_pcidev_description) * - (relations->device_count)), GFP_NOWAIT); - + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); if (!dr) return; @@ -2435,9 +2431,8 @@ static void hv_pci_onchannelcallback(void *context) bus_rel = (struct pci_bus_relations *)buffer; if (bytes_recvd < - offsetof(struct pci_bus_relations, func) + - (sizeof(struct pci_function_description) * - (bus_rel->device_count))) { + struct_size(bus_rel, func, + bus_rel->device_count)) { dev_err(&hbus->hdev->device, "bus relations too small\n"); break; @@ -2450,9 +2445,8 @@ static void hv_pci_onchannelcallback(void *context) bus_rel2 = (struct pci_bus_relations2 *)buffer; if (bytes_recvd < - offsetof(struct pci_bus_relations2, func) + - (sizeof(struct pci_function_description2) * - (bus_rel2->device_count))) { + struct_size(bus_rel2, func, + bus_rel2->device_count)) { dev_err(&hbus->hdev->device, "bus relations v2 too small\n"); break; -- cgit