From 0e7d29a39a546161ea3a49e8e282a43212d7ff68 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 2 Feb 2024 14:16:34 +0100 Subject: PCI/AER: Fix rootport attribute paths in ABI docs The 'aer_stats' directory never made it into the sixth and final revision of the series adding the sysfs AER attributes. Link: https://lore.kernel.org/r/20240202131635.11405-2-johan+linaro@kernel.org Link: https://lore.kernel.org/lkml/20180621184822.GB14136@bhelgaas-glaptop.roam.corp.google.com/ Fixes: 12833017e581 ("PCI/AER: Add sysfs attributes for rootport cumulative stats") Signed-off-by: Johan Hovold Signed-off-by: Bjorn Helgaas --- Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats index 860db53037a5..24087d5fd417 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats @@ -100,19 +100,19 @@ collectors) that are AER capable. These indicate the number of error messages as device, so these counters include them and are thus cumulative of all the error messages on the PCI hierarchy originating at that root port. -What: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_cor +What: /sys/bus/pci/devices//aer_rootport_total_err_cor Date: July 2018 KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_COR messages reported to rootport. -What: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_fatal +What: /sys/bus/pci/devices//aer_rootport_total_err_fatal Date: July 2018 KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_FATAL messages reported to rootport. -What: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_nonfatal +What: /sys/bus/pci/devices//aer_rootport_total_err_nonfatal Date: July 2018 KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com -- cgit From 96ed79791b1b213c892301595459e0ea404540b3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 2 Feb 2024 14:16:35 +0100 Subject: PCI/AER: Clean up version indentation in ABI docs The 'KernelVersion' lines use a single space as separator instead of a tab so the values are not aligned with the other AER attribute fields. Link: https://lore.kernel.org/r/20240202131635.11405-3-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Bjorn Helgaas --- Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats index 24087d5fd417..d1f67bb81d5d 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats @@ -11,7 +11,7 @@ saw any problems). What: /sys/bus/pci/devices//aer_dev_correctable Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: List of correctable errors seen and reported by this PCI device using ERR_COR. Note that since multiple errors may @@ -32,7 +32,7 @@ Description: List of correctable errors seen and reported by this What: /sys/bus/pci/devices//aer_dev_fatal Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: List of uncorrectable fatal errors seen and reported by this PCI device using ERR_FATAL. Note that since multiple errors may @@ -62,7 +62,7 @@ Description: List of uncorrectable fatal errors seen and reported by this What: /sys/bus/pci/devices//aer_dev_nonfatal Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: List of uncorrectable nonfatal errors seen and reported by this PCI device using ERR_NONFATAL. Note that since multiple errors @@ -102,18 +102,18 @@ messages on the PCI hierarchy originating at that root port. What: /sys/bus/pci/devices//aer_rootport_total_err_cor Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_COR messages reported to rootport. What: /sys/bus/pci/devices//aer_rootport_total_err_fatal Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_FATAL messages reported to rootport. What: /sys/bus/pci/devices//aer_rootport_total_err_nonfatal Date: July 2018 -KernelVersion: 4.19.0 +KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_NONFATAL messages reported to rootport. -- cgit From 002bf2fbc00e5c4b95fb167287e2ae7d1973281e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 12 Feb 2024 13:01:35 +0100 Subject: PCI/AER: Block runtime suspend when handling errors PM runtime can be done simultaneously with AER error handling. Avoid that by using pm_runtime_get_sync() before and pm_runtime_put() after reset in pcie_do_recovery() for all recovering devices. pm_runtime_get_sync() will increase dev->power.usage_count counter to prevent any possible future request to runtime suspend a device. It will also resume a device, if it was previously in D3hot state. I tested with igc device by doing simultaneous aer_inject and rpm suspend/resume via /sys/bus/pci/devices/PCI_ID/power/control and can reproduce: igc 0000:02:00.0: not ready 65535ms after bus reset; giving up pcieport 0000:00:1c.2: AER: Root Port link has been reset (-25) pcieport 0000:00:1c.2: AER: subordinate device reset failed pcieport 0000:00:1c.2: AER: device recovery failed igc 0000:02:00.0: Unable to change power state from D3hot to D0, device inaccessible The problem disappears when this patch is applied. Link: https://lore.kernel.org/r/20240212120135.146068-1-stanislaw.gruszka@linux.intel.com Signed-off-by: Stanislaw Gruszka Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Rafael J. Wysocki Cc: --- drivers/pci/pcie/err.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 59c90d04a609..705893b5f7b0 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -13,6 +13,7 @@ #define dev_fmt(fmt) "AER: " fmt #include +#include #include #include #include @@ -85,6 +86,18 @@ static int report_error_detected(struct pci_dev *dev, return 0; } +static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data) +{ + pm_runtime_get_sync(&pdev->dev); + return 0; +} + +static int pci_pm_runtime_put(struct pci_dev *pdev, void *data) +{ + pm_runtime_put(&pdev->dev); + return 0; +} + static int report_frozen_detected(struct pci_dev *dev, void *data) { return report_error_detected(dev, pci_channel_io_frozen, data); @@ -207,6 +220,8 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, else bridge = pci_upstream_bridge(dev); + pci_walk_bridge(bridge, pci_pm_runtime_get_sync, NULL); + pci_dbg(bridge, "broadcast error_detected message\n"); if (state == pci_channel_io_frozen) { pci_walk_bridge(bridge, report_frozen_detected, &status); @@ -251,10 +266,15 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pcie_clear_device_status(dev); pci_aer_clear_nonfatal_status(dev); } + + pci_walk_bridge(bridge, pci_pm_runtime_put, NULL); + pci_info(bridge, "device recovery successful\n"); return status; failed: + pci_walk_bridge(bridge, pci_pm_runtime_put, NULL); + pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT); /* TODO: Should kernel panic here? */ -- cgit From a37e12bcab22efa05802f87baa0692365ae0ab4d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 6 Feb 2024 15:57:14 +0200 Subject: PCI/AER: Use explicit register size for PCI_ERR_CAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use u32 for PCIe AER Capability register variable and name it "aercc" (Advanced Error Capabilities and Control register, PCIe r6.1 sec 7.8.4.7) instead of "temp". Link: https://lore.kernel.org/r/20240206135717.8565-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: make subject more specific and match similar previous patches] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 05fc30bb5134..e31e6a9a7773 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1210,7 +1210,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) { int type = pci_pcie_type(dev); int aer = dev->aer_cap; - int temp; + u32 aercc; /* Must reset in this function */ info->status = 0; @@ -1241,8 +1241,8 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) return 0; /* Get First Error Pointer */ - pci_read_config_dword(dev, aer + PCI_ERR_CAP, &temp); - info->first_error = PCI_ERR_CAP_FEP(temp); + pci_read_config_dword(dev, aer + PCI_ERR_CAP, &aercc); + info->first_error = PCI_ERR_CAP_FEP(aercc); if (info->status & AER_LOG_TLP_MASKS) { info->tlp_header_valid = 1; -- cgit From 0a5a46a6a61be7b63c12c18495d427f91f3662a9 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 6 Feb 2024 15:57:15 +0200 Subject: PCI/AER: Generalize TLP Header Log reading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both AER and DPC RP PIO provide TLP Header Log registers (PCIe r6.1 secs 7.8.4 & 7.9.14) to convey error diagnostics but the struct is named after AER as the struct aer_header_log_regs. Also, not all places that handle TLP Header Log use the struct and the struct members are named individually. Generalize the struct name and members, and use it consistently where TLP Header Log is being handled so that a pcie_read_tlp_log() helper can be easily added. Link: https://lore.kernel.org/r/20240206135717.8565-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: drop ixgbe changes for now, tidy whitespace] Signed-off-by: Bjorn Helgaas --- drivers/firmware/efi/cper.c | 4 ++-- drivers/pci/pci.c | 28 ++++++++++++++++++++++++++++ drivers/pci/pci.h | 2 +- drivers/pci/pcie/aer.c | 14 +++----------- drivers/pci/pcie/dpc.c | 14 ++++---------- include/linux/aer.h | 11 +++++------ include/ras/ras_event.h | 10 +++++----- 7 files changed, 48 insertions(+), 35 deletions(-) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 35c37f667781..d3f98161171e 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -445,8 +445,8 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, printk("%saer_uncor_severity: 0x%08x\n", pfx, aer->uncor_severity); printk("%sTLP Header: %08x %08x %08x %08x\n", pfx, - aer->header_log.dw0, aer->header_log.dw1, - aer->header_log.dw2, aer->header_log.dw3); + aer->header_log.dw[0], aer->header_log.dw[1], + aer->header_log.dw[2], aer->header_log.dw[3]); } } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d8f11a078924..a0af94cfcf7d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1067,6 +1067,34 @@ disable_acs_redir: pci_disable_acs_redir(dev); } +/** + * pcie_read_tlp_log - read TLP Header Log + * @dev: PCIe device + * @where: PCI Config offset of TLP Header Log + * @tlp_log: TLP Log structure to fill + * + * Fill @tlp_log from TLP Header Log registers, e.g., AER or DPC. + * + * Return: 0 on success and filled TLP Log structure, <0 on error. + */ +int pcie_read_tlp_log(struct pci_dev *dev, int where, + struct pcie_tlp_log *tlp_log) +{ + int i, ret; + + memset(tlp_log, 0, sizeof(*tlp_log)); + + for (i = 0; i < 4; i++) { + ret = pci_read_config_dword(dev, where + i * 4, + &tlp_log->dw[i]); + if (ret) + return pcibios_err_to_errno(ret); + } + + return 0; +} +EXPORT_SYMBOL_GPL(pcie_read_tlp_log); + /** * pci_restore_bars - restore a device's BAR values (e.g. after wake-up) * @dev: PCI device to have its BARs restored diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2336a8d1edab..a59ba6fde2a0 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -409,7 +409,7 @@ struct aer_err_info { unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ - struct aer_header_log_regs tlp; /* TLP Header */ + struct pcie_tlp_log tlp; /* TLP Header */ }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e31e6a9a7773..ac6293c24976 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -664,11 +664,10 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, } } -static void __print_tlp_header(struct pci_dev *dev, - struct aer_header_log_regs *t) +static void __print_tlp_header(struct pci_dev *dev, struct pcie_tlp_log *t) { pci_err(dev, " TLP Header: %08x %08x %08x %08x\n", - t->dw0, t->dw1, t->dw2, t->dw3); + t->dw[0], t->dw[1], t->dw[2], t->dw[3]); } static void __aer_print_error(struct pci_dev *dev, @@ -1246,14 +1245,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) if (info->status & AER_LOG_TLP_MASKS) { info->tlp_header_valid = 1; - pci_read_config_dword(dev, - aer + PCI_ERR_HEADER_LOG, &info->tlp.dw0); - pci_read_config_dword(dev, - aer + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1); - pci_read_config_dword(dev, - aer + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2); - pci_read_config_dword(dev, - aer + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3); + pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG, &info->tlp); } } diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 94111e438241..c197bc7f7f2c 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -190,7 +190,8 @@ out: static void dpc_process_rp_pio_error(struct pci_dev *pdev) { u16 cap = pdev->dpc_cap, dpc_status, first_error; - u32 status, mask, sev, syserr, exc, dw0, dw1, dw2, dw3, log, prefix; + u32 status, mask, sev, syserr, exc, log, prefix; + struct pcie_tlp_log tlp_log; int i; pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS, &status); @@ -216,16 +217,9 @@ static void dpc_process_rp_pio_error(struct pci_dev *pdev) if (pdev->dpc_rp_log_size < 4) goto clear_status; - pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG, - &dw0); - pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 4, - &dw1); - pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 8, - &dw2); - pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 12, - &dw3); + pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG, &tlp_log); pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n", - dw0, dw1, dw2, dw3); + tlp_log.dw[0], tlp_log.dw[1], tlp_log.dw[2], tlp_log.dw[3]); if (pdev->dpc_rp_log_size < 5) goto clear_status; diff --git a/include/linux/aer.h b/include/linux/aer.h index ae0fae70d4bd..4b97f38f3fcf 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -18,11 +18,8 @@ struct pci_dev; -struct aer_header_log_regs { - u32 dw0; - u32 dw1; - u32 dw2; - u32 dw3; +struct pcie_tlp_log { + u32 dw[4]; }; struct aer_capability_regs { @@ -33,13 +30,15 @@ struct aer_capability_regs { u32 cor_status; u32 cor_mask; u32 cap_control; - struct aer_header_log_regs header_log; + struct pcie_tlp_log header_log; u32 root_command; u32 root_status; u16 cor_err_source; u16 uncor_err_source; }; +int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log *log); + #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index cbd3ddd7c33d..c011ea236e9b 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -300,7 +300,7 @@ TRACE_EVENT(aer_event, const u32 status, const u8 severity, const u8 tlp_header_valid, - struct aer_header_log_regs *tlp), + struct pcie_tlp_log *tlp), TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), @@ -318,10 +318,10 @@ TRACE_EVENT(aer_event, __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; if (tlp_header_valid) { - __entry->tlp_header[0] = tlp->dw0; - __entry->tlp_header[1] = tlp->dw1; - __entry->tlp_header[2] = tlp->dw2; - __entry->tlp_header[3] = tlp->dw3; + __entry->tlp_header[0] = tlp->dw[0]; + __entry->tlp_header[1] = tlp->dw[1]; + __entry->tlp_header[2] = tlp->dw[2]; + __entry->tlp_header[3] = tlp->dw[3]; } ), -- cgit