From 4d1827485acecac0016eaaec199e97697afdaa84 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 31 Jul 2023 14:02:11 +0800 Subject: tools/power/turbostat: Fix failure with new uncore sysfs On some platforms, turbostat fails during launch time like below, turbostat version 2023.03.17 - Len Brown ... cpu40: MSR_IA32_PACKAGE_THERM_STATUS: 0x884c0000 (24 C) cpu40: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x00000003 (100 C, 100 C) turbostat: snapshot_sysfs_counter(/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz): No data available This is because new uncore sysfs is used on these platforms as introduced by commit 9b8dea80e3cb ("platform/x86/intel-uncore-freq: Support for cluster level controls"). With the new uncore sysfs interface, /sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz is still available, but reading it fails. How to support the fabric cluster level uncore sysfs is not settled yet, as a short term fix, clear the BIC_UNCORE_MHZ bit when new sysfs I/F is detected. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9a10512e3407..9de1ff6f82ce 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4163,6 +4163,10 @@ static void intel_uncore_frequency_probe(void) if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) return; + /* Cluster level sysfs not supported yet. */ + if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) + return; + if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) BIC_PRESENT(BIC_UNCORE_MHZ); -- cgit From 137f01b3529d292a68d22e9681e2f903c768f790 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 25 Mar 2023 21:57:07 +0800 Subject: tools/power/turbostat: Fix a knl bug MSR_KNL_CORE_C6_RESIDENCY should be evaluated only if 1. this is KNL platform AND 2. need to get C6 residency or need to calculate C1 residency Fix the broken logic introduced by commit 1e9042b9c8d4 ("tools/power turbostat: Fix CPU%C1 display value"). Fixes: 1e9042b9c8d4 ("tools/power turbostat: Fix CPU%C1 display value") Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9de1ff6f82ce..fb6c6ddbdb60 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2180,7 +2180,7 @@ retry: if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) { if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) return -7; - } else if (do_knl_cstates || soft_c1_residency_display(BIC_CPU_c6)) { + } else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) { if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) return -7; } -- cgit From b61b7d8c4c22c4298a50ae5d0ee88facb85ce665 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Mon, 27 Mar 2023 11:17:44 +0800 Subject: tools/power/turbostat: Enable the C-state Pre-wake printing Currently the C-state Pre-wake will not be printed due to the probe has not been invoked. Invoke the probe function accordingly. Fixes: aeb01e6d71ff ("tools/power turbostat: Print the C-state Pre-wake settings") Signed-off-by: Chen Yu Reviewed-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fb6c6ddbdb60..03d4f09b103a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5794,6 +5794,7 @@ void process_cpuid() rapl_probe(family, model); perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); + prewake_cstate_probe(family, model); check_tcc_offset(model_orig); -- cgit From b98a6d78768ec459d394db8bc086d071eb6556c8 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:30:55 +0800 Subject: tools/power/turbostat: Enable TCC Offset on more models All Models that duplicate INTEL_FAM6_CANNONLAKE_L support TCC Offset. Enable this feature on all these models. Delete obsolete model_orig. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 03d4f09b103a..d7880870ef68 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -211,9 +211,6 @@ int *fd_instr_count_percpu; struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; -/* Save original CPU model */ -unsigned int model_orig; - unsigned int num_iterations; unsigned int header_iterations; unsigned int debug; @@ -4046,14 +4043,7 @@ void check_tcc_offset(int model) switch (model) { case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: - case INTEL_FAM6_ICELAKE_L: - case INTEL_FAM6_ICELAKE: - case INTEL_FAM6_TIGERLAKE_L: - case INTEL_FAM6_TIGERLAKE: - case INTEL_FAM6_COMETLAKE: + case INTEL_FAM6_CANNONLAKE_L: if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { msr = (msr >> 30) & 1; if (msr) @@ -5573,10 +5563,9 @@ void process_cpuid() edx_flags & (1 << 22) ? "ACPI-TM" : "-", edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } - if (genuine_intel) { - model_orig = model; + + if (genuine_intel) model = intel_model_duplicates(model); - } if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); @@ -5796,7 +5785,7 @@ void process_cpuid() automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); - check_tcc_offset(model_orig); + check_tcc_offset(model); if (!quiet) dump_cstate_pstate_config_info(family, model); -- cgit From 2c019d657968bdd93e11615e0919d8181a54742d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 21 Aug 2022 22:39:16 +0800 Subject: tools/power/turbostat: Support alternative graphics sysfs knobs /sys/class/graphics/fb0/device/drm/card0/ and /sys/class/drm/card0/ point to the same device node. But in some cases, one exists and the other one does not. Prefer to use /sys/class/drm/card0/, and fall back to /sys/class/graphics/fb0/device/drm/card0/. This recovers the "GFXMHz" and "GFXAMHz" columns on some platforms like a SPR server. Reviewed-by: Rodrigo Vivi Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d7880870ef68..f9bc2230db73 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3188,8 +3188,8 @@ int snapshot_gfx_rc6_ms(void) /* * snapshot_gfx_mhz() * - * record snapshot of - * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz + * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz + * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. * * return 1 if config change requires a restart, else return 0 */ @@ -3198,9 +3198,11 @@ int snapshot_gfx_mhz(void) static FILE *fp; int retval; - if (fp == NULL) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); - else { + if (fp == NULL) { + fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); + if (!fp) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); + } else { rewind(fp); fflush(fp); } @@ -3215,8 +3217,8 @@ int snapshot_gfx_mhz(void) /* * snapshot_gfx_cur_mhz() * - * record snapshot of - * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * when /sys/class/drm/card0/gt_act_freq_mhz is not available. * * return 1 if config change requires a restart, else return 0 */ @@ -3225,9 +3227,11 @@ int snapshot_gfx_act_mhz(void) static FILE *fp; int retval; - if (fp == NULL) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); - else { + if (fp == NULL) { + fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); + if (!fp) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + } else { rewind(fp); fflush(fp); } @@ -5804,10 +5808,12 @@ void process_cpuid() if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); - if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXMHz); - if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXACTMHz); if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) -- cgit From 6d306d6ec7e0f9a5e90f5afd70e3b6c32ae50ce6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:26:25 +0800 Subject: tools/power/turbostat: Replace raw value cpu model with Macro Kernel already has #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ Use standard Macro for CPU Model instead of raw value. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f9bc2230db73..8d3a8af3692a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5413,7 +5413,7 @@ unsigned int intel_model_duplicates(unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case 0x1F: /* Core i7 and i5 Processor - Nehalem */ + case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; -- cgit From bbfc33b1e49f443296e56d8b76c77373f700aedc Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:27:30 +0800 Subject: tools/power/turbostat: Remove redundant duplicates Remove redundant duplicates in intel_model_duplicates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d3a8af3692a..2420300939da 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5412,24 +5412,20 @@ unsigned int intel_model_duplicates(unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ - case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ case INTEL_FAM6_WESTMERE_EX: /* Westmere-EX Xeon - Eagleton */ return INTEL_FAM6_NEHALEM_EX; case INTEL_FAM6_XEON_PHI_KNM: return INTEL_FAM6_XEON_PHI_KNL; - case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_BROADWELL_D: /* BDX-DE */ return INTEL_FAM6_BROADWELL_X; - case INTEL_FAM6_SKYLAKE_L: case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: -- cgit From 48674c1bb6124fe392e8fed80a39fcb3f62e6551 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 12:36:55 +0800 Subject: tools/power/turbostat: Remove pseudo check for two models INTEL_FAM6_ATOM_SILVERMONT_MID/INTEL_FAM6_ATOM_AIRMONT_MID are not listed in probe_nhm_msrs(). This means that most of the turbostat features are not available on these two platforms. Further more, checking for these two models in has_slv_msrs() is dead code. Because has_slv_msrs() is called by the code guarded by probe_nhm_msrs(). For these two reasons, remove pseudo check for INTEL_FAM6_ATOM_SILVERMONT_MID and INTEL_FAM6_ATOM_AIRMONT_MID. Will add back the support when we can access these two platforms. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2420300939da..b8874d3dc83e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3792,8 +3792,6 @@ int has_slv_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_MID: - case INTEL_FAM6_ATOM_AIRMONT_MID: return 1; } return 0; -- cgit From 45232ab168a3c5abad86eafaef2beed8d7037666 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:52:53 +0800 Subject: tools/power/turbostat: Add skeleton support for table driven feature enumeration Turbostat supports a series of features that may diverge among different CPU models. Current code uses various of CPU model checks in different places to handle this, which makes the code hard to maintain. Add skeleton support for table driven feature enumeration to replace the current error-prone CPU model checks and global variables. Note: by comparing the CPU models with intel-family.h, it is found that turbostat support for below four Models are missing, including INTEL_FAM6_ICELAKE, INTEL_FAM6_ATOM_SILVERMONT_MID, INTEL_FAM6_ATOM_AIRMONT_MID and INTEL_FAM6_ATOM_AIRMONT_NP. Adding support for these models is a different work, thus it is not covered in this patch set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 192 ++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b8874d3dc83e..4deea374188a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -281,6 +281,197 @@ unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; +/* Model specific support Start */ + +/* List of features that may diverge among different platforms */ +struct platform_features { +}; + +struct platform_data { + unsigned int model; + const struct platform_features *features; +}; + +static const struct platform_features nhm_features = { +}; + +static const struct platform_features nhx_features = { +}; + +static const struct platform_features snb_features = { +}; + +static const struct platform_features snx_features = { +}; + +static const struct platform_features ivb_features = { +}; + +static const struct platform_features ivx_features = { +}; + +static const struct platform_features hsw_features = { +}; + +static const struct platform_features hsx_features = { +}; + +static const struct platform_features hswl_features = { +}; + +static const struct platform_features hswg_features = { +}; + +static const struct platform_features bdw_features = { +}; + +static const struct platform_features bdwg_features = { +}; + +static const struct platform_features bdx_features = { +}; + +static const struct platform_features skl_features = { +}; + +static const struct platform_features cnl_features = { +}; + +static const struct platform_features skx_features = { +}; + +static const struct platform_features icx_features = { +}; + +static const struct platform_features spr_features = { +}; + +static const struct platform_features slv_features = { +}; + +static const struct platform_features slvd_features = { +}; + +static const struct platform_features amt_features = { +}; + +static const struct platform_features gmt_features = { +}; + +static const struct platform_features gmtd_features = { +}; + +static const struct platform_features gmtp_features = { +}; + +static const struct platform_features tmt_features = { +}; + +static const struct platform_features tmtd_features = { +}; + +static const struct platform_features knl_features = { +}; + +static const struct platform_features default_features = { +}; + +static const struct platform_features amd_features = { +}; + +static const struct platform_data turbostat_pdata[] = { + { INTEL_FAM6_NEHALEM, &nhm_features }, + { INTEL_FAM6_NEHALEM_G, &nhm_features }, + { INTEL_FAM6_NEHALEM_EP, &nhm_features }, + { INTEL_FAM6_NEHALEM_EX, &nhx_features }, + { INTEL_FAM6_WESTMERE, &nhm_features }, + { INTEL_FAM6_WESTMERE_EP, &nhm_features }, + { INTEL_FAM6_WESTMERE_EX, &nhx_features }, + { INTEL_FAM6_SANDYBRIDGE, &snb_features }, + { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, + { INTEL_FAM6_IVYBRIDGE, &ivb_features }, + { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, + { INTEL_FAM6_HASWELL, &hsw_features }, + { INTEL_FAM6_HASWELL_X, &hsx_features }, + { INTEL_FAM6_HASWELL_L, &hswl_features }, + { INTEL_FAM6_HASWELL_G, &hswg_features }, + { INTEL_FAM6_BROADWELL, &bdw_features }, + { INTEL_FAM6_BROADWELL_G, &bdwg_features }, + { INTEL_FAM6_BROADWELL_X, &bdx_features }, + { INTEL_FAM6_BROADWELL_D, &bdx_features }, + { INTEL_FAM6_SKYLAKE_L, &skl_features }, + { INTEL_FAM6_SKYLAKE, &skl_features }, + { INTEL_FAM6_SKYLAKE_X, &skx_features }, + { INTEL_FAM6_KABYLAKE_L, &skl_features }, + { INTEL_FAM6_KABYLAKE, &skl_features }, + { INTEL_FAM6_COMETLAKE, &skl_features }, + { INTEL_FAM6_COMETLAKE_L, &skl_features }, + { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, + { INTEL_FAM6_ICELAKE_X, &icx_features }, + { INTEL_FAM6_ICELAKE_D, &icx_features }, + { INTEL_FAM6_ICELAKE_L, &cnl_features }, + { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, + { INTEL_FAM6_ROCKETLAKE, &cnl_features }, + { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, + { INTEL_FAM6_TIGERLAKE, &cnl_features }, + { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, + { INTEL_FAM6_LAKEFIELD, &cnl_features }, + { INTEL_FAM6_ALDERLAKE, &cnl_features }, + { INTEL_FAM6_ALDERLAKE_L, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE_P, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE_S, &cnl_features }, + { INTEL_FAM6_METEORLAKE, &cnl_features }, + { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, + { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, + { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, + { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, + { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, + { INTEL_FAM6_ATOM_GRACEMONT, &cnl_features }, + { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, + { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + /* + * Missing support for + * INTEL_FAM6_ICELAKE + * INTEL_FAM6_ATOM_SILVERMONT_MID + * INTEL_FAM6_ATOM_AIRMONT_MID + * INTEL_FAM6_ATOM_AIRMONT_NP + */ + { 0, NULL }, +}; + +static const struct platform_features *platform; + +void probe_platform_features(unsigned int family, unsigned int model) +{ + int i; + + if (authentic_amd || hygon_genuine) { + platform = &amd_features; + return; + } + + platform = &default_features; + + if (!genuine_intel || family != 6) + return; + + for (i = 0; turbostat_pdata[i].features; i++) { + if (turbostat_pdata[i].model == model) { + platform = turbostat_pdata[i].features; + return; + } + } +} + +/* Model specific support End */ + #define RAPL_PKG (1 << 0) /* 0x610 MSR_PKG_POWER_LIMIT */ /* 0x611 MSR_PKG_ENERGY_STATUS */ @@ -5562,6 +5753,7 @@ void process_cpuid() edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } + probe_platform_features(family, model); if (genuine_intel) model = intel_model_duplicates(model); -- cgit From 778fc34a7a3db2811e28ce570318dd047f278cb2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 15:26:32 +0800 Subject: tools/power/turbostat: Abstract MSR_MISC_FEATURE_CONTROL support Abstract MSR_MISC_FEATURE_CONTROL support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4deea374188a..7eaa0adf72e0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -277,7 +277,6 @@ unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ -unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; @@ -285,6 +284,7 @@ int ignore_stdin; /* List of features that may diverge among different platforms */ struct platform_features { + bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ }; struct platform_data { @@ -299,51 +299,67 @@ static const struct platform_features nhx_features = { }; static const struct platform_features snb_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features snx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features ivb_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features ivx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hsw_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hsx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hswl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hswg_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdw_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdwg_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features skl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features cnl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features skx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features icx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features spr_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features slv_features = { @@ -3883,7 +3899,6 @@ void check_permissions(void) * * Side effect: * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL - * sets has_misc_feature_control */ int probe_nhm_msrs(unsigned int family, unsigned int model) { @@ -3909,7 +3924,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_IVYBRIDGE: /* IVB */ case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ pkg_cstate_limits = snb_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_HASWELL: /* HSW */ case INTEL_FAM6_HASWELL_G: /* HSW */ @@ -3921,16 +3935,13 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ pkg_cstate_limits = hsw_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ pkg_cstate_limits = skx_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_ICELAKE_X: /* ICX */ pkg_cstate_limits = icx_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ no_MSR_MISC_PWR_MGMT = 1; @@ -5541,7 +5552,7 @@ void decode_misc_feature_control(void) { unsigned long long msr; - if (!has_misc_feature_control) + if (!platform->has_msr_misc_feature_control) return; if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) -- cgit From 3dd0e7547d11e770bafb40ad41f2631cc4b16649 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 22:12:57 +0800 Subject: tools/power/turbostat: Abstract MSR_MISC_PWR_MGMT support Abstract MSR_MISC_PWR_MGMT support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7eaa0adf72e0..9507f310e212 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -238,7 +238,6 @@ unsigned int hygon_genuine; unsigned int max_level, max_extended_level; unsigned int has_invariant_tsc; unsigned int do_nhm_platform_info; -unsigned int no_MSR_MISC_PWR_MGMT; unsigned int aperf_mperf_multiplier = 1; double bclk; double base_hz; @@ -285,6 +284,7 @@ int ignore_stdin; /* List of features that may diverge among different platforms */ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ + bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ }; struct platform_data { @@ -293,100 +293,125 @@ struct platform_data { }; static const struct platform_features nhm_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features nhx_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features slv_features = { }; static const struct platform_features slvd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features amt_features = { }; static const struct platform_features gmt_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features gmtd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features gmtp_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features tmt_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features tmtd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features knl_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features default_features = { @@ -3944,14 +3969,12 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) pkg_cstate_limits = icx_pkg_cstate_limits; break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - no_MSR_MISC_PWR_MGMT = 1; /* FALLTHRU */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ pkg_cstate_limits = slv_pkg_cstate_limits; break; case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ pkg_cstate_limits = amt_pkg_cstate_limits; - no_MSR_MISC_PWR_MGMT = 1; break; case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ pkg_cstate_limits = phi_pkg_cstate_limits; @@ -5576,7 +5599,7 @@ void decode_misc_pwr_mgmt_msr(void) if (!do_nhm_platform_info) return; - if (no_MSR_MISC_PWR_MGMT) + if (!platform->has_msr_misc_pwr_mgmt) return; if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) -- cgit From 71e841293c715797d8c6ae8cdc3f74b4396c5570 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 22:22:48 +0800 Subject: tools/power/turbostat: Abstract BCLK frequency support Abstract CPU base clock frequency support. Note that bclk is used by 1. calculate base_hz using MSR_PLATFORM_INFO, which is guarded by probe_nhm_msrs(). 2. dump MSR_PLATFORM_INFO and Turbo Ratio Limit MSRs, which are also guarded by probe_nhm_msrs(). Thus probe_bclk() works for probe_nhm_msrs() models only. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 131 ++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 44 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9507f310e212..66ba70017d53 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -269,7 +269,6 @@ unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; -double discover_bclk(unsigned int family, unsigned int model); unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ /* IA32_HWP_REQUEST, IA32_HWP_STATUS */ unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ @@ -279,12 +278,15 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +int get_msr(int cpu, off_t offset, unsigned long long *msr); + /* Model specific support Start */ /* List of features that may diverge among different platforms */ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ + int bclk_freq; /* CPU base clock */ }; struct platform_data { @@ -292,126 +294,185 @@ struct platform_data { const struct platform_features *features; }; +/* For BCLK */ +enum bclk_freq { + BCLK_100MHZ = 1, + BCLK_133MHZ, + BCLK_SLV, +}; + +#define SLM_BCLK_FREQS 5 +double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; + +double slm_bclk(void) +{ + unsigned long long msr = 3; + unsigned int i; + double freq; + + if (get_msr(base_cpu, MSR_FSB_FREQ, &msr)) + fprintf(outf, "SLM BCLK: unknown\n"); + + i = msr & 0xf; + if (i >= SLM_BCLK_FREQS) { + fprintf(outf, "SLM BCLK[%d] invalid\n", i); + i = 3; + } + freq = slm_freq_table[i]; + + if (!quiet) + fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq); + + return freq; +} + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features slv_features = { + .bclk_freq = BCLK_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_SLV, }; static const struct platform_features amt_features = { + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features default_features = { @@ -3907,6 +3968,30 @@ void check_permissions(void) exit(-6); } +void probe_bclk(void) +{ + unsigned long long msr; + unsigned int base_ratio; + + if (!do_nhm_platform_info) + return; + + if (platform->bclk_freq == BCLK_100MHZ) + bclk = 100.00; + else if (platform->bclk_freq == BCLK_133MHZ) + bclk = 133.33; + else if (platform->bclk_freq == BCLK_SLV) + bclk = slm_bclk(); + else + return; + + get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); + base_ratio = (msr >> 8) & 0xFF; + + base_hz = base_ratio * bclk * 1000000; + has_base_hz = 1; +} + /* * NHM adds support for additional MSRs: * @@ -3928,7 +4013,6 @@ void check_permissions(void) int probe_nhm_msrs(unsigned int family, unsigned int model) { unsigned long long msr; - unsigned int base_ratio; int *pkg_cstate_limits; if (!genuine_intel) @@ -3937,8 +4021,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) if (family != 6) return 0; - bclk = discover_bclk(family, model); - switch (model) { case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ @@ -3992,11 +4074,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; - get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); - base_ratio = (msr >> 8) & 0xFF; - - base_hz = base_ratio * bclk * 1000000; - has_base_hz = 1; return 1; } @@ -5403,41 +5480,6 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) return 1; } -#define SLM_BCLK_FREQS 5 -double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; - -double slm_bclk(void) -{ - unsigned long long msr = 3; - unsigned int i; - double freq; - - if (get_msr(base_cpu, MSR_FSB_FREQ, &msr)) - fprintf(outf, "SLM BCLK: unknown\n"); - - i = msr & 0xf; - if (i >= SLM_BCLK_FREQS) { - fprintf(outf, "SLM BCLK[%d] invalid\n", i); - i = 3; - } - freq = slm_freq_table[i]; - - if (!quiet) - fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq); - - return freq; -} - -double discover_bclk(unsigned int family, unsigned int model) -{ - if (has_snb_msrs(family, model) || is_knl(family, model)) - return 100.00; - else if (is_slm(family, model)) - return slm_bclk(); - else - return 133.33; -} - int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned int eax, ebx, ecx, edx; @@ -5929,6 +5971,7 @@ void process_cpuid() BIC_PRESENT(BIC_CPU_c6); BIC_PRESENT(BIC_SMI); } + probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); if (do_snb_cstates) -- cgit From 3989fc890782c8002477895e9f24ffb98a132293 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 22:37:06 +0800 Subject: tools/power/turbostat: Abstract Package cstate limit decoding support Abstract the support for decoding package cstate limit from MSR_PKG_CST_CONFIG_CONTROL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 112 ++++++++++++++++++++++++++-------- 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 66ba70017d53..bcad9332a3b2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -287,6 +287,7 @@ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ int bclk_freq; /* CPU base clock */ + int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ }; struct platform_data { @@ -326,153 +327,193 @@ double slm_bclk(void) return freq; } +/* For Package cstate limit */ +enum package_cstate_limit { + CST_LIMIT_NHM = 1, + CST_LIMIT_SNB, + CST_LIMIT_HSW, + CST_LIMIT_SKX, + CST_LIMIT_ICX, + CST_LIMIT_SLV, + CST_LIMIT_AMT, + CST_LIMIT_KNL, + CST_LIMIT_GMT, +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_ICX, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, + .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_SLV, + .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features amt_features = { .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_AMT, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_KNL, }; static const struct platform_features default_features = { @@ -2704,6 +2745,50 @@ int icx_pkg_cstate_limits[16] = PCLRSV, PCLRSV }; +void probe_cst_limit(void) +{ + unsigned long long msr; + int *pkg_cstate_limits; + + if (!do_nhm_platform_info) + return; + + switch (platform->cst_limit) { + case CST_LIMIT_NHM: + pkg_cstate_limits = nhm_pkg_cstate_limits; + break; + case CST_LIMIT_SNB: + pkg_cstate_limits = snb_pkg_cstate_limits; + break; + case CST_LIMIT_HSW: + pkg_cstate_limits = hsw_pkg_cstate_limits; + break; + case CST_LIMIT_SKX: + pkg_cstate_limits = skx_pkg_cstate_limits; + break; + case CST_LIMIT_ICX: + pkg_cstate_limits = icx_pkg_cstate_limits; + break; + case CST_LIMIT_SLV: + pkg_cstate_limits = slv_pkg_cstate_limits; + break; + case CST_LIMIT_AMT: + pkg_cstate_limits = amt_pkg_cstate_limits; + break; + case CST_LIMIT_KNL: + pkg_cstate_limits = phi_pkg_cstate_limits; + break; + case CST_LIMIT_GMT: + pkg_cstate_limits = glm_pkg_cstate_limits; + break; + default: + return; + } + + get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); + pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; +} + static void calculate_tsc_tweak() { tsc_tweak = base_hz / tsc_hz; @@ -4006,15 +4091,9 @@ void probe_bclk(void) * MSR_PKG_C6_RESIDENCY 0x000003f9 * MSR_CORE_C3_RESIDENCY 0x000003fc * MSR_CORE_C6_RESIDENCY 0x000003fd - * - * Side effect: - * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL */ int probe_nhm_msrs(unsigned int family, unsigned int model) { - unsigned long long msr; - int *pkg_cstate_limits; - if (!genuine_intel) return 0; @@ -4024,14 +4103,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - pkg_cstate_limits = nhm_pkg_cstate_limits; - break; case INTEL_FAM6_SANDYBRIDGE: /* SNB */ case INTEL_FAM6_SANDYBRIDGE_X: /* SNB Xeon */ case INTEL_FAM6_IVYBRIDGE: /* IVB */ case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - pkg_cstate_limits = snb_pkg_cstate_limits; - break; case INTEL_FAM6_HASWELL: /* HSW */ case INTEL_FAM6_HASWELL_G: /* HSW */ case INTEL_FAM6_HASWELL_X: /* HSX */ @@ -4041,38 +4116,22 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - pkg_cstate_limits = hsw_pkg_cstate_limits; - break; case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - pkg_cstate_limits = skx_pkg_cstate_limits; - break; case INTEL_FAM6_ICELAKE_X: /* ICX */ - pkg_cstate_limits = icx_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - /* FALLTHRU */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - pkg_cstate_limits = slv_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ - pkg_cstate_limits = amt_pkg_cstate_limits; - break; case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ - pkg_cstate_limits = phi_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - pkg_cstate_limits = glm_pkg_cstate_limits; break; default: return 0; } - get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); - pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; return 1; } @@ -5964,6 +6023,7 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); + probe_cst_limit(); if (probe_nhm_msrs(family, model)) { do_nhm_platform_info = 1; BIC_PRESENT(BIC_CPU_c1); -- cgit From fcfa1ce074ab76272639961d4d7900b91657a8d5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 25 Aug 2023 16:52:23 +0800 Subject: tools/power/turbostat: Abstract Nehalem MSRs support MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, and the TRL MSRs are always available for platforms since Nehalem. Support for these msrs can be described altogether. Abstract the support for these MSRs. Delete probe_nhm_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 101 ++++++++++++---------------------- 1 file changed, 34 insertions(+), 67 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bcad9332a3b2..bc221e800aec 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -237,7 +237,6 @@ unsigned int authentic_amd; unsigned int hygon_genuine; unsigned int max_level, max_extended_level; unsigned int has_invariant_tsc; -unsigned int do_nhm_platform_info; unsigned int aperf_mperf_multiplier = 1; double bclk; double base_hz; @@ -286,6 +285,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr); struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ + bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ }; @@ -342,12 +342,14 @@ enum package_cstate_limit { static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, }; @@ -355,6 +357,7 @@ static const struct platform_features nhx_features = { static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -362,6 +365,7 @@ static const struct platform_features snb_features = { static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -369,6 +373,7 @@ static const struct platform_features snx_features = { static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -376,6 +381,7 @@ static const struct platform_features ivb_features = { static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -383,6 +389,7 @@ static const struct platform_features ivx_features = { static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -390,6 +397,7 @@ static const struct platform_features hsw_features = { static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -397,6 +405,7 @@ static const struct platform_features hsx_features = { static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -404,6 +413,7 @@ static const struct platform_features hswl_features = { static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -411,6 +421,7 @@ static const struct platform_features hswg_features = { static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -418,6 +429,7 @@ static const struct platform_features bdw_features = { static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -425,6 +437,7 @@ static const struct platform_features bdwg_features = { static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -432,6 +445,7 @@ static const struct platform_features bdx_features = { static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -439,6 +453,7 @@ static const struct platform_features skl_features = { static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -446,6 +461,7 @@ static const struct platform_features cnl_features = { static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, }; @@ -453,6 +469,7 @@ static const struct platform_features skx_features = { static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, }; @@ -460,58 +477,68 @@ static const struct platform_features icx_features = { static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features slv_features = { + .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features amt_features = { + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_AMT, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, }; @@ -2750,7 +2777,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; switch (platform->cst_limit) { @@ -4058,7 +4085,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4077,65 +4104,6 @@ void probe_bclk(void) has_base_hz = 1; } -/* - * NHM adds support for additional MSRs: - * - * MSR_SMI_COUNT 0x00000034 - * - * MSR_PLATFORM_INFO 0x000000ce - * MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 - * - * MSR_MISC_PWR_MGMT 0x000001aa - * - * MSR_PKG_C3_RESIDENCY 0x000003f8 - * MSR_PKG_C6_RESIDENCY 0x000003f9 - * MSR_CORE_C3_RESIDENCY 0x000003fc - * MSR_CORE_C6_RESIDENCY 0x000003fd - */ -int probe_nhm_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - case INTEL_FAM6_SANDYBRIDGE: /* SNB */ - case INTEL_FAM6_SANDYBRIDGE_X: /* SNB Xeon */ - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ - case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - break; - default: - return 0; - } - - return 1; -} - /* * SLV client has support for unique MSRs: * @@ -4461,7 +4429,7 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; dump_nhm_platform_info(); @@ -5606,7 +5574,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5697,7 +5665,7 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; if (!platform->has_msr_misc_pwr_mgmt) @@ -6024,8 +5992,7 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); probe_cst_limit(); - if (probe_nhm_msrs(family, model)) { - do_nhm_platform_info = 1; + if (platform->has_nhm_msrs) { BIC_PRESENT(BIC_CPU_c1); BIC_PRESENT(BIC_CPU_c3); BIC_PRESENT(BIC_CPU_c6); -- cgit From c2c25e85df316a624e4a8ee85d65ea29265f486d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 14:57:06 +0800 Subject: tools/power/turbostat: Remove a redundant check Platforms with has_msr_misc_pwr_mgmt set is a subset of platforms with has_nhm_msrs set. Thus remove the redudant check for platform->has_nhm_msrs. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc221e800aec..4eb10491f714 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5665,9 +5665,6 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) - return; - if (!platform->has_msr_misc_pwr_mgmt) return; -- cgit From 8b7199c0855e3b22532a21aaaed78e699431e4e5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 14:58:35 +0800 Subject: tools/power/turbostat: Rename some functions Rename dump_nhm_platform_info() and dump_nhm_cst_cfg() to dump_platform_info() and dump_cst_cfg() because these MSRs' behavior is consistent when they're available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4eb10491f714..654ae1ce130c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2823,7 +2823,7 @@ static void calculate_tsc_tweak() void prewake_cstate_probe(unsigned int family, unsigned int model); -static void dump_nhm_platform_info(void) +static void dump_platform_info(void) { unsigned long long msr; unsigned int ratio; @@ -3059,7 +3059,7 @@ static void dump_knl_turbo_ratio_limits(void) ratio[i], bclk, ratio[i] * bclk, cores[i]); } -static void dump_nhm_cst_cfg(void) +static void dump_cst_cfg(void) { unsigned long long msr; @@ -4432,9 +4432,9 @@ static void dump_cstate_pstate_config_info(unsigned int family, unsigned int mod if (!platform->has_nhm_msrs) return; - dump_nhm_platform_info(); + dump_platform_info(); dump_turbo_ratio_info(family, model); - dump_nhm_cst_cfg(); + dump_cst_cfg(); } static int read_sysfs_int(char *path) -- cgit From 10d85d85ab4f3ae7faca4450ce7b0a166fd396f0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:00 +0800 Subject: tools/power/turbostat: Abstract Turbo Ratio Limit MSRs support Abstract the support for MSR_TURBO_RATIO_LIMIT, MSR_TRUBO_RATIO_LIMIT1, MSR_TURBO_RATIO_LIMIT2, MSR_SECONDARY_TURBO_RATIO_LIMIT, MSR_ATOM_CORE_RATIOS and MSR_ATOM_CORE_TURBO_RATIOS. Delete has_turbo_ratio_group_limits(), has_turbo_ratio_limit(), has_atom_turbo_ratio_limit(), has_ivt_turbo_ratio_limit(), has_hsw_turbo_ratio_limit(), has_knl_turbo_ratio_limit() and has_glm_turbo_ratio_limit() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 171 +++++++++------------------------- 1 file changed, 46 insertions(+), 125 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 654ae1ce130c..b6c53ec740d3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -288,6 +288,7 @@ struct platform_features { bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ + int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ }; struct platform_data { @@ -340,11 +341,22 @@ enum package_cstate_limit { CST_LIMIT_GMT, }; +/* For Turbo Ratio Limit MSRs */ +enum turbo_ratio_limit_msrs { + TRL_BASE = BIT(0), + TRL_LIMIT1 = BIT(1), + TRL_LIMIT2 = BIT(2), + TRL_ATOM = BIT(3), + TRL_KNL = BIT(4), + TRL_CORECOUNT = BIT(5), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, + .trl_msrs = TRL_BASE, }; static const struct platform_features nhx_features = { @@ -360,6 +372,7 @@ static const struct platform_features snb_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features snx_features = { @@ -368,6 +381,7 @@ static const struct platform_features snx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features ivb_features = { @@ -376,6 +390,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features ivx_features = { @@ -384,6 +399,7 @@ static const struct platform_features ivx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE | TRL_LIMIT1, }; static const struct platform_features hsw_features = { @@ -392,6 +408,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features hsx_features = { @@ -400,6 +417,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, }; static const struct platform_features hswl_features = { @@ -408,6 +426,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features hswg_features = { @@ -416,6 +435,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdw_features = { @@ -424,6 +444,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdwg_features = { @@ -432,6 +453,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdx_features = { @@ -440,6 +462,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features skl_features = { @@ -448,6 +471,7 @@ static const struct platform_features skl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features cnl_features = { @@ -456,6 +480,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features skx_features = { @@ -464,6 +489,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features icx_features = { @@ -472,6 +498,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features spr_features = { @@ -480,12 +507,14 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, + .trl_msrs = TRL_ATOM, }; static const struct platform_features slvd_features = { @@ -493,12 +522,14 @@ static const struct platform_features slvd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, + .trl_msrs = TRL_BASE, }; static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_AMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features gmt_features = { @@ -506,6 +537,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features gmtd_features = { @@ -513,6 +545,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features gmtp_features = { @@ -520,6 +553,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features tmt_features = { @@ -527,6 +561,7 @@ static const struct platform_features tmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features tmtd_features = { @@ -534,6 +569,7 @@ static const struct platform_features tmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features knl_features = { @@ -541,6 +577,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, + .trl_msrs = TRL_KNL, }; static const struct platform_features default_features = { @@ -2911,29 +2948,7 @@ static void dump_ivt_turbo_ratio_limits(void) return; } -int has_turbo_ratio_group_limits(int family, int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - case INTEL_FAM6_ATOM_GOLDMONT_D: - case INTEL_FAM6_ATOM_TREMONT_D: - return 1; - default: - return 0; - } -} - -static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model) +static void dump_turbo_ratio_limits(int trl_msr_offset) { unsigned long long msr, core_counts; int shift; @@ -2942,7 +2957,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model) fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr); - if (has_turbo_ratio_group_limits(family, model)) { + if (platform->trl_msrs & TRL_CORECOUNT) { get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts); fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts); } else { @@ -4236,100 +4251,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -int has_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (has_slv_msrs(family, model)) - return 0; - - if (family != 6) - return 0; - - switch (model) { - /* Nehalem compatible, but do not include turbo-ratio limit support */ - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - case INTEL_FAM6_XEON_PHI_KNL: /* PHI - Knights Landing (different MSR definition) */ - return 0; - default: - return 1; - } -} - -int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (has_slv_msrs(family, model)) - return 1; - - return 0; -} - -int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL_X: /* HSW Xeon */ - return 1; - default: - return 0; - } -} - -int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_HASWELL_X: /* HSW Xeon */ - return 1; - default: - return 0; - } -} - -int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ - return 1; - default: - return 0; - } -} - -int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - return 1; - default: - return 0; - } -} - int has_config_tdp(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -4404,23 +4325,23 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) if (!has_turbo) return; - if (has_hsw_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_LIMIT2) dump_hsw_turbo_ratio_limits(); - if (has_ivt_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_LIMIT1) dump_ivt_turbo_ratio_limits(); - if (has_turbo_ratio_limit(family, model)) { - dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT, family, model); + if (platform->trl_msrs & TRL_BASE) { + dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT); if (is_hybrid) - dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT, family, model); + dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT); } - if (has_atom_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_ATOM) dump_atom_turbo_ratio_limits(); - if (has_knl_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_KNL) dump_knl_turbo_ratio_limits(); if (has_config_tdp(family, model)) -- cgit From a3943deaf98f713c819dd7e67af734d4ed4da030 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 30 Jul 2023 13:54:25 +0800 Subject: tools/power/turbostat: Rename some TRL functions Rename dump_hsw_turbo_ratio_limits() and dump_ivt_turbo_ratio_limits() to dump_turbo_ratio_limit2() and dump_turbo_ratio_limit1() because they dump MSR_TURBO_RATIO_LIMIT1/LIMIT2, and the MSRs' behavior is consistent when they are available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b6c53ec740d3..abcc055ea0e1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2886,7 +2886,7 @@ static void dump_platform_info(void) return; } -static void dump_hsw_turbo_ratio_limits(void) +static void dump_turbo_ratio_limit2(void) { unsigned long long msr; unsigned int ratio; @@ -2905,7 +2905,7 @@ static void dump_hsw_turbo_ratio_limits(void) return; } -static void dump_ivt_turbo_ratio_limits(void) +static void dump_turbo_ratio_limit1(void) { unsigned long long msr; unsigned int ratio; @@ -4326,10 +4326,10 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) return; if (platform->trl_msrs & TRL_LIMIT2) - dump_hsw_turbo_ratio_limits(); + dump_turbo_ratio_limit2(); if (platform->trl_msrs & TRL_LIMIT1) - dump_ivt_turbo_ratio_limits(); + dump_turbo_ratio_limit1(); if (platform->trl_msrs & TRL_BASE) { dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT); -- cgit From a61c9cb478c0bf31a50d1829834e822e92876c95 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 14:19:46 +0800 Subject: tools/power/turbostat: Abstract Config TDP MSRs support Abstract the support for MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL and MSR_TURBO_ACTIVATION_RATIO. Delete has_config_tdp() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 54 +++++++++++++---------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index abcc055ea0e1..bb9d8c2605c8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -286,6 +286,7 @@ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ + bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -388,6 +389,7 @@ static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, @@ -406,6 +408,7 @@ static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -415,6 +418,7 @@ static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, @@ -424,6 +428,7 @@ static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -433,6 +438,7 @@ static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -442,6 +448,7 @@ static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -451,6 +458,7 @@ static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -460,6 +468,7 @@ static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -469,6 +478,7 @@ static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -478,6 +488,7 @@ static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -487,6 +498,7 @@ static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -496,6 +508,7 @@ static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -505,6 +518,7 @@ static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -575,6 +589,7 @@ static const struct platform_features tmtd_features = { static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, @@ -4251,35 +4266,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -int has_config_tdp(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ - return 1; - default: - return 0; - } -} - /* * tcc_offset_bits: * 0: Tcc Offset not supported (Default) @@ -4320,7 +4306,7 @@ static void remove_underbar(char *s) *to = 0; } -static void dump_turbo_ratio_info(unsigned int family, unsigned int model) +static void dump_turbo_ratio_info(void) { if (!has_turbo) return; @@ -4344,17 +4330,17 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) if (platform->trl_msrs & TRL_KNL) dump_knl_turbo_ratio_limits(); - if (has_config_tdp(family, model)) + if (platform->has_config_tdp) dump_config_tdp(); } -static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) +static void dump_cstate_pstate_config_info(void) { if (!platform->has_nhm_msrs) return; dump_platform_info(); - dump_turbo_ratio_info(family, model); + dump_turbo_ratio_info(); dump_cst_cfg(); } @@ -6000,7 +5986,7 @@ void process_cpuid() check_tcc_offset(model); if (!quiet) - dump_cstate_pstate_config_info(family, model); + dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); if (!quiet) -- cgit From d8e1623baa0b49aa90cf5801cfaac9e3d3aa1e19 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 25 Aug 2023 23:04:44 +0800 Subject: tools/power/turbostat: Abstract TCC Offset bits support Abstract the support for different TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET. Delete check_tcc_offset() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 53 ++++++++--------------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bb9d8c2605c8..c39beb4078da 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -256,7 +256,6 @@ unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; -int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; @@ -290,6 +289,7 @@ struct platform_features { int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ + int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; struct platform_data { @@ -482,6 +482,7 @@ static const struct platform_features skl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, }; static const struct platform_features cnl_features = { @@ -492,6 +493,7 @@ static const struct platform_features cnl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, }; static const struct platform_features skx_features = { @@ -4266,33 +4268,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -/* - * tcc_offset_bits: - * 0: Tcc Offset not supported (Default) - * 6: Bit 29:24 of MSR_PLATFORM_INFO - * 4: Bit 27:24 of MSR_PLATFORM_INFO - */ -void check_tcc_offset(int model) -{ - unsigned long long msr; - - if (!genuine_intel) - return; - - switch (model) { - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_CANNONLAKE_L: - if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { - msr = (msr >> 30) & 1; - if (msr) - tcc_offset_bits = 6; - } - return; - default: - return; - } -} - static void remove_underbar(char *s) { char *to = s; @@ -5490,20 +5465,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk tcc_default = (msr >> 16) & 0xFF; if (!quiet) { - switch (tcc_offset_bits) { - case 4: - tcc_offset = (msr >> 24) & 0xF; - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); - break; - case 6: - tcc_offset = (msr >> 24) & 0x3F; + int bits = platform->tcc_offset_bits; + unsigned long long enabled = 0; + + if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled)) + enabled = (enabled >> 30) & 1; + + if (bits && enabled) { + tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0); fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); - break; - default: + } else { fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default); - break; } } @@ -5983,8 +5956,6 @@ void process_cpuid() automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); - check_tcc_offset(model); - if (!quiet) dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); -- cgit From 0c057cf7a0e163bf9631b83c002b3a55691674c7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 30 Jul 2023 14:25:17 +0800 Subject: tools/power/turbostat: Abstract Perf Limit Reasons MSRs support Abstract the support for MSR_CORE/GFX/RING_PERF_LIMIT_REASONS MSRs. Delete perf_limit_reasons_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 ++++++++++++----------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c39beb4078da..1207845340ad 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -259,11 +259,8 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int do_core_perf_limit_reasons; unsigned int has_automatic_cstate_conversion; unsigned int dis_cstate_prewake; -unsigned int do_gfx_perf_limit_reasons; -unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; @@ -289,6 +286,7 @@ struct platform_features { int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ + int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -352,6 +350,13 @@ enum turbo_ratio_limit_msrs { TRL_CORECOUNT = BIT(5), }; +/* For Perf Limit Reason MSRs */ +enum perf_limit_reason_msrs { + PLR_CORE = BIT(0), + PLR_GFX = BIT(1), + PLR_RING = BIT(2), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -412,6 +417,7 @@ static const struct platform_features hsw_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features hsx_features = { @@ -422,6 +428,7 @@ static const struct platform_features hsx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, + .plr_msrs = PLR_CORE | PLR_RING, }; static const struct platform_features hswl_features = { @@ -432,6 +439,7 @@ static const struct platform_features hswl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features hswg_features = { @@ -442,6 +450,7 @@ static const struct platform_features hswg_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features bdw_features = { @@ -4657,7 +4666,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data return -1; } - if (do_core_perf_limit_reasons) { + if (platform->plr_msrs & PLR_CORE) { get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)", @@ -4690,7 +4699,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); } - if (do_gfx_perf_limit_reasons) { + if (platform->plr_msrs & PLR_GFX) { get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)", @@ -4710,7 +4719,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 25) ? "GFXPwr, " : "", (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } - if (do_ring_perf_limit_reasons) { + if (platform->plr_msrs & PLR_RING) { get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s)", @@ -5002,28 +5011,6 @@ void rapl_probe(unsigned int family, unsigned int model) rapl_probe_amd(family, model); } -void perf_limit_reasons_probe(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return; - - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - do_gfx_perf_limit_reasons = 1; - /* FALLTHRU */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - do_core_perf_limit_reasons = 1; - do_ring_perf_limit_reasons = 1; - default: - return; - } -} - void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { if (family != 6) @@ -5952,7 +5939,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(family, model); - perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); -- cgit From d90120bf9f111bec8ba0b8ef86c46ccbcd9df188 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:29:18 +0800 Subject: tools/power/turbostat: Abstract Automatic Cstate Conversion support Abstract the support for AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL. Delete automatic_cstate_conversion_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1207845340ad..a235cbf7b581 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -259,7 +259,6 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int has_automatic_cstate_conversion; unsigned int dis_cstate_prewake; unsigned int crystal_hz; unsigned long long tsc_hz; @@ -285,6 +284,7 @@ struct platform_features { bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ + bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ @@ -480,6 +480,7 @@ static const struct platform_features bdx_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, }; @@ -512,6 +513,7 @@ static const struct platform_features skx_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -3116,7 +3118,7 @@ static void dump_cst_cfg(void) (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]); #define AUTOMATIC_CSTATE_CONVERSION (1UL << 16) - if (has_automatic_cstate_conversion) { + if (platform->has_cst_auto_convension) { fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); } @@ -5011,18 +5013,6 @@ void rapl_probe(unsigned int family, unsigned int model) rapl_probe_amd(family, model); } -void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) -{ - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SKYLAKE_X: - has_automatic_cstate_conversion = 1; - } -} - void prewake_cstate_probe(unsigned int family, unsigned int model) { if (is_icx(family, model) || is_spr(family, model)) @@ -5939,7 +5929,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(family, model); - automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); if (!quiet) -- cgit From a5d1ab93a0993616efbf61378e491d6673f4684d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:39:49 +0800 Subject: tools/power/turbostat: Abstract hardcoded Crystal Clock frequency Abstract the support for hardcoded Crystal Clock frequency, which is used when crystal clock is not available from CPUID.15. Delete CPU model checks in process_cpuid(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a235cbf7b581..c76baa10f4eb 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -283,6 +283,7 @@ struct platform_features { bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ + int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -490,6 +491,7 @@ static const struct platform_features skl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 24000000, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -563,6 +565,7 @@ static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -571,6 +574,7 @@ static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 25000000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -579,6 +583,7 @@ static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, }; @@ -5796,26 +5801,12 @@ void process_cpuid() __cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx); if (ebx_tsc != 0) { - if (!quiet && (ebx != 0)) fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n", eax_crystal, ebx_tsc, crystal_hz); if (crystal_hz == 0) - switch (model) { - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - crystal_hz = 24000000; /* 24.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - crystal_hz = 25000000; /* 25.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - crystal_hz = 19200000; /* 19.2 MHz */ - break; - default: - crystal_hz = 0; - } + crystal_hz = platform->crystal_freq; if (crystal_hz) { tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal; -- cgit From b9cd66833d3a651cea10666674e9abcf2182e8ad Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 26 Aug 2023 14:38:38 +0800 Subject: tools/power/turbostat: Redefine RAPL macros Redefine RAPL macros to make the code more readable. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 102 +++++++++++++++------------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c76baa10f4eb..4829e8289feb 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -358,6 +358,40 @@ enum perf_limit_reason_msrs { PLR_RING = BIT(2), }; +/* For RAPL MSRs */ +enum rapl_msrs { + RAPL_PKG_POWER_LIMIT = BIT(0), /* 0x610 MSR_PKG_POWER_LIMIT */ + RAPL_PKG_ENERGY_STATUS = BIT(1), /* 0x611 MSR_PKG_ENERGY_STATUS */ + RAPL_PKG_PERF_STATUS = BIT(2), /* 0x613 MSR_PKG_PERF_STATUS */ + RAPL_PKG_POWER_INFO = BIT(3), /* 0x614 MSR_PKG_POWER_INFO */ + RAPL_DRAM_POWER_LIMIT = BIT(4), /* 0x618 MSR_DRAM_POWER_LIMIT */ + RAPL_DRAM_ENERGY_STATUS = BIT(5), /* 0x619 MSR_DRAM_ENERGY_STATUS */ + RAPL_DRAM_PERF_STATUS = BIT(6), /* 0x61b MSR_DRAM_PERF_STATUS */ + RAPL_DRAM_POWER_INFO = BIT(7), /* 0x61c MSR_DRAM_POWER_INFO */ + RAPL_CORE_POWER_LIMIT = BIT(8), /* 0x638 MSR_PP0_POWER_LIMIT */ + RAPL_CORE_ENERGY_STATUS = BIT(9), /* 0x639 MSR_PP0_ENERGY_STATUS */ + RAPL_CORE_POLICY = BIT(10), /* 0x63a MSR_PP0_POLICY */ + RAPL_GFX_POWER_LIMIT = BIT(11), /* 0x640 MSR_PP1_POWER_LIMIT */ + RAPL_GFX_ENERGY_STATUS = BIT(12), /* 0x641 MSR_PP1_ENERGY_STATUS */ + RAPL_GFX_POLICY = BIT(13), /* 0x642 MSR_PP1_POLICY */ + RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ + RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ + RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ + RAPL_PER_CORE_ENERGY = BIT(17), /* Indicates cores energy collection is per-core, not per-package. */ +}; + +#define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) +#define RAPL_DRAM (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT) +#define RAPL_CORE (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT) +#define RAPL_GFX (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS) + +#define RAPL_PKG_ALL (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO) +#define RAPL_DRAM_ALL (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO) +#define RAPL_CORE_ALL (RAPL_CORE | RAPL_CORE_POLICY) +#define RAPL_GFX_ALL (RAPL_GFX | RAPL_GFX_POLIGY) + +#define RAPL_AMD_F17H (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT) + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -712,42 +746,6 @@ void probe_platform_features(unsigned int family, unsigned int model) /* Model specific support End */ -#define RAPL_PKG (1 << 0) - /* 0x610 MSR_PKG_POWER_LIMIT */ - /* 0x611 MSR_PKG_ENERGY_STATUS */ -#define RAPL_PKG_PERF_STATUS (1 << 1) - /* 0x613 MSR_PKG_PERF_STATUS */ -#define RAPL_PKG_POWER_INFO (1 << 2) - /* 0x614 MSR_PKG_POWER_INFO */ - -#define RAPL_DRAM (1 << 3) - /* 0x618 MSR_DRAM_POWER_LIMIT */ - /* 0x619 MSR_DRAM_ENERGY_STATUS */ -#define RAPL_DRAM_PERF_STATUS (1 << 4) - /* 0x61b MSR_DRAM_PERF_STATUS */ -#define RAPL_DRAM_POWER_INFO (1 << 5) - /* 0x61c MSR_DRAM_POWER_INFO */ - -#define RAPL_CORES_POWER_LIMIT (1 << 6) - /* 0x638 MSR_PP0_POWER_LIMIT */ -#define RAPL_CORE_POLICY (1 << 7) - /* 0x63a MSR_PP0_POLICY */ - -#define RAPL_GFX (1 << 8) - /* 0x640 MSR_PP1_POWER_LIMIT */ - /* 0x641 MSR_PP1_ENERGY_STATUS */ - /* 0x642 MSR_PP1_POLICY */ - -#define RAPL_CORES_ENERGY_STATUS (1 << 9) - /* 0x639 MSR_PP0_ENERGY_STATUS */ -#define RAPL_PER_CORE_ENERGY (1 << 10) - /* Indicates cores energy collection is per-core, - * not per-package. */ -#define RAPL_AMD_F17H (1 << 11) - /* 0xc0010299 MSR_RAPL_PWR_UNIT */ - /* 0xc001029a MSR_CORE_ENERGY_STAT */ - /* 0xc001029b MSR_PKG_ENERGY_STAT */ -#define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT) #define TJMAX_DEFAULT 100 /* MSRs that are not yet in the kernel-provided header. */ @@ -948,7 +946,7 @@ int idx_valid(int idx) case IDX_DRAM_ENERGY: return do_rapl & RAPL_DRAM; case IDX_PP0_ENERGY: - return do_rapl & RAPL_CORES_ENERGY_STATUS; + return do_rapl & RAPL_CORE_ENERGY_STATUS; case IDX_PP1_ENERGY: return do_rapl & RAPL_GFX; case IDX_PKG_PERF: @@ -2710,7 +2708,7 @@ retry: return -13; p->energy_pkg = msr; } - if (do_rapl & RAPL_CORES_ENERGY_STATUS) { + if (do_rapl & RAPL_CORE_ENERGY_STATUS) { if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; p->energy_cores = msr; @@ -4810,7 +4808,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_HASWELL_G: /* HSW */ case INTEL_FAM6_BROADWELL: /* BDW */ case INTEL_FAM6_BROADWELL_G: /* BDW */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4830,9 +4828,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS - | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4846,7 +4842,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL; BIC_PRESENT(BIC_PKG__); if (rapl_joules) BIC_PRESENT(BIC_Pkg_J); @@ -4855,9 +4851,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS - | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4878,9 +4872,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = - RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | - RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4893,9 +4885,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | - RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4910,7 +4900,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - do_rapl = RAPL_PKG | RAPL_CORES; + do_rapl = RAPL_PKG | RAPL_CORE; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4920,9 +4910,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = - RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | - RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; + do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -5195,7 +5183,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); } - if (do_rapl & RAPL_CORES_POWER_LIMIT) { + if (do_rapl & RAPL_CORE_POWER_LIMIT) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", -- cgit From a98f886035d5f7e0ec66036dd6bf98b40e75b692 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 26 Aug 2023 14:57:12 +0800 Subject: tools/power/turbostat: Simplify the logic for RAPL enumeration The support for each RAPL domains, as well as the support for the perf status of each RAPL domains, can be detected by checking the availabilities of the corresponding RAPL MSRs. Change the code accordingly and remove the hardcoded logic for each model. Note that this also fixes the INTEL_FAM6_ATOM_TREMONT model, which has RAPL_PKG_PERF_STATUS and MSR_DRAM_PERF_STATUS but doesn't have BIC_PKG__ and BIC_RAM__ set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 99 ++++++++--------------------------- 1 file changed, 22 insertions(+), 77 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4829e8289feb..b2da36437b12 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4809,62 +4809,20 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_BROADWELL: /* BDW */ case INTEL_FAM6_BROADWELL_G: /* BDW */ do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; - if (rapl_joules) - BIC_PRESENT(BIC_Pkg_J); - else - BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ do_rapl = RAPL_PKG_ALL; - BIC_PRESENT(BIC_PKG__); - if (rapl_joules) - BIC_PRESENT(BIC_Pkg_J); - else - BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ @@ -4873,60 +4831,47 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_RAM_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_RAMWatt); - } break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - } break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ do_rapl = RAPL_PKG | RAPL_CORE; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { + break; + default: + return; + } + + if (rapl_joules) { + if (do_rapl & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_Pkg_J); + if (do_rapl & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_Cor_J); + if (do_rapl & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAM_J); - } else { + if (do_rapl & RAPL_GFX_ENERGY_STATUS) + BIC_PRESENT(BIC_GFX_J); + } else { + if (do_rapl & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_PkgWatt); + if (do_rapl & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_CorWatt); + if (do_rapl & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAMWatt); - } - break; - default: - return; + if (do_rapl & RAPL_GFX_ENERGY_STATUS) + BIC_PRESENT(BIC_GFXWatt); } + if (do_rapl & RAPL_PKG_PERF_STATUS) + BIC_PRESENT(BIC_PKG__); + if (do_rapl & RAPL_DRAM_PERF_STATUS) + BIC_PRESENT(BIC_RAM__); + /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) return; -- cgit From 86ba263d9b72b7766a99059e9b3bd104d089b7fa Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 28 Aug 2023 15:09:40 +0800 Subject: tools/power/turbostat: Abstract RAPL MSRs support Abstract the support for RAPL MSRs. Delete CPU model checks in rapl_probe_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 189 +++++++++++++++------------------- 1 file changed, 85 insertions(+), 104 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b2da36437b12..90e1abe96dd5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -288,6 +288,7 @@ struct platform_features { bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ + int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -414,6 +415,7 @@ static const struct platform_features snb_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features snx_features = { @@ -423,6 +425,7 @@ static const struct platform_features snx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features ivb_features = { @@ -433,6 +436,7 @@ static const struct platform_features ivb_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features ivx_features = { @@ -442,6 +446,7 @@ static const struct platform_features ivx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features hsw_features = { @@ -453,6 +458,7 @@ static const struct platform_features hsw_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hsx_features = { @@ -464,6 +470,7 @@ static const struct platform_features hsx_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features hswl_features = { @@ -475,6 +482,7 @@ static const struct platform_features hswl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hswg_features = { @@ -486,6 +494,7 @@ static const struct platform_features hswg_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdw_features = { @@ -496,6 +505,7 @@ static const struct platform_features bdw_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdwg_features = { @@ -506,6 +516,7 @@ static const struct platform_features bdwg_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdx_features = { @@ -517,6 +528,7 @@ static const struct platform_features bdx_features = { .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features skl_features = { @@ -529,6 +541,7 @@ static const struct platform_features skl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features cnl_features = { @@ -540,6 +553,7 @@ static const struct platform_features cnl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features skx_features = { @@ -551,6 +565,7 @@ static const struct platform_features skx_features = { .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features icx_features = { @@ -561,6 +576,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features spr_features = { @@ -571,6 +587,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features slv_features = { @@ -578,6 +595,7 @@ static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, + .rapl_msrs = RAPL_PKG | RAPL_CORE, }; static const struct platform_features slvd_features = { @@ -586,6 +604,7 @@ static const struct platform_features slvd_features = { .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE, }; static const struct platform_features amt_features = { @@ -602,6 +621,7 @@ static const struct platform_features gmt_features = { .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features gmtd_features = { @@ -611,6 +631,7 @@ static const struct platform_features gmtd_features = { .crystal_freq = 25000000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; static const struct platform_features gmtp_features = { @@ -620,6 +641,7 @@ static const struct platform_features gmtp_features = { .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features tmt_features = { @@ -628,6 +650,7 @@ static const struct platform_features tmt_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features tmtd_features = { @@ -636,6 +659,7 @@ static const struct platform_features tmtd_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL, }; static const struct platform_features knl_features = { @@ -645,6 +669,7 @@ static const struct platform_features knl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features default_features = { @@ -653,6 +678,10 @@ static const struct platform_features default_features = { static const struct platform_features amd_features = { }; +static const struct platform_features amd_features_with_rapl = { + .rapl_msrs = RAPL_AMD_F17H, +}; + static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_NEHALEM, &nhm_features }, { INTEL_FAM6_NEHALEM_G, &nhm_features }, @@ -728,6 +757,17 @@ void probe_platform_features(unsigned int family, unsigned int model) if (authentic_amd || hygon_genuine) { platform = &amd_features; + + if (max_extended_level >= 0x80000007) { + unsigned int eax, ebx, ecx, edx; + + __cpuid(0x80000007, eax, ebx, ecx, edx); + /* RAPL (Fam 17h+) */ + if ((edx & (1 << 14)) && family >= 0x17) { + platform = &amd_features_with_rapl; + do_rapl = RAPL_PER_CORE_ENERGY; + } + } return; } @@ -882,7 +922,7 @@ off_t idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - if (do_rapl & RAPL_AMD_F17H) + if (platform->rapl_msrs & RAPL_AMD_F17H) offset = MSR_PKG_ENERGY_STAT; else offset = MSR_PKG_ENERGY_STATUS; @@ -942,17 +982,17 @@ int idx_valid(int idx) { switch (idx) { case IDX_PKG_ENERGY: - return do_rapl & (RAPL_PKG | RAPL_AMD_F17H); + return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H); case IDX_DRAM_ENERGY: - return do_rapl & RAPL_DRAM; + return platform->rapl_msrs & RAPL_DRAM; case IDX_PP0_ENERGY: - return do_rapl & RAPL_CORE_ENERGY_STATUS; + return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS; case IDX_PP1_ENERGY: - return do_rapl & RAPL_GFX; + return platform->rapl_msrs & RAPL_GFX; case IDX_PKG_PERF: - return do_rapl & RAPL_PKG_PERF_STATUS; + return platform->rapl_msrs & RAPL_PKG_PERF_STATUS; case IDX_DRAM_PERF: - return do_rapl & RAPL_DRAM_PERF_STATUS; + return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS; default: return 0; } @@ -1330,10 +1370,10 @@ void print_header(char *delim) if (DO_BIC(BIC_CORE_THROT_CNT)) outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); - if (do_rapl && !rapl_joules) { + if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); - } else if (do_rapl && rapl_joules) { + } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); } @@ -1392,7 +1432,7 @@ void print_header(char *delim) if (DO_BIC(BIC_SYS_LPI)) outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : "")); - if (do_rapl && !rapl_joules) { + if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : "")); if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -1405,7 +1445,7 @@ void print_header(char *delim) outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : "")); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : "")); - } else if (do_rapl && rapl_joules) { + } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : "")); if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -2638,7 +2678,7 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; c->core_energy = msr & 0xFFFFFFFF; @@ -2703,37 +2743,37 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (do_rapl & RAPL_PKG) { + if (platform->rapl_msrs & RAPL_PKG) { if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) return -13; p->energy_pkg = msr; } - if (do_rapl & RAPL_CORE_ENERGY_STATUS) { + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; p->energy_cores = msr; } - if (do_rapl & RAPL_DRAM) { + if (platform->rapl_msrs & RAPL_DRAM) { if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) return -15; p->energy_dram = msr; } - if (do_rapl & RAPL_GFX) { + if (platform->rapl_msrs & RAPL_GFX) { if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) return -16; p->energy_gfx = msr; } - if (do_rapl & RAPL_PKG_PERF_STATUS) { + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) return -16; p->rapl_pkg_perf_status = msr; } - if (do_rapl & RAPL_DRAM_PERF_STATUS) { + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) return -16; p->rapl_dram_perf_status = msr; } - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) return -13; p->energy_pkg = msr; @@ -4750,7 +4790,7 @@ double get_tdp_intel(unsigned int model) { unsigned long long msr; - if (do_rapl & RAPL_PKG_POWER_INFO) + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; @@ -4791,85 +4831,35 @@ static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) } } -void rapl_probe_intel(unsigned int family, unsigned int model) +void rapl_probe_intel(unsigned int model) { unsigned long long msr; unsigned int time_unit; double tdp; - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; - break; - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; - break; - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - break; - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - do_rapl = RAPL_PKG_ALL; - break; - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - break; - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; - break; - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; - break; - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - do_rapl = RAPL_PKG | RAPL_CORE; - break; - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; - break; - default: - return; - } - if (rapl_joules) { - if (do_rapl & RAPL_PKG_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_Pkg_J); - if (do_rapl & RAPL_CORE_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_Cor_J); - if (do_rapl & RAPL_DRAM_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAM_J); - if (do_rapl & RAPL_GFX_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) BIC_PRESENT(BIC_GFX_J); } else { - if (do_rapl & RAPL_PKG_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_PkgWatt); - if (do_rapl & RAPL_CORE_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_CorWatt); - if (do_rapl & RAPL_DRAM_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAMWatt); - if (do_rapl & RAPL_GFX_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) BIC_PRESENT(BIC_GFXWatt); } - if (do_rapl & RAPL_PKG_PERF_STATUS) + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) BIC_PRESENT(BIC_PKG__); - if (do_rapl & RAPL_DRAM_PERF_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) BIC_PRESENT(BIC_RAM__); /* units on package 0, verify later other packages match */ @@ -4900,22 +4890,10 @@ void rapl_probe_intel(unsigned int family, unsigned int model) void rapl_probe_amd(unsigned int family, unsigned int model) { unsigned long long msr; - unsigned int eax, ebx, ecx, edx; - unsigned int has_rapl = 0; double tdp; UNUSED(model); - if (max_extended_level >= 0x80000007) { - __cpuid(0x80000007, eax, ebx, ecx, edx); - /* RAPL (Fam 17h+) */ - has_rapl = edx & (1 << 14); - } - - if (!has_rapl || family < 0x17) - return; - - do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4941,12 +4919,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model) /* * rapl_probe() * - * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units + * sets rapl_power_units, rapl_energy_units, rapl_time_units */ void rapl_probe(unsigned int family, unsigned int model) { + if (!platform->rapl_msrs) + return; + if (genuine_intel) - rapl_probe_intel(family, model); + rapl_probe_intel(model); if (authentic_amd || hygon_genuine) rapl_probe_amd(family, model); } @@ -5040,7 +5021,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); - if (!do_rapl) + if (!platform->rapl_msrs) return 0; /* RAPL counters are per package, so print only for 1st thread/package */ @@ -5053,7 +5034,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; } - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { msr_name = "MSR_RAPL_PWR_UNIT"; if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr)) return -1; @@ -5066,7 +5047,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units); - if (do_rapl & RAPL_PKG_POWER_INFO) { + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) { if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) return -5; @@ -5079,7 +5060,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (do_rapl & RAPL_PKG) { + if (platform->rapl_msrs & RAPL_PKG) { if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr)) return -9; @@ -5103,7 +5084,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); } - if (do_rapl & RAPL_DRAM_POWER_INFO) { + if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) { if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) return -6; @@ -5114,7 +5095,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (do_rapl & RAPL_DRAM) { + if (platform->rapl_msrs & RAPL_DRAM) { if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", @@ -5122,20 +5103,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) print_power_limit_msr(cpu, msr, "DRAM Limit"); } - if (do_rapl & RAPL_CORE_POLICY) { + if (platform->rapl_msrs & RAPL_CORE_POLICY) { if (get_msr(cpu, MSR_PP0_POLICY, &msr)) return -7; fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); } - if (do_rapl & RAPL_CORE_POWER_LIMIT) { + if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "Cores Limit"); } - if (do_rapl & RAPL_GFX) { + if (platform->rapl_msrs & RAPL_GFX) { if (get_msr(cpu, MSR_PP1_POLICY, &msr)) return -8; -- cgit From e338831b14d2da921348c8c4055b9f2c94effe73 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:37:46 +0800 Subject: tools/power/turbostat: Abstract Per Core RAPL support Abstract the support for Per Core RAPL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 90e1abe96dd5..6314c40dc15b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -245,7 +245,6 @@ double tsc_tweak = 1.0; unsigned int show_pkg_only; unsigned int show_core_only; char *output_buffer, *outp; -unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; @@ -289,6 +288,7 @@ struct platform_features { int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ + bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -378,7 +378,6 @@ enum rapl_msrs { RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ - RAPL_PER_CORE_ENERGY = BIT(17), /* Indicates cores energy collection is per-core, not per-package. */ }; #define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) @@ -680,6 +679,7 @@ static const struct platform_features amd_features = { static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, + .has_per_core_rapl = 1, }; static const struct platform_data turbostat_pdata[] = { @@ -763,10 +763,8 @@ void probe_platform_features(unsigned int family, unsigned int model) __cpuid(0x80000007, eax, ebx, ecx, edx); /* RAPL (Fam 17h+) */ - if ((edx & (1 << 14)) && family >= 0x17) { + if ((edx & (1 << 14)) && family >= 0x17) platform = &amd_features_with_rapl; - do_rapl = RAPL_PER_CORE_ENERGY; - } } return; } @@ -1371,10 +1369,10 @@ void print_header(char *delim) outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); if (platform->rapl_msrs && !rapl_joules) { - if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); } else if (platform->rapl_msrs && rapl_joules) { - if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); } @@ -1435,7 +1433,7 @@ void print_header(char *delim) if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : "")); - if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); if (DO_BIC(BIC_GFXWatt)) outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : "")); @@ -1448,7 +1446,7 @@ void print_header(char *delim) } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : "")); - if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : "")); @@ -1750,10 +1748,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data fmt8 = "%s%.2f"; - if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); - if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); /* print per-package data only for 1st core in package */ @@ -1818,7 +1816,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); if (DO_BIC(BIC_GFXWatt)) @@ -1830,7 +1828,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->energy_dram * rapl_dram_energy_units / interval_float); if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); - if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); -- cgit From 6d35b8c4a661c849361239fe316035ea952606a3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:59:04 +0800 Subject: tools/power/turbostat: Abstract RAPL divisor support INTEL_FAM6_ATOM_SILVERMONT model needs a divisor to convert the raw Energy Units value from MSR_RAPL_POWER_UNIT. Abstract the support for RAPL divisor. Delete CPU model check in rapl_probe_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6314c40dc15b..a31724335671 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -289,6 +289,7 @@ struct platform_features { int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ + bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -595,6 +596,7 @@ static const struct platform_features slv_features = { .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, + .has_rapl_divisor = 1, }; static const struct platform_features slvd_features = { @@ -4865,7 +4867,7 @@ void rapl_probe_intel(unsigned int model) return; rapl_power_units = 1.0 / (1 << (msr & 0xF)); - if (model == INTEL_FAM6_ATOM_SILVERMONT) + if (platform->has_rapl_divisor) rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000; else rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); -- cgit From 9e6f35159cdef148c711d2fb7d5fd2b2b6fb772d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 12:21:58 +0800 Subject: tools/power/turbostat: Abstract fixed DRAM Energy unit support Abstract the support for fixed Dram domain energy unit. Delete rapl_dram_energy_units_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a31724335671..a26ae5a2e2bd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -290,6 +290,7 @@ struct platform_features { int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ + bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -471,6 +472,7 @@ static const struct platform_features hsx_features = { .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features hswl_features = { @@ -529,6 +531,7 @@ static const struct platform_features bdx_features = { .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features skl_features = { @@ -566,6 +569,7 @@ static const struct platform_features skx_features = { .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features icx_features = { @@ -577,6 +581,7 @@ static const struct platform_features icx_features = { .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features spr_features = { @@ -671,6 +676,7 @@ static const struct platform_features knl_features = { .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features default_features = { @@ -4811,26 +4817,6 @@ double get_tdp_amd(unsigned int family) return 280.0; } -/* - * rapl_dram_energy_units_probe() - * Energy units are either hard-coded, or come from RAPL Energy Unit MSR. - */ -static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) -{ - /* only called for genuine_intel, family 6 */ - - switch (model) { - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - return (rapl_dram_energy_units = 15.3 / 1000000); - default: - return (rapl_energy_units); - } -} - void rapl_probe_intel(unsigned int model) { unsigned long long msr; @@ -4872,7 +4858,10 @@ void rapl_probe_intel(unsigned int model) else rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); - rapl_dram_energy_units = rapl_dram_energy_units_probe(model, rapl_energy_units); + if (platform->has_fixed_rapl_unit) + rapl_dram_energy_units = (15.3 / 1000000); + else + rapl_dram_energy_units = rapl_energy_units; time_unit = msr >> 16 & 0xF; if (time_unit == 0) -- cgit From 7c60409382a4be05d601e0b45db7b0166845b0cf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:21:10 +0800 Subject: tools/power/turbostat: Abstract hardcoded TDP value Different hardcoded TDP values are used when TDP can not be retrieved from the hardware. Abstract hardcoded TDP value. Delete CPU model checks in get_tdp_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a26ae5a2e2bd..45698c3a9e72 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -291,6 +291,7 @@ struct platform_features { bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ + int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -602,6 +603,7 @@ static const struct platform_features slv_features = { .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, + .rapl_quirk_tdp = 30, }; static const struct platform_features slvd_features = { @@ -611,6 +613,7 @@ static const struct platform_features slvd_features = { .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, + .rapl_quirk_tdp = 30, }; static const struct platform_features amt_features = { @@ -688,6 +691,7 @@ static const struct platform_features amd_features = { static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, .has_per_core_rapl = 1, + .rapl_quirk_tdp = 280, /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ }; static const struct platform_data turbostat_pdata[] = { @@ -4792,29 +4796,31 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ #define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ +double get_quirk_tdp(void) +{ + if (platform->rapl_quirk_tdp) + return platform->rapl_quirk_tdp; + + return 135.0; +} + double get_tdp_intel(unsigned int model) { unsigned long long msr; + UNUSED(model); + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_D: - return 30.0; - default: - return 135.0; - } + return get_quirk_tdp(); } double get_tdp_amd(unsigned int family) { UNUSED(family); - /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ - return 280.0; + return get_quirk_tdp(); } void rapl_probe_intel(unsigned int model) -- cgit From bf1ad57c3f92c551dbfdcecd49797253f55cb7c1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:28:04 +0800 Subject: tools/power/turbostat: Remove unused family/model parameters for RAPL functions RAPL probing can be done without family/model checking. Remove these parameters in rapl probe functions. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 45698c3a9e72..44d7321b004e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4804,26 +4804,22 @@ double get_quirk_tdp(void) return 135.0; } -double get_tdp_intel(unsigned int model) +double get_tdp_intel(void) { unsigned long long msr; - UNUSED(model); - if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; return get_quirk_tdp(); } -double get_tdp_amd(unsigned int family) +double get_tdp_amd(void) { - UNUSED(family); - return get_quirk_tdp(); } -void rapl_probe_intel(unsigned int model) +void rapl_probe_intel(void) { unsigned long long msr; unsigned int time_unit; @@ -4875,20 +4871,18 @@ void rapl_probe_intel(unsigned int model) rapl_time_units = 1.0 / (1 << (time_unit)); - tdp = get_tdp_intel(model); + tdp = get_tdp_intel(); rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet) fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); } -void rapl_probe_amd(unsigned int family, unsigned int model) +void rapl_probe_amd(void) { unsigned long long msr; double tdp; - UNUSED(model); - if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4904,7 +4898,7 @@ void rapl_probe_amd(unsigned int family, unsigned int model) rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f)); rapl_power_units = ldexp(1.0, -(msr & 0xf)); - tdp = get_tdp_amd(family); + tdp = get_tdp_amd(); rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet) @@ -4916,15 +4910,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model) * * sets rapl_power_units, rapl_energy_units, rapl_time_units */ -void rapl_probe(unsigned int family, unsigned int model) +void rapl_probe(void) { if (!platform->rapl_msrs) return; if (genuine_intel) - rapl_probe_intel(model); + rapl_probe_intel(); if (authentic_amd || hygon_genuine) - rapl_probe_amd(family, model); + rapl_probe_amd(); } void prewake_cstate_probe(unsigned int family, unsigned int model) @@ -5828,7 +5822,7 @@ void process_cpuid() if (!quiet && has_slv_msrs(family, model)) decode_c6_demotion_policy_msr(); - rapl_probe(family, model); + rapl_probe(); prewake_cstate_probe(family, model); if (!quiet) -- cgit From 485a017c45200ed82518f6cdeea554f77e9a0562 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:37:23 +0800 Subject: tools/power/turbostat: Abstract TSC tweak support On some models, the CPU base frequency is different from the TSC frequency, and the aperf/mperf counters are running at CPU base frequency instead of TSC frequency. Abstract support for TSC tweak. Given that tsc_tweak depends on base_hz, move the code to probe_bclk() after base_hz is available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 44d7321b004e..05385fabc83a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -293,6 +293,7 @@ struct platform_features { bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ + bool enable_tsc_tweak; /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */ }; struct platform_data { @@ -546,6 +547,7 @@ static const struct platform_features skl_features = { .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features cnl_features = { @@ -558,6 +560,7 @@ static const struct platform_features cnl_features = { .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features skx_features = { @@ -660,6 +663,7 @@ static const struct platform_features tmt_features = { .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features tmtd_features = { @@ -2934,11 +2938,6 @@ void probe_cst_limit(void) pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; } -static void calculate_tsc_tweak() -{ - tsc_tweak = base_hz / tsc_hz; -} - void prewake_cstate_probe(unsigned int family, unsigned int model); static void dump_platform_info(void) @@ -4198,6 +4197,9 @@ void probe_bclk(void) base_hz = base_ratio * bclk * 1000000; has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; } /* @@ -5836,9 +5838,6 @@ void process_cpuid() if (!quiet) dump_sysfs_pstate_config(); - if (has_skl_msrs(family, model) || is_ehl(family, model)) - calculate_tsc_tweak(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); -- cgit From 3c6a17b8ae44b0116e303402803c173fe2a3da92 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 22:37:37 +0800 Subject: tools/power/turbostat: Add skeleton support for cstate enumeration Add skeleton support for cstate enumeration. Note that the previous logic may override the cstate setting for multiple times for different reasons. The conversion to new cstate enumeration must be done step by step following the previous code order strictly. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 57 ++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 05385fabc83a..6a49eb941fe0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -283,6 +283,7 @@ struct platform_features { bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ + int supported_cstates; /* Core cstates and Package cstates supported */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -396,6 +397,21 @@ enum rapl_msrs { #define RAPL_AMD_F17H (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT) +/* For Cstates */ +enum cstates { + CC1 = BIT(0), + CC3 = BIT(1), + CC6 = BIT(2), + CC7 = BIT(3), + PC2 = BIT(4), + PC3 = BIT(5), + PC6 = BIT(6), + PC7 = BIT(7), + PC8 = BIT(8), + PC9 = BIT(9), + PC10 = BIT(10), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -5560,6 +5576,44 @@ void linux_perf_init(void) BIC_PRESENT(BIC_IPC); } +void probe_cstates(void) +{ + probe_cst_limit(); + + if (platform->supported_cstates & CC1) + BIC_PRESENT(BIC_CPU_c1); + + if (platform->supported_cstates & CC3) + BIC_PRESENT(BIC_CPU_c3); + + if (platform->supported_cstates & CC6) + BIC_PRESENT(BIC_CPU_c6); + + if (platform->supported_cstates & CC7) + BIC_PRESENT(BIC_CPU_c7); + + if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) + BIC_PRESENT(BIC_Pkgpc2); + + if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) + BIC_PRESENT(BIC_Pkgpc3); + + if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) + BIC_PRESENT(BIC_Pkgpc6); + + if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) + BIC_PRESENT(BIC_Pkgpc7); + + if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) + BIC_PRESENT(BIC_Pkgpc8); + + if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) + BIC_PRESENT(BIC_Pkgpc9); + + if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) + BIC_PRESENT(BIC_Pkgpc10); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5741,7 +5795,8 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); - probe_cst_limit(); + probe_cstates(); + if (platform->has_nhm_msrs) { BIC_PRESENT(BIC_CPU_c1); BIC_PRESENT(BIC_CPU_c3); -- cgit From ce7ddf8af2f96a8a7af4cc2273843518c5810166 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:27 +0800 Subject: tools/power/turbostat: Adjust cstate for models with .has_nhm_msrs set Enable CC1/CC3/CC6 for platforms with .has_nhm_msrs set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6a49eb941fe0..c7345e0c5185 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -416,6 +416,7 @@ static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_NHM, .trl_msrs = TRL_BASE, }; @@ -424,6 +425,7 @@ static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_NHM, }; @@ -432,6 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -442,6 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -453,6 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -463,6 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -474,6 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -486,6 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -499,6 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -511,6 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -523,6 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -534,6 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,6 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -559,6 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -572,6 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -585,6 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -598,6 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -610,6 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -618,6 +636,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -629,6 +648,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -638,6 +658,7 @@ static const struct platform_features slvd_features = { static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_AMT, .trl_msrs = TRL_BASE, }; @@ -647,6 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -657,6 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -667,6 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -676,6 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -686,6 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -696,6 +722,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -5797,12 +5824,8 @@ void process_cpuid() probe_cstates(); - if (platform->has_nhm_msrs) { - BIC_PRESENT(BIC_CPU_c1); - BIC_PRESENT(BIC_CPU_c3); - BIC_PRESENT(BIC_CPU_c6); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - } probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); -- cgit From 942c854d8d0f6c6fc0864a9da5f5e374a8e146e5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:56 +0800 Subject: tools/power/turbostat: Adjust cstate for has_snb_msrs() models Enable CC7 and PC2 for has_snb_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c7345e0c5185..174a8d0750da 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -434,7 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -668,7 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -5829,12 +5829,7 @@ void process_cpuid() probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); - if (do_snb_cstates) - BIC_PRESENT(BIC_CPU_c7); - do_irtl_snb = has_snb_msrs(family, model); - if (do_snb_cstates && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); if (pkg_cstate_limit >= PCL__3) BIC_PRESENT(BIC_Pkgpc3); if (pkg_cstate_limit >= PCL__6) -- cgit From 6f1935c036f79b56b6a1dc6e51c8c6fe483983ec Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:17:23 +0800 Subject: tools/power/turbostat: Adjust cstate for models with .cst_limit set Enable PC3/PC6 for platforms with .cst_limit set because package cstates are guarded by pkg_cstate_limit. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 174a8d0750da..2bfbf4ccf5ac 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -416,7 +416,7 @@ static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_NHM, .trl_msrs = TRL_BASE, }; @@ -425,7 +425,7 @@ static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_NHM, }; @@ -434,7 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -636,7 +636,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -648,7 +648,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -658,7 +658,7 @@ static const struct platform_features slvd_features = { static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_AMT, .trl_msrs = TRL_BASE, }; @@ -668,7 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -722,7 +722,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -5830,10 +5830,6 @@ void process_cpuid() do_snb_cstates = has_snb_msrs(family, model); do_irtl_snb = has_snb_msrs(family, model); - if (pkg_cstate_limit >= PCL__3) - BIC_PRESENT(BIC_Pkgpc3); - if (pkg_cstate_limit >= PCL__6) - BIC_PRESENT(BIC_Pkgpc6); if (do_snb_cstates && (pkg_cstate_limit >= PCL__7)) BIC_PRESENT(BIC_Pkgpc7); if (has_slv_msrs(family, model)) { -- cgit From 192cbf0468ae31062526287e257f5b56214d2da5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:17:58 +0800 Subject: tools/power/turbostat: Adjust cstate for has_snb_msrs() models Enable PC7 for has_snb_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 46 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2bfbf4ccf5ac..f3d44e81d7d5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -221,7 +221,6 @@ unsigned int rapl_joules; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; -unsigned int do_snb_cstates; unsigned int do_knl_cstates; unsigned int do_slm_cstates; unsigned int use_c1_residency_msr; @@ -434,7 +433,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +444,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +456,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +467,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +479,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +492,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +506,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +519,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +532,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +544,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +556,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +571,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +599,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +613,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +626,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -668,7 +667,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +678,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +689,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +710,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -5827,11 +5826,8 @@ void process_cpuid() if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); probe_bclk(); - do_snb_cstates = has_snb_msrs(family, model); do_irtl_snb = has_snb_msrs(family, model); - if (do_snb_cstates && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); if (has_slv_msrs(family, model)) { BIC_NOT_PRESENT(BIC_Pkgpc2); BIC_NOT_PRESENT(BIC_Pkgpc3); -- cgit From ff206149551f09117f44883650a45ae692745703 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:07:08 +0800 Subject: tools/power/turbostat: Adjust cstate for has_slv_msrs() models Disable PC2/PC3/PC7 and enable PC6 for has_slv_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3d44e81d7d5..972f5a9b14e6 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -635,7 +635,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -5829,10 +5829,6 @@ void process_cpuid() do_irtl_snb = has_snb_msrs(family, model); if (has_slv_msrs(family, model)) { - BIC_NOT_PRESENT(BIC_Pkgpc2); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_PRESENT(BIC_Pkgpc6); - BIC_NOT_PRESENT(BIC_Pkgpc7); BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } -- cgit From 3d982ac0dafeab860b9d42f5cc41a78753275fdd Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:09:20 +0800 Subject: tools/power/turbostat: Adjust cstate for is_jvl() models Disable CC3/CC7/PC2/PC3/PC6/PC7 for is_jvl() models. Delete is_jvl() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 972f5a9b14e6..e95972edde3c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -710,7 +710,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -4361,21 +4361,6 @@ int is_ehl(unsigned int family, unsigned int model) return 0; } -int is_jvl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_TREMONT_D: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -5832,14 +5817,6 @@ void process_cpuid() BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } - if (is_jvl(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc2); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_Pkgpc6); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (is_dnv(family, model)) { BIC_PRESENT(BIC_CPU_c1); BIC_NOT_PRESENT(BIC_CPU_c3); -- cgit From 8e20ced057423f0edaf40b650facc221e8030b33 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:12:29 +0800 Subject: tools/power/turbostat: Adjust cstate for is_dnv() models Enable CC1 and disable CC3/CC7/PC3/PC7 for is_dnv() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e95972edde3c..069704ef3c80 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -678,7 +678,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -5818,11 +5818,6 @@ void process_cpuid() use_c1_residency_msr = 1; } if (is_dnv(family, model)) { - BIC_PRESENT(BIC_CPU_c1); - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); use_c1_residency_msr = 1; } if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) { -- cgit From 24d16bec379db6eea2d72e18c97c6c80e486a5e1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:18:27 +0800 Subject: tools/power/turbostat: Adjust cstate for is_skx()/is_icx()/is_spr() models Disable CC3/CC7/PC3/PC7 for is_skx()/is_icx()/is_spr() models. Delete is_skx() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 069704ef3c80..262af40fe35d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -599,7 +599,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -613,7 +613,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -626,7 +626,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -4298,22 +4298,6 @@ int is_bdx(unsigned int family, unsigned int model) return 0; } -int is_skx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SKYLAKE_X: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5820,12 +5804,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (is_bdx(family, model)) { BIC_NOT_PRESENT(BIC_CPU_c7); BIC_NOT_PRESENT(BIC_Pkgpc7); -- cgit From 1109694817fb4fcdfccb1a86fd58e69fb60f4eab Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:16:52 +0800 Subject: tools/power/turbostat: Adjust cstate for is_bdx() models Disable CC7/PC7 for is_bdx() models. Delete is_bdx() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 262af40fe35d..27ca29f0545a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -556,7 +556,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -4282,22 +4282,6 @@ int is_dnv(unsigned int family, unsigned int model) return 0; } -int is_bdx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_BROADWELL_X: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5804,10 +5788,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (is_bdx(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (has_c8910_msrs(family, model)) { if (pkg_cstate_limit >= PCL__8) BIC_PRESENT(BIC_Pkgpc8); -- cgit From 4d2c95d40a90877ffd8f961055419f1f550a7ed9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:20:02 +0800 Subject: tools/power/turbostat: Adjust cstate for has_c8910_msrs() models Enable PC8/PC9/PC10 for has_c8910_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 27ca29f0545a..b0bc973c077d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -506,7 +506,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -532,7 +532,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -571,7 +571,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -585,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -667,7 +667,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -689,7 +689,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -699,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -5788,14 +5788,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (has_c8910_msrs(family, model)) { - if (pkg_cstate_limit >= PCL__8) - BIC_PRESENT(BIC_Pkgpc8); - if (pkg_cstate_limit >= PCL__9) - BIC_PRESENT(BIC_Pkgpc9); - if (pkg_cstate_limit >= PCL_10) - BIC_PRESENT(BIC_Pkgpc10); - } do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); -- cgit From cd7a2b6a61100a0fe8a40916d1afbd72d8833d57 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:24:19 +0800 Subject: tools/power/turbostat: Adjust cstate for is_slm()/is_knl()/is_cnl()/is_ehl() models Disable CC3 for is_slm()/is_knl()/is_cnl()/is_ehl() models. Delete is_cnl()/is_ehl() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 ++++------------------------------- 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b0bc973c077d..24f470883ca2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -585,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -635,7 +635,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC6, + .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -647,7 +647,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -699,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -721,7 +721,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -4314,21 +4314,6 @@ int is_spr(unsigned int family, unsigned int model) return 0; } -int is_ehl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_TREMONT: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -5248,22 +5233,6 @@ int is_knl(unsigned int family, unsigned int model) return 0; } -int is_cnl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - return 1; - } - - return 0; -} - unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) { if (is_knl(family, model)) @@ -5798,9 +5767,6 @@ void process_cpuid() do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); - if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model)) - BIC_NOT_PRESENT(BIC_CPU_c3); - if (!quiet) decode_misc_pwr_mgmt_msr(); -- cgit From 8c382f9e74663072805565036026d4e79de96425 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:27:26 +0800 Subject: tools/power/turbostat: Use fine grained IRTL output It is pointless to dump the IRTL register for a package cstate that is not supported by the platform. Print IRTL only for states that are available in platform->supported_cstates. Delete has_c8910_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 99 +++++++++++++---------------------- 1 file changed, 36 insertions(+), 63 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 24f470883ca2..680373010b01 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -229,7 +229,6 @@ unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; unsigned int do_irtl_snb; -unsigned int do_irtl_hsw; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int authentic_amd; @@ -3269,39 +3268,47 @@ void print_irtl(void) { unsigned long long msr; - get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); - - get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); - - get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC3) { + get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - if (!do_irtl_hsw) - return; + if (platform->supported_cstates & PC6) { + get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC7) { + get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC8) { + get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC9) { + get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } + if (platform->supported_cstates & PC10) { + get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } } void free_fd_percpu(void) @@ -5145,39 +5152,6 @@ int has_snb_msrs(unsigned int family, unsigned int model) return 0; } -/* - * HSW ULT added support for C8/C9/C10 MSRs: - * - * MSR_PKG_C8_RESIDENCY 0x00000630 - * MSR_PKG_C9_RESIDENCY 0x00000631 - * MSR_PKG_C10_RESIDENCY 0x00000632 - * - * MSR_PKGC8_IRTL 0x00000633 - * MSR_PKGC9_IRTL 0x00000634 - * MSR_PKGC10_IRTL 0x00000635 - * - */ -int has_c8910_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - return 1; - } - return 0; -} - /* * SKL adds support for additional MSRS: * @@ -5757,7 +5731,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); -- cgit From 148df4fd04a98fb24198ecb4419c87e07d38af30 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:18:51 +0800 Subject: tools/power/turbostat: Abstract IRTL support Abstract the support for MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL. Delete has_snb_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 72 +++++++++++++---------------------- 1 file changed, 26 insertions(+), 46 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 680373010b01..44be06b763b2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -228,7 +228,6 @@ unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; -unsigned int do_irtl_snb; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int authentic_amd; @@ -284,6 +283,7 @@ struct platform_features { int supported_cstates; /* Core cstates and Package cstates supported */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ + bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -434,6 +434,7 @@ static const struct platform_features snb_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -445,6 +446,7 @@ static const struct platform_features snx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; @@ -457,6 +459,7 @@ static const struct platform_features ivb_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -468,6 +471,7 @@ static const struct platform_features ivx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; @@ -480,6 +484,7 @@ static const struct platform_features hsw_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -493,6 +498,7 @@ static const struct platform_features hsx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -507,6 +513,7 @@ static const struct platform_features hswl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -520,6 +527,7 @@ static const struct platform_features hswg_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -533,6 +541,7 @@ static const struct platform_features bdw_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -545,6 +554,7 @@ static const struct platform_features bdwg_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -557,6 +567,7 @@ static const struct platform_features bdx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -572,6 +583,7 @@ static const struct platform_features skl_features = { .crystal_freq = 24000000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -586,6 +598,7 @@ static const struct platform_features cnl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -600,6 +613,7 @@ static const struct platform_features skx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -614,6 +628,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -627,6 +642,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; @@ -668,6 +684,7 @@ static const struct platform_features gmt_features = { .crystal_freq = 19200000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; @@ -679,6 +696,7 @@ static const struct platform_features gmtd_features = { .crystal_freq = 25000000, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; @@ -690,6 +708,7 @@ static const struct platform_features gmtp_features = { .crystal_freq = 19200000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; @@ -700,6 +719,7 @@ static const struct platform_features tmt_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, .enable_tsc_tweak = 1, @@ -711,6 +731,7 @@ static const struct platform_features tmtd_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, }; @@ -3268,6 +3289,9 @@ void print_irtl(void) { unsigned long long msr; + if (!platform->has_irtl_msrs) + return; + if (platform->supported_cstates & PC3) { get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); @@ -5109,49 +5133,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -/* - * SNB adds support for additional MSRs: - * - * MSR_PKG_C7_RESIDENCY 0x000003fa - * MSR_CORE_C7_RESIDENCY 0x000003fe - * MSR_PKG_C2_RESIDENCY 0x0000060d - */ - -int has_snb_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - return 1; - } - return 0; -} - /* * SKL adds support for additional MSRS: * @@ -5723,7 +5704,6 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - do_irtl_snb = has_snb_msrs(family, model); if (has_slv_msrs(family, model)) { BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; @@ -6098,7 +6078,7 @@ void turbostat_init() if (!quiet) for_all_cpus(print_thermal, ODD_COUNTERS); - if (!quiet && do_irtl_snb) + if (!quiet) print_irtl(); if (DO_BIC(BIC_IPC)) -- cgit From 76d83d2ae8e3099d9a6bd67fba918108824d7d4d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:41:30 +0800 Subject: tools/power/turbostat: Abstract MSR_CORE_C1_RES support Abstract the support for MSR_CORE_C1_RES. Delete is_dnv() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 44be06b763b2..de9260c96678 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -223,7 +223,6 @@ unsigned int list_header_only; unsigned int dump_only; unsigned int do_knl_cstates; unsigned int do_slm_cstates; -unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -284,6 +283,7 @@ struct platform_features { int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ + bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -652,6 +652,7 @@ static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, + .has_msr_core_c1_res = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -697,6 +698,7 @@ static const struct platform_features gmtd_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; @@ -2069,7 +2071,7 @@ void delta_core(struct core_data *new, struct core_data *old) int soft_c1_residency_display(int bic) { - if (!DO_BIC(BIC_CPU_c1) || use_c1_residency_msr) + if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res) return 0; return DO_BIC_READ(bic); @@ -2118,7 +2120,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d } } - if (use_c1_residency_msr) { + if (platform->has_msr_core_c1_res) { /* * Some models have a dedicated C1 residency MSR, * which should be more accurate than the derivation below. @@ -2700,7 +2702,7 @@ retry: return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && use_c1_residency_msr) { + if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) return -6; } @@ -4297,22 +4299,6 @@ int has_slv_msrs(unsigned int family, unsigned int model) return 0; } -int is_dnv(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT_D: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5706,10 +5692,6 @@ void process_cpuid() if (has_slv_msrs(family, model)) { BIC_PRESENT(BIC_Mod_c6); - use_c1_residency_msr = 1; - } - if (is_dnv(family, model)) { - use_c1_residency_msr = 1; } if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); -- cgit From 9cc1c1038526a5b6c9a57397def80ba79c260ff2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:47:30 +0800 Subject: tools/power/turbostat: Abstract MSR_MODULE_C6_RES_MS support Abstract MSR_MODULE_C6_RES_MS support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index de9260c96678..5f90d96bd9e3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -284,6 +284,7 @@ struct platform_features { bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ + bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -653,6 +654,7 @@ static const struct platform_features slv_features = { .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -5690,9 +5692,9 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - if (has_slv_msrs(family, model)) { + if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - } + if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); -- cgit From 6c36882e09dbc9a44d64180ba7972838d3f45488 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:52:24 +0800 Subject: tools/power/turbostat: Abstract MSR_CC6/MC6_DEMOTION_POLICY_CONFIG support Abstract the support for MSR_CC6/MC6_DEMOTION_POLICY_CONFIG. Delete has_slv_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5f90d96bd9e3..f8d7ba87f968 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -285,6 +285,7 @@ struct platform_features { bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ + bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -655,6 +656,7 @@ static const struct platform_features slv_features = { .cst_limit = CST_LIMIT_SLV, .has_msr_core_c1_res = 1, .has_msr_module_c6_res_ms = 1, + .has_msr_c6_demotion_policy_config = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -4279,28 +4281,6 @@ void probe_bclk(void) tsc_tweak = base_hz / tsc_hz; } -/* - * SLV client has support for unique MSRs: - * - * MSR_CC6_DEMOTION_POLICY_CONFIG - * MSR_MC6_DEMOTION_POLICY_CONFIG - */ - -int has_slv_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5358,6 +5338,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (!platform->has_msr_c6_demotion_policy_config) + return; + if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr)) fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n", base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); @@ -5707,7 +5690,7 @@ void process_cpuid() if (!quiet) decode_misc_pwr_mgmt_msr(); - if (!quiet && has_slv_msrs(family, model)) + if (!quiet) decode_c6_demotion_policy_msr(); rapl_probe(); -- cgit From c8202a6c3acf7bbde42b5e389eec40fd8e1b8358 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:26:14 +0800 Subject: tools/power/turbostat: Abstract MSR_ATOM_PKG_C6_RESIDENCY support Abstract the support for MSR_ATOM_PKG_C6_RESIDENCY. Delete is_slm() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f8d7ba87f968..a04861846d33 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -222,7 +222,6 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int do_knl_cstates; -unsigned int do_slm_cstates; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -286,6 +285,7 @@ struct platform_features { bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ + bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -657,6 +657,7 @@ static const struct platform_features slv_features = { .has_msr_core_c1_res = 1, .has_msr_module_c6_res_ms = 1, .has_msr_c6_demotion_policy_config = 1, + .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -669,6 +670,7 @@ static const struct platform_features slvd_features = { .bclk_freq = BCLK_SLV, .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, + .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, .rapl_quirk_tdp = 30, @@ -2795,7 +2797,7 @@ retry: if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) return -9; if (DO_BIC(BIC_Pkgpc6)) { - if (do_slm_cstates) { + if (platform->has_msr_atom_pkg_c6_residency) { if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) return -10; } else { @@ -5125,22 +5127,6 @@ int has_skl_msrs(unsigned int family, unsigned int model) return 0; } -int is_slm(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - return 1; - } - return 0; -} - int is_knl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -5684,7 +5670,6 @@ void process_cpuid() BIC_PRESENT(BIC_GFX_c0); BIC_PRESENT(BIC_CPUGFX); } - do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); if (!quiet) -- cgit From 80d132cb45f2cc171395bfaacd74567a183ab160 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:00:58 +0800 Subject: tools/power/turbostat: Abstract MSR_KNL_CORE_C6_RESIDENCY support Abstract the support for MSR_KNL_CORE_C6_RESIDENCY. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a04861846d33..f0a99e092fa7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -221,7 +221,6 @@ unsigned int rapl_joules; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; -unsigned int do_knl_cstates; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -286,6 +285,7 @@ struct platform_features { bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ + bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -751,6 +751,7 @@ static const struct platform_features knl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, + .has_msr_knl_core_c6_residency = 1, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -2727,10 +2728,10 @@ retry: return -6; } - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) { + if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) return -7; - } else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) { + } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) return -7; } @@ -5670,7 +5671,6 @@ void process_cpuid() BIC_PRESENT(BIC_GFX_c0); BIC_PRESENT(BIC_CPUGFX); } - do_knl_cstates = is_knl(family, model); if (!quiet) decode_misc_pwr_mgmt_msr(); -- cgit From 58ddb691d8d8a281535406766da4e23e0f011126 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:07:43 +0800 Subject: tools/power/turbostat: Abstract extended cstate MSRs support Abstract the support for MSR_PKG_WEIGHTED_CORE_C0_RES, MSR_PKG_ANY_CORE_C0_RES, MSR_PKG_ANY_GFXE_C0_RES and MSR_PKG_BOTH_CORE_GFXE_C0_RES. Delete has_skl_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f0a99e092fa7..613b284f2f09 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -286,6 +286,7 @@ struct platform_features { bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ + bool has_ext_cst_msrs; /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -586,6 +587,7 @@ static const struct platform_features skl_features = { .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -601,6 +603,7 @@ static const struct platform_features cnl_features = { .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -5104,30 +5107,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -/* - * SKL adds support for additional MSRS: - * - * MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 - * MSR_PKG_ANY_CORE_C0_RES 0x00000659 - * MSR_PKG_ANY_GFXE_C0_RES 0x0000065A - * MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B - */ -int has_skl_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - return 1; - } - return 0; -} - int is_knl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -5665,7 +5644,7 @@ void process_cpuid() if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (has_skl_msrs(family, model)) { + if (platform->has_ext_cst_msrs) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); -- cgit From ed43247b15a4fb4df01c3408fcab6e206f93ab87 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:49:10 +0800 Subject: tools/power/turbostat: Abstract aperf/mperf multiplier support Abstract aperf/mperf multiplier support. Delete is_knl() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 613b284f2f09..65a507d82fc4 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -296,6 +296,7 @@ struct platform_features { int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ bool enable_tsc_tweak; /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */ + bool need_perf_multiplier; /* mperf/aperf multiplier */ }; struct platform_data { @@ -758,6 +759,7 @@ static const struct platform_features knl_features = { .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, + .need_perf_multiplier = 1, }; static const struct platform_features default_features = { @@ -5107,28 +5109,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -int is_knl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - return 1; - } - return 0; -} - -unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) -{ - if (is_knl(family, model)) - return 1024; - return 1; -} - int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned int eax, ebx, ecx, edx; @@ -5630,7 +5610,7 @@ void process_cpuid() } if (has_aperf) - aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model); + aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1; BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); -- cgit From 7d0ebe6f7eaf8c0ac069ddab1fe3793401139fb3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:57:44 +0800 Subject: tools/power/turbostat: Abstract cstate prewake bit support Abstract cstate prewake bit support. Delete is_icx()/is_spr() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 +++-------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 65a507d82fc4..580cc2a3b947 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -252,7 +252,6 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int dis_cstate_prewake; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; @@ -287,6 +286,7 @@ struct platform_features { bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ bool has_ext_cst_msrs; /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */ + bool has_cst_prewake_bit; /* Cstate prewake bit in MSR_IA32_POWER_CTL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -635,6 +635,7 @@ static const struct platform_features icx_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -649,6 +650,7 @@ static const struct platform_features spr_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; @@ -3014,8 +3016,6 @@ void probe_cst_limit(void) pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; } -void prewake_cstate_probe(unsigned int family, unsigned int model); - static void dump_platform_info(void) { unsigned long long msr; @@ -3036,7 +3036,7 @@ static void dump_platform_info(void) base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ - if (dis_cstate_prewake) + if (platform->has_cst_prewake_bit) fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN"); return; @@ -4289,38 +4289,6 @@ void probe_bclk(void) tsc_tweak = base_hz / tsc_hz; } -int is_icx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ICELAKE_X: - return 1; - } - return 0; -} - -int is_spr(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SAPPHIRERAPIDS_X: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -4910,12 +4878,6 @@ void rapl_probe(void) rapl_probe_amd(); } -void prewake_cstate_probe(unsigned int family, unsigned int model) -{ - if (is_icx(family, model) || is_spr(family, model)) - dis_cstate_prewake = 1; -} - int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; @@ -5638,7 +5600,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(); - prewake_cstate_probe(family, model); if (!quiet) dump_cstate_pstate_config_info(); -- cgit From d085b3b0f11af7d23601b832e0d9e446d18df968 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:53:21 +0800 Subject: tools/power/turbostat: Delete intel_model_duplicates() Now CPU model checks have been cleaned up, no code depends on the duplicated CPU model value. Delete intel_model_duplicates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 59 ----------------------------------- 1 file changed, 59 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 580cc2a3b947..89f53e1ac63a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5258,63 +5258,6 @@ void decode_c6_demotion_policy_msr(void) base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); } -/* - * When models are the same, for the purpose of turbostat, reuse - */ -unsigned int intel_model_duplicates(unsigned int model) -{ - - switch (model) { - case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ - case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ - case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ - case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ - return INTEL_FAM6_NEHALEM; - - case INTEL_FAM6_WESTMERE_EX: /* Westmere-EX Xeon - Eagleton */ - return INTEL_FAM6_NEHALEM_EX; - - case INTEL_FAM6_XEON_PHI_KNM: - return INTEL_FAM6_XEON_PHI_KNL; - - case INTEL_FAM6_BROADWELL_D: /* BDX-DE */ - return INTEL_FAM6_BROADWELL_X; - - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: - case INTEL_FAM6_COMETLAKE_L: - case INTEL_FAM6_COMETLAKE: - return INTEL_FAM6_SKYLAKE_L; - - case INTEL_FAM6_ICELAKE_L: - case INTEL_FAM6_ICELAKE_NNPI: - case INTEL_FAM6_TIGERLAKE_L: - case INTEL_FAM6_TIGERLAKE: - case INTEL_FAM6_ROCKETLAKE: - case INTEL_FAM6_LAKEFIELD: - case INTEL_FAM6_ALDERLAKE: - case INTEL_FAM6_ALDERLAKE_L: - case INTEL_FAM6_ATOM_GRACEMONT: - case INTEL_FAM6_RAPTORLAKE: - case INTEL_FAM6_RAPTORLAKE_P: - case INTEL_FAM6_RAPTORLAKE_S: - case INTEL_FAM6_METEORLAKE: - case INTEL_FAM6_METEORLAKE_L: - return INTEL_FAM6_CANNONLAKE_L; - - case INTEL_FAM6_ATOM_TREMONT_L: - return INTEL_FAM6_ATOM_TREMONT; - - case INTEL_FAM6_ICELAKE_D: - return INTEL_FAM6_ICELAKE_X; - - case INTEL_FAM6_EMERALDRAPIDS_X: - return INTEL_FAM6_SAPPHIRERAPIDS_X; - } - return model; -} - void print_dev_latency(void) { char *path = "/dev/cpu_dma_latency"; @@ -5457,8 +5400,6 @@ void process_cpuid() } probe_platform_features(family, model); - if (genuine_intel) - model = intel_model_duplicates(model); if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); -- cgit From 32e8c6169af7ef7c938b1bd996d27ab171e27d80 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 14:44:06 +0800 Subject: tools/power/turbostat: Improve probe_platform_features() logic AMD/Hygon platforms that don't have RAPL use 'amd_features' to describe the platform features. Unknown Intel platforms use 'default_features' to describe the platform features. As none of the platform feature is set for 'amd_features' or 'default_features', there is no need to maintain both of them. Remove 'amd_features' structure and improve the logic in probe_platform_features(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 89f53e1ac63a..102ba515cf4b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -767,9 +767,6 @@ static const struct platform_features knl_features = { static const struct platform_features default_features = { }; -static const struct platform_features amd_features = { -}; - static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, .has_per_core_rapl = 1, @@ -849,9 +846,9 @@ void probe_platform_features(unsigned int family, unsigned int model) { int i; - if (authentic_amd || hygon_genuine) { - platform = &amd_features; + platform = &default_features; + if (authentic_amd || hygon_genuine) { if (max_extended_level >= 0x80000007) { unsigned int eax, ebx, ecx, edx; @@ -863,8 +860,6 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - platform = &default_features; - if (!genuine_intel || family != 6) return; -- cgit From 045acf6064c5567011163e97c28906e0a7791414 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 29 Aug 2023 13:47:58 +0800 Subject: tools/power/turbostat: Relocate cstate probing code Move all cstate probing related code into probe_cstates(). Note that dump_platform_info() actually dumps both MSR_PLATFORM_INFO and MSR_IA32_POWER_CTL. MSR_PLATFORM_INFO is for pstate and MSR_IA32_POWER_CTL is for cstate. So split dump_platform_info() and dump MSR_IA32_POWER_CTL in probe_cstates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 50 ++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 102ba515cf4b..e5ca1586961e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -272,7 +272,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr); struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ - bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ + bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */ bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ @@ -3025,6 +3025,14 @@ static void dump_platform_info(void) ratio = (msr >> 8) & 0xFF; fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); +} + +static void dump_power_ctl(void) +{ + unsigned long long msr; + + if (!platform->has_nhm_msrs) + return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", @@ -3229,6 +3237,9 @@ static void dump_cst_cfg(void) { unsigned long long msr; + if (!platform->has_nhm_msrs) + return; + get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr); @@ -4332,7 +4343,6 @@ static void dump_cstate_pstate_config_info(void) dump_platform_info(); dump_turbo_ratio_info(); - dump_cst_cfg(); } static int read_sysfs_int(char *path) @@ -5332,6 +5342,25 @@ void probe_cstates(void) if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) BIC_PRESENT(BIC_Pkgpc10); + + if (platform->has_msr_module_c6_res_ms) + BIC_PRESENT(BIC_Mod_c6); + + if (platform->has_ext_cst_msrs) { + BIC_PRESENT(BIC_Totl_c0); + BIC_PRESENT(BIC_Any_c0); + BIC_PRESENT(BIC_GFX_c0); + BIC_PRESENT(BIC_CPUGFX); + } + + if (quiet) + return; + + dump_power_ctl(); + dump_cst_cfg(); + decode_c6_demotion_policy_msr(); + print_dev_latency(); + dump_sysfs_cstate_config(); } void process_cpuid() @@ -5519,32 +5548,15 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - if (platform->has_msr_module_c6_res_ms) - BIC_PRESENT(BIC_Mod_c6); - - if (platform->has_ext_cst_msrs) { - BIC_PRESENT(BIC_Totl_c0); - BIC_PRESENT(BIC_Any_c0); - BIC_PRESENT(BIC_GFX_c0); - BIC_PRESENT(BIC_CPUGFX); - } - if (!quiet) decode_misc_pwr_mgmt_msr(); - if (!quiet) - decode_c6_demotion_policy_msr(); - rapl_probe(); if (!quiet) dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); - if (!quiet) - print_dev_latency(); - if (!quiet) - dump_sysfs_cstate_config(); if (!quiet) dump_sysfs_pstate_config(); -- cgit From 11cd9a09f3e827605cf8fc7c343ddb8c9f4ee95d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:58:10 +0800 Subject: tools/power/turbostat: Relocate pstate probing code Introduce probe_pstates() and move all pstate probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 +++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5ca1586961e..1bcaee6d0ead 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3016,6 +3016,9 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; + if (!platform->has_nhm_msrs) + return; + get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr); @@ -4313,6 +4316,9 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; + if (!platform->has_nhm_msrs) + return; + if (platform->trl_msrs & TRL_LIMIT2) dump_turbo_ratio_limit2(); @@ -4336,15 +4342,6 @@ static void dump_turbo_ratio_info(void) dump_config_tdp(); } -static void dump_cstate_pstate_config_info(void) -{ - if (!platform->has_nhm_msrs) - return; - - dump_platform_info(); - dump_turbo_ratio_info(); -} - static int read_sysfs_int(char *path) { FILE *input; @@ -5363,6 +5360,19 @@ void probe_cstates(void) dump_sysfs_cstate_config(); } +void probe_pstates(void) +{ + probe_bclk(); + + if (quiet) + return; + + dump_platform_info(); + dump_turbo_ratio_info(); + dump_sysfs_pstate_config(); + decode_misc_pwr_mgmt_msr(); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5542,24 +5552,17 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); + probe_pstates(); + probe_cstates(); if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - probe_bclk(); - - if (!quiet) - decode_misc_pwr_mgmt_msr(); rapl_probe(); - if (!quiet) - dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); - if (!quiet) - dump_sysfs_pstate_config(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); -- cgit From 622c8f23556266a5afdd657aa0518b7d70d5dfc7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:00:15 +0800 Subject: tools/power/turbostat: Rename uncore probing function Rename intel_uncore_frequency_probe() to probe_intel_uncore_frequency() to be consistent with other probing function names. Probe uncore frequency right after probing cstates. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1bcaee6d0ead..a956a30d81de 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4378,7 +4378,7 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void intel_uncore_frequency_probe(void) +static void probe_intel_uncore_frequency(void) { int i, j; char path[128]; @@ -5556,13 +5556,13 @@ void process_cpuid() probe_cstates(); + probe_intel_uncore_frequency(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); rapl_probe(); - intel_uncore_frequency_probe(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); -- cgit From 6cb13609a07ba466b4613fdce7da8c98508069b7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:00:33 +0800 Subject: tools/power/turbostat: Rename rapl probing function Rename rapl_probe() to probe_rapl() to be consistent with other probing function names. Probe rapl after probing uncore frequency. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a956a30d81de..bf2b1d1b2627 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4865,11 +4865,11 @@ void rapl_probe_amd(void) } /* - * rapl_probe() + * probe_rapl() * * sets rapl_power_units, rapl_energy_units, rapl_time_units */ -void rapl_probe(void) +void probe_rapl(void) { if (!platform->rapl_msrs) return; @@ -5558,11 +5558,11 @@ void process_cpuid() probe_intel_uncore_frequency(); + probe_rapl(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - rapl_probe(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); -- cgit From 2538d1673d02f66f6bdf01eebf36e271228778e9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:01:12 +0800 Subject: tools/power/turbostat: Relocate graphics probing code Introduce probe_graphics(), and move all graphics probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bf2b1d1b2627..feff9ecff368 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4424,6 +4424,20 @@ static void probe_intel_uncore_frequency(void) } } +static void probe_graphics(void) +{ + if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) + BIC_PRESENT(BIC_GFX_rc6); + + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXMHz); + + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXACTMHz); +} + static void dump_sysfs_cstate_config(void) { char path[64]; @@ -5558,22 +5572,13 @@ void process_cpuid() probe_intel_uncore_frequency(); + probe_graphics(); + probe_rapl(); if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); - - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); - - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXACTMHz); - if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) BIC_PRESENT(BIC_CPU_LPI); else -- cgit From e7d7b82de192464b733fb2bcc9e305ea6f6ea47e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:01:36 +0800 Subject: tools/power/turbostat: Relocate lpi probing code Introduce probe_lpi(), and move all lpi probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 38 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index feff9ecff368..ad9147757d5a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5374,6 +5374,26 @@ void probe_cstates(void) dump_sysfs_cstate_config(); } +void probe_lpi(void) +{ + if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) + BIC_PRESENT(BIC_CPU_LPI); + else + BIC_NOT_PRESENT(BIC_CPU_LPI); + + if (!access(sys_lpi_file_sysfs, R_OK)) { + sys_lpi_file = sys_lpi_file_sysfs; + BIC_PRESENT(BIC_SYS_LPI); + } else if (!access(sys_lpi_file_debugfs, R_OK)) { + sys_lpi_file = sys_lpi_file_debugfs; + BIC_PRESENT(BIC_SYS_LPI); + } else { + sys_lpi_file_sysfs = NULL; + BIC_NOT_PRESENT(BIC_SYS_LPI); + } + +} + void probe_pstates(void) { probe_bclk(); @@ -5570,6 +5590,8 @@ void process_cpuid() probe_cstates(); + probe_lpi(); + probe_intel_uncore_frequency(); probe_graphics(); @@ -5579,27 +5601,11 @@ void process_cpuid() if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) - BIC_PRESENT(BIC_CPU_LPI); - else - BIC_NOT_PRESENT(BIC_CPU_LPI); - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) BIC_PRESENT(BIC_CORE_THROT_CNT); else BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); - if (!access(sys_lpi_file_sysfs, R_OK)) { - sys_lpi_file = sys_lpi_file_sysfs; - BIC_PRESENT(BIC_SYS_LPI); - } else if (!access(sys_lpi_file_debugfs, R_OK)) { - sys_lpi_file = sys_lpi_file_debugfs; - BIC_PRESENT(BIC_SYS_LPI); - } else { - sys_lpi_file_sysfs = NULL; - BIC_NOT_PRESENT(BIC_SYS_LPI); - } - if (!quiet) decode_misc_feature_control(); -- cgit From db735f8ba78bf7692579b78f0ca1e40563ef79fd Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:02:16 +0800 Subject: tools/power/turbostat: Relocate thermal probing code Introduce probe_thermal(), and move all thermal probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ad9147757d5a..8dae576234a4 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4894,6 +4894,14 @@ void probe_rapl(void) rapl_probe_amd(); } +void probe_thermal(void) +{ + if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) + BIC_PRESENT(BIC_CORE_THROT_CNT); + else + BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); +} + int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; @@ -5598,14 +5606,11 @@ void process_cpuid() probe_rapl(); + probe_thermal(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) - BIC_PRESENT(BIC_CORE_THROT_CNT); - else - BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); - if (!quiet) decode_misc_feature_control(); -- cgit From ce7a32c2a4cdba06ed5ead4ed8f980c893e20cd2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Aug 2023 16:36:06 +0800 Subject: tools/power/turbostat: Reorder some functions Reorder some functions to solve code depdency introduced by next patch. No functional change. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 214 +++++++++++++++++----------------- 1 file changed, 107 insertions(+), 107 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8dae576234a4..eb333612bdec 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4878,92 +4878,6 @@ void rapl_probe_amd(void) fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); } -/* - * probe_rapl() - * - * sets rapl_power_units, rapl_energy_units, rapl_time_units - */ -void probe_rapl(void) -{ - if (!platform->rapl_msrs) - return; - - if (genuine_intel) - rapl_probe_intel(); - if (authentic_amd || hygon_genuine) - rapl_probe_amd(); -} - -void probe_thermal(void) -{ - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) - BIC_PRESENT(BIC_CORE_THROT_CNT); - else - BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); -} - -int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) -{ - unsigned long long msr; - unsigned int dts, dts2; - int cpu; - - UNUSED(c); - UNUSED(p); - - if (!(do_dts || do_ptm)) - return 0; - - cpu = t->cpu_id; - - /* DTS is per-core, no need to print for each thread */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) - return 0; - - if (cpu_migrate(cpu)) { - fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); - return -1; - } - - if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { - if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); - - if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); - } - - if (do_dts && debug) { - unsigned int resolution; - - if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - resolution = (msr >> 27) & 0xF; - fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", - cpu, msr, tj_max - dts, resolution); - - if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); - } - - return 0; -} - void print_power_limit_msr(int cpu, unsigned long long msr, char *label) { fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n", @@ -5095,29 +5009,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +/* + * probe_rapl() + * + * sets rapl_power_units, rapl_energy_units, rapl_time_units + */ +void probe_rapl(void) { - unsigned int eax, ebx, ecx, edx; - - UNUSED(c); - UNUSED(p); - - if (!genuine_intel) - return 0; - - if (cpu_migrate(t->cpu_id)) { - fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); - return -1; - } - - if (max_level < 0x1a) - return 0; + if (!platform->rapl_msrs) + return; - __cpuid(0x1a, eax, ebx, ecx, edx); - eax = (eax >> 24) & 0xFF; - if (eax == 0x20) - t->is_atom = true; - return 0; + if (genuine_intel) + rapl_probe_intel(); + if (authentic_amd || hygon_genuine) + rapl_probe_amd(); } /* @@ -5200,6 +5105,101 @@ guess: return 0; } +int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + unsigned int dts, dts2; + int cpu; + + UNUSED(c); + UNUSED(p); + + if (!(do_dts || do_ptm)) + return 0; + + cpu = t->cpu_id; + + /* DTS is per-core, no need to print for each thread */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + return 0; + + if (cpu_migrate(cpu)) { + fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); + + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tj_max - dts, tj_max - dts2); + } + + if (do_dts && debug) { + unsigned int resolution; + + if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + resolution = (msr >> 27) & 0xF; + fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", + cpu, msr, tj_max - dts, resolution); + + if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tj_max - dts, tj_max - dts2); + } + + return 0; +} + +void probe_thermal(void) +{ + if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) + BIC_PRESENT(BIC_CORE_THROT_CNT); + else + BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); +} + +int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned int eax, ebx, ecx, edx; + + UNUSED(c); + UNUSED(p); + + if (!genuine_intel) + return 0; + + if (cpu_migrate(t->cpu_id)) { + fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); + return -1; + } + + if (max_level < 0x1a) + return 0; + + __cpuid(0x1a, eax, ebx, ecx, edx); + eax = (eax >> 24) & 0xFF; + if (eax == 0x20) + t->is_atom = true; + return 0; +} + void decode_feature_control_msr(void) { unsigned long long msr; -- cgit From 5612b2c89bd0a3a2f7c9e8756e6ead971b03f8a3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Aug 2023 16:54:51 +0800 Subject: tools/power/turbostat: Relocate more probing related code Relocate more feature probing code outside of process_cpuids() into the corresponding probing functions. This improves the readability of code and the turbostat output. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 37 ++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index eb333612bdec..ffeee48e8d85 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5023,6 +5023,11 @@ void probe_rapl(void) rapl_probe_intel(); if (authentic_amd || hygon_genuine) rapl_probe_amd(); + + if (quiet) + return; + + for_all_cpus(print_rapl, ODD_COUNTERS); } /* @@ -5173,6 +5178,13 @@ void probe_thermal(void) BIC_PRESENT(BIC_CORE_THROT_CNT); else BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); + + for_all_cpus(set_temperature_target, ODD_COUNTERS); + + if (quiet) + return; + + for_all_cpus(print_thermal, ODD_COUNTERS); } int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) @@ -5380,6 +5392,7 @@ void probe_cstates(void) decode_c6_demotion_policy_msr(); print_dev_latency(); dump_sysfs_cstate_config(); + print_irtl(); } void probe_lpi(void) @@ -5413,6 +5426,10 @@ void probe_pstates(void) dump_turbo_ratio_info(); dump_sysfs_pstate_config(); decode_misc_pwr_mgmt_msr(); + + for_all_cpus(print_hwp, ODD_COUNTERS); + for_all_cpus(print_epb, ODD_COUNTERS); + for_all_cpus(print_perf_limit, ODD_COUNTERS); } void process_cpuid() @@ -5897,29 +5914,9 @@ void turbostat_init() process_cpuid(); linux_perf_init(); - if (!quiet) - for_all_cpus(print_hwp, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_epb, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_perf_limit, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_rapl, ODD_COUNTERS); - - for_all_cpus(set_temperature_target, ODD_COUNTERS); - for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (!quiet) - for_all_cpus(print_thermal, ODD_COUNTERS); - - if (!quiet) - print_irtl(); - if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); } -- cgit From 7ee39d8d593e3d28eced0fa1a8c8c6bdcbd4156e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Sep 2023 23:32:39 +0800 Subject: tools/power/turbostat: Introduce probe_pm_features() Feature probe has nothing to do with CPUID, thus it should not be in process_cpuids(). Introduce probe_pm_features() and move all feature probing functions into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ffeee48e8d85..607152b36c1a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5610,7 +5610,10 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); +} +void probe_pm_features(void) +{ probe_pstates(); probe_cstates(); @@ -5630,8 +5633,6 @@ void process_cpuid() if (!quiet) decode_misc_feature_control(); - - return; } /* @@ -5912,6 +5913,7 @@ void turbostat_init() check_dev_msr(); check_permissions(); process_cpuid(); + probe_pm_features(); linux_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); -- cgit From 05ad96ff0fb9d1b16abb5022b9c62636c6780fc2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:33:27 +0800 Subject: tools/power/turbostat: Enable MSR_CORE_C1_RES on recent Intel client platforms All recent Intel client platforms have MSR_CORE_C1_RES. Enable the support on these platforms, including CNL/ICL/LKF/RKL/TGL/ADL/RPL/MTL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 607152b36c1a..9895f348b637 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -604,6 +604,7 @@ static const struct platform_features cnl_features = { .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, -- cgit From 6b74a30b767e362eda7deeac52edcd546c5f6d8f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 9 Sep 2023 13:26:51 +0800 Subject: tools/power/turbostat: Remove PC7/PC9 support on ADL/RPL Compared with other platforms that share cnl_features, ADL/RPL don't have PC7/PC9. Clone a new platform feature set from cnl_features for ADL/RPL, with PC7/PC9 removed. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9895f348b637..a769daa59b12 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -612,6 +612,23 @@ static const struct platform_features cnl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features adl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -812,11 +829,11 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &cnl_features }, - { INTEL_FAM6_ALDERLAKE_L, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &cnl_features }, + { INTEL_FAM6_ALDERLAKE, &adl_features }, + { INTEL_FAM6_ALDERLAKE_L, &adl_features }, + { INTEL_FAM6_RAPTORLAKE, &adl_features }, + { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, + { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, @@ -828,7 +845,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &cnl_features }, + { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* -- cgit From 71cfd1da9f0635ccd7124ad8b67b9aed596be491 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 11:01:17 +0800 Subject: tools/power/turbostat: Introduce cpu_allowed_set Turbostat supports "-c" parameter which limits output to system summary plus the specified cpu-set. But some code still uses cpu_present_set to read and dump the counters. Introduce cpu_allowed_set for code that should obey the specified cpu-set. No functional change. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a769daa59b12..d8f44ea5b4bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -904,8 +904,8 @@ int backwards_count; char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ -cpu_set_t *cpu_present_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_affinity_setsize, cpu_subset_size; +cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 @@ -1157,6 +1157,11 @@ int cpu_is_not_present(int cpu) return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set); } +int cpu_is_not_allowed(int cpu) +{ + return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set); +} + /* * run func(thread, core, package) in topology order * skip non-present cpus @@ -3396,6 +3401,10 @@ void free_all_buffers(void) cpu_present_set = NULL; cpu_present_setsize = 0; + CPU_FREE(cpu_allowed_set); + cpu_allowed_set = NULL; + cpu_allowed_setsize = 0; + CPU_FREE(cpu_affinity_set); cpu_affinity_set = NULL; cpu_affinity_setsize = 0; @@ -5697,13 +5706,29 @@ void topology_probe() CPU_ZERO_S(cpu_present_setsize, cpu_present_set); for_all_proc_cpus(mark_cpu_present); + /* + * Allocate and initialize cpu_allowed_set + */ + cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (cpu_allowed_set == NULL) + err(3, "CPU_ALLOC"); + cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); + CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); + /* * Validate that all cpus in cpu_subset are also in cpu_present_set */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { - if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) + if (!cpu_subset) { + if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + continue; + } + if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) err(1, "cpu%d not present", i); + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + } } /* -- cgit From 4ede6d1ce7acba9cafe7df4e935b174623cd2181 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 13:52:02 +0800 Subject: tools/power/turbostat: Obey allowed CPUs when accessing CPU counters for_all_cpus/for_all_cpus_2 are used for accessing the per CPU counters, and they should follow the cpu_allowed_set instead of cpu_present_set. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d8f44ea5b4bf..202cf5231d7a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1182,7 +1182,7 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); - if (cpu_is_not_present(t->cpu_id)) + if (cpu_is_not_allowed(t->cpu_id)) continue; c = GET_CORE(core_base, core_no, node_no, pkg_no); @@ -3618,7 +3618,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); - if (cpu_is_not_present(t->cpu_id)) + if (cpu_is_not_allowed(t->cpu_id)) continue; t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no); -- cgit From 7bb3fe27ad4f62039b2ac80a2147452a608b474f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 14:25:25 +0800 Subject: tools/power/turbostat: Obey allowed CPUs during startup Set turbostat CPU affinity to make sure turbostat is running on one of the allowed CPUs. Set base_cpu to the first allowed CPU so that some platform information is dumped using one of the allowed CPUs. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 202cf5231d7a..f81ce832d17a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5731,6 +5731,10 @@ void topology_probe() } } + if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set)) + err(-ENODEV, "No valid cpus found"); + sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set); + /* * Allocate and initialize cpu_affinity_set */ @@ -5941,12 +5945,17 @@ void setup_all_buffers(void) void set_base_cpu(void) { - base_cpu = sched_getcpu(); - if (base_cpu < 0) - err(-ENODEV, "No valid cpus found"); + int i; - if (debug > 1) - fprintf(outf, "base_cpu = %d\n", base_cpu); + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (cpu_is_not_allowed(i)) + continue; + base_cpu = i; + if (debug > 1) + fprintf(outf, "base_cpu = %d\n", base_cpu); + return; + } + err(-ENODEV, "No valid cpus found"); } void turbostat_init() @@ -5976,8 +5985,6 @@ int fork_it(char **argv) first_counter_read = 0; if (status) exit(status); - /* clear affinity side-effect of get_counters() */ - sched_setaffinity(0, cpu_present_setsize, cpu_present_set); gettimeofday(&tv_even, (struct timezone *)NULL); child_pid = fork(); -- cgit From 74318add132365db3026e281ac06836a26cda857 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 15:13:23 +0800 Subject: tools/power/turbostat: Abstract several functions When detecting the primary thread/core in a core/package, current code doesn't handle the allowed CPUs. Abstract several functions for further fix of this issue. No functional change. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f81ce832d17a..5f6bc076e1dd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1198,6 +1198,30 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk return 0; } +int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE); +} + +int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); +} + +int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); +} + int cpu_migrate(int cpu) { CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set); @@ -1682,11 +1706,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data int printed = 0; /* if showing only 1st thread in core and this isn't one, bail out */ - if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (show_core_only && !is_cpu_first_thread_in_core(t, c, p)) return 0; /* if showing only 1st thread in pkg and this isn't one, bail out */ - if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p)) return 0; /*if not summary line and --cpu is used */ @@ -1820,7 +1844,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); /* print per-core data only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) goto done; if (DO_BIC(BIC_CPU_c3)) @@ -1867,7 +1891,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); /* print per-package data only for 1st core in package */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) goto done; /* PkgTmp */ @@ -2202,7 +2226,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, int retval = 0; /* calculate core delta only for 1st thread in core */ - if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) + if (is_cpu_first_thread_in_core(t, c, p)) delta_core(c, c2); /* always calculate thread delta */ @@ -2211,7 +2235,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; /* calculate package delta only for 1st core in package */ - if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE) + if (is_cpu_first_core_in_package(t, c, p)) retval = delta_package(p, p2); return retval; @@ -2325,7 +2349,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* sum per-core values only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) return 0; average.cores.c3 += c->c3; @@ -2345,7 +2369,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* sum per-pkg values only for 1st core in pkg */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) return 0; if (DO_BIC(BIC_Totl_c0)) @@ -2745,7 +2769,7 @@ retry: } /* collect core counters only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) goto done; if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { @@ -2800,7 +2824,7 @@ retry: } /* collect package counters only for 1st core in package */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) goto done; if (DO_BIC(BIC_Totl_c0)) { @@ -4581,7 +4605,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; /* EPB is per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4630,7 +4654,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; /* MSR_HWP_CAPABILITIES is per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4713,7 +4737,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data cpu = t->cpu_id; /* per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4930,7 +4954,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; /* RAPL counters are per package, so print only for 1st thread/package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; cpu = t->cpu_id; @@ -5083,7 +5107,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk return 0; /* this is a per-package concept */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; cpu = t->cpu_id; @@ -5152,7 +5176,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p cpu = t->cpu_id; /* DTS is per-core, no need to print for each thread */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -5160,7 +5184,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return -1; } - if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { + if (do_ptm && is_cpu_first_core_in_package(t, c, p)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return 0; -- cgit From ccf8a0528061b4ca5f5c0e73c8d888e7e6d8b054 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 09:59:14 +0800 Subject: tools/power/turbostat: Obey allowed CPUs for primary thread/core detection Thread_id doesn't tell if a CPU is allowed or not. Detect allowed CPUs only and use the first detected thread/core as the primary thread/core of a core/package. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5f6bc076e1dd..e586164906fa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -926,12 +926,11 @@ struct thread_data { unsigned int x2apic_id; unsigned int flags; bool is_atom; -#define CPU_IS_FIRST_THREAD_IN_CORE 0x2 -#define CPU_IS_FIRST_CORE_IN_PACKAGE 0x4 unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { + int base_cpu; unsigned long long c3; unsigned long long c6; unsigned long long c7; @@ -944,6 +943,7 @@ struct core_data { } *core_even, *core_odd; struct pkg_data { + int base_cpu; unsigned long long pc2; unsigned long long pc3; unsigned long long pc6; @@ -1200,26 +1200,21 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - UNUSED(c); UNUSED(p); - return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE); + return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0); } int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) { UNUSED(c); - UNUSED(p); - return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); + return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0); } int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - UNUSED(c); - UNUSED(p); - - return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); + return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p); } int cpu_migrate(int cpu) @@ -2263,9 +2258,6 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->irq_count = 0; t->smi_count = 0; - /* tells format_counters to dump all fields from this set */ - t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE; - c->c3 = 0; c->c6 = 0; c->c7 = 0; @@ -5872,15 +5864,19 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_ if (*c == NULL) goto error; - for (i = 0; i < num_cores; i++) + for (i = 0; i < num_cores; i++) { (*c)[i].core_id = -1; + (*c)[i].base_cpu = -1; + } *p = calloc(topo.num_packages, sizeof(struct pkg_data)); if (*p == NULL) goto error; - for (i = 0; i < topo.num_packages; i++) + for (i = 0; i < topo.num_packages; i++) { (*p)[i].package_id = i; + (*p)[i].base_cpu = -1; + } return; error: @@ -5913,10 +5909,11 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, p = GET_PKG(pkg_base, pkg_id); t->cpu_id = cpu_id; - if (thread_id == 0) { - t->flags |= CPU_IS_FIRST_THREAD_IN_CORE; - if (cpu_is_first_core_in_package(cpu_id)) - t->flags |= CPU_IS_FIRST_CORE_IN_PACKAGE; + if (!cpu_is_not_allowed(cpu_id)) { + if (c->base_cpu < 0) + c->base_cpu = t->cpu_id; + if (p->base_cpu < 0) + p->base_cpu = t->cpu_id; } c->core_id = core_id; -- cgit From 0fe3752901370b83b63d4ddbce708b23c8e41ea9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 18:35:26 +0800 Subject: tools/power/turbostat: Obey allowed CPUs for system summary System summary should summarize the information for allowed CPUs instead of all the present CPUs. Introduce topology information for allowed CPUs, and use them to get system summary. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 71 +++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e586164906fa..91aa03989317 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1134,6 +1134,9 @@ struct topo_params { int num_die; int num_cpus; int num_cores; + int allowed_packages; + int allowed_cpus; + int allowed_cores; int max_cpu_num; int max_node_num; int nodes_per_pkg; @@ -1179,7 +1182,6 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk struct thread_data *t; struct core_data *c; struct pkg_data *p; - t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_allowed(t->cpu_id)) @@ -2426,40 +2428,40 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data /* Use the global time delta for the average. */ average.threads.tv_delta = tv_delta; - average.threads.tsc /= topo.num_cpus; - average.threads.aperf /= topo.num_cpus; - average.threads.mperf /= topo.num_cpus; - average.threads.instr_count /= topo.num_cpus; - average.threads.c1 /= topo.num_cpus; + average.threads.tsc /= topo.allowed_cpus; + average.threads.aperf /= topo.allowed_cpus; + average.threads.mperf /= topo.allowed_cpus; + average.threads.instr_count /= topo.allowed_cpus; + average.threads.c1 /= topo.allowed_cpus; if (average.threads.irq_count > 9999999) sums_need_wide_columns = 1; - average.cores.c3 /= topo.num_cores; - average.cores.c6 /= topo.num_cores; - average.cores.c7 /= topo.num_cores; - average.cores.mc6_us /= topo.num_cores; + average.cores.c3 /= topo.allowed_cores; + average.cores.c6 /= topo.allowed_cores; + average.cores.c7 /= topo.allowed_cores; + average.cores.mc6_us /= topo.allowed_cores; if (DO_BIC(BIC_Totl_c0)) - average.packages.pkg_wtd_core_c0 /= topo.num_packages; + average.packages.pkg_wtd_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_Any_c0)) - average.packages.pkg_any_core_c0 /= topo.num_packages; + average.packages.pkg_any_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_GFX_c0)) - average.packages.pkg_any_gfxe_c0 /= topo.num_packages; + average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages; if (DO_BIC(BIC_CPUGFX)) - average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages; + average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages; - average.packages.pc2 /= topo.num_packages; + average.packages.pc2 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc3)) - average.packages.pc3 /= topo.num_packages; + average.packages.pc3 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc6)) - average.packages.pc6 /= topo.num_packages; + average.packages.pc6 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc7)) - average.packages.pc7 /= topo.num_packages; + average.packages.pc7 /= topo.allowed_packages; - average.packages.pc8 /= topo.num_packages; - average.packages.pc9 /= topo.num_packages; - average.packages.pc10 /= topo.num_packages; + average.packages.pc8 /= topo.allowed_packages; + average.packages.pc9 /= topo.allowed_packages; + average.packages.pc10 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2469,7 +2471,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data sums_need_wide_columns = 1; continue; } - average.threads.counter[i] /= topo.num_cpus; + average.threads.counter[i] /= topo.allowed_cpus; } for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2478,7 +2480,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data if (average.cores.counter[i] > 9999999) sums_need_wide_columns = 1; } - average.cores.counter[i] /= topo.num_cores; + average.cores.counter[i] /= topo.allowed_cores; } for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2487,7 +2489,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data if (average.packages.counter[i] > 9999999) sums_need_wide_columns = 1; } - average.packages.counter[i] /= topo.num_packages; + average.packages.counter[i] /= topo.allowed_packages; } } @@ -3690,7 +3692,7 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(); - fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } void set_max_cpu_num(void) @@ -5953,6 +5955,24 @@ void allocate_irq_buffers(void) err(-1, "calloc %d", topo.max_cpu_num + 1); } +int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + topo.allowed_cpus++; + if ((int)t->cpu_id == c->base_cpu) + topo.allowed_cores++; + if ((int)t->cpu_id == p->base_cpu) + topo.allowed_packages++; + + return 0; +} + +void topology_update(void) +{ + topo.allowed_cpus = 0; + topo.allowed_cores = 0; + topo.allowed_packages = 0; + for_all_cpus(update_topo, ODD_COUNTERS); +} void setup_all_buffers(void) { topology_probe(); @@ -5962,6 +5982,7 @@ void setup_all_buffers(void) allocate_counters(&thread_odd, &core_odd, &package_odd); allocate_output_buffer(); for_all_proc_cpus(initialize_counters); + topology_update(); } void set_base_cpu(void) -- cgit From c25ef0e5d9d7d5fb9e1679286fc7a11e70f16c70 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 14:20:47 +0800 Subject: tools/power/turbostat: Handle offlined CPUs in cpu_subset It is possible that the cpu_subset contains offlined CPUs. If this happens during start, exit immediately because this is likely an operator error that is best fixed by re-invoking. If this happens at runtime, give a warning only because turbostat should do its best effort to continue running. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 91aa03989317..f9ea07961a22 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1149,7 +1149,7 @@ struct timeval tv_even, tv_odd, tv_delta; int *irq_column_2_cpu; /* /proc/interrupts column numbers */ int *irqs_per_cpu; /* indexed by cpu_num */ -void setup_all_buffers(void); +void setup_all_buffers(bool startup); char *sys_lpi_file; char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us"; @@ -3691,7 +3691,7 @@ int for_all_proc_cpus(int (func) (int)) void re_initialize(void) { free_all_buffers(); - setup_all_buffers(); + setup_all_buffers(false); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5692,7 +5692,7 @@ int dir_filter(const struct dirent *dirp) return 0; } -void topology_probe() +void topology_probe(bool startup) { int i; int max_core_id = 0; @@ -5734,7 +5734,12 @@ void topology_probe() CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); /* - * Validate that all cpus in cpu_subset are also in cpu_present_set + * Validate cpu_subset and update cpu_allowed_set. + * + * Make sure all cpus in cpu_subset are also in cpu_present_set during startup, + * and give a warning when cpus in cpu_subset become unavailable at runtime. + * + * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset. */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { if (!cpu_subset) { @@ -5743,9 +5748,15 @@ void topology_probe() continue; } if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { - if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) - err(1, "cpu%d not present", i); - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { + /* all cpus in cpu_subset must be in cpu_present_set during startup */ + if (startup) + err(1, "cpu%d not present", i); + else + fprintf(stderr, "cpu%d not present\n", i); + } else { + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + } } } @@ -5973,9 +5984,9 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } -void setup_all_buffers(void) +void setup_all_buffers(bool startup) { - topology_probe(); + topology_probe(startup); allocate_irq_buffers(); allocate_fd_percpu(); allocate_counters(&thread_even, &core_even, &package_even); @@ -6002,7 +6013,7 @@ void set_base_cpu(void) void turbostat_init() { - setup_all_buffers(); + setup_all_buffers(true); set_base_cpu(); check_dev_msr(); check_permissions(); -- cgit From 8c3dd2c9e54273922ea71b2a4c0e77fc624c396b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 09:45:21 +0800 Subject: tools/power/turbostat: Abstrct function for parsing cpu string Abstract parse_cpu_str() which can update any specified cpu_set by a given cpu string. This can be used to handle further CPU limitations from other sources like cgroup. The cpu string parsing code is also enhanced to handle the strings that have an extra '\n' before string terminator. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 104 ++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f9ea07961a22..3a759b49f25e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3565,6 +3565,59 @@ int get_physical_node_id(struct cpu_topology *thiscpu) return -1; } +static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size) +{ + unsigned int start, end; + char *next = cpu_str; + + while (next && *next) { + + if (*next == '-') /* no negative cpu numbers */ + return 1; + + start = strtoul(next, &next, 10); + + if (start >= CPU_SUBSET_MAXCPUS) + return 1; + CPU_SET_S(start, cpu_set_size, cpu_set); + + if (*next == '\0' || *next == '\n') + break; + + if (*next == ',') { + next += 1; + continue; + } + + if (*next == '-') { + next += 1; /* start range */ + } else if (*next == '.') { + next += 1; + if (*next == '.') + next += 1; /* start range */ + else + return 1; + } + + end = strtoul(next, &next, 10); + if (end <= start) + return 1; + + while (++start <= end) { + if (start >= CPU_SUBSET_MAXCPUS) + return 1; + CPU_SET_S(start, cpu_set_size, cpu_set); + } + + if (*next == ',') + next += 1; + else if (*next != '\0' && *next != '\n') + return 1; + } + + return 0; +} + int get_thread_siblings(struct cpu_topology *thiscpu) { char path[80], character; @@ -6384,9 +6437,6 @@ void probe_sysfs(void) */ void parse_cpu_command(char *optarg) { - unsigned int start, end; - char *next; - if (!strcmp(optarg, "core")) { if (cpu_subset) goto error; @@ -6409,52 +6459,8 @@ void parse_cpu_command(char *optarg) CPU_ZERO_S(cpu_subset_size, cpu_subset); - next = optarg; - - while (next && *next) { - - if (*next == '-') /* no negative cpu numbers */ - goto error; - - start = strtoul(next, &next, 10); - - if (start >= CPU_SUBSET_MAXCPUS) - goto error; - CPU_SET_S(start, cpu_subset_size, cpu_subset); - - if (*next == '\0') - break; - - if (*next == ',') { - next += 1; - continue; - } - - if (*next == '-') { - next += 1; /* start range */ - } else if (*next == '.') { - next += 1; - if (*next == '.') - next += 1; /* start range */ - else - goto error; - } - - end = strtoul(next, &next, 10); - if (end <= start) - goto error; - - while (++start <= end) { - if (start >= CPU_SUBSET_MAXCPUS) - goto error; - CPU_SET_S(start, cpu_subset_size, cpu_subset); - } - - if (*next == ',') - next += 1; - else if (*next != '\0') - goto error; - } + if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size)) + goto error; return; -- cgit From f638858da0925b29122f05135663013dc240eaf9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 09:39:22 +0800 Subject: tools/power/turbostat: Handle cgroup v2 cpu limitation CPUs can be isolated via cgroup settings and turbostat should avoid migrating to these CPUs, just like it does for the '-c' cpus. Introduce cpu_effective_set to save the cgroup cpu limitation info from /sys/fs/cgroup/cpuset.cpus.effective. And use cpu_allowed_set as the intersection of cpu_present_set, cpu_effective_set and cpu_subset. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 95 +++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3a759b49f25e..0ef6fba118b1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -904,8 +904,8 @@ int backwards_count; char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ -cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; +cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 @@ -3419,6 +3419,10 @@ void free_all_buffers(void) cpu_present_set = NULL; cpu_present_setsize = 0; + CPU_FREE(cpu_effective_set); + cpu_effective_set = NULL; + cpu_effective_setsize = 0; + CPU_FREE(cpu_allowed_set); cpu_allowed_set = NULL; cpu_allowed_setsize = 0; @@ -3741,6 +3745,46 @@ int for_all_proc_cpus(int (func) (int)) return 0; } +#define PATH_EFFECTIVE_CPUS "/sys/fs/cgroup/cpuset.cpus.effective" + +static char cpu_effective_str[1024]; + +static int update_effective_str(bool startup) +{ + FILE *fp; + char *pos; + char buf[1024]; + int ret; + + if (cpu_effective_str[0] == '\0' && !startup) + return 0; + + fp = fopen(PATH_EFFECTIVE_CPUS, "r"); + if (!fp) + return 0; + + pos = fgets(buf, 1024, fp); + if (!pos) + err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS); + + fclose(fp); + + ret = strncmp(cpu_effective_str, buf, 1024); + if (!ret) + return 0; + + strncpy(cpu_effective_str, buf, 1024); + return 1; +} + +static void update_effective_set(bool startup) +{ + update_effective_str(startup); + + if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize)) + err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); +} + void re_initialize(void) { free_all_buffers(); @@ -4257,6 +4301,10 @@ restart: re_initialize(); goto restart; } + if (update_effective_str(false)) { + re_initialize(); + goto restart; + } do_sleep(); if (snapshot_proc_sysfs_files()) goto restart; @@ -5777,6 +5825,16 @@ void topology_probe(bool startup) CPU_ZERO_S(cpu_present_setsize, cpu_present_set); for_all_proc_cpus(mark_cpu_present); + /* + * Allocate and initialize cpu_effective_set + */ + cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (cpu_effective_set == NULL) + err(3, "CPU_ALLOC"); + cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); + CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set); + update_effective_set(startup); + /* * Allocate and initialize cpu_allowed_set */ @@ -5787,30 +5845,37 @@ void topology_probe(bool startup) CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); /* - * Validate cpu_subset and update cpu_allowed_set. + * Validate and update cpu_allowed_set. * - * Make sure all cpus in cpu_subset are also in cpu_present_set during startup, - * and give a warning when cpus in cpu_subset become unavailable at runtime. + * Make sure all cpus in cpu_subset are also in cpu_present_set during startup. + * Give a warning when cpus in cpu_subset become unavailable at runtime. + * Give a warning when cpus are not effective because of cgroup setting. * - * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset. + * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset. */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { - if (!cpu_subset) { - if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) continue; - } - if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { - if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { - /* all cpus in cpu_subset must be in cpu_present_set during startup */ + + if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { + if (cpu_subset) { + /* cpus in cpu_subset must be in cpu_present_set during startup */ if (startup) err(1, "cpu%d not present", i); else fprintf(stderr, "cpu%d not present\n", i); - } else { - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); } + continue; } + + if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) { + if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) { + fprintf(stderr, "cpu%d not effective\n", i); + continue; + } + } + + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); } if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set)) -- cgit From 37f68a2940558b4f6f8e51b7b1d00f084b4bdde2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 24 Jan 2023 10:39:53 -0800 Subject: tools/power/turbostat: Move process to root cgroup When available CPUs are reduced via cgroup cpuset controller, turbostat will exit with errors (For example): get_counters: Could not migrate to CPU 0 turbostat: re-initialized with num_cpus 20 get_counters: Could not migrate to CPU 0 turbostat: re-initialized with num_cpus 20 Move the turbostat to root cgroup, which has every CPU. Writing the value 0 to a cgroup.procs file causes the writing process to be moved to the corresponding cgroup. Signed-off-by: Srinivas Pandruvada Tested-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0ef6fba118b1..fea63d9d8e02 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6666,6 +6666,19 @@ void cmdline(int argc, char **argv) int main(int argc, char **argv) { + int fd, ret; + + fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY); + if (fd < 0) + goto skip_cgroup_setting; + + ret = write(fd, "0\n", 2); + if (ret == -1) + perror("Can't update cgroup\n"); + + close(fd); + +skip_cgroup_setting: outf = stderr; cmdline(argc, argv); -- cgit From 0e3f10e6aa97b0134b526ec9cdc3ccdac2239b43 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 19 Oct 2023 11:04:33 +0800 Subject: tools/power/turbostat: Add MSR_CORE_C1_RES support for spr_features Add MSR_CORE_C1_RES support for spr_features because both Sapphirerapids and Emeraldrapids support this MSR. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fea63d9d8e02..bbeeec02bf5b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -667,6 +667,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, -- cgit From 5feab4a6b8a730438a0fe8758dfa0700f951edde Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 9 Sep 2023 13:28:07 +0800 Subject: tools/power/turbostat: Add initial support for GraniteRapids Add initial support for GraniteRapids. It shares the same features with SapphireRapids. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bbeeec02bf5b..981d39454b49 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -829,6 +829,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_TIGERLAKE, &cnl_features }, { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, + { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, { INTEL_FAM6_LAKEFIELD, &cnl_features }, { INTEL_FAM6_ALDERLAKE, &adl_features }, { INTEL_FAM6_ALDERLAKE_L, &adl_features }, -- cgit From d33605f367414c7e0009978d1fbe9af01a36e221 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 28 Sep 2023 13:09:02 +0800 Subject: tools/power/turbostat: Add initial support for SierraForest Add initial support for SierraForest. It shares the same features with SapphireRapids, except that it has MSR_MODULE_C6_RES_MS support. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 981d39454b49..a50b6d071f6a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -674,6 +674,22 @@ static const struct platform_features spr_features = { .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; +static const struct platform_features srf_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | PC2 | PC6, + .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, + .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, +}; + static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, @@ -848,6 +864,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, + { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* -- cgit From 5a6efcb9102af4210d5a59182dbfbc594ae50fd4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 19 Oct 2023 11:00:07 +0800 Subject: tools/power/turbostat: Add initial support for GrandRidge Add initial support for GrandRidge. It shares the same features as SierraForest, except that it does not support PC2/PC6. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a50b6d071f6a..3407f08593f8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -690,6 +690,22 @@ static const struct platform_features srf_features = { .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; +static const struct platform_features grr_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6, + .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, + .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, +}; + static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, @@ -865,6 +881,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* -- cgit From 7b57e7b683e3872b02117f46bd7dc7ad765888a8 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 23 May 2023 17:46:45 +0530 Subject: tools/power/turbostat: Add initial support for ArrowLake Add initial support for ArrowLake platform. It shares the same features with CannonLake. Signed-off-by: Sumeet Pawnikar --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3407f08593f8..6c52cceae1d7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -870,6 +870,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ARROWLAKE, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, -- cgit From 956dbd3de400a5665faf08a8588556db9c1bb56e Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 23 May 2023 17:46:45 +0530 Subject: tools/power/turbostat: Add initial support for LunarLake Add initial support for LunarLake platform. It shares the same features with CannonLake. Signed-off-by: Sumeet Pawnikar --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6c52cceae1d7..8a311d7272e7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -871,6 +871,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, { INTEL_FAM6_ARROWLAKE, &cnl_features }, + { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, -- cgit From 3503895788d402d6a3814085ed582c364ec3e903 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 31 Oct 2023 12:02:06 -0400 Subject: virtio_pci: move structure to a header These are guest/host interfaces, so they belong in the header where e.g. qemu will know to find them. Note: we added a new structure as opposed to extending existing one because someone might be relying on the size of the existing structure staying unchanged. Add a warning to avoid using sizeof. Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_pci_modern_dev.c | 7 ++++--- include/linux/virtio_pci_modern.h | 7 ------- include/uapi/linux/virtio_pci.h | 11 +++++++++++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index e2a1fe7bb66c..7de8b1ebabac 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -294,9 +294,10 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) err = -EINVAL; mdev->common = vp_modern_map_capability(mdev, common, - sizeof(struct virtio_pci_common_cfg), 4, - 0, sizeof(struct virtio_pci_modern_common_cfg), - &mdev->common_len, NULL); + sizeof(struct virtio_pci_common_cfg), 4, 0, + offsetofend(struct virtio_pci_modern_common_cfg, + queue_reset), + &mdev->common_len, NULL); if (!mdev->common) goto err_map_common; mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index d0f2797420f7..a09e13a577a9 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -5,13 +5,6 @@ #include #include -struct virtio_pci_modern_common_cfg { - struct virtio_pci_common_cfg cfg; - - __le16 queue_notify_data; /* read-write */ - __le16 queue_reset; /* read-write */ -}; - /** * struct virtio_pci_modern_device - info for modern PCI virtio * @pci_dev: Ptr to the PCI device struct diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index f703afc7ad31..44f4dd2add18 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -166,6 +166,17 @@ struct virtio_pci_common_cfg { __le32 queue_used_hi; /* read-write */ }; +/* + * Warning: do not use sizeof on this: use offsetofend for + * specific fields you need. + */ +struct virtio_pci_modern_common_cfg { + struct virtio_pci_common_cfg cfg; + + __le16 queue_notify_data; /* read-write */ + __le16 queue_reset; /* read-write */ +}; + /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ struct virtio_pci_cfg_cap { struct virtio_pci_cap cap; -- cgit From 0d82410252ea324f0064e75b9865bb74cccc1dda Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 31 Oct 2023 15:43:39 +0100 Subject: vdpa_sim_blk: allocate the buffer zeroed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deleting and recreating a device can lead to having the same content as the old device, so let's always allocate buffers completely zeroed out. Fixes: abebb16254b3 ("vdpa_sim_blk: support shared backend") Suggested-by: Qing Wang Signed-off-by: Stefano Garzarella Message-Id: <20231031144339.121453-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index b3a3cb165795..b137f3679343 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -437,7 +437,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (blk->shared_backend) { blk->buffer = shared_buffer; } else { - blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, GFP_KERNEL); if (!blk->buffer) { ret = -ENOMEM; @@ -495,7 +495,7 @@ static int __init vdpasim_blk_init(void) goto parent_err; if (shared_backend) { - shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, GFP_KERNEL); if (!shared_buffer) { ret = -ENOMEM; -- cgit From f2de37a572853d340f945a7748f74e3ed8c6b743 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 12 Oct 2023 12:28:52 +0200 Subject: riscv, qemu_fw_cfg: Add support for RISC-V architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qemu fw_cfg support was missing for RISC-V, which made it hard to do proper vmcore dumps from qemu. Add the missing RISC-V arch-defines. You can now do vmcore dumps from qemu. Add "-device vmcoreinfo" to the qemu command-line. From the qemu monitor: (qemu) dump-guest-memory vmcore The vmcore can now be used, e.g., with the "crash" utility. Acked-by: "Michael S. Tsirkin" Acked-by: Alistair Francis Tested-by: Clément Léger Signed-off-by: Björn Töpel Message-Id: <20231012102852.234442-1-bjorn@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/firmware/Kconfig | 2 +- drivers/firmware/qemu_fw_cfg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index b59e3041fd62..f05ff56629b3 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -155,7 +155,7 @@ config RASPBERRYPI_FIRMWARE config FW_CFG_SYSFS tristate "QEMU fw_cfg device support in sysfs" - depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || SPARC || X86) + depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || RISCV || SPARC || X86) depends on HAS_IOPORT_MAP default n help diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index a69399a6b7c0..1448f61173b3 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -211,7 +211,7 @@ static void fw_cfg_io_cleanup(void) /* arch-specific ctrl & data register offsets are not available in ACPI, DT */ #if !(defined(FW_CFG_CTRL_OFF) && defined(FW_CFG_DATA_OFF)) -# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64)) +# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV)) # define FW_CFG_CTRL_OFF 0x08 # define FW_CFG_DATA_OFF 0x00 # define FW_CFG_DMA_OFF 0x10 -- cgit From b2c8b644fac1087dfe69a1762c04df090178a5ae Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 25 Oct 2023 16:53:19 +0200 Subject: virtio_pci: Switch away from deprecated irq_set_affinity_hint Since commit 65c7cdedeb30 ("genirq: Provide new interfaces for affinity hints") irq_set_affinity_hint is being phased out. Switch to new interfaces for setting and applying irq affinity hints. Signed-off-by: Jakub Sitnicki Message-Id: <20231025145319.380775-1-jakub@cloudflare.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_pci_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index c2524a7207cf..7a5593997e0e 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev) if (v != VIRTIO_MSI_NO_VECTOR) { int irq = pci_irq_vector(vp_dev->pci_dev, v); - irq_set_affinity_hint(irq, NULL); + irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } @@ -443,10 +443,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) - irq_set_affinity_hint(irq, NULL); + irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); - irq_set_affinity_hint(irq, mask); + irq_set_affinity_and_hint(irq, mask); } } return 0; -- cgit From e07754e0a1ea2d63fb29574253d1fd7405607343 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Oct 2023 15:12:54 +0300 Subject: vhost-vdpa: fix use after free in vhost_vdpa_probe() The put_device() calls vhost_vdpa_release_dev() which calls ida_simple_remove() and frees "v". So this call to ida_simple_remove() is a use after free and a double free. Fixes: ebe6a354fa7e ("vhost-vdpa: Call ida_simple_remove() when failed") Signed-off-by: Dan Carpenter Message-Id: Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 30df5c58db73..da7ec77cdaff 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1582,7 +1582,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) err: put_device(&v->dev); - ida_simple_remove(&vhost_vdpa_ida, v->minor); return r; } -- cgit From dec96fc2dcb59723e041416b8dc53e011b4bfc2e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Oct 2023 10:05:48 +0100 Subject: btrfs: use u64 for buffer sizes in the tree search ioctls In the tree search v2 ioctl we use the type size_t, which is an unsigned long, to track the buffer size in the local variable 'buf_size'. An unsigned long is 32 bits wide on a 32 bits architecture. The buffer size defined in struct btrfs_ioctl_search_args_v2 is a u64, so when we later try to copy the local variable 'buf_size' to the argument struct, when the search returns -EOVERFLOW, we copy only 32 bits which will be a problem on big endian systems. Fix this by using a u64 type for the buffer sizes, not only at btrfs_ioctl_tree_search_v2(), but also everywhere down the call chain so that we can use the u64 at btrfs_ioctl_tree_search_v2(). Fixes: cc68a8a5a433 ("btrfs: new ioctl TREE_SEARCH_V2") Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-btrfs/ce6f4bd6-9453-4ffe-ba00-cee35495e10f@moroto.mountain/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 89cd212735ea..f7e94aff41ae 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -1660,7 +1660,7 @@ out: static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit From b8212814d1e8428a082234223105e4071b844fab Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Oct 2023 12:42:55 +0300 Subject: btrfs: directly return 0 on no error code in btrfs_insert_raid_extent() It's more obvious to return a literal zero instead of "return ret;". Plus Smatch complains that ret could be uninitialized if the ordered_extent->bioc_list list is empty and this silences that warning. Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 944e8f1862aa..9589362acfbf 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, btrfs_put_bioc(bioc); } - return ret; + return 0; } int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, -- cgit From dfcb03ae8a341600d72fbf3c79429f306764d653 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 17 Oct 2023 16:23:22 +0900 Subject: btrfs: zoned: drop no longer valid write pointer check There is a check of the write pointer vs the zone size to reject an invalid write pointer. However, as of now, we have RAID0/RAID10 on the zoned mode, we can have a block group whose size is larger than the zone size. As an equivalent check against the block group's zone_capacity is already there, we can just drop this invalid check. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 3504ade30cb0..188378ca19c7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; - } - if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", -- cgit From 776a838f1fa95670c1c1cf7109a898090b473fa3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 17 Oct 2023 17:00:31 +0900 Subject: btrfs: zoned: wait for data BG to be finished on direct IO allocation Running the fio command below on a ZNS device results in "Resource temporarily unavailable" error. $ sudo fio --name=w --directory=/mnt --filesize=1GB --bs=16MB --numjobs=16 \ --rw=write --ioengine=libaio --iodepth=128 --direct=1 fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=117440512, buflen=16777216 fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=134217728, buflen=16777216 ... This happens because -EAGAIN error returned from btrfs_reserve_extent() called from btrfs_new_extent_direct() is spilling over to the userland. btrfs_reserve_extent() returns -EAGAIN when there is no active zone available. Then, the caller should wait for some other on-going IO to finish a zone and retry the allocation. This logic is already implemented for buffered write in cow_file_range(), but it is missing for the direct IO counterpart. Implement the same logic for it. Reported-by: Shinichiro Kawasaki Fixes: 2ce543f47843 ("btrfs: zoned: wait until zone is finished when allocation didn't progress") CC: stable@vger.kernel.org # 6.1+ Tested-by: Shinichiro Kawasaki Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b388505c91cc..137c4824ff91 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6979,8 +6979,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); +again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } if (ret) return ERR_PTR(ret); -- cgit From d8ba2a91fc3cd0347823435971f58f473cbba7aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Oct 2023 15:18:17 -0400 Subject: btrfs: get correct owning_root when dropping snapshot Dave reported a bug where we were aborting the transaction while trying to cleanup the squota reservation for an extent. This turned out to be because we're doing btrfs_header_owner(next) in do_walk_down when we decide to free the block. However in this code block we haven't explicitly read next, so it could be stale. We would then get whatever garbage happened to be in the pages at this point. The commit that introduced that is "btrfs: track owning root in btrfs_ref". Fix this by saving the owner_root when we do the btrfs_lookup_extent_info(). We always do this in do_walk_down, it is how we make the decision of whether or not to delete the block. This is cheap because we've already done the extent item lookup at this point, so it's straightforward to just grab the owner root as well. Then we can use this when deleting the metadata block without needing to force a read of the extent buffer to find the owner. This fixes the problem that Dave reported. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- fs/btrfs/extent-tree.h | 3 ++- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c0c5f2239820..14cefeaf9622 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -421,7 +421,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c8e5b4715b49..0455935ff558 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; @@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -167,6 +169,8 @@ search_again: struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, + path->slots[0]); } else { ret = -EUCLEAN; btrfs_err(fs_info, @@ -226,6 +230,8 @@ out: *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); BUG_ON(ret == -ENOMEM); if (ret) return ret; @@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; struct btrfs_ref ref = { 0 }; @@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5567,8 +5576,7 @@ skip: find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, - btrfs_header_owner(next)); + fs_info->nodesize, parent, owner_root); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); @@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], NULL); if (ret < 0) { err = ret; goto out_end_trans; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0716f65d9753..2e066035ccee 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, -- cgit From 47e2b06b7b5cb356a987ba3429550c3a89ea89d6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 28 Oct 2023 13:28:45 +1030 Subject: btrfs: make found_logical_ret parameter mandatory for function queue_scrub_stripe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a compilation warning reported on commit ae76d8e3e135 ("btrfs: scrub: fix grouping of read IO"), where gcc (14.0.0 20231022 experimental) is reporting the following uninitialized variable: fs/btrfs/scrub.c: In function ‘scrub_simple_mirror.isra’: fs/btrfs/scrub.c:2075:29: error: ‘found_logical’ may be used uninitialized [-Werror=maybe-uninitialized[https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wmaybe-uninitialized]] 2075 | cur_logical = found_logical + BTRFS_STRIPE_LEN; fs/btrfs/scrub.c:2040:21: note: ‘found_logical’ was declared here 2040 | u64 found_logical; | ^~~~~~~~~~~~~ [CAUSE] This is a false alert, as @found_logical is passed as parameter @found_logical_ret of function queue_scrub_stripe(). As long as queue_scrub_stripe() returned 0, we would update @found_logical_ret. And if queue_scrub_stripe() returned >0 or <0, the caller would not utilized @found_logical, thus there should be nothing wrong. Although the triggering gcc is still experimental, it looks like the extra check on "if (found_logical_ret)" can sometimes confuse the compiler. Meanwhile the only caller of queue_scrub_stripe() is always passing a valid pointer, there is no need for such check at all. [FIX] Although the report itself is a false alert, we can still make it more explicit by: - Replace the check for @found_logical_ret with ASSERT() - Initialize @found_logical to U64_MAX - Add one extra ASSERT() to make sure @found_logical got updated Link: https://lore.kernel.org/linux-btrfs/87fs1x1p93.fsf@gentoo.org/ Fixes: ae76d8e3e135 ("btrfs: scrub: fix grouping of read IO") Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9ce5be21b036..f62a408671cb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * */ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); + stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, @@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; - if (found_logical_ret) - *found_logical_ret = stripe->logical; + *found_logical_ret = stripe->logical; sctx->cur_stripe++; /* We filled one group, submit it. */ @@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { - u64 found_logical; + u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ -- cgit From cd63ffbd23edc176f09cac5c9287db732d7cbb73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 30 Oct 2023 11:54:23 +0000 Subject: btrfs: fix error pointer dereference after failure to allocate fs devices At device_list_add() we allocate a btrfs_fs_devices structure and then before checking if the allocation failed (pointer is ERR_PTR(-ENOMEM)), we dereference the error pointer in a memcpy() argument if the feature BTRFS_FEATURE_INCOMPAT_METADATA_UUID is enabled. Fix this by checking for an allocation error before trying the memcpy(). Fixes: f7361d8c3fc3 ("btrfs: sipmlify uuid parameters of alloc_fs_devices()") Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1fdfa9153e30..dd279241f78c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -746,13 +746,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return ERR_CAST(fs_devices); + if (has_metadata_uuid) memcpy(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); - if (IS_ERR(fs_devices)) - return ERR_CAST(fs_devices); - if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; -- cgit From c692800cb2ef7a4f4940c68d765cd4649aff3e46 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 2 Nov 2023 02:33:14 +0300 Subject: MAINTAINERS: Add Intel TDX entry Add myself as Intel TDX maintainer. I drove upstreaming most of TDX code so far and I will continue working on TDX for foreseeable future. [ dhansen: * Add myself as a reviewer too * Swap Maintained=>Supported. I double checked Kirill is still being paid * Add drivers/virt/coco/tdx-guest ] Suggested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Acked-by: Dave Hansen Acked-by: Kuppuswamy Sathyanarayanan Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231101233314.2567-1-kirill.shutemov%40linux.intel.com --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index dd5de540ec0b..b697020eca1c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23460,6 +23460,20 @@ F: arch/x86/kernel/dumpstack.c F: arch/x86/kernel/stacktrace.c F: arch/x86/kernel/unwind_*.c +X86 TRUST DOMAIN EXTENSIONS (TDX) +M: Kirill A. Shutemov +R: Dave Hansen +L: x86@kernel.org +L: linux-coco@lists.linux.dev +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx +F: arch/x86/boot/compressed/tdx* +F: arch/x86/coco/tdx/ +F: arch/x86/include/asm/shared/tdx.h +F: arch/x86/include/asm/tdx.h +F: arch/x86/virt/vmx/tdx/ +F: drivers/virt/coco/tdx-guest + X86 VDSO M: Andy Lutomirski L: linux-kernel@vger.kernel.org -- cgit From d3badb15613c14dd35d3495b1dde5c90fcd616dd Mon Sep 17 00:00:00 2001 From: Fang Xiang Date: Mon, 30 Oct 2023 16:32:56 +0800 Subject: irqchip/gic-v3-its: Flush ITS tables correctly in non-coherent GIC designs In non-coherent GIC designs, the ITS tables must be flushed before writing to the GITS_BASER registers, otherwise the ITS could read dirty tables, which results in unpredictable behavior. Flush the tables right at the begin of its_setup_baser() to prevent that. [ tglx: Massage changelog ] Fixes: a8707f553884 ("irqchip/gic-v3: Add Rockchip 3588001 erratum workaround") Suggested-by: Marc Zyngier Signed-off-by: Fang Xiang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20231030083256.4345-1-fangxiang3@xiaomi.com --- drivers/irqchip/irq-gic-v3-its.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index a8c89df1a997..9a7a74239eab 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2379,12 +2379,12 @@ retry_baser: break; } + if (!shr) + gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); + its_write_baser(its, baser, val); tmp = baser->val; - if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) - tmp &= ~GITS_BASER_SHAREABILITY_MASK; - if ((val ^ tmp) & GITS_BASER_SHAREABILITY_MASK) { /* * Shareability didn't stick. Just use @@ -2394,10 +2394,9 @@ retry_baser: * non-cacheable as well. */ shr = tmp & GITS_BASER_SHAREABILITY_MASK; - if (!shr) { + if (!shr) cache = GITS_BASER_nC; - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); - } + goto retry_baser; } @@ -2609,6 +2608,11 @@ static int its_alloc_tables(struct its_node *its) /* erratum 24313: ignore memory access type */ cache = GITS_BASER_nCnB; + if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) { + cache = GITS_BASER_nC; + shr = 0; + } + for (i = 0; i < GITS_BASER_NR_REGS; i++) { struct its_baser *baser = its->tables + i; u64 val = its_read_baser(its, baser); -- cgit From 18216762bcf618c52b85719d3563243f80e4a2d4 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 6 Nov 2023 17:12:04 +0700 Subject: x86/Documentation: Indent 'note::' directive for protocol version number note The protocol version number note is between the protocol version table and the memory layout section. As such, Sphinx renders the note directive not only on the actual note, but until the end of doc. Indent the directive so that only the actual protocol version number note is rendered as such. Fixes: 2c33c27fd603 ("x86/boot: Introduce kernel_info") Signed-off-by: Bagas Sanjaya Signed-off-by: Ingo Molnar Cc: Jonathan Corbet Link: https://lore.kernel.org/r/20231106101206.76487-2-bagasdotme@gmail.com Signed-off-by: Ingo Molnar --- Documentation/arch/x86/boot.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index f5d2f2414de8..22cc7a040dae 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -77,7 +77,7 @@ Protocol 2.14 BURNT BY INCORRECT COMMIT Protocol 2.15 (Kernel 5.5) Added the kernel_info and kernel_info.setup_type_max. ============= ============================================================ -.. note:: + .. note:: The protocol version number should be changed only if the setup header is changed. There is no need to update the version number if boot_params or kernel_info are changed. Additionally, it is recommended to use -- cgit From f2c1dba31133233697fc96e808c6005fc304a8e9 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 28 Jun 2023 09:40:53 -0400 Subject: tools/power/turbostat: bugfix "--show IPC" turbostat --show IPC displays "inf" for the IPC column turbostat was missing the explicit dependency of IPC on APERF, and thus neglected to collect APERF when only IPC was requested. typcial use: turbostat --quiet --show CPU,IPC Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8a311d7272e7..5aa6598a40a9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2202,7 +2202,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->c1 = new->c1 - old->c1; - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) + || soft_c1_residency_display(BIC_Avg_MHz)) { if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) { old->aperf = new->aperf - old->aperf; old->mperf = new->mperf - old->mperf; @@ -2724,7 +2725,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) retry: t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) + || soft_c1_residency_display(BIC_Avg_MHz)) { unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; /* -- cgit From b8337e6a780dad9505f9d44da07c0a5c52fa0a04 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 7 Nov 2023 23:28:30 -0500 Subject: tools/power turbostat: version 2023.11.07 Turbostat features are now table-driven (Rui Zhang) Add support for some new platforms (Sumeet Pawnikar, Rui Zhang) Gracefully run in configs when CPUs are limited (Rui Zhang, Srinivas Pandruvada) misc minor fixes. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5aa6598a40a9..7a334377f92b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6259,7 +6259,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.03.17 - Len Brown \n"); + fprintf(outf, "turbostat version 2023.11.07 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- cgit From 31255e072b2e91f97645d792d25b2db744186dd1 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Tue, 7 Nov 2023 10:22:51 -0800 Subject: x86/shstk: Delay signal entry SSP write until after user accesses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a signal is being delivered, the kernel needs to make accesses to userspace. These accesses could encounter an access error, in which case the signal delivery itself will trigger a segfault. Usually this would result in the kernel killing the process. But in the case of a SEGV signal handler being configured, the failure of the first signal delivery will result in *another* signal getting delivered. The second signal may succeed if another thread has resolved the issue that triggered the segfault (i.e. a well timed mprotect()/mmap()), or the second signal is being delivered to another stack (i.e. an alt stack). On x86, in the non-shadow stack case, all the accesses to userspace are done before changes to the registers (in pt_regs). The operation is aborted when an access error occurs, so although there may be writes done for the first signal, control flow changes for the signal (regs->ip, regs->sp, etc) are not committed until all the accesses have already completed successfully. This means that the second signal will be delivered as if it happened at the time of the first signal. It will effectively replace the first aborted signal, overwriting the half-written frame of the aborted signal. So on sigreturn from the second signal, control flow will resume happily from the point of control flow where the original signal was delivered. The problem is, when shadow stack is active, the shadow stack SSP register/MSR is updated *before* some of the userspace accesses. This means if the earlier accesses succeed and the later ones fail, the second signal will not be delivered at the same spot on the shadow stack as the first one. So on sigreturn from the second signal, the SSP will be pointing to the wrong location on the shadow stack (off by a frame). Pengfei privately reported that while using a shadow stack enabled glibc, the “signal06” test in the LTP test-suite hung. It turns out it is testing the above described double signal scenario. When this test was compiled with shadow stack, the first signal pushed a shadow stack sigframe, then the second pushed another. When the second signal was handled, the SSP was at the first shadow stack signal frame instead of the original location. The test then got stuck as the #CP from the twice incremented SSP was incorrect and generated segfaults in a loop. Fix this by adjusting the SSP register only after any userspace accesses, such that there can be no failures after the SSP is adjusted. Do this by moving the shadow stack sigframe push logic to happen after all other userspace accesses. Note, sigreturn (as opposed to the signal delivery dealt with in this patch) has ordering behavior that could lead to similar failures. The ordering issues there extend beyond shadow stack to include the alt stack restoration. Fixing that would require cross-arch changes, and the ordering today does not cause any known test or apps breakages. So leave it as is, for now. [ dhansen: minor changelog/subject tweak ] Fixes: 05e36022c054 ("x86/shstk: Handle signals for shadow stack") Reported-by: Pengfei Xu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Tested-by: Pengfei Xu Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20231107182251.91276-1-rick.p.edgecombe%40intel.com Link: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/signal/signal06.c --- arch/x86/kernel/signal_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index cacf2ede6217..23d8aaf8d9fd 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); - if (setup_signal_shadow_stack(ksig)) - return -EFAULT; - if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) return -EFAULT; } + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ -- cgit From 65120498aaf8d7320647a8b6d6de7db42e74ea52 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:27 +0100 Subject: stackleak: add declarations for global functions With -Wmissing-prototypes enabled, the stackleak code produces a couple of warnings that have no declarations because they are only called from assembler: stackleak.c:127:25: error: no previous prototype for 'stackleak_erase' [-Werror=missing-prototypes] stackleak.c:139:25: error: no previous prototype for 'stackleak_erase_on_task_stack' [-Werror=missing-prototypes] stackleak.c:151:25: error: no previous prototype for 'stackleak_erase_off_task_stack' [-Werror=missing-prototypes] stackleak.c:159:49: error: no previous prototype for 'stackleak_track_stack' [-Werror=missing-prototypes] Add declarations to the stackleak header to shut up the warnings. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231108125843.3806765-7-arnd@kernel.org Signed-off-by: Kees Cook --- include/linux/stackleak.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index c36e7a3b45e7..3be2cb564710 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -14,6 +14,7 @@ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK #include +#include /* * The lowest address on tsk's stack which we can plausibly erase. @@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } +asmlinkage void noinstr stackleak_erase(void); +asmlinkage void noinstr stackleak_erase_on_task_stack(void); +asmlinkage void noinstr stackleak_erase_off_task_stack(void); +void __no_caller_saved_registers noinstr stackleak_track_stack(void); + #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif -- cgit From 1ee60356c2dca938362528404af95b8ef3e49b6a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Nov 2023 13:43:37 -0700 Subject: gcc-plugins: randstruct: Only warn about true flexible arrays The randstruct GCC plugin tried to discover "fake" flexible arrays to issue warnings about them in randomized structs. In the future LSM overhead reduction series, it would be legal to have a randomized struct with a 1-element array, and this should _not_ be treated as a flexible array, especially since commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3"). Disable the 0-sized and 1-element array discovery logic in the plugin, but keep the "true" flexible array check. Cc: KP Singh Cc: linux-hardening@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311021532.iBwuZUZ0-lkp@intel.com/ Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Reviewed-by: Bill Wendling Acked-by: "Gustavo A. R. Silva" Link: https://lore.kernel.org/r/20231104204334.work.160-kees@kernel.org Signed-off-by: Kees Cook --- scripts/gcc-plugins/randomize_layout_plugin.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index 366395cab490..910bd21d08f4 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -278,8 +278,6 @@ static bool is_flexible_array(const_tree field) { const_tree fieldtype; const_tree typesize; - const_tree elemtype; - const_tree elemsize; fieldtype = TREE_TYPE(field); typesize = TYPE_SIZE(fieldtype); @@ -287,20 +285,12 @@ static bool is_flexible_array(const_tree field) if (TREE_CODE(fieldtype) != ARRAY_TYPE) return false; - elemtype = TREE_TYPE(fieldtype); - elemsize = TYPE_SIZE(elemtype); - /* size of type is represented in bits */ if (typesize == NULL_TREE && TYPE_DOMAIN(fieldtype) != NULL_TREE && TYPE_MAX_VALUE(TYPE_DOMAIN(fieldtype)) == NULL_TREE) return true; - if (typesize != NULL_TREE && - (TREE_CONSTANT(typesize) && (!tree_to_uhwi(typesize) || - tree_to_uhwi(typesize) == tree_to_uhwi(elemsize)))) - return true; - return false; } -- cgit From 19597cad64d608aa8ac2f8aef50a50187a565223 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Mon, 30 Oct 2023 12:19:12 +0530 Subject: scsi: qla2xxx: Fix system crash due to bad pointer access User experiences system crash when running AER error injection. The perturbation causes the abort-all-I/O path to trigger. The driver assumes all I/O on this path is FCP only. If there is both NVMe & FCP traffic, a system crash happens. Add additional check to see if I/O is FCP or not before access. PID: 999019 TASK: ff35d769f24722c0 CPU: 53 COMMAND: "kworker/53:1" 0 [ff3f78b964847b58] machine_kexec at ffffffffae86973d 1 [ff3f78b964847ba8] __crash_kexec at ffffffffae9be29d 2 [ff3f78b964847c70] crash_kexec at ffffffffae9bf528 3 [ff3f78b964847c78] oops_end at ffffffffae8282ab 4 [ff3f78b964847c98] exc_page_fault at ffffffffaf2da502 5 [ff3f78b964847cc0] asm_exc_page_fault at ffffffffaf400b62 [exception RIP: qla2x00_abort_srb+444] RIP: ffffffffc07b5f8c RSP: ff3f78b964847d78 RFLAGS: 00010046 RAX: 0000000000000282 RBX: ff35d74a0195a200 RCX: ff35d76886fd03a0 RDX: 0000000000000001 RSI: ffffffffc07c5ec8 RDI: ff35d74a0195a200 RBP: ff35d76913d22080 R8: ff35d7694d103200 R9: ff35d7694d103200 R10: 0000000100000000 R11: ffffffffb05d6630 R12: 0000000000010000 R13: ff3f78b964847df8 R14: ff35d768d8754000 R15: ff35d768877248e0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 6 [ff3f78b964847d70] qla2x00_abort_srb at ffffffffc07b5f84 [qla2xxx] 7 [ff3f78b964847de0] __qla2x00_abort_all_cmds at ffffffffc07b6238 [qla2xxx] 8 [ff3f78b964847e38] qla2x00_abort_all_cmds at ffffffffc07ba635 [qla2xxx] 9 [ff3f78b964847e58] qla2x00_terminate_rport_io at ffffffffc08145eb [qla2xxx] 10 [ff3f78b964847e70] fc_terminate_rport_io at ffffffffc045987e [scsi_transport_fc] 11 [ff3f78b964847e88] process_one_work at ffffffffae914f15 12 [ff3f78b964847ed0] worker_thread at ffffffffae9154c0 13 [ff3f78b964847f10] kthread at ffffffffae91c456 14 [ff3f78b964847f50] ret_from_fork at ffffffffae8036ef Cc: stable@vger.kernel.org Fixes: f45bca8c5052 ("scsi: qla2xxx: Fix double scsi_done for abort path") Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20231030064912.37912-1-njavali@marvell.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 7e103d711825..d24410944f7d 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1837,8 +1837,16 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res, } spin_lock_irqsave(qp->qp_lock_ptr, *flags); - if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd))) - sp->done(sp, res); + switch (sp->type) { + case SRB_SCSI_CMD: + if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd))) + sp->done(sp, res); + break; + default: + if (ret_cmd) + sp->done(sp, res); + break; + } } else { sp->done(sp, res); } -- cgit From defde5a50d91c74e1ce71a7f0bce7fb1ae311d84 Mon Sep 17 00:00:00 2001 From: Naomi Chu Date: Thu, 2 Nov 2023 13:24:24 +0800 Subject: scsi: ufs: core: Expand MCQ queue slot to DeviceQueueDepth + 1 The UFSHCI 4.0 specification mandates that there should always be at least one empty slot in each queue for distinguishing between full and empty states. Enlarge 'hwq->max_entries' to 'DeviceQueueDepth + 1' to allow UFSHCI 4.0 controllers to fully utilize MCQ queue slots. Fixes: 4682abfae2eb ("scsi: ufs: core: mcq: Allocate memory for MCQ mode") Signed-off-by: Naomi Chu Link: https://lore.kernel.org/r/20231102052426.12006-2-naomi.chu@mediatek.com Reviewed-by: Stanley Chu Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Reviewed-by: Chun-Hung Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 2ba8ec254dce..5c75ab9d6bb5 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -436,7 +436,7 @@ int ufshcd_mcq_init(struct ufs_hba *hba) for (i = 0; i < hba->nr_hw_queues; i++) { hwq = &hba->uhq[i]; - hwq->max_entries = hba->nutrs; + hwq->max_entries = hba->nutrs + 1; spin_lock_init(&hwq->sq_lock); spin_lock_init(&hwq->cq_lock); mutex_init(&hwq->sq_mutex); -- cgit From 27900d7119c464b43cd9eac69c85884d17bae240 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 6 Nov 2023 15:51:17 +0800 Subject: scsi: ufs: core: Fix racing issue between ufshcd_mcq_abort() and ISR If command timeout happens and cq complete IRQ is raised at the same time, ufshcd_mcq_abort clears lprb->cmd and a NULL pointer deref happens in the ISR. Error log: ufshcd_abort: Device abort task at tag 18 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000108 pc : [0xffffffe27ef867ac] scsi_dma_unmap+0xc/0x44 lr : [0xffffffe27f1b898c] ufshcd_release_scsi_cmd+0x24/0x114 Fixes: f1304d442077 ("scsi: ufs: mcq: Added ufshcd_mcq_abort()") Cc: stable@vger.kernel.org Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20231106075117.8995-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 5c75ab9d6bb5..0787456c2b89 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -630,6 +630,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) int tag = scsi_cmd_to_rq(cmd)->tag; struct ufshcd_lrb *lrbp = &hba->lrb[tag]; struct ufs_hw_queue *hwq; + unsigned long flags; int err = FAILED; if (!ufshcd_cmd_inflight(lrbp->cmd)) { @@ -670,8 +671,10 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) } err = SUCCESS; + spin_lock_irqsave(&hwq->cq_lock, flags); if (ufshcd_cmd_inflight(lrbp->cmd)) ufshcd_release_scsi_cmd(hba, lrbp); + spin_unlock_irqrestore(&hwq->cq_lock, flags); out: return err; -- cgit From 860c3d03bbc3f17aef8600662c488f27fd093142 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:04:33 +0300 Subject: scsi: scsi_debug: Fix some bugs in sdebug_error_write() There are two bug in this code: 1) If count is zero, then it will lead to a NULL dereference. The kmalloc() will successfully allocate zero bytes and the test for "if (buf[0] == '-')" will read beyond the end of the zero size buffer and Oops. 2) The code does not ensure that the user's string is properly NUL terminated which could lead to a read overflow. Fixes: a9996d722b11 ("scsi: scsi_debug: Add interface to manage error injection for a single device") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/7733643d-e102-4581-8d29-769472011c97@moroto.mountain Reviewed-by: Wenchao Hao Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 67922e2c4c19..0dd21598f7b6 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1019,7 +1019,7 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf, struct sdebug_err_inject *inject; struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private; - buf = kmalloc(count, GFP_KERNEL); + buf = kzalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; -- cgit From 037fbd3fcfbd99145f9310d93f6637012807cfd0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:05:04 +0300 Subject: scsi: scsi_debug: Delete some bogus error checking Smatch complains that "dentry" is never initialized. These days everyone initializes all their stack variables to zero so this means that it will trigger a warning every time this function is run. Really, debugfs functions are not supposed to be checked for errors in normal code. For example, if we updated this code to check the correct variable then it would print a warning if CONFIG_DEBUGFS was disabled. We don't want that. Just delete the check. Fixes: f084fe52c640 ("scsi: scsi_debug: Add debugfs interface to fail target reset") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/c602c9ad-5e35-4e18-a47f-87ed956a9ec2@moroto.mountain Reviewed-by: Wenchao Hao Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 0dd21598f7b6..6d8218a44122 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1132,7 +1132,6 @@ static const struct file_operations sdebug_target_reset_fail_fops = { static int sdebug_target_alloc(struct scsi_target *starget) { struct sdebug_target_info *targetip; - struct dentry *dentry; targetip = kzalloc(sizeof(struct sdebug_target_info), GFP_KERNEL); if (!targetip) @@ -1140,15 +1139,9 @@ static int sdebug_target_alloc(struct scsi_target *starget) targetip->debugfs_entry = debugfs_create_dir(dev_name(&starget->dev), sdebug_debugfs_root); - if (IS_ERR_OR_NULL(targetip->debugfs_entry)) - pr_info("%s: failed to create debugfs directory for target %s\n", - __func__, dev_name(&starget->dev)); debugfs_create_file("fail_reset", 0600, targetip->debugfs_entry, starget, &sdebug_target_reset_fail_fops); - if (IS_ERR_OR_NULL(dentry)) - pr_info("%s: failed to create fail_reset file for target %s\n", - __func__, dev_name(&starget->dev)); starget->hostdata = targetip; -- cgit From 3b83486399a6a9feb9c681b74c21a227d48d7020 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 6 Nov 2023 17:13:04 -0600 Subject: scsi: sd: Fix sshdr use in sd_suspend_common() If scsi_execute_cmd() returns < 0, it doesn't initialize the sshdr, so we shouldn't access the sshdr. If it returns 0, then the cmd executed successfully, so there is no need to check the sshdr. sd_sync_cache() will only access the sshdr if it's been setup because it calls scsi_status_is_check_condition() before accessing it. However, the sd_sync_cache() caller, sd_suspend_common(), does not check. sd_suspend_common() is only checking for ILLEGAL_REQUEST which it's using to determine if the command is supported. If it's not it just ignores the error. So to fix its sshdr use this patch just moves that check to sd_sync_cache() where it converts ILLEGAL_REQUEST to success/0. sd_suspend_common() was ignoring that error and sd_shutdown() doesn't check for errors so there will be no behavior changes. Signed-off-by: Mike Christie Link: https://lore.kernel.org/r/20231106231304.5694-2-michael.christie@oracle.com Reviewed-by: Christoph Hellwig Reviewed-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 53 +++++++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 864db30b9c8e..3ef3b00397fe 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1565,24 +1565,21 @@ out: return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } -static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) +static int sd_sync_cache(struct scsi_disk *sdkp) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; - struct scsi_sense_hdr my_sshdr; + struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, - /* caller might not be interested in sense, but we need it */ - .sshdr = sshdr ? : &my_sshdr, + .sshdr = &sshdr, }; if (!scsi_device_online(sdp)) return -ENODEV; - sshdr = exec_args.sshdr; - for (retries = 3; retries > 0; --retries) { unsigned char cmd[16] = { 0 }; @@ -1607,15 +1604,23 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) return res; if (scsi_status_is_check_condition(res) && - scsi_sense_valid(sshdr)) { - sd_print_sense_hdr(sdkp, sshdr); + scsi_sense_valid(&sshdr)) { + sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ - if (sshdr->asc == 0x3a || /* medium not present */ - sshdr->asc == 0x20 || /* invalid command */ - (sshdr->asc == 0x74 && sshdr->ascq == 0x71)) /* drive is password locked */ + if (sshdr.asc == 0x3a || /* medium not present */ + sshdr.asc == 0x20 || /* invalid command */ + (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; + /* + * This drive doesn't support sync and there's not much + * we can do because this is called during shutdown + * or suspend so just return success so those operations + * can proceed. + */ + if (sshdr.sense_key == ILLEGAL_REQUEST) + return 0; } switch (host_byte(res)) { @@ -3774,7 +3779,7 @@ static void sd_shutdown(struct device *dev) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - sd_sync_cache(sdkp, NULL); + sd_sync_cache(sdkp); } if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) { @@ -3786,7 +3791,6 @@ static void sd_shutdown(struct device *dev) static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - struct scsi_sense_hdr sshdr; int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ @@ -3795,24 +3799,13 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - ret = sd_sync_cache(sdkp, &sshdr); - - if (ret) { - /* ignore OFFLINE device */ - if (ret == -ENODEV) - return 0; - - if (!scsi_sense_valid(&sshdr) || - sshdr.sense_key != ILLEGAL_REQUEST) - return ret; + ret = sd_sync_cache(sdkp); + /* ignore OFFLINE device */ + if (ret == -ENODEV) + return 0; - /* - * sshdr.sense_key == ILLEGAL_REQUEST means this drive - * doesn't support sync. There's not much to do and - * suspend shouldn't fail. - */ - ret = 0; - } + if (ret) + return ret; } if (sdkp->device->manage_start_stop) { -- cgit From e439e4a62a8ea3c39d65c546de3af7d1c594077c Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:43:11 +0100 Subject: scsi: ufs: qcom-ufs: dt-bindings: Document the SM8650 UFS Controller Document the UFS Controller on the SM8650 Platform. Reviewed-by: Alim Akhtar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-bindings-ufs-v3-1-a96364463fd5@linaro.org Reviewed-by: Krzysztof Kozlowski Signed-off-by: Martin K. Petersen --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 462ead5a1cec..2cf3d016db42 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -36,6 +36,7 @@ properties: - qcom,sm8350-ufshc - qcom,sm8450-ufshc - qcom,sm8550-ufshc + - qcom,sm8650-ufshc - const: qcom,ufshc - const: jedec,ufs-2.0 @@ -122,6 +123,7 @@ allOf: - qcom,sm8350-ufshc - qcom,sm8450-ufshc - qcom,sm8550-ufshc + - qcom,sm8650-ufshc then: properties: clocks: -- cgit From 6c8e69e4a702b072206f166111c003d704de15d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 3 Nov 2023 12:26:25 +0000 Subject: btrfs: fix race between accounting qgroup extents and removing a qgroup When doing qgroup accounting for an extent, we take the spinlock fs_info->qgroup_lock and then add qgroups to the local list (iterator) named "qgroups". These qgroups are found in the fs_info->qgroup_tree rbtree. After we're done, we unlock fs_info->qgroup_lock and then call qgroup_iterator_nested_clean(), which will iterate over all the qgroups added to the local list "qgroups" and then delete them from the list. Deleting a qgroup from the list can however result in a use-after-free if a qgroup remove operation happens after we unlock fs_info->qgroup_lock and before or while we are at qgroup_iterator_nested_clean(). Fix this by calling qgroup_iterator_nested_clean() while still holding the lock fs_info->qgroup_lock - we don't need it under the 'out' label since before taking the lock the "qgroups" list is always empty. This guarantees safety because btrfs_remove_qgroup() takes that lock before removing a qgroup from the rbtree fs_info->qgroup_tree. This was reported by syzbot with the following stack traces: BUG: KASAN: slab-use-after-free in __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49 Read of size 8 at addr ffff888027e420b0 by task kworker/u4:3/48 CPU: 1 PID: 48 Comm: kworker/u4:3 Not tainted 6.6.0-syzkaller-10396-g4652b8e4f3ff #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Workqueue: btrfs-qgroup-rescan btrfs_work_helper Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0x163/0x540 mm/kasan/report.c:475 kasan_report+0x175/0x1b0 mm/kasan/report.c:588 __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49 __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_init include/linux/list.h:287 [inline] qgroup_iterator_nested_clean fs/btrfs/qgroup.c:2623 [inline] btrfs_qgroup_account_extent+0x18b/0x1150 fs/btrfs/qgroup.c:2883 qgroup_rescan_leaf fs/btrfs/qgroup.c:3543 [inline] btrfs_qgroup_rescan_worker+0x1078/0x1c60 fs/btrfs/qgroup.c:3604 btrfs_work_helper+0x37c/0xbd0 fs/btrfs/async-thread.c:315 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x90f/0x1400 kernel/workqueue.c:2703 worker_thread+0xa5f/0xff0 kernel/workqueue.c:2784 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Allocated by task 6355: kasan_save_stack mm/kasan/common.c:45 [inline] kasan_set_track+0x4f/0x70 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:383 kmalloc include/linux/slab.h:600 [inline] kzalloc include/linux/slab.h:721 [inline] btrfs_quota_enable+0xee9/0x2060 fs/btrfs/qgroup.c:1209 btrfs_ioctl_quota_ctl+0x143/0x190 fs/btrfs/ioctl.c:3705 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 6355: kasan_save_stack mm/kasan/common.c:45 [inline] kasan_set_track+0x4f/0x70 mm/kasan/common.c:52 kasan_save_free_info+0x28/0x40 mm/kasan/generic.c:522 ____kasan_slab_free+0xd6/0x120 mm/kasan/common.c:236 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook mm/slub.c:1826 [inline] slab_free mm/slub.c:3809 [inline] __kmem_cache_free+0x263/0x3a0 mm/slub.c:3822 btrfs_remove_qgroup+0x764/0x8c0 fs/btrfs/qgroup.c:1787 btrfs_ioctl_qgroup_create+0x185/0x1e0 fs/btrfs/ioctl.c:3811 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Last potentially related work creation: kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45 __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2667 [inline] call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781 kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Second to last potentially related work creation: kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45 __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2667 [inline] call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781 kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 The buggy address belongs to the object at ffff888027e42000 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 176 bytes inside of freed 512-byte region [ffff888027e42000, ffff888027e42200) The buggy address belongs to the physical page: page:ffffea00009f9000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27e40 head:ffffea00009f9000 order:2 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000840(slab|head|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000840 ffff888012c41c80 ffffea0000a5ba00 dead000000000002 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 4514, tgid 4514 (udevadm), ts 24598439480, free_ts 23755696267 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x1e6/0x210 mm/page_alloc.c:1536 prep_new_page mm/page_alloc.c:1543 [inline] get_page_from_freelist+0x31db/0x3360 mm/page_alloc.c:3170 __alloc_pages+0x255/0x670 mm/page_alloc.c:4426 alloc_slab_page+0x6a/0x160 mm/slub.c:1870 allocate_slab mm/slub.c:2017 [inline] new_slab+0x84/0x2f0 mm/slub.c:2070 ___slab_alloc+0xc85/0x1310 mm/slub.c:3223 __slab_alloc mm/slub.c:3322 [inline] __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] __kmem_cache_alloc_node+0x19d/0x270 mm/slub.c:3517 kmalloc_trace+0x2a/0xe0 mm/slab_common.c:1098 kmalloc include/linux/slab.h:600 [inline] kzalloc include/linux/slab.h:721 [inline] kernfs_fop_open+0x3e7/0xcc0 fs/kernfs/file.c:670 do_dentry_open+0x8fd/0x1590 fs/open.c:948 do_open fs/namei.c:3622 [inline] path_openat+0x2845/0x3280 fs/namei.c:3779 do_filp_open+0x234/0x490 fs/namei.c:3809 do_sys_openat2+0x13e/0x1d0 fs/open.c:1440 do_sys_open fs/open.c:1455 [inline] __do_sys_openat fs/open.c:1471 [inline] __se_sys_openat fs/open.c:1466 [inline] __x64_sys_openat+0x247/0x290 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1136 [inline] free_unref_page_prepare+0x8c3/0x9f0 mm/page_alloc.c:2312 free_unref_page+0x37/0x3f0 mm/page_alloc.c:2405 discard_slab mm/slub.c:2116 [inline] __unfreeze_partials+0x1dc/0x220 mm/slub.c:2655 put_cpu_partial+0x17b/0x250 mm/slub.c:2731 __slab_free+0x2b6/0x390 mm/slub.c:3679 qlink_free mm/kasan/quarantine.c:166 [inline] qlist_free_all+0x75/0xe0 mm/kasan/quarantine.c:185 kasan_quarantine_reduce+0x14b/0x160 mm/kasan/quarantine.c:292 __kasan_slab_alloc+0x23/0x70 mm/kasan/common.c:305 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook+0x67/0x3d0 mm/slab.h:762 slab_alloc_node mm/slub.c:3478 [inline] slab_alloc mm/slub.c:3486 [inline] __kmem_cache_alloc_lru mm/slub.c:3493 [inline] kmem_cache_alloc+0x104/0x2c0 mm/slub.c:3502 getname_flags+0xbc/0x4f0 fs/namei.c:140 do_sys_openat2+0xd2/0x1d0 fs/open.c:1434 do_sys_open fs/open.c:1455 [inline] __do_sys_openat fs/open.c:1471 [inline] __se_sys_openat fs/open.c:1466 [inline] __x64_sys_openat+0x247/0x290 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Memory state around the buggy address: ffff888027e41f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888027e42000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888027e42080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888027e42100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888027e42180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Reported-by: syzbot+e0b615318f8fcfc01ceb@syzkaller.appspotmail.com Fixes: dce28769a33a ("btrfs: qgroup: use qgroup_iterator_nested to in qgroup_update_refcnt()") CC: stable@vger.kernel.org # 6.6 Link: https://lore.kernel.org/linux-btrfs/00000000000091a5b2060936bf6d@google.com/ Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index edb84cc03237..e48eba7e9379 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2874,13 +2874,19 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); + /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; spin_unlock(&fs_info->qgroup_lock); out_free: - qgroup_iterator_nested_clean(&qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; -- cgit From 609d99379736aa6c5b0658654084198aa808035a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Nov 2023 20:17:37 +0000 Subject: btrfs: fix qgroup record leaks when using simple quotas When using simple quotas we are not supposed to allocate qgroup records when adding delayed references. However we allocate them if either mode of quotas is enabled (the new simple one or the old one), but then we never free them because running the accounting, which frees the records, is only run when using the old quotas (at btrfs_qgroup_account_extents()), resulting in a memory leak of the records allocated when adding delayed references. Fix this by allocating the records only if the old quotas mode is enabled. Also fix btrfs_qgroup_trace_extent_nolock() to return 1 if the old quotas mode is not enabled - meaning the caller has to free the record. Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas") Reported-by: syzbot+d3ddc6dcc6386dea398b@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000004769106097f9a34@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 4 ++-- fs/btrfs/qgroup.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9223934d95f4..891ea2fa263c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e48eba7e9379..ce446d9d7f23 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, u64 bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) - return 0; + return 1; lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); -- cgit From d3933152442b7f94419e9ea71835d71b620baf0e Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 3 Nov 2023 11:38:04 -0700 Subject: btrfs: make OWNER_REF_KEY type value smallest among inline refs BTRFS_EXTENT_OWNER_REF_KEY is the type of simple quotas extent owner refs. This special inline ref goes in front of all other inline refs. In general, inline refs have a required sorted order s.t. type never decreases (among other requirements). This was recently reified into a tree-checker and fsck rule, which broke simple quotas. To be fair, though, in a sense, the new owner ref item had also violated that not yet fully enforced requirement. This fix brings the owner ref item into compliance with the requirement that inline ref type never decrease. btrfs/301 exercises this behavior and should pass again with this fix. Fixes: d9a620f77e33 ("btrfs: new inline ref storing owning subvol of data extents") Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index c25fc9614594..d24e8e121507 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -219,6 +219,22 @@ */ #define BTRFS_METADATA_ITEM_KEY 169 +/* + * Special inline ref key which stores the id of the subvolume which originally + * created the extent. This subvolume owns the extent permanently from the + * perspective of simple quotas. Needed to know which subvolume to free quota + * usage from when the extent is deleted. + * + * Stored as an inline ref rather to avoid wasting space on a separate item on + * top of the existing extent item. However, unlike the other inline refs, + * there is one one owner ref per extent rather than one per extent. + * + * Because of this, it goes at the front of the list of inline refs, and thus + * must have a lower type value than any other inline ref type (to satisfy the + * disk format rule that inline refs have non-decreasing type). + */ +#define BTRFS_EXTENT_OWNER_REF_KEY 172 + #define BTRFS_TREE_BLOCK_REF_KEY 176 #define BTRFS_EXTENT_DATA_REF_KEY 178 @@ -233,14 +249,6 @@ #define BTRFS_SHARED_DATA_REF_KEY 184 -/* - * Special inline ref key which stores the id of the subvolume which originally - * created the extent. This subvolume owns the extent permanently from the - * perspective of simple quotas. Needed to know which subvolume to free quota - * usage from when the extent is deleted. - */ -#define BTRFS_EXTENT_OWNER_REF_KEY 188 - /* * block groups give us hints into the extent allocation trees. Which * blocks are free etc etc -- cgit From ec9aedb2aa1ab7ac420c00b31f5edc5be15ec167 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 3 Jul 2023 00:28:02 +0800 Subject: x86/acpi: Ignore invalid x2APIC entries Currently, the kernel enumerates the possible CPUs by parsing both ACPI MADT Local APIC entries and x2APIC entries. So CPUs with "valid" APIC IDs, even if they have duplicated APIC IDs in Local APIC and x2APIC, are always enumerated. Below is what ACPI MADT Local APIC and x2APIC describes on an Ivebridge-EP system, [02Ch 0044 1] Subtable Type : 00 [Processor Local APIC] [02Fh 0047 1] Local Apic ID : 00 ... [164h 0356 1] Subtable Type : 00 [Processor Local APIC] [167h 0359 1] Local Apic ID : 39 [16Ch 0364 1] Subtable Type : 00 [Processor Local APIC] [16Fh 0367 1] Local Apic ID : FF ... [3ECh 1004 1] Subtable Type : 09 [Processor Local x2APIC] [3F0h 1008 4] Processor x2Apic ID : 00000000 ... [B5Ch 2908 1] Subtable Type : 09 [Processor Local x2APIC] [B60h 2912 4] Processor x2Apic ID : 00000077 As a result, kernel shows "smpboot: Allowing 168 CPUs, 120 hotplug CPUs". And this wastes significant amount of memory for the per-cpu data. Plus this also breaks https://lore.kernel.org/all/87edm36qqb.ffs@tglx/, because __max_logical_packages is over-estimated by the APIC IDs in the x2APIC entries. According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure: "[Compatibility note] On some legacy OSes, Logical processors with APIC ID values less than 255 (whether in XAPIC or X2APIC mode) must use the Processor Local APIC structure to convey their APIC information to OSPM, and those processors must be declared in the DSDT using the Processor() keyword. Logical processors with APIC ID values 255 and greater must use the Processor Local x2APIC structure and be declared using the Device() keyword." Therefore prevent the registration of x2APIC entries with an APIC ID less than 255 if the local APIC table enumerates valid APIC IDs. [ tglx: Simplify the logic ] Signed-off-by: Zhang Rui Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra Link: https://lore.kernel.org/r/20230702162802.344176-1-rui.zhang@intel.com --- arch/x86/kernel/acpi/boot.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c55c0ef47a18..fc5bce1b5047 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool has_lapic_cpus __initdata; static bool acpi_support_online_capable; #endif @@ -232,6 +233,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) if (!acpi_is_processor_usable(processor->lapic_flags)) return 0; + /* + * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure + * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID + * in x2APIC must be equal or greater than 0xff. + */ + if (has_lapic_cpus && apic_id < 0xff) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1114,10 +1123,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { - int count; - int x2count = 0; - int ret; - struct acpi_subtable_proc madt_proc[2]; + int count, x2count = 0; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1126,21 +1132,11 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - memset(madt_proc, 0, sizeof(madt_proc)); - madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; - madt_proc[0].handler = acpi_parse_lapic; - madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; - madt_proc[1].handler = acpi_parse_x2apic; - ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, - sizeof(struct acpi_table_madt), - madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); - if (ret < 0) { - pr_err("Error parsing LAPIC/X2APIC entries\n"); - return ret; - } - - count = madt_proc[0].count; - x2count = madt_proc[1].count; + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, + acpi_parse_lapic, MAX_LOCAL_APIC); + has_lapic_cpus = count > 0; + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, + acpi_parse_x2apic, MAX_LOCAL_APIC); } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); -- cgit From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 9 Nov 2023 18:43:28 +0100 Subject: selftests: bpf: xskxceiver: ksft_print_msg: fix format type error Crossbuilding selftests/bpf for architecture arm64, format specifies type error show up like. xskxceiver.c:912:34: error: format specifies type 'int' but the argument has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", ~~ %llu __func__, pkt->pkt_nb, meta->count); ^~~~~~~~~~~ xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ~~~~ ^~~~ Fixing the issues by casting to (unsigned long long) and changing the specifiers to be %llu from %d and %u, since with u64s it might be %llx or %lx, depending on architecture. Signed-off-by: Anders Roxell Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 591ca9637b23..b604c570309a 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) struct xdp_info *meta = data - sizeof(struct xdp_info); if (meta->count != pkt->pkt_nb) { - ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", - __func__, pkt->pkt_nb, meta->count); + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", + __func__, pkt->pkt_nb, + (unsigned long long)meta->count); return false; } @@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp if (addr >= umem->num_frames * umem->frame_size || addr + len > umem->num_frames * umem->frame_size) { - ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ksft_print_msg("Frag invalid addr: %llx len: %u\n", + (unsigned long long)addr, len); return false; } if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { - ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); + ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", + (unsigned long long)addr, len); return false; } @@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); ksft_print_msg("[%s] Too many packets completed\n", __func__); - ksft_print_msg("Last completion address: %llx\n", addr); + ksft_print_msg("Last completion address: %llx\n", + (unsigned long long)addr); return TEST_FAILURE; } @@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) } if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { - ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", - __func__, stats.tx_invalid_descs, + ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", + __func__, + (unsigned long long)stats.tx_invalid_descs, ifobject->xsk->pkt_stream->nb_pkts); return TEST_FAILURE; } -- cgit From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:36 -0800 Subject: bpf: handle ldimm64 properly in check_cfg() ldimm64 instructions are 16-byte long, and so have to be handled appropriately in check_cfg(), just like the rest of BPF verifier does. This has implications in three places: - when determining next instruction for non-jump instructions; - when determining next instruction for callback address ldimm64 instructions (in visit_func_call_insn()); - when checking for unreachable instructions, where second half of ldimm64 is expected to be unreachable; We take this also as an opportunity to report jump into the middle of ldimm64. And adjust few test_verifier tests accordingly. Acked-by: Eduard Zingerman Reported-by: Hao Sun Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++-- kernel/bpf/verifier.c | 27 ++++++++++++++++++------- tools/testing/selftests/bpf/verifier/ld_imm64.c | 8 ++++---- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb29..35bff17396c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +static bool bpf_is_ldimm64(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { - return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && - insn->src_reg == BPF_PSEUDO_FUNC; + return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } struct bpf_prog_ops { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bd1c42eb540f..b87715b364fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); @@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -15607,11 +15610,21 @@ walk_cfg: } for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c index f9297900cea6..78f19c255f20 100644 --- a/tools/testing/selftests/bpf/verifier/ld_imm64.c +++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c @@ -9,8 +9,8 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { @@ -23,8 +23,8 @@ BPF_LD_IMM64(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { -- cgit From 4bb7ea946a370707315ab774432963ce47291946 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:37 -0800 Subject: bpf: fix precision backtracking instruction iteration Fix an edge case in __mark_chain_precision() which prematurely stops backtracking instructions in a state if it happens that state's first and last instruction indexes are the same. This situations doesn't necessarily mean that there were no instructions simulated in a state, but rather that we starting from the instruction, jumped around a bit, and then ended up at the same instruction before checkpointing or marking precision. To distinguish between these two possible situations, we need to consult jump history. If it's empty or contain a single record "bridging" parent state and first instruction of processed state, then we indeed backtracked all instructions in this state. But if history is not empty, we are definitely not done yet. Move this logic inside get_prev_insn_idx() to contain it more nicely. Use -ENOENT return code to denote "we are out of instructions" situation. This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once the next fix in this patch set is applied. Acked-by: Eduard Zingerman Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b87715b364fd..484c742f733e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3516,12 +3516,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -4401,10 +4418,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; subseq_idx = i; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask -- cgit From 62ccdb11d3c63dc697dea1fd92b3496fe43dcc1e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:38 -0800 Subject: selftests/bpf: add edge case backtracking logic test Add a dedicated selftests to try to set up conditions to have a state with same first and last instruction index, but it actually is a loop 3->4->1->2->3. This confuses mark_chain_precision() if verifier doesn't take into account jump history. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_precision.c | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 193c0f8272d0..6b564d4c0986 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -91,3 +91,43 @@ __naked int bpf_end_bswap(void) } #endif /* v4 instruction */ + +SEC("?raw_tp") +__success __log_level(2) +/* + * Without the bug fix there will be no history between "last_idx 3 first_idx 3" + * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * expected log messages to the one specific mark_chain_precision operation. + * + * This is quite fragile: if verifier checkpointing heuristic changes, this + * might need adjusting. + */ +__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("3: (35) if r0 >= 0xa goto pc+1") +__msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") +__msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") +__msg("3: R0_w=6") +__naked int state_loop_first_last_equal(void) +{ + asm volatile ( + "r0 = 0;" + "l0_%=:" + "r0 += 1;" + "r0 += 1;" + /* every few iterations we'll have a checkpoint here with + * first_idx == last_idx, potentially confusing precision + * backtracking logic + */ + "if r0 >= 10 goto l1_%=;" /* checkpoint + mark_precise */ + "goto l0_%=;" + "l1_%=:" + "exit;" + ::: __clobber_common + ); +} + +char _license[] SEC("license") = "GPL"; -- cgit From 10e14e9652bf9e8104151bfd9200433083deae3d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 22:14:10 -0800 Subject: bpf: fix control-flow graph checking in privileged mode When BPF program is verified in privileged mode, BPF verifier allows bounded loops. This means that from CFG point of view there are definitely some back-edges. Original commit adjusted check_cfg() logic to not detect back-edges in control flow graph if they are resulting from conditional jumps, which the idea that subsequent full BPF verification process will determine whether such loops are bounded or not, and either accept or reject the BPF program. At least that's my reading of the intent. Unfortunately, the implementation of this idea doesn't work correctly in all possible situations. Conditional jump might not result in immediate back-edge, but just a few unconditional instructions later we can arrive at back-edge. In such situations check_cfg() would reject BPF program even in privileged mode, despite it might be bounded loop. Next patch adds one simple program demonstrating such scenario. To keep things simple, instead of trying to detect back edges in privileged mode, just assume every back edge is valid and let subsequent BPF verification prove or reject bounded loops. Note a few test changes. For unknown reason, we have a few tests that are specified to detect a back-edge in a privileged mode, but looking at their code it seems like the right outcome is passing check_cfg() and letting subsequent verification to make a decision about bounded or not bounded looping. Bounded recursion case is also interesting. The example should pass, as recursion is limited to just a few levels and so we never reach maximum number of nested frames and never exhaust maximum stack depth. But the way that max stack depth logic works today it falsely detects this as exceeding max nested frame count. This patch series doesn't attempt to fix this orthogonal problem, so we just adjust expected verifier failure. Suggested-by: Alexei Starovoitov Fixes: 2589726d12a1 ("bpf: introduce bounded loops") Reported-by: Hao Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110061412.2995786-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 ++++++++-------------- .../testing/selftests/bpf/progs/verifier_loops1.c | 9 ++++++--- tools/testing/selftests/bpf/verifier/calls.c | 6 +++--- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 484c742f733e..a2267d5ed14e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15403,8 +15403,7 @@ enum { * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, - bool loop_ok) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -15436,7 +15435,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->bpf_capable) + if (env->bpf_capable) return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -15459,7 +15458,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, int ret, insn_sz; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; - ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); if (ret) return ret; @@ -15469,12 +15468,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, if (visit_callee) { mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, - /* It's ok to allow recursion from CFG point of - * view. __check_func_call() will do the actual - * check. - */ - bpf_pseudo_func(insns + t)); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); } return ret; } @@ -15496,7 +15490,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) != BPF_JMP && BPF_CLASS(insn->code) != BPF_JMP32) { insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + return push_insn(t, t + insn_sz, FALLTHROUGH, env); } switch (BPF_OP(insn->code)) { @@ -15543,8 +15537,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) off = insn->imm; /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env, - true); + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); if (ret) return ret; @@ -15557,11 +15550,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env) /* conditional jump with two edges */ mark_prune_point(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) return ret; - return push_insn(t, t + insn->off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env); } } diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index 5bc86af80a9a..71735dbf33d4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -75,9 +75,10 @@ l0_%=: r0 += 1; \ " ::: __clobber_all); } -SEC("tracepoint") +SEC("socket") __description("bounded loop, start in the middle") -__failure __msg("back-edge") +__success +__failure_unpriv __msg_unpriv("back-edge") __naked void loop_start_in_the_middle(void) { asm volatile (" \ @@ -136,7 +137,9 @@ l0_%=: exit; \ SEC("tracepoint") __description("bounded recursion") -__failure __msg("back-edge") +__failure +/* verifier limitation in detecting max stack depth */ +__msg("the call stack of 8 frames is too deep !") __naked void bounded_recursion(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 1bdf2b43e49e..3d5cd51071f0 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -442,7 +442,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge from insn 0 to 0", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -799,7 +799,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -811,7 +811,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { -- cgit From e2e57d637aa5da0a2f49d83ad44e9febf95df7b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 22:14:11 -0800 Subject: selftests/bpf: add more test cases for check_cfg() Add a few more simple cases to validate proper privileged vs unprivileged loop detection behavior. conditional_loop2 is the one reported by Hao Sun that triggered this set of fixes. Acked-by: Eduard Zingerman Suggested-by: Hao Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110061412.2995786-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_cfg.c | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_cfg.c b/tools/testing/selftests/bpf/progs/verifier_cfg.c index df7697b94007..c1f55e1d80a4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_cfg.c +++ b/tools/testing/selftests/bpf/progs/verifier_cfg.c @@ -97,4 +97,66 @@ l0_%=: r2 = r0; \ " ::: __clobber_all); } +SEC("socket") +__description("conditional loop (2)") +__success +__failure_unpriv __msg_unpriv("back-edge from insn 10 to 11") +__naked void conditional_loop2(void) +{ + asm volatile (" \ + r9 = 2 ll; \ + r3 = 0x20 ll; \ + r4 = 0x35 ll; \ + r8 = r4; \ + goto l1_%=; \ +l0_%=: r9 -= r3; \ + r9 -= r4; \ + r9 -= r8; \ +l1_%=: r8 += r4; \ + if r8 < 0x64 goto l0_%=; \ + r0 = r9; \ + exit; \ +" ::: __clobber_all); +} + +SEC("socket") +__description("unconditional loop after conditional jump") +__failure __msg("infinite loop detected") +__failure_unpriv __msg_unpriv("back-edge from insn 3 to 2") +__naked void uncond_loop_after_cond_jmp(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 > 0 goto l1_%=; \ +l0_%=: r0 = 1; \ + goto l0_%=; \ +l1_%=: exit; \ +" ::: __clobber_all); +} + + +__naked __noinline __used +static unsigned long never_ending_subprog() +{ + asm volatile (" \ + r0 = r1; \ + goto -1; \ +" ::: __clobber_all); +} + +SEC("socket") +__description("unconditional loop after conditional jump") +/* infinite loop is detected *after* check_cfg() */ +__failure __msg("infinite loop detected") +__naked void uncond_loop_in_subprog_after_cond_jmp(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 > 0 goto l1_%=; \ +l0_%=: r0 += 1; \ + call never_ending_subprog; \ +l1_%=: exit; \ +" ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit From 8a4f030dbced6fc255cbe67b2d0a129947e18493 Mon Sep 17 00:00:00 2001 From: Yuran Pereira Date: Wed, 8 Nov 2023 02:18:36 +0530 Subject: ptp: Fixes a null pointer dereference in ptp_ioctl Syzkaller found a null pointer dereference in ptp_ioctl originating from the lack of a null check for tsevq. ``` general protection fault, probably for non-canonical address 0xdffffc000000020b: 0000 [#1] PREEMPT SMP KASAN KASAN: probably user-memory-access in range [0x0000000000001058-0x000000000000105f] CPU: 0 PID: 5053 Comm: syz-executor353 Not tainted 6.6.0-syzkaller-10396-g4652b8e4f3ff #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:ptp_ioctl+0xcb7/0x1d10 drivers/ptp/ptp_chardev.c:476 ... Call Trace: posix_clock_ioctl+0xf8/0x160 kernel/time/posix-clock.c:86 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b ``` This patch fixes the issue by adding a check for tsevq and ensuring ptp_ioctl returns with an error if tsevq is null. Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a78ecea7ac1a2ea26e5 Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks") Signed-off-by: Yuran Pereira Reviewed-by: Przemek Kitszel Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 3f7a74788802..e95a6ed130ef 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -176,6 +176,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, int enable, err = 0; tsevq = pccontext->private_clkdata; + if (!tsevq) + return -EINVAL; switch (cmd) { -- cgit From 871019b22d1bcc9fab2d1feba1b9a564acbb6e99 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 8 Nov 2023 13:13:25 -0800 Subject: net: set SOCK_RCU_FREE before inserting socket into hashtable We've started to see the following kernel traces: WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0 Call Trace: __bpf_skc_lookup+0x10d/0x120 bpf_sk_lookup+0x48/0xd0 bpf_sk_lookup_tcp+0x19/0x20 bpf_prog_+0x37c/0x16a3 cls_bpf_classify+0x205/0x2e0 tcf_classify+0x92/0x160 __netif_receive_skb_core+0xe52/0xf10 __netif_receive_skb_list_core+0x96/0x2b0 napi_complete_done+0x7b5/0xb70 _poll+0x94/0xb0 net_rx_action+0x163/0x1d70 __do_softirq+0xdc/0x32e asm_call_irq_on_stack+0x12/0x20 do_softirq_own_stack+0x36/0x50 do_softirq+0x44/0x70 __inet_hash can race with lockless (rcu) readers on the other cpus: __inet_hash __sk_nulls_add_node_rcu <- (bpf triggers here) sock_set_flag(SOCK_RCU_FREE) Let's move the SOCK_RCU_FREE part up a bit, before we are inserting the socket into hashtables. Note, that the race is really harmless; the bpf callers are handling this situation (where listener socket doesn't have SOCK_RCU_FREE set) correctly, so the only annoyance is a WARN_ONCE. More details from Eric regarding SOCK_RCU_FREE timeline: Commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood") added SOCK_RCU_FREE. At that time, the precise location of sock_set_flag(sk, SOCK_RCU_FREE) did not matter, because the thread calling __inet_hash() owns a reference on sk. SOCK_RCU_FREE was only tested at dismantle time. Commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") started checking SOCK_RCU_FREE _after_ the lookup to infer whether the refcount has been taken care of. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Reviewed-by: Eric Dumazet Signed-off-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 598c1b114d2c..a532f749e477 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk) if (err) goto unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); - sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); -- cgit From cbe9e68e1e0f965fd3b1caf975eb29083c65e6b0 Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Fri, 10 Nov 2023 14:57:49 +0530 Subject: MAINTAINERS: net: Update reviewers for TI's Ethernet drivers Grygorii is no longer associated with TI and messages addressed to him bounce. Add Siddharth, Roger and myself as reviewers. Signed-off-by: Ravi Gunasekaran Signed-off-by: David S. Miller --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 350d00657f6b..e3acb36989f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21769,7 +21769,9 @@ F: Documentation/devicetree/bindings/counter/ti-eqep.yaml F: drivers/counter/ti-eqep.c TI ETHERNET SWITCH DRIVER (CPSW) -R: Grygorii Strashko +R: Siddharth Vadapalli +R: Ravi Gunasekaran +R: Roger Quadros L: linux-omap@vger.kernel.org L: netdev@vger.kernel.org S: Maintained -- cgit From 18f039428c7df183b09c69ebf10ffd4e521035d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 15:22:41 +0000 Subject: ipvlan: add ipvlan_route_v6_outbound() helper Inspired by syzbot reports using a stack of multiple ipvlan devices. Reduce stack size needed in ipvlan_process_v6_outbound() by moving the flowi6 struct used for the route lookup in an non inlined helper. ipvlan_route_v6_outbound() needs 120 bytes on the stack, immediately reclaimed. Also make sure ipvlan_process_v4_outbound() is not inlined. We might also have to lower MAX_NEST_DEV, because only syzbot uses setups with more than four stacked devices. BUG: TASK stack guard page was hit at ffffc9000e803ff8 (stack is ffffc9000e804000..ffffc9000e808000) stack guard page: 0000 [#1] SMP KASAN CPU: 0 PID: 13442 Comm: syz-executor.4 Not tainted 6.1.52-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:kasan_check_range+0x4/0x2a0 mm/kasan/generic.c:188 Code: 48 01 c6 48 89 c7 e8 db 4e c1 03 31 c0 5d c3 cc 0f 0b eb 02 0f 0b b8 ea ff ff ff 5d c3 cc 00 00 cc cc 00 00 cc cc 55 48 89 e5 <41> 57 41 56 41 55 41 54 53 b0 01 48 85 f6 0f 84 a4 01 00 00 48 89 RSP: 0018:ffffc9000e804000 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817e5bf2 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff887c6568 RBP: ffffc9000e804000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff92001d0080c R13: dffffc0000000000 R14: ffffffff87e6b100 R15: 0000000000000000 FS: 00007fd0c55826c0(0000) GS:ffff8881f6800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000e803ff8 CR3: 0000000170ef7000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <#DF> [] __kasan_check_read+0x11/0x20 mm/kasan/shadow.c:31 [] instrument_atomic_read include/linux/instrumented.h:72 [inline] [] _test_bit include/asm-generic/bitops/instrumented-non-atomic.h:141 [inline] [] cpumask_test_cpu include/linux/cpumask.h:506 [inline] [] cpu_online include/linux/cpumask.h:1092 [inline] [] trace_lock_acquire include/trace/events/lock.h:24 [inline] [] lock_acquire+0xe2/0x590 kernel/locking/lockdep.c:5632 [] rcu_lock_acquire+0x2e/0x40 include/linux/rcupdate.h:306 [] rcu_read_lock include/linux/rcupdate.h:747 [inline] [] ip6_pol_route+0x15d/0x1440 net/ipv6/route.c:2221 [] ip6_pol_route_output+0x50/0x80 net/ipv6/route.c:2606 [] pol_lookup_func include/net/ip6_fib.h:584 [inline] [] fib6_rule_lookup+0x265/0x620 net/ipv6/fib6_rules.c:116 [] ip6_route_output_flags_noref+0x2d9/0x3a0 net/ipv6/route.c:2638 [] ip6_route_output_flags+0xca/0x340 net/ipv6/route.c:2651 [] ip6_route_output include/net/ip6_route.h:100 [inline] [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:473 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0xc33/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_resolve_output+0x64e/0x750 net/core/neighbour.c:1560 [] neigh_output include/net/neighbour.h:545 [inline] [] ip6_finish_output2+0x1643/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] NF_HOOK include/linux/netfilter.h:309 [inline] [] ip6_xmit+0x11a4/0x1b20 net/ipv6/ip6_output.c:352 [] sctp_v6_xmit+0x9ae/0x1230 net/sctp/ipv6.c:250 [] sctp_packet_transmit+0x25de/0x2bc0 net/sctp/output.c:653 [] sctp_packet_singleton+0x202/0x310 net/sctp/outqueue.c:783 [] sctp_outq_flush_ctrl net/sctp/outqueue.c:914 [inline] [] sctp_outq_flush+0x661/0x3d40 net/sctp/outqueue.c:1212 [] sctp_outq_uncork+0x79/0xb0 net/sctp/outqueue.c:764 [] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline] [] sctp_do_sm+0x55c0/0x5c30 net/sctp/sm_sideeffect.c:1170 [] sctp_primitive_ASSOCIATE+0x97/0xc0 net/sctp/primitive.c:73 [] sctp_sendmsg_to_asoc+0xf62/0x17b0 net/sctp/socket.c:1839 [] sctp_sendmsg+0x212e/0x33b0 net/sctp/socket.c:2029 [] inet_sendmsg+0x149/0x310 net/ipv4/af_inet.c:849 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] ____sys_sendmsg+0x572/0x8c0 net/socket.c:2504 [] ___sys_sendmsg net/socket.c:2558 [inline] [] __sys_sendmsg+0x271/0x360 net/socket.c:2587 [] __do_sys_sendmsg net/socket.c:2596 [inline] [] __se_sys_sendmsg net/socket.c:2594 [inline] [] __x64_sys_sendmsg+0x7f/0x90 net/socket.c:2594 [] do_syscall_x64 arch/x86/entry/common.c:51 [inline] [] do_syscall_64+0x53/0x80 arch/x86/entry/common.c:84 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_core.c | 41 ++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 21e9cac73121..2d5b021b4ea6 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -411,7 +411,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, return addr; } -static int ipvlan_process_v4_outbound(struct sk_buff *skb) +static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; @@ -453,13 +453,11 @@ out: } #if IS_ENABLED(CONFIG_IPV6) -static int ipvlan_process_v6_outbound(struct sk_buff *skb) + +static noinline_for_stack int +ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); - struct dst_entry *dst; - int err, ret = NET_XMIT_DROP; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, @@ -469,27 +467,38 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; + struct dst_entry *dst; + int err; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - ret = dst->error; + dst = ip6_route_output(dev_net(dev), NULL, &fl6); + err = dst->error; + if (err) { dst_release(dst); - goto err; + return err; } skb_dst_set(skb, dst); + return 0; +} + +static int ipvlan_process_v6_outbound(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + int err, ret = NET_XMIT_DROP; + + err = ipvlan_route_v6_outbound(dev, skb); + if (unlikely(err)) { + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); + return err; + } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - err = ip6_local_out(net, skb->sk, skb); + err = ip6_local_out(dev_net(dev), skb->sk, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; - goto out; -err: - DEV_STATS_INC(dev, tx_errors); - kfree_skb(skb); -out: return ret; } #else -- cgit From 719639853d88071dfdfd8d9971eca9c283ff314c Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 9 Nov 2023 00:44:20 +0900 Subject: tty: Fix uninit-value access in ppp_sync_receive() KMSAN reported the following uninit-value access issue: ===================================================== BUG: KMSAN: uninit-value in ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline] BUG: KMSAN: uninit-value in ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334 ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline] ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: __alloc_pages+0x75d/0xe80 mm/page_alloc.c:4591 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] __page_frag_cache_refill+0x9a/0x2c0 mm/page_alloc.c:4691 page_frag_alloc_align+0x91/0x5d0 mm/page_alloc.c:4722 page_frag_alloc include/linux/gfp.h:322 [inline] __netdev_alloc_skb+0x215/0x6d0 net/core/skbuff.c:728 netdev_alloc_skb include/linux/skbuff.h:3225 [inline] dev_alloc_skb include/linux/skbuff.h:3238 [inline] ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline] ppp_sync_receive+0x237/0xe70 drivers/net/ppp/ppp_synctty.c:334 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 12950 Comm: syz-executor.1 Not tainted 6.6.0-14500-g1c41041124bd #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 ===================================================== ppp_sync_input() checks the first 2 bytes of the data are PPP_ALLSTATIONS and PPP_UI. However, if the data length is 1 and the first byte is PPP_ALLSTATIONS, an access to an uninitialized value occurs when checking PPP_UI. This patch resolves this issue by checking the data length. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Shigeru Yoshida Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_synctty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index ebcdffdf4f0e..ea261a628786 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -687,7 +687,7 @@ ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count) /* strip address/control field if present */ p = skb->data; - if (p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) { + if (skb->len >= 2 && p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto err; -- cgit From 4b21a669ca21ed8f24ef4530b2918be5730114de Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 10 Nov 2023 15:16:06 +0800 Subject: ALSA: hda/realtek - Add Dell ALC295 to pin fall back table Add ALC295 to pin fall back table. Remove 5 pin quirks for Dell ALC295. ALC295 was only support MIC2 for external MIC function. ALC295 assigned model "ALC269_FIXUP_DELL1_MIC_NO_PRESENCE" for pin fall back table. It was assigned wrong model. So, let's remove it. Fixes: fbc571290d9f ("ALSA: hda/realtek - Fixed Headphone Mic can't record on Dell platform") Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/7c1998e873834df98d59bd7e0d08c72e@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 669ae3d6e447..d689f0050aae 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10821,22 +10821,6 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60130}, {0x17, 0x90170110}, {0x21, 0x03211020}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, - {0x14, 0x90170110}, - {0x21, 0x04211020}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, - {0x14, 0x90170110}, - {0x21, 0x04211030}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS, - {0x17, 0x21014020}, - {0x18, 0x21a19030}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS, - {0x17, 0x21014040}, - {0x18, 0x21a19050}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS), SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_STANDARD_PINS, {0x17, 0x90170110}), @@ -10880,6 +10864,9 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = { SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, {0x19, 0x40000000}, {0x1b, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, + {0x19, 0x40000000}, + {0x1b, 0x40000000}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x19, 0x40000000}, {0x1a, 0x40000000}), -- cgit From 8384c0baf223e1c3bc7b1c711d80a4c6106d210e Mon Sep 17 00:00:00 2001 From: Eymen Yigit Date: Fri, 10 Nov 2023 18:07:15 +0300 Subject: ALSA: hda/realtek: Enable Mute LED on HP 255 G8 This HP Notebook uses ALC236 codec with COEF 0x07 idx 1 controlling the mute LED. Enable already existing quirk for this device. Signed-off-by: Eymen Yigit Cc: Luka Guzenko Cc: Link: https://lore.kernel.org/r/20231110150715.5141-1-eymenyg01@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d689f0050aae..c7de74396185 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9832,6 +9832,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8902, "HP OMEN 16", ALC285_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x890e, "HP 255 G8 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8919, "HP Pavilion Aero Laptop 13-be0xxx", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x896d, "HP ZBook Firefly 16 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), -- cgit From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Nov 2023 15:57:13 +0100 Subject: hrtimers: Push pending hrtimers away from outgoing CPU earlier 2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug") solved the straight forward CPU hotplug deadlock vs. the scheduler bandwidth timer. Yu discovered a more involved variant where a task which has a bandwidth timer started on the outgoing CPU holds a lock and then gets throttled. If the lock required by one of the CPU hotplug callbacks the hotplug operation deadlocks because the unthrottling timer event is not handled on the dying CPU and can only be recovered once the control CPU reaches the hotplug state which pulls the pending hrtimers from the dead CPU. Solve this by pushing the hrtimers away from the dying CPU in the dying callbacks. Nothing can queue a hrtimer on the dying CPU at that point because all other CPUs spin in stop_machine() with interrupts disabled and once the operation is finished the CPU is marked offline. Reported-by: Yu Liao Signed-off-by: Thomas Gleixner Tested-by: Liu Tie Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx --- include/linux/cpuhotplug.h | 1 + include/linux/hrtimer.h | 4 ++-- kernel/cpu.c | 8 +++++++- kernel/time/hrtimer.c | 33 ++++++++++++--------------------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 068f7738be22..448f5f995adc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -193,6 +193,7 @@ enum cpuhp_state { CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, + CPUHP_AP_HRTIMERS_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0ee140176f10..f2044d5a652b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU -int hrtimers_dead_cpu(unsigned int cpu); +int hrtimers_cpu_dying(unsigned int cpu); #else -#define hrtimers_dead_cpu NULL +#define hrtimers_cpu_dying NULL #endif #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74ee..2e69a1deaa31 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2098,7 +2098,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, - .teardown.single = hrtimers_dead_cpu, + .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", @@ -2190,6 +2190,12 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, + [CPUHP_AP_HRTIMERS_DYING] = { + .name = "hrtimers:dying", + .startup.single = NULL, + .teardown.single = hrtimers_cpu_dying, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 238262e4aba7..760793998cdd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, ncpu = cpumask_first(cpu_active_mask); - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + tick_cancel_sched_timer(dying_cpu); + + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], @@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu) * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ - hrtimer_update_softirq_timer(new_base, false); + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); - raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); - local_bh_enable(); return 0; } -- cgit From e409d7346648c9acff84c3cc8d291767ee2d5326 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 10 Nov 2023 17:13:02 +0100 Subject: net: ti: icssg-prueth: Add missing icss_iep_put to error path Analogously to prueth_remove, just also taking care for NULL'ing the iep pointers. Fixes: 186734c15886 ("net: ti: icssg-prueth: add packet timestamping and ptp support") Fixes: 443a2367ba3c ("net: ti: icssg-prueth: am65x SR2.0 add 10M full duplex support") Signed-off-by: Jan Kiszka Reviewed-by: Wojciech Drewek Reviewed-by: MD Danish Anwar Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 6c4b64227ac8..3abbeba26f1b 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -2105,10 +2105,7 @@ static int prueth_probe(struct platform_device *pdev) prueth->iep1 = icss_iep_get_idx(np, 1); if (IS_ERR(prueth->iep1)) { ret = dev_err_probe(dev, PTR_ERR(prueth->iep1), "iep1 get failed\n"); - icss_iep_put(prueth->iep0); - prueth->iep0 = NULL; - prueth->iep1 = NULL; - goto free_pool; + goto put_iep0; } if (prueth->pdata.quirk_10m_link_issue) { @@ -2205,6 +2202,12 @@ netdev_exit: exit_iep: if (prueth->pdata.quirk_10m_link_issue) icss_iep_exit_fw(prueth->iep1); + icss_iep_put(prueth->iep1); + +put_iep0: + icss_iep_put(prueth->iep0); + prueth->iep0 = NULL; + prueth->iep1 = NULL; free_pool: gen_pool_free(prueth->sram_pool, -- cgit From 2bd5b559a1f391f05927bbb0b31381fa71c61e26 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 10 Nov 2023 17:13:08 +0100 Subject: net: ti: icssg-prueth: Fix error cleanup on failing pruss_request_mem_region We were just continuing in this case, surely not desired. Fixes: 128d5874c082 ("net: ti: icssg-prueth: Add ICSSG ethernet driver") Signed-off-by: Jan Kiszka Reviewed-by: Wojciech Drewek Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 3abbeba26f1b..411898a4f38c 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -2063,7 +2063,7 @@ static int prueth_probe(struct platform_device *pdev) &prueth->shram); if (ret) { dev_err(dev, "unable to get PRUSS SHRD RAM2: %d\n", ret); - pruss_put(prueth->pruss); + goto put_pruss; } prueth->sram_pool = of_gen_pool_get(np, "sram", 0); @@ -2215,6 +2215,8 @@ free_pool: put_mem: pruss_release_mem_region(prueth->pruss, &prueth->shram); + +put_pruss: pruss_put(prueth->pruss); put_cores: -- cgit From 713f040cd22285fcc506f40a0d259566e6758c3c Mon Sep 17 00:00:00 2001 From: Chandradeep Dey Date: Sat, 11 Nov 2023 19:25:49 +0100 Subject: ALSA: hda/realtek - Enable internal speaker of ASUS K6500ZC Apply the already existing quirk chain ALC294_FIXUP_ASUS_SPK to enable the internal speaker of ASUS K6500ZC. Signed-off-by: Chandradeep Dey Cc: Link: https://lore.kernel.org/r/NizcVHQ--3-9@chandradeepdey.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c7de74396185..cdd808e02b44 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9907,6 +9907,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x10a1, "ASUS UX391UA", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x10c0, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x10d0, "ASUS X540LA/X540LJ", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x10d3, "ASUS K6500ZC", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1043, 0x11c0, "ASUS X556UR", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x125e, "ASUS Q524UQK", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), -- cgit From 7b211c7671212cad0b83603c674838c7e824d845 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 10 Nov 2023 10:30:11 +0100 Subject: Revert "i2c: pxa: move to generic GPIO recovery" This reverts commit 0b01392c18b9993a584f36ace1d61118772ad0ca. Conversion of PXA to generic I2C recovery, makes the I2C bus completely lock up if recovery pinctrl is present in the DT and I2C recovery is enabled. So, until the generic I2C recovery can also work with PXA lets revert to have working I2C and I2C recovery again. Signed-off-by: Robert Marko Cc: stable@vger.kernel.org # 5.11+ Acked-by: Andi Shyti Acked-by: Russell King (Oracle) Acked-by: Linus Walleij Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pxa.c | 76 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 1d7648242749..76f79b68cef8 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -265,6 +265,9 @@ struct pxa_i2c { u32 hs_mask; struct i2c_bus_recovery_info recovery; + struct pinctrl *pinctrl; + struct pinctrl_state *pinctrl_default; + struct pinctrl_state *pinctrl_recovery; }; #define _IBMR(i2c) ((i2c)->reg_ibmr) @@ -1299,12 +1302,13 @@ static void i2c_pxa_prepare_recovery(struct i2c_adapter *adap) */ gpiod_set_value(i2c->recovery.scl_gpiod, ibmr & IBMR_SCLS); gpiod_set_value(i2c->recovery.sda_gpiod, ibmr & IBMR_SDAS); + + WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery)); } static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap) { struct pxa_i2c *i2c = adap->algo_data; - struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; u32 isr; /* @@ -1318,7 +1322,7 @@ static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap) i2c_pxa_do_reset(i2c); } - WARN_ON(pinctrl_select_state(bri->pinctrl, bri->pins_default)); + WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default)); dev_dbg(&i2c->adap.dev, "recovery: IBMR 0x%08x ISR 0x%08x\n", readl(_IBMR(i2c)), readl(_ISR(i2c))); @@ -1340,20 +1344,76 @@ static int i2c_pxa_init_recovery(struct pxa_i2c *i2c) if (IS_ENABLED(CONFIG_I2C_PXA_SLAVE)) return 0; - bri->pinctrl = devm_pinctrl_get(dev); - if (PTR_ERR(bri->pinctrl) == -ENODEV) { - bri->pinctrl = NULL; + i2c->pinctrl = devm_pinctrl_get(dev); + if (PTR_ERR(i2c->pinctrl) == -ENODEV) + i2c->pinctrl = NULL; + if (IS_ERR(i2c->pinctrl)) + return PTR_ERR(i2c->pinctrl); + + if (!i2c->pinctrl) + return 0; + + i2c->pinctrl_default = pinctrl_lookup_state(i2c->pinctrl, + PINCTRL_STATE_DEFAULT); + i2c->pinctrl_recovery = pinctrl_lookup_state(i2c->pinctrl, "recovery"); + + if (IS_ERR(i2c->pinctrl_default) || IS_ERR(i2c->pinctrl_recovery)) { + dev_info(dev, "missing pinmux recovery information: %ld %ld\n", + PTR_ERR(i2c->pinctrl_default), + PTR_ERR(i2c->pinctrl_recovery)); + return 0; + } + + /* + * Claiming GPIOs can influence the pinmux state, and may glitch the + * I2C bus. Do this carefully. + */ + bri->scl_gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); + if (bri->scl_gpiod == ERR_PTR(-EPROBE_DEFER)) + return -EPROBE_DEFER; + if (IS_ERR(bri->scl_gpiod)) { + dev_info(dev, "missing scl gpio recovery information: %pe\n", + bri->scl_gpiod); + return 0; + } + + /* + * We have SCL. Pull SCL low and wait a bit so that SDA glitches + * have no effect. + */ + gpiod_direction_output(bri->scl_gpiod, 0); + udelay(10); + bri->sda_gpiod = devm_gpiod_get(dev, "sda", GPIOD_OUT_HIGH_OPEN_DRAIN); + + /* Wait a bit in case of a SDA glitch, and then release SCL. */ + udelay(10); + gpiod_direction_output(bri->scl_gpiod, 1); + + if (bri->sda_gpiod == ERR_PTR(-EPROBE_DEFER)) + return -EPROBE_DEFER; + + if (IS_ERR(bri->sda_gpiod)) { + dev_info(dev, "missing sda gpio recovery information: %pe\n", + bri->sda_gpiod); return 0; } - if (IS_ERR(bri->pinctrl)) - return PTR_ERR(bri->pinctrl); bri->prepare_recovery = i2c_pxa_prepare_recovery; bri->unprepare_recovery = i2c_pxa_unprepare_recovery; + bri->recover_bus = i2c_generic_scl_recovery; i2c->adap.bus_recovery_info = bri; - return 0; + /* + * Claiming GPIOs can change the pinmux state, which confuses the + * pinctrl since pinctrl's idea of the current setting is unaffected + * by the pinmux change caused by claiming the GPIO. Work around that + * by switching pinctrl to the GPIO state here. We do it this way to + * avoid glitching the I2C bus. + */ + pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery); + + return pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default); } static int i2c_pxa_probe(struct platform_device *dev) -- cgit From 2a5db859c6825b5d50377dda9c3cc729c20cad43 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 31 Jul 2023 20:46:17 +0800 Subject: xfs: factor out xfs_defer_pending_abort Factor out xfs_defer_pending_abort() from xfs_defer_trans_abort(), which not use transaction parameter, so it can be used after the transaction life cycle. Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index bcfb6a4203cd..88388e12f8e7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,21 +245,18 @@ xfs_defer_create_intents( return ret; } -/* Abort all the intents that were committed. */ STATIC void -xfs_defer_trans_abort( - struct xfs_trans *tp, - struct list_head *dop_pending) +xfs_defer_pending_abort( + struct xfs_mount *mp, + struct list_head *dop_list) { struct xfs_defer_pending *dfp; const struct xfs_defer_op_type *ops; - trace_xfs_defer_trans_abort(tp, _RET_IP_); - /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_pending, dfp_list) { + list_for_each_entry(dfp, dop_list, dfp_list) { ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(tp->t_mountp, dfp); + trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; @@ -267,6 +264,16 @@ xfs_defer_trans_abort( } } +/* Abort all the intents that were committed. */ +STATIC void +xfs_defer_trans_abort( + struct xfs_trans *tp, + struct list_head *dop_pending) +{ + trace_xfs_defer_trans_abort(tp, _RET_IP_); + xfs_defer_pending_abort(tp->t_mountp, dop_pending); +} + /* * Capture resources that the caller said not to release ("held") when the * transaction commits. Caller is responsible for zero-initializing @dres. -- cgit From f8f9d952e42dd49ae534f61f2fa7ca0876cb9848 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 31 Jul 2023 20:46:18 +0800 Subject: xfs: abort intent items when recovery intents fail When recovering intents, we capture newly created intent items as part of committing recovered intent items. If intent recovery fails at a later point, we forget to remove those newly created intent items from the AIL and hang: [root@localhost ~]# cat /proc/539/stack [<0>] xfs_ail_push_all_sync+0x174/0x230 [<0>] xfs_unmount_flush_inodes+0x8d/0xd0 [<0>] xfs_mountfs+0x15f7/0x1e70 [<0>] xfs_fs_fill_super+0x10ec/0x1b20 [<0>] get_tree_bdev+0x3c8/0x730 [<0>] vfs_get_tree+0x89/0x2c0 [<0>] path_mount+0xecf/0x1800 [<0>] do_mount+0xf3/0x110 [<0>] __x64_sys_mount+0x154/0x1f0 [<0>] do_syscall_64+0x39/0x80 [<0>] entry_SYSCALL_64_after_hwframe+0x63/0xcd When newly created intent items fail to commit via transaction, intent recovery hasn't created done items for these newly created intent items, so the capture structure is the sole owner of the captured intent items. We must release them explicitly or else they leak: unreferenced object 0xffff888016719108 (size 432): comm "mount", pid 529, jiffies 4294706839 (age 144.463s) hex dump (first 32 bytes): 08 91 71 16 80 88 ff ff 08 91 71 16 80 88 ff ff ..q.......q..... 18 91 71 16 80 88 ff ff 18 91 71 16 80 88 ff ff ..q.......q..... backtrace: [] xfs_efi_init+0x18f/0x1d0 [] xfs_extent_free_create_intent+0x50/0x150 [] xfs_defer_create_intents+0x16a/0x340 [] xfs_defer_ops_capture_and_commit+0x8e/0xad0 [] xfs_cui_item_recover+0x819/0x980 [] xlog_recover_process_intents+0x246/0xb70 [] xlog_recover_finish+0x8a/0x9a0 [] xfs_log_mount_finish+0x2bb/0x4a0 [] xfs_mountfs+0x14bf/0x1e70 [] xfs_fs_fill_super+0x10d0/0x1b20 [] get_tree_bdev+0x3d2/0x6d0 [] vfs_get_tree+0x89/0x2c0 [] path_mount+0xecf/0x1800 [] do_mount+0xf3/0x110 [] __x64_sys_mount+0x154/0x1f0 [] do_syscall_64+0x39/0x80 Fix the problem above by abort intent items that don't have a done item when recovery intents fail. Fixes: e6fff81e4870 ("xfs: proper replay of deferred ops queued during log recovery") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 5 +++-- fs/xfs/libxfs/xfs_defer.h | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 88388e12f8e7..f71679ce23b9 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -763,12 +763,13 @@ xfs_defer_ops_capture( /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_capture_free( +xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; + xfs_defer_pending_abort(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -809,7 +810,7 @@ xfs_defer_ops_capture_and_commit( /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3..8788ad5f6a73 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); -void xfs_defer_ops_capture_free(struct xfs_mount *mp, +void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13b94d2e605b..a1e18b24971a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2511,7 +2511,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } -- cgit From 00080503612f61d1ad67be641ed9cb4f9f6ba40e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 19 Sep 2023 17:12:45 -0400 Subject: XFS: Update MAINTAINERS to catch all XFS documentation Assumes that all XFS documentation will be prefixed with xfs-, which seems like a good policy anyway. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Chandan Babu R --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..a78795dc9aee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23872,8 +23872,7 @@ T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git P: Documentation/filesystems/xfs-maintainer-entry-profile.rst F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-delayed-logging-design.rst -F: Documentation/filesystems/xfs-self-describing-metadata.rst +F: Documentation/filesystems/xfs-* F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h -- cgit From 55f669f34184ecb25b8353f29c7f6f1ae5b313d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Oct 2023 17:28:52 +0200 Subject: xfs: only remap the written blocks in xfs_reflink_end_cow_extent xfs_reflink_end_cow_extent looks up the COW extent and the data fork extent at offset_fsb, and then proceeds to remap the common subset between the two. It does however not limit the remapped extent to the passed in [*offset_fsbm end_fsb] range and thus potentially remaps more blocks than the one handled by the current I/O completion. This means that with sufficiently large data and COW extents we could be remapping COW fork mappings that have not been written to, leading to a stale data exposure on a powerfail event. We use to have a xfs_trim_range to make the remap fit the I/O completion range, but that got (apparently accidentally) removed in commit df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents"). Note that I've only found this by code inspection, and a test case would probably require very specific delay and error injection. Fixes: df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents") Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_reflink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 658edee8381d..e5b62dc28466 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent( } } del = got; + xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); /* Grab the corresponding mapping in the data fork. */ nmaps = 1; -- cgit From 471de20303dda0b67981e06d59cc6c4a83fd2a3c Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Mon, 30 Oct 2023 13:33:49 -0700 Subject: xfs: up(ic_sema) if flushing data device fails We flush the data device cache before we issue external log IO. If the flush fails, we shut down the log immediately and return. However, the iclog->ic_sema is left in a decremented state so let's add an up(). Prior to this patch, xfs/438 would fail consistently when running with an external log device: sync -> xfs_log_force -> xlog_write_iclog -> down(&iclog->ic_sema) -> blkdev_issue_flush (fail causes us to intiate shutdown) -> xlog_force_shutdown -> return unmount -> xfs_log_umount -> xlog_wait_iclog_completion -> down(&iclog->ic_sema) --------> HANG There is a second early return / shutdown. Make sure the up() happens for it as well. Also make sure we cleanup the iclog state, xlog_state_done_syncing, before dropping the iclog lock. Fixes: b5d721eaae47 ("xfs: external logs need to flush data device") Fixes: 842a42d126b4 ("xfs: shutdown on failure to add page to log bio") Fixes: 7d839e325af2 ("xfs: check return codes when flushing block devices") Signed-off-by: Leah Rumancik Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 51c100c86177..ee206facf0dc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1893,9 +1893,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1925,20 +1923,17 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1959,6 +1954,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* -- cgit From f63a5b3769ad7659da4c0420751d78958ab97675 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 1 Nov 2023 09:41:45 -0700 Subject: xfs: fix internal error from AGFL exhaustion We've been seeing XFS errors like the following: XFS: Internal error i != 1 at line 3526 of file fs/xfs/libxfs/xfs_btree.c. Caller xfs_btree_insert+0x1ec/0x280 ... Call Trace: xfs_corruption_error+0x94/0xa0 xfs_btree_insert+0x221/0x280 xfs_alloc_fixup_trees+0x104/0x3e0 xfs_alloc_ag_vextent_size+0x667/0x820 xfs_alloc_fix_freelist+0x5d9/0x750 xfs_free_extent_fix_freelist+0x65/0xa0 __xfs_free_extent+0x57/0x180 ... This is the XFS_IS_CORRUPT() check in xfs_btree_insert() when xfs_btree_insrec() fails. After converting this into a panic and dissecting the core dump, I found that xfs_btree_insrec() is failing because it's trying to split a leaf node in the cntbt when the AG free list is empty. In particular, it's failing to get a block from the AGFL _while trying to refill the AGFL_. If a single operation splits every level of the bnobt and the cntbt (and the rmapbt if it is enabled) at once, the free list will be empty. Then, when the next operation tries to refill the free list, it allocates space. If the allocation does not use a full extent, it will need to insert records for the remaining space in the bnobt and cntbt. And if those new records go in full leaves, the leaves (and potentially more nodes up to the old root) need to be split. Fix it by accounting for the additional splits that may be required to refill the free list in the calculation for the minimum free list size. P.S. As far as I can tell, this bug has existed for a long time -- maybe back to xfs-history commit afdf80ae7405 ("Add XFS_AG_MAXLEVELS macros ...") in April 1994! It requires a very unlucky sequence of events, and in fact we didn't hit it until a particular sparse mmap workload updated from 5.12 to 5.19. But this bug existed in 5.12, so it must've been exposed by some other change in allocation or writeback patterns. It's also much less likely to be hit with the rmapbt enabled, since that increases the minimum free list size and is unlikely to split at the same time as the bnobt and cntbt. Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Omar Sandoval Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3069194527dd..100ab5931b31 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist( ASSERT(mp->m_alloc_maxlevels > 0); + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + mp->m_rmap_maxlevels) * 2 - 2; return min_free; } -- cgit From a2e4388adfa44684c7c428a5a5980efe0d75e13e Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Sun, 5 Nov 2023 20:23:18 +0100 Subject: xfs: fix again select in kconfig XFS_ONLINE_SCRUB_STATS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 57c0f4a8ea3a attempted to fix the select in the kconfig entry XFS_ONLINE_SCRUB_STATS by selecting XFS_DEBUG, but the original intention was to select DEBUG_FS, since the feature relies on debugfs to export the related scrub statistics. Fixes: 57c0f4a8ea3a ("xfs: fix select in config XFS_ONLINE_SCRUB_STATS") Reported-by: Holger Hoffstätte Signed-off-by: Anthony Iliopoulos Reviewed-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ed0bc8cbc703..567fb37274d3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select XFS_DEBUG + select DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number -- cgit From 038ca189c0d2c1570b4d922f25b524007c85cf94 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 10 Nov 2023 15:33:13 +1100 Subject: xfs: inode recovery does not validate the recovered inode Discovered when trying to track down a weird recovery corruption issue that wasn't detected at recovery time. The specific corruption was a zero extent count field when big extent counts are in use, and it turns out the dinode verifier doesn't detect that specific corruption case, either. So fix it too. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +++ fs/xfs/xfs_inode_item_recover.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 543f3748c2a3..137a65bda95d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -510,6 +510,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 0e5dba2343ea..1c5ba2a732b0 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2( struct xfs_log_dinode *ldip; uint isize; int need_free = 0; + xfs_failaddr_t fa; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; @@ -528,8 +529,19 @@ out_owner_change: (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); - /* re-generate the checksum. */ + /* re-generate the checksum and validate the recovered inode. */ xfs_dinode_calc_crc(log->l_mp, dip); + fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); + if (fa) { + XFS_CORRUPTION_ERROR( + "Bad dinode after recovery", + XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); + xfs_alert(mp, + "Metadata corruption detected at %pS, inode 0x%llx", + fa, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; -- cgit From 7930d9e103700cde15833638855b750715c12091 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 10 Nov 2023 15:33:14 +1100 Subject: xfs: recovery should not clear di_flushiter unconditionally Because on v3 inodes, di_flushiter doesn't exist. It overlaps with zero padding in the inode, except when NREXT64=1 configurations are in use and the zero padding is no longer padding but holds the 64 bit extent counter. This manifests obviously on big endian platforms (e.g. s390) because the log dinode is in host order and the overlap is the LSBs of the extent count field. It is not noticed on little endian machines because the overlap is at the MSB end of the extent count field and we need to get more than 2^^48 extents in the inode before it manifests. i.e. the heat death of the universe will occur before we see the problem in little endian machines. This is a zero-day issue for NREXT64=1 configuraitons on big endian machines. Fix it by only clearing di_flushiter on v2 inodes during recovery. Fixes: 9b7d16e34bbe ("xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers") cc: stable@kernel.org # 5.19+ Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode_item_recover.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 1c5ba2a732b0..144198a6b270 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -370,24 +370,26 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_has_v3inodes(mp) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; + if (!xfs_has_v3inodes(mp)) { + if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; } - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && -- cgit From e64e7c74b99ec9e439abca75f522f4b98f220bd1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 24 Oct 2023 13:51:36 +0200 Subject: xen/events: avoid using info_for_irq() in xen_send_IPI_one() xen_send_IPI_one() is being used by cpuhp_report_idle_dead() after it calls rcu_report_dead(), meaning that any RCU usage by xen_send_IPI_one() is a bad idea. Unfortunately xen_send_IPI_one() is using notify_remote_via_irq() today, which is using irq_get_chip_data() via info_for_irq(). And irq_get_chip_data() in turn is using a maple-tree lookup requiring RCU. Avoid this problem by caching the ipi event channels in another percpu variable, allowing the use notify_remote_via_evtchn() in xen_send_IPI_one(). Fixes: 721255b9826b ("genirq: Use a maple tree for interrupt descriptor management") Reported-by: David Woodhouse Signed-off-by: Juergen Gross Tested-by: David Woodhouse Acked-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6de6b084ea60..237e6b884f72 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -164,6 +164,8 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; +/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */ +static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0}; /* Event channel distribution data */ static atomic_t channels_on_cpu[NR_CPUS]; @@ -366,6 +368,7 @@ static int xen_irq_info_ipi_setup(unsigned cpu, info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } @@ -981,6 +984,7 @@ static void __unbind_from_irq(unsigned int irq) break; case IRQT_IPI: per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0; break; case IRQT_EVTCHN: dev = info->u.interdomain; @@ -1632,7 +1636,7 @@ EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq; + evtchn_port_t evtchn; #ifdef CONFIG_X86 if (unlikely(vector == XEN_NMI_VECTOR)) { @@ -1643,9 +1647,9 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) return; } #endif - irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + BUG_ON(evtchn == 0); + notify_remote_via_evtchn(evtchn); } struct evtchn_loop_ctrl { -- cgit From bfa993b355d33a438a746523e7129391c8664e8a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 8 Nov 2023 16:25:15 -0500 Subject: acpi/processor: sanitize _OSC/_PDC capabilities for Xen dom0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Processor capability bits notify ACPI of the OS capabilities, and so ACPI can adjust the return of other Processor methods taking the OS capabilities into account. When Linux is running as a Xen dom0, the hypervisor is the entity in charge of processor power management, and hence Xen needs to make sure the capabilities reported by _OSC/_PDC match the capabilities of the driver in Xen. Introduce a small helper to sanitize the buffer when running as Xen dom0. When Xen supports HWP, this serves as the equivalent of commit a21211672c9a ("ACPI / processor: Request native thermal interrupt handling via _OSC") to avoid SMM crashes. Xen will set bit ACPI_PROC_CAP_COLLAB_PROC_PERF (bit 12) in the capability bits and the _OSC/_PDC call will apply it. [ jandryuk: Mention Xen HWP's need. Support _OSC & _PDC ] Signed-off-by: Roger Pau Monné Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Michal Wilczynski Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20231108212517.72279-1-jandryuk@gmail.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/acpi.h | 14 ++++++++++++++ arch/x86/include/asm/xen/hypervisor.h | 9 +++++++++ drivers/xen/pcpu.c | 22 ++++++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index c8a7fc23f63c..f896eed4516c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -16,6 +16,9 @@ #include #include #include +#include + +#include #ifdef CONFIG_ACPI_APEI # include @@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap) if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_option_idle_override == IDLE_NOMWAIT) *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 7048dfacc04b..a9088250770f 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode) enum xen_lazy_mode xen_get_lazy_mode(void); +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index b3e3d1bb37f3..508655273145 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -47,6 +47,9 @@ #include #include +#ifdef CONFIG_ACPI +#include +#endif /* * @cpu_id: Xen physical cpu logic number @@ -400,4 +403,23 @@ bool __init xen_processor_present(uint32_t acpi_id) return online; } + +void xen_sanitize_proc_cap_bits(uint32_t *cap) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap }; + int ret; + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + ret = HYPERVISOR_platform_op(&op); + if (ret) + pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n", + ret); + else + *cap = buf[2]; +} #endif -- cgit From 50e865a56876bd7a74a79c1778025631150f104a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 5 Nov 2023 21:56:31 -0800 Subject: xen/shbuf: eliminate 17 kernel-doc warnings Don't use kernel-doc markers ("/**") for comments that are not in kernel-doc format. This prevents multiple kernel-doc warnings: xen-front-pgdir-shbuf.c:25: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * This structure represents the structure of a shared page xen-front-pgdir-shbuf.c:37: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Shared buffer ops which are differently implemented xen-front-pgdir-shbuf.c:65: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get granted reference to the very first page of the xen-front-pgdir-shbuf.c:85: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Map granted references of the shared buffer. xen-front-pgdir-shbuf.c:106: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Unmap granted references of the shared buffer. xen-front-pgdir-shbuf.c:127: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Free all the resources of the shared buffer. xen-front-pgdir-shbuf.c:154: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get the number of pages the page directory consumes itself. xen-front-pgdir-shbuf.c:164: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Calculate the number of grant references needed to share the buffer xen-front-pgdir-shbuf.c:176: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Calculate the number of grant references needed to share the buffer xen-front-pgdir-shbuf.c:194: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Unmap the buffer previously mapped with grant references xen-front-pgdir-shbuf.c:242: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Map the buffer with grant references provided by the backend. xen-front-pgdir-shbuf.c:324: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Fill page directory with grant references to the pages of the xen-front-pgdir-shbuf.c:354: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Fill page directory with grant references to the pages of the xen-front-pgdir-shbuf.c:393: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Grant references to the frontend's buffer pages. xen-front-pgdir-shbuf.c:422: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Grant all the references needed to share the buffer. xen-front-pgdir-shbuf.c:470: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Allocate all required structures to mange shared buffer. xen-front-pgdir-shbuf.c:510: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Allocate a new instance of a shared buffer. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: lore.kernel.org/r/202311060203.yQrpPZhm-lkp@intel.com Acked-by: Juergen Gross Cc: Juergen Gross Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20231106055631.21520-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- drivers/xen/xen-front-pgdir-shbuf.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c index b52e0fa595a9..223870a0111b 100644 --- a/drivers/xen/xen-front-pgdir-shbuf.c +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -21,7 +21,7 @@ #include -/** +/* * This structure represents the structure of a shared page * that contains grant references to the pages of the shared * buffer. This structure is common to many Xen para-virtualized @@ -33,7 +33,7 @@ struct xen_page_directory { grant_ref_t gref[]; /* Variable length */ }; -/** +/* * Shared buffer ops which are differently implemented * depending on the allocation mode, e.g. if the buffer * is allocated by the corresponding backend or frontend. @@ -61,7 +61,7 @@ struct xen_front_pgdir_shbuf_ops { int (*unmap)(struct xen_front_pgdir_shbuf *buf); }; -/** +/* * Get granted reference to the very first page of the * page directory. Usually this is passed to the backend, * so it can find/fill the grant references to the buffer's @@ -81,7 +81,7 @@ xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); -/** +/* * Map granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -102,7 +102,7 @@ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); -/** +/* * Unmap granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -123,7 +123,7 @@ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap); -/** +/* * Free all the resources of the shared buffer. * * \param buf shared buffer which resources to be freed. @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free); offsetof(struct xen_page_directory, \ gref)) / sizeof(grant_ref_t)) -/** +/* * Get the number of pages the page directory consumes itself. * * \param buf shared buffer. @@ -160,7 +160,7 @@ static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf) return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when backend allocates the buffer. * @@ -172,7 +172,7 @@ static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) buf->num_grefs = get_num_pages_dir(buf); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when frontend allocates the buffer. * @@ -190,7 +190,7 @@ static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) #define xen_page_to_vaddr(page) \ ((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page))) -/** +/* * Unmap the buffer previously mapped with grant references * provided by the backend. * @@ -238,7 +238,7 @@ static int backend_unmap(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Map the buffer with grant references provided by the backend. * * \param buf shared buffer. @@ -320,7 +320,7 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Fill page directory with grant references to the pages of the * page directory itself. * @@ -350,7 +350,7 @@ static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf) page_dir->gref_dir_next_page = XEN_GREF_LIST_END; } -/** +/* * Fill page directory with grant references to the pages of the * page directory and the buffer we share with the backend. * @@ -389,7 +389,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf) } } -/** +/* * Grant references to the frontend's buffer pages. * * These will be shared with the backend, so it can @@ -418,7 +418,7 @@ static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf, return 0; } -/** +/* * Grant all the references needed to share the buffer. * * Grant references to the page directory pages and, if @@ -466,7 +466,7 @@ static int grant_references(struct xen_front_pgdir_shbuf *buf) return 0; } -/** +/* * Allocate all required structures to mange shared buffer. * * \param buf shared buffer. @@ -506,7 +506,7 @@ static const struct xen_front_pgdir_shbuf_ops local_ops = { .grant_refs_for_buffer = guest_grant_refs_for_buffer, }; -/** +/* * Allocate a new instance of a shared buffer. * * \param cfg configuration to be used while allocating a new shared buffer. -- cgit From 472a2ff63efb30234cbf6b2cdaf8117f21b4f8bc Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 10 Nov 2023 17:37:07 +0800 Subject: net: hns3: fix add VLAN fail issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hclge_sync_vlan_filter is called in periodic task, trying to remove VLAN from vlan_del_fail_bmap. It can be concurrence with VLAN adding operation from user. So once user failed to delete a VLAN id, and add it again soon, it may be removed by the periodic task, which may cause the software configuration being inconsistent with hardware. So add mutex handling to avoid this. user hns3 driver periodic task │ add vlan 10 ───── hns3_vlan_rx_add_vid │ │ (suppose success) │ │ │ del vlan 10 ───── hns3_vlan_rx_kill_vid │ │ (suppose fail,add to │ │ vlan_del_fail_bmap) │ │ │ add vlan 10 ───── hns3_vlan_rx_add_vid │ (suppose success) │ foreach vlan_del_fail_bmp del vlan 10 Fixes: fe4144d47eef ("net: hns3: sync VLAN filter entries when kill VLAN ID failed") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 28 +++++++++++++++------- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 11 +++++++-- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 66e5807903a0..e22279e5d43f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10025,8 +10025,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, struct hclge_vport_vlan_cfg *vlan, *tmp; struct hclge_dev *hdev = vport->back; - mutex_lock(&hdev->vport_lock); - list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { if (vlan->vlan_id == vlan_id) { if (is_write_tbl && vlan->hd_tbl_status) @@ -10041,8 +10039,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, break; } } - - mutex_unlock(&hdev->vport_lock); } void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list) @@ -10451,11 +10447,16 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, * handle mailbox. Just record the vlan id, and remove it after * reset finished. */ + mutex_lock(&hdev->vport_lock); if ((test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || test_bit(HCLGE_STATE_RST_FAIL, &hdev->state)) && is_kill) { set_bit(vlan_id, vport->vlan_del_fail_bmap); + mutex_unlock(&hdev->vport_lock); return -EBUSY; + } else if (!is_kill && test_bit(vlan_id, vport->vlan_del_fail_bmap)) { + clear_bit(vlan_id, vport->vlan_del_fail_bmap); } + mutex_unlock(&hdev->vport_lock); /* when port base vlan enabled, we use port base vlan as the vlan * filter entry. In this case, we don't update vlan filter table @@ -10470,17 +10471,22 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, } if (!ret) { - if (!is_kill) + if (!is_kill) { hclge_add_vport_vlan_table(vport, vlan_id, writen_to_tbl); - else if (is_kill && vlan_id != 0) + } else if (is_kill && vlan_id != 0) { + mutex_lock(&hdev->vport_lock); hclge_rm_vport_vlan_table(vport, vlan_id, false); + mutex_unlock(&hdev->vport_lock); + } } else if (is_kill) { /* when remove hw vlan filter failed, record the vlan id, * and try to remove it from hw later, to be consistence * with stack */ + mutex_lock(&hdev->vport_lock); set_bit(vlan_id, vport->vlan_del_fail_bmap); + mutex_unlock(&hdev->vport_lock); } hclge_set_vport_vlan_fltr_change(vport); @@ -10520,6 +10526,7 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev) int i, ret, sync_cnt = 0; u16 vlan_id; + mutex_lock(&hdev->vport_lock); /* start from vport 1 for PF is always alive */ for (i = 0; i < hdev->num_alloc_vport; i++) { struct hclge_vport *vport = &hdev->vport[i]; @@ -10530,21 +10537,26 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), vport->vport_id, vlan_id, true); - if (ret && ret != -EINVAL) + if (ret && ret != -EINVAL) { + mutex_unlock(&hdev->vport_lock); return; + } clear_bit(vlan_id, vport->vlan_del_fail_bmap); hclge_rm_vport_vlan_table(vport, vlan_id, false); hclge_set_vport_vlan_fltr_change(vport); sync_cnt++; - if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) + if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) { + mutex_unlock(&hdev->vport_lock); return; + } vlan_id = find_first_bit(vport->vlan_del_fail_bmap, VLAN_N_VID); } } + mutex_unlock(&hdev->vport_lock); hclge_sync_vlan_fltr_state(hdev); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a4d68fb216fb..1c62e58ff6d8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1206,6 +1206,8 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle, test_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state)) && is_kill) { set_bit(vlan_id, hdev->vlan_del_fail_bmap); return -EBUSY; + } else if (!is_kill && test_bit(vlan_id, hdev->vlan_del_fail_bmap)) { + clear_bit(vlan_id, hdev->vlan_del_fail_bmap); } hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_VLAN, @@ -1233,20 +1235,25 @@ static void hclgevf_sync_vlan_filter(struct hclgevf_dev *hdev) int ret, sync_cnt = 0; u16 vlan_id; + if (bitmap_empty(hdev->vlan_del_fail_bmap, VLAN_N_VID)) + return; + + rtnl_lock(); vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID); while (vlan_id != VLAN_N_VID) { ret = hclgevf_set_vlan_filter(handle, htons(ETH_P_8021Q), vlan_id, true); if (ret) - return; + break; clear_bit(vlan_id, hdev->vlan_del_fail_bmap); sync_cnt++; if (sync_cnt >= HCLGEVF_MAX_SYNC_COUNT) - return; + break; vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID); } + rtnl_unlock(); } static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) -- cgit From ac92c0a9a0603fb448e60f38e63302e4eebb8035 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:08 +0800 Subject: net: hns3: add barrier in vf mailbox reply process In hclgevf_mbx_handler() and hclgevf_get_mbx_resp() functions, there is a typical store-store and load-load scenario between received_resp and additional_info. This patch adds barrier to fix the problem. Fixes: 4671042f1ef0 ("net: hns3: add match_id to check mailbox response from PF to VF") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c index bbf7b14079de..85c2a634c8f9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c @@ -63,6 +63,9 @@ static int hclgevf_get_mbx_resp(struct hclgevf_dev *hdev, u16 code0, u16 code1, i++; } + /* ensure additional_info will be seen after received_resp */ + smp_rmb(); + if (i >= HCLGEVF_MAX_TRY_TIMES) { dev_err(&hdev->pdev->dev, "VF could not get mbx(%u,%u) resp(=%d) from PF in %d tries\n", @@ -178,6 +181,10 @@ static void hclgevf_handle_mbx_response(struct hclgevf_dev *hdev, resp->resp_status = hclgevf_resp_to_errno(resp_status); memcpy(resp->additional_info, req->msg.resp_data, HCLGE_MBX_MAX_RESP_DATA_SIZE * sizeof(u8)); + + /* ensure additional_info will be seen before setting received_resp */ + smp_wmb(); + if (match_id) { /* If match_id is not zero, it means PF support match_id. * if the match_id is right, VF get the right response, or -- cgit From 75b247b57d8b71bcb679e4cb37d0db104848806c Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 10 Nov 2023 17:37:09 +0800 Subject: net: hns3: fix incorrect capability bit display for copper port Currently, the FEC capability bit is default set for device version V2. It's incorrect for the copper port. Eventhough it doesn't make the nic work abnormal, but the capability information display in debugfs may confuse user. So clear it when driver get the port type inforamtion. Fixes: 433ccce83504 ("net: hns3: use FEC capability queried from firmware") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e22279e5d43f..c393b4ee4a32 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11663,6 +11663,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) goto err_msi_irq_uninit; if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) { + clear_bit(HNAE3_DEV_SUPPORT_FEC_B, ae_dev->caps); if (hnae3_dev_phy_imp_supported(hdev)) ret = hclge_update_tp_port_info(hdev); else -- cgit From 53aba458f23846112c0d44239580ff59bc5c36c3 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:10 +0800 Subject: net: hns3: fix out-of-bounds access may occur when coalesce info is read via debugfs The hns3 driver define an array of string to show the coalesce info, but if the kernel adds a new mode or a new state, out-of-bounds access may occur when coalesce info is read via debugfs, this patch fix the problem. Fixes: c99fead7cb07 ("net: hns3: add debugfs support for interrupt coalesce") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 0b138635bafa..c083d1d10767 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -503,11 +503,14 @@ static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector, } sprintf(result[j++], "%d", i); - sprintf(result[j++], "%s", dim_state_str[dim->state]); + sprintf(result[j++], "%s", dim->state < ARRAY_SIZE(dim_state_str) ? + dim_state_str[dim->state] : "unknown"); sprintf(result[j++], "%u", dim->profile_ix); - sprintf(result[j++], "%s", dim_cqe_mode_str[dim->mode]); + sprintf(result[j++], "%s", dim->mode < ARRAY_SIZE(dim_cqe_mode_str) ? + dim_cqe_mode_str[dim->mode] : "unknown"); sprintf(result[j++], "%s", - dim_tune_stat_str[dim->tune_state]); + dim->tune_state < ARRAY_SIZE(dim_tune_stat_str) ? + dim_tune_stat_str[dim->tune_state] : "unknown"); sprintf(result[j++], "%u", dim->steps_left); sprintf(result[j++], "%u", dim->steps_right); sprintf(result[j++], "%u", dim->tired); -- cgit From dbd2f3b20c6ae425665b6975d766e3653d453e73 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:11 +0800 Subject: net: hns3: fix variable may not initialized problem in hns3_init_mac_addr() When a VF is calling hns3_init_mac_addr(), get_mac_addr() may return fail, then the value of mac_addr_temp is not initialized. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 06117502001f..b618797a7e8d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5139,7 +5139,7 @@ static int hns3_init_mac_addr(struct net_device *netdev) struct hns3_nic_priv *priv = netdev_priv(netdev); char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hnae3_handle *h = priv->ae_handle; - u8 mac_addr_temp[ETH_ALEN]; + u8 mac_addr_temp[ETH_ALEN] = {0}; int ret = 0; if (h->ae_algo->ops->get_mac_addr) -- cgit From 65e98bb56fa3ce2edb400930c05238c9b380500e Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Fri, 10 Nov 2023 17:37:12 +0800 Subject: net: hns3: fix VF reset fail issue Currently the reset process in hns3 and firmware watchdog init process is asynchronous. We think firmware watchdog initialization is completed before VF clear the interrupt source. However, firmware initialization may not complete early. So VF will receive multiple reset interrupts and fail to reset. So we add delay before VF interrupt source and 5 ms delay is enough to avoid second reset interrupt. Fixes: 427900d27d86 ("net: hns3: fix the timing issue of VF clearing interrupt sources") Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 14 +++++++++++++- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 1c62e58ff6d8..0aa9beefd1c7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1981,8 +1981,18 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, return HCLGEVF_VECTOR0_EVENT_OTHER; } +static void hclgevf_reset_timer(struct timer_list *t) +{ + struct hclgevf_dev *hdev = from_timer(hdev, t, reset_timer); + + hclgevf_clear_event_cause(hdev, HCLGEVF_VECTOR0_EVENT_RST); + hclgevf_reset_task_schedule(hdev); +} + static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) { +#define HCLGEVF_RESET_DELAY 5 + enum hclgevf_evt_cause event_cause; struct hclgevf_dev *hdev = data; u32 clearval; @@ -1994,7 +2004,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) switch (event_cause) { case HCLGEVF_VECTOR0_EVENT_RST: - hclgevf_reset_task_schedule(hdev); + mod_timer(&hdev->reset_timer, + jiffies + msecs_to_jiffies(HCLGEVF_RESET_DELAY)); break; case HCLGEVF_VECTOR0_EVENT_MBX: hclgevf_mbx_handler(hdev); @@ -2937,6 +2948,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 81c16b8c8da2..a73f2bf3a56a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -219,6 +219,7 @@ struct hclgevf_dev { enum hnae3_reset_type reset_level; unsigned long reset_pending; enum hnae3_reset_type reset_type; + struct timer_list reset_timer; #define HCLGEVF_RESET_REQUESTED 0 #define HCLGEVF_RESET_PENDING 1 -- cgit From dff655e82faffc287d4a72a59f66fa120bf904e4 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Fri, 10 Nov 2023 17:37:13 +0800 Subject: net: hns3: fix VF wrong speed and duplex issue If PF is down, firmware will returns 10 Mbit/s rate and half-duplex mode when PF queries the port information from firmware. After imp reset command is executed, PF status changes to down, and PF will query link status and updates port information from firmware in a periodic scheduled task. However, there is a low probability that port information is updated when PF is down, and then PF link status changes to up. In this case, PF synchronizes incorrect rate and duplex mode to VF. This patch fixes it by updating port information before PF synchronizes the rate and duplex to the VF when PF changes to up. Fixes: 18b6e31f8bf4 ("net: hns3: PF add support for pushing link status to VFs") Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c393b4ee4a32..5ea9e59569ef 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -61,6 +61,7 @@ static void hclge_sync_fd_table(struct hclge_dev *hdev); static void hclge_update_fec_stats(struct hclge_dev *hdev); static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, int wait_cnt); +static int hclge_update_port_info(struct hclge_dev *hdev); static struct hnae3_ae_algo ae_algo; @@ -3041,6 +3042,9 @@ static void hclge_update_link_status(struct hclge_dev *hdev) if (state != hdev->hw.mac.link) { hdev->hw.mac.link = state; + if (state == HCLGE_LINK_STATUS_UP) + hclge_update_port_info(hdev); + client->ops->link_status_change(handle, state); hclge_config_mac_tnl_int(hdev, state); if (rclient && rclient->ops->link_status_change) -- cgit From f726eaa787e9f9bc858c902d18a09af6bcbfcdaf Mon Sep 17 00:00:00 2001 From: Jan Bottorff Date: Thu, 9 Nov 2023 03:19:27 +0000 Subject: i2c: designware: Fix corrupted memory seen in the ISR When running on a many core ARM64 server, errors were happening in the ISR that looked like corrupted memory. These corruptions would fix themselves if small delays were inserted in the ISR. Errors reported by the driver included "i2c_designware APMC0D0F:00: i2c_dw_xfer_msg: invalid target address" and "i2c_designware APMC0D0F:00:controller timed out" during in-band IPMI SSIF stress tests. The problem was determined to be memory writes in the driver were not becoming visible to all cores when execution rapidly shifted between cores, like when a register write immediately triggers an ISR. Processors with weak memory ordering, like ARM64, make no guarantees about the order normal memory writes become globally visible, unless barrier instructions are used to control ordering. To solve this, regmap accessor functions configured by this driver were changed to use non-relaxed forms of the low-level register access functions, which include a barrier on platforms that require it. This assures memory writes before a controller register access are visible to all cores. The community concluded defaulting to correct operation outweighed defaulting to the small performance gains from using relaxed access functions. Being a low speed device added weight to this choice of default register access behavior. Signed-off-by: Jan Bottorff Acked-by: Jarkko Nikula Tested-by: Serge Semin Reviewed-by: Serge Semin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-common.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index affcfb243f0f..35f762872b8a 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -63,7 +63,7 @@ static int dw_reg_read(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = readl_relaxed(dev->base + reg); + *val = readl(dev->base + reg); return 0; } @@ -72,7 +72,7 @@ static int dw_reg_write(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writel_relaxed(val, dev->base + reg); + writel(val, dev->base + reg); return 0; } @@ -81,7 +81,7 @@ static int dw_reg_read_swab(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = swab32(readl_relaxed(dev->base + reg)); + *val = swab32(readl(dev->base + reg)); return 0; } @@ -90,7 +90,7 @@ static int dw_reg_write_swab(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writel_relaxed(swab32(val), dev->base + reg); + writel(swab32(val), dev->base + reg); return 0; } @@ -99,8 +99,8 @@ static int dw_reg_read_word(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = readw_relaxed(dev->base + reg) | - (readw_relaxed(dev->base + reg + 2) << 16); + *val = readw(dev->base + reg) | + (readw(dev->base + reg + 2) << 16); return 0; } @@ -109,8 +109,8 @@ static int dw_reg_write_word(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writew_relaxed(val, dev->base + reg); - writew_relaxed(val >> 16, dev->base + reg + 2); + writew(val, dev->base + reg); + writew(val >> 16, dev->base + reg + 2); return 0; } -- cgit From 6979a51ecaec4dfb7c768eb2a77b77df73a74c8e Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 13 Nov 2023 15:16:56 +0530 Subject: MAINTAINERS: add entry for TI ICSSG Ethernet driver Add record for TI Industrial Communication Subsystem - Gigabit (ICSSG) Ethernet driver. Also add Roger and myself as maintainer. Signed-off-by: MD Danish Anwar Signed-off-by: David S. Miller --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e3acb36989f0..0443f4d9f736 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21795,6 +21795,15 @@ F: Documentation/devicetree/bindings/media/i2c/ti,ds90* F: drivers/media/i2c/ds90* F: include/media/i2c/ds90* +TI ICSSG ETHERNET DRIVER (ICSSG) +R: MD Danish Anwar +R: Roger Quadros +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: netdev@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/ti,icss*.yaml +F: drivers/net/ethernet/ti/icssg/* + TI J721E CSI2RX DRIVER M: Jai Luthra L: linux-media@vger.kernel.org -- cgit From 438cbcdf105d84449fceb39a2d0e16d0ec20708f Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Fri, 10 Nov 2023 13:05:46 +0100 Subject: net: mdio: fix typo in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The quotes symbol in "EEE "link partner ability 1 should be at the end of the register name "EEE link partner ability 1" Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 8fa23bdcedbf..007fd9c3e4b6 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising) * A function that translates value of following registers to the linkmode: * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20) * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60) - * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61) + * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61) */ static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val) { -- cgit From e6daf129ccb79d3781129f623f82bc676f2cb02c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 10 Nov 2023 10:36:00 -0500 Subject: net: gso_test: support CONFIG_MAX_SKB_FRAGS up to 45 The test allocs a single page to hold all the frag_list skbs. This is insufficient on kernels with CONFIG_MAX_SKB_FRAGS=45, due to the increased skb_shared_info frags[] array length. gso_test_func: ASSERTION FAILED at net/core/gso_test.c:210 Expected alloc_size <= ((1UL) << 12), but alloc_size == 5075 (0x13d3) ((1UL) << 12) == 4096 (0x1000) Simplify the logic. Just allocate a page for each frag_list skb. Fixes: 4688ecb1385f ("net: expand skb_segment unit test with frag_list coverage") Signed-off-by: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/gso_test.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/core/gso_test.c b/net/core/gso_test.c index ceb684be4cbf..4c2e77bd12f4 100644 --- a/net/core/gso_test.c +++ b/net/core/gso_test.c @@ -180,18 +180,17 @@ static void gso_test_func(struct kunit *test) } if (tcase->frag_skbs) { - unsigned int total_size = 0, total_true_size = 0, alloc_size = 0; + unsigned int total_size = 0, total_true_size = 0; struct sk_buff *frag_skb, *prev = NULL; - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - page_ref_add(page, tcase->nr_frag_skbs - 1); - for (i = 0; i < tcase->nr_frag_skbs; i++) { unsigned int frag_size; + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + frag_size = tcase->frag_skbs[i]; - frag_skb = build_skb(page_address(page) + alloc_size, + frag_skb = build_skb(page_address(page), frag_size + shinfo_size); KUNIT_ASSERT_NOT_NULL(test, frag_skb); __skb_put(frag_skb, frag_size); @@ -204,11 +203,8 @@ static void gso_test_func(struct kunit *test) total_size += frag_size; total_true_size += frag_skb->truesize; - alloc_size += frag_size + shinfo_size; } - KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE); - skb->len += total_size; skb->data_len += total_size; skb->truesize += total_true_size; -- cgit From fb317eb23b5ee4c37b0656a9a52a3db58d9dd072 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sat, 11 Nov 2023 01:39:47 +0900 Subject: tipc: Fix kernel-infoleak due to uninitialized TLV value KMSAN reported the following kernel-infoleak issue: ===================================================== BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186 copy_to_iter include/linux/uio.h:197 [inline] simple_copy_to_iter net/core/datagram.c:532 [inline] __skb_datagram_iter.5+0x148/0xe30 net/core/datagram.c:420 skb_copy_datagram_iter+0x52/0x210 net/core/datagram.c:546 skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline] netlink_recvmsg+0x43d/0x1630 net/netlink/af_netlink.c:1967 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg net/socket.c:1066 [inline] __sys_recvfrom+0x476/0x860 net/socket.c:2246 __do_sys_recvfrom net/socket.c:2264 [inline] __se_sys_recvfrom net/socket.c:2260 [inline] __x64_sys_recvfrom+0x130/0x200 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x103/0x9e0 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5f7/0xb50 mm/slub.c:3523 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x2fd/0x770 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] tipc_tlv_alloc net/tipc/netlink_compat.c:156 [inline] tipc_get_err_tlv+0x90/0x5d0 net/tipc/netlink_compat.c:170 tipc_nl_compat_recv+0x1042/0x15d0 net/tipc/netlink_compat.c:1324 genl_family_rcv_msg_doit net/netlink/genetlink.c:972 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1052 [inline] genl_rcv_msg+0x1220/0x12c0 net/netlink/genetlink.c:1067 netlink_rcv_skb+0x4a4/0x6a0 net/netlink/af_netlink.c:2545 genl_rcv+0x41/0x60 net/netlink/genetlink.c:1076 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0xf4b/0x1230 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x1242/0x1420 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x997/0xd60 net/socket.c:2588 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2642 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2680 [inline] __se_sys_sendmsg net/socket.c:2678 [inline] __x64_sys_sendmsg+0x2fa/0x4a0 net/socket.c:2678 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Bytes 34-35 of 36 are uninitialized Memory access of size 36 starts at ffff88802d464a00 Data copied to user address 00007ff55033c0a0 CPU: 0 PID: 30322 Comm: syz-executor.0 Not tainted 6.6.0-14500-g1c41041124bd #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 ===================================================== tipc_add_tlv() puts TLV descriptor and value onto `skb`. This size is calculated with TLV_SPACE() macro. It adds the size of struct tlv_desc and the length of TLV value passed as an argument, and aligns the result to a multiple of TLV_ALIGNTO, i.e., a multiple of 4 bytes. If the size of struct tlv_desc plus the length of TLV value is not aligned, the current implementation leaves the remaining bytes uninitialized. This is the cause of the above kernel-infoleak issue. This patch resolves this issue by clearing data up to an aligned size. Fixes: d0796d1ef63d ("tipc: convert legacy nl bearer dump to nl compat") Signed-off-by: Shigeru Yoshida Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 5bc076f2fa74..c763008a8adb 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -102,6 +102,7 @@ static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len) return -EMSGSIZE; skb_put(skb, TLV_SPACE(len)); + memset(tlv, 0, TLV_SPACE(len)); tlv->tlv_type = htons(type); tlv->tlv_len = htons(TLV_LENGTH(len)); if (len && data) -- cgit From ca8add922f9c7f6e2e3c71039da8e0dcc64b87ed Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Sat, 11 Nov 2023 05:41:12 +0100 Subject: net: mvneta: fix calls to page_pool_get_stats Calling page_pool_get_stats in the mvneta driver without checks leads to kernel crashes. First the page pool is only available if the bm is not used. The page pool is also not allocated when the port is stopped. It can also be not allocated in case of errors. The current implementation leads to the following crash calling ethstats on a port that is down or when calling it at the wrong moment: ble to handle kernel NULL pointer dereference at virtual address 00000070 [00000070] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Hardware name: Marvell Armada 380/385 (Device Tree) PC is at page_pool_get_stats+0x18/0x1cc LR is at mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta] pc : [] lr : [] psr: a0000013 sp : f1439d48 ip : f1439dc0 fp : 0000001d r10: 00000100 r9 : c4816b80 r8 : f0d75150 r7 : bf0b400c r6 : c238f000 r5 : 00000000 r4 : f1439d68 r3 : c2091040 r2 : ffffffd8 r1 : f1439d68 r0 : 00000000 Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 066b004a DAC: 00000051 Register r0 information: NULL pointer Register r1 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Register r2 information: non-paged memory Register r3 information: slab kmalloc-2k start c2091000 pointer offset 64 size 2048 Register r4 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Register r5 information: NULL pointer Register r6 information: slab kmalloc-cg-4k start c238f000 pointer offset 0 size 4096 Register r7 information: 15-page vmalloc region starting at 0xbf0a8000 allocated at load_module+0xa30/0x219c Register r8 information: 1-page vmalloc region starting at 0xf0d75000 allocated at ethtool_get_stats+0x138/0x208 Register r9 information: slab task_struct start c4816b80 pointer offset 0 Register r10 information: non-paged memory Register r11 information: non-paged memory Register r12 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Process snmpd (pid: 733, stack limit = 0x38de3a88) Stack: (0xf1439d48 to 0xf143a000) 9d40: 000000c0 00000001 c238f000 bf0b400c f0d75150 c4816b80 9d60: 00000100 bf0a98d8 00000000 00000000 00000000 00000000 00000000 00000000 9d80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9da0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9dc0: 00000dc0 5335509c 00000035 c238f000 bf0b2214 01067f50 f0d75000 c0b9b9c8 9de0: 0000001d 00000035 c2212094 5335509c c4816b80 c238f000 c5ad6e00 01067f50 9e00: c1b0be80 c4816b80 00014813 c0b9d7f0 00000000 00000000 0000001d 0000001d 9e20: 00000000 00001200 00000000 00000000 c216ed90 c73943b8 00000000 00000000 9e40: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9e60: 00000000 c0ad9034 00000000 00000000 00000000 00000000 00000000 00000000 9e80: 00000000 00000000 00000000 5335509c c1b0be80 f1439ee4 00008946 c1b0be80 9ea0: 01067f50 f1439ee3 00000000 00000046 b6d77ae0 c0b383f0 00008946 becc83e8 9ec0: c1b0be80 00000051 0000000b c68ca480 c7172d00 c0ad8ff0 f1439ee3 cf600e40 9ee0: 01600e40 32687465 00000000 00000000 00000000 01067f50 00000000 00000000 9f00: 00000000 5335509c 00008946 00008946 00000000 c68ca480 becc83e8 c05e2de0 9f20: f1439fb0 c03002f0 00000006 5ac3c35a c4816b80 00000006 b6d77ae0 c030caf0 9f40: c4817350 00000014 f1439e1c 0000000c 00000000 00000051 01000000 00000014 9f60: 00003fec f1439edc 00000001 c0372abc b6d77ae0 c0372abc cf600e40 5335509c 9f80: c21e6800 01015c9c 0000000b 00008946 00000036 c03002f0 c4816b80 00000036 9fa0: b6d77ae0 c03000c0 01015c9c 0000000b 0000000b 00008946 becc83e8 00000000 9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0 9fe0: b6dbf738 becc838c b6d186d7 b6baa858 40000030 0000000b 00000000 00000000 page_pool_get_stats from mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta] mvneta_ethtool_get_stats [mvneta] from ethtool_get_stats+0x154/0x208 ethtool_get_stats from dev_ethtool+0xf48/0x2480 dev_ethtool from dev_ioctl+0x538/0x63c dev_ioctl from sock_ioctl+0x49c/0x53c sock_ioctl from sys_ioctl+0x134/0xbd8 sys_ioctl from ret_fast_syscall+0x0/0x1c Exception stack(0xf1439fa8 to 0xf1439ff0) 9fa0: 01015c9c 0000000b 0000000b 00008946 becc83e8 00000000 9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0 9fe0: b6dbf738 becc838c b6d186d7 b6baa858 Code: e28dd004 e1a05000 e2514000 0a00006a (e5902070) This commit adds the proper checks before calling page_pool_get_stats. Fixes: b3fc79225f05 ("net: mvneta: add support for page_pool_get_stats") Signed-off-by: Sven Auhagen Reported-by: Paulo Da Silva Acked-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 90817136808d..29aac327574d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4790,14 +4790,17 @@ static void mvneta_ethtool_get_strings(struct net_device *netdev, u32 sset, u8 *data) { if (sset == ETH_SS_STATS) { + struct mvneta_port *pp = netdev_priv(netdev); int i; for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++) memcpy(data + i * ETH_GSTRING_LEN, mvneta_statistics[i].name, ETH_GSTRING_LEN); - data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics); - page_pool_ethtool_stats_get_strings(data); + if (!pp->bm_priv) { + data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics); + page_pool_ethtool_stats_get_strings(data); + } } } @@ -4915,8 +4918,10 @@ static void mvneta_ethtool_pp_stats(struct mvneta_port *pp, u64 *data) struct page_pool_stats stats = {}; int i; - for (i = 0; i < rxq_number; i++) - page_pool_get_stats(pp->rxqs[i].page_pool, &stats); + for (i = 0; i < rxq_number; i++) { + if (pp->rxqs[i].page_pool) + page_pool_get_stats(pp->rxqs[i].page_pool, &stats); + } page_pool_ethtool_stats_get(data, &stats); } @@ -4932,14 +4937,21 @@ static void mvneta_ethtool_get_stats(struct net_device *dev, for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++) *data++ = pp->ethtool_stats[i]; - mvneta_ethtool_pp_stats(pp, data); + if (!pp->bm_priv) + mvneta_ethtool_pp_stats(pp, data); } static int mvneta_ethtool_get_sset_count(struct net_device *dev, int sset) { - if (sset == ETH_SS_STATS) - return ARRAY_SIZE(mvneta_statistics) + - page_pool_ethtool_stats_get_count(); + if (sset == ETH_SS_STATS) { + int count = ARRAY_SIZE(mvneta_statistics); + struct mvneta_port *pp = netdev_priv(dev); + + if (!pp->bm_priv) + count += page_pool_ethtool_stats_get_count(); + + return count; + } return -EOPNOTSUPP; } -- cgit From c0a2a1b0d631fc460d830f52d06211838874d655 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 12 Nov 2023 22:16:32 -0500 Subject: ppp: limit MRU to 64K ppp_sync_ioctl allows setting device MRU, but does not sanity check this input. Limit to a sane upper bound of 64KB. No implementation I could find generates larger than 64KB frames. RFC 2823 mentions an upper bound of PPP over SDL of 64KB based on the 16-bit length field. Other protocols will be smaller, such as PPPoE (9KB jumbo frame) and PPPoA (18190 maximum CPCS-SDU size, RFC 2364). PPTP and L2TP encapsulate in IP. Syzbot managed to trigger alloc warning in __alloc_pages: if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) WARNING: CPU: 1 PID: 37 at mm/page_alloc.c:4544 __alloc_pages+0x3ab/0x4a0 mm/page_alloc.c:4544 __alloc_skb+0x12b/0x330 net/core/skbuff.c:651 __netdev_alloc_skb+0x72/0x3f0 net/core/skbuff.c:715 netdev_alloc_skb include/linux/skbuff.h:3225 [inline] dev_alloc_skb include/linux/skbuff.h:3238 [inline] ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline] ppp_sync_receive+0xff/0x680 drivers/net/ppp/ppp_synctty.c:334 tty_ldisc_receive_buf+0x14c/0x180 drivers/tty/tty_buffer.c:390 tty_port_default_receive_buf+0x70/0xb0 drivers/tty/tty_port.c:37 receive_buf drivers/tty/tty_buffer.c:444 [inline] flush_to_ldisc+0x261/0x780 drivers/tty/tty_buffer.c:494 process_one_work+0x884/0x15c0 kernel/workqueue.c:2630 With call ioctl$PPPIOCSMRU1(r1, 0x40047452, &(0x7f0000000100)=0x5e6417a8) Similar code exists in other drivers that implement ppp_channel_ops ioctl PPPIOCSMRU. Those might also be in scope. Notably excluded from this are pppol2tp_ioctl and pppoe_ioctl. This code goes back to the start of git history. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+6177e1f90d92583bcc58@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_synctty.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index ea261a628786..52d05ce4a281 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -453,6 +453,10 @@ ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) case PPPIOCSMRU: if (get_user(val, (int __user *) argp)) break; + if (val > U16_MAX) { + err = -EINVAL; + break; + } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; -- cgit From 47d970204054f859f35a2237baa75c2d84fcf436 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 25 Sep 2023 17:54:13 +0200 Subject: xen/events: fix delayed eoi list handling When delaying eoi handling of events, the related elements are queued into the percpu lateeoi list. In case the list isn't empty, the elements should be sorted by the time when eoi handling is to happen. Unfortunately a new element will never be queued at the start of the list, even if it has a handling time lower than all other list elements. Fix that by handling that case the same way as for an empty list. Fixes: e99502f76271 ("xen/events: defer eoi in case of excessive number of events") Reported-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 237e6b884f72..cd33a418344a 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -604,7 +604,9 @@ static void lateeoi_list_add(struct irq_info *info) spin_lock_irqsave(&eoi->eoi_list_lock, flags); - if (list_empty(&eoi->eoi_list)) { + elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + if (!elem || info->eoi_time < elem->eoi_time) { list_add(&info->eoi_list, &eoi->eoi_list); mod_delayed_work_on(info->eoi_cpu, system_wq, &eoi->delayed, delay); -- cgit From f96c6c588ca81255566a5168e51c9cbbe7b86def Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Sep 2023 14:29:54 +0200 Subject: xen/events: remove unused functions There are no users of xen_irq_from_pirq() and xen_set_irq_pending(). Remove those functions. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 30 ------------------------------ include/xen/events.h | 4 ---- 2 files changed, 34 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index cd33a418344a..c5d86128eb73 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1169,29 +1169,6 @@ out: return rc; } -int xen_irq_from_pirq(unsigned pirq) -{ - int irq; - - struct irq_info *info; - - mutex_lock(&irq_mapping_update_lock); - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - irq = info->irq; - if (info->u.pirq.pirq == pirq) - goto out; - } - irq = -1; -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - - int xen_pirq_from_irq(unsigned irq) { return pirq_from_irq(irq); @@ -2031,13 +2008,6 @@ void xen_clear_irq_pending(int irq) event_handler_exit(info); } EXPORT_SYMBOL(xen_clear_irq_pending); -void xen_set_irq_pending(int irq) -{ - evtchn_port_t evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - set_evtchn(evtchn); -} bool xen_test_irq_pending(int irq) { diff --git a/include/xen/events.h b/include/xen/events.h index 23932b0673dc..a129cafa80ed 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -88,7 +88,6 @@ void xen_irq_resume(void); /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); -void xen_set_irq_pending(int irq); bool xen_test_irq_pending(int irq); /* Poll waiting for an irq to become pending. In the usual case, the @@ -122,9 +121,6 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, /* De-allocates the above mentioned physical interrupt. */ int xen_destroy_irq(int irq); -/* Return irq from pirq */ -int xen_irq_from_pirq(unsigned pirq); - /* Return the pirq allocated to the irq. */ int xen_pirq_from_irq(unsigned irq); -- cgit From b0077e269f6c152e807fdac90b58caf012cdbaab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Nov 2023 11:52:31 +0800 Subject: blk-mq: make sure active queue usage is held for bio_integrity_prep() blk_integrity_unregister() can come if queue usage counter isn't held for one bio with integrity prepared, so this request may be completed with calling profile->complete_fn, then kernel panic. Another constraint is that bio_integrity_prep() needs to be called before bio merge. Fix the issue by: - call bio_integrity_prep() with one queue usage counter grabbed reliably - call bio_integrity_prep() before bio merge Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()") Reported-by: Yi Zhang Cc: Christoph Hellwig Signed-off-by: Ming Lei Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 75 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e2d11183f62e..900c1be1fee1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2858,11 +2858,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (unlikely(bio_queue_enter(bio))) - return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - goto queue_exit; + return NULL; rq_qos_throttle(q, bio); @@ -2878,35 +2875,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); -queue_exit: - blk_queue_exit(q); return NULL; } -static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio **bio, unsigned int nsegs) +/* return true if this @rq can be used for @bio */ +static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) { - struct request *rq; - enum hctx_type type, hctx_type; + enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); + enum hctx_type hctx_type = rq->mq_hctx->type; - if (!plug) - return NULL; - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); - if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { - *bio = NULL; - return NULL; - } - - type = blk_mq_get_hctx_type((*bio)->bi_opf); - hctx_type = rq->mq_hctx->type; if (type != hctx_type && !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) - return NULL; + return false; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return false; /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2914,12 +2899,12 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, * before we throttle. */ plug->cached_rq = rq_list_next(rq); - rq_qos_throttle(q, *bio); + rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, 0); - rq->cmd_flags = (*bio)->bi_opf; + rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return rq; + return true; } static void bio_set_ioprio(struct bio *bio) @@ -2949,7 +2934,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - struct request *rq; + struct request *rq = NULL; unsigned int nr_segs = 1; blk_status_t ret; @@ -2960,20 +2945,36 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (!bio_integrity_prep(bio)) - return; - bio_set_ioprio(bio); - rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); - if (!rq) { - if (!bio) + if (plug) { + rq = rq_list_peek(&plug->cached_rq); + if (rq && rq->q != q) + rq = NULL; + } + if (rq) { + if (!bio_integrity_prep(bio)) return; - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; + if (blk_mq_can_use_cached_rq(rq, plug, bio)) + goto done; + percpu_ref_get(&q->q_usage_counter); + } else { + if (unlikely(bio_queue_enter(bio))) + return; + if (!bio_integrity_prep(bio)) + goto fail; + } + + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) { +fail: + blk_queue_exit(q); + return; } +done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); -- cgit From d02ef87db9d6137fc2a98231b92f24ead4f7966d Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 13 Nov 2023 16:40:29 +0000 Subject: ALSA: hda: cs35l56: Enable low-power hibernation mode on i2c This can now be re-enabled as the sequence to reliably wake the device has been implemented in the shared ASoC code. This has a functional dependency on commit 3df761bdbc8b ("ASoC: cs35l56: Wake transactions need to be issued twice") To protect against this, enabling hibernation is conditional on CS35L56_WAKE_HOLD_TIME_US being defined, which indicates that the new hibernation sequences are available. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20231113164029.1156669-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda_i2c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l56_hda_i2c.c b/sound/pci/hda/cs35l56_hda_i2c.c index 757a4d193e0f..a9ef6d86de83 100644 --- a/sound/pci/hda/cs35l56_hda_i2c.c +++ b/sound/pci/hda/cs35l56_hda_i2c.c @@ -21,6 +21,10 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt) return -ENOMEM; cs35l56->base.dev = &clt->dev; + +#ifdef CS35L56_WAKE_HOLD_TIME_US + cs35l56->base.can_hibernate = true; +#endif cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); -- cgit From 58c09cad1754c56cb000ef07477e8781e3fad4d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Nov 2023 08:52:24 -0800 Subject: drm/ci: make github dependabot happy again The drm CI scripts for gitlab have a requirements file that makes the github 'dependabot' worry about a few of the required tooling versions. It wants to update the pip requirements from 23.2.1 to 23.3: "When installing a package from a Mercurial VCS URL, e.g. pip install hg+..., with pip prior to v23.3, the specified Mercurial revision could be used to inject arbitrary configuration options to the hg clone call (e.g. --config). Controlling the Mercurial configuration can modify how and which repository is installed. This vulnerability does not affect users who aren't installing from Mercurial" and upgrade the urllib3 requirements from 2.0.4 to 2.0.7 due to two issues: "urllib3's request body not stripped after redirect from 303 status changes request method to GET" "`Cookie` HTTP header isn't stripped on cross-origin redirects" The file also ends up not having a newline at the end, that my editor ends up wanting to fix automatically. Link: https://github.com/dependabot Tested-by: Helen Koike Signed-off-by: Linus Torvalds --- drivers/gpu/drm/ci/xfails/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/ci/xfails/requirements.txt b/drivers/gpu/drm/ci/xfails/requirements.txt index d8856d1581fd..e9994c9db799 100644 --- a/drivers/gpu/drm/ci/xfails/requirements.txt +++ b/drivers/gpu/drm/ci/xfails/requirements.txt @@ -5,7 +5,7 @@ termcolor==2.3.0 certifi==2023.7.22 charset-normalizer==3.2.0 idna==3.4 -pip==23.2.1 +pip==23.3 python-gitlab==3.15.0 requests==2.31.0 requests-toolbelt==1.0.0 @@ -13,5 +13,5 @@ ruamel.yaml==0.17.32 ruamel.yaml.clib==0.2.7 setuptools==68.0.0 tenacity==8.2.3 -urllib3==2.0.4 -wheel==0.41.1 \ No newline at end of file +urllib3==2.0.7 +wheel==0.41.1 -- cgit From 382561d16854a747e6df71034da08d20d6013dfe Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 12 Nov 2023 18:32:45 -0800 Subject: i2c: ocores: Move system PM hooks to the NOIRQ phase When an I2C device contains a wake IRQ subordinate to a regmap-irq chip, the regmap-irq code must be able to perform I2C transactions during suspend_device_irqs() and resume_device_irqs(). Therefore, the bus must be suspended/resumed during the NOIRQ phase. Signed-off-by: Samuel Holland Acked-by: Peter Korsgaard Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ocores.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 041a76f71a49..e106af83cef4 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -771,8 +771,8 @@ static int ocores_i2c_resume(struct device *dev) return ocores_init(dev, i2c); } -static DEFINE_SIMPLE_DEV_PM_OPS(ocores_i2c_pm, - ocores_i2c_suspend, ocores_i2c_resume); +static DEFINE_NOIRQ_DEV_PM_OPS(ocores_i2c_pm, + ocores_i2c_suspend, ocores_i2c_resume); static struct platform_driver ocores_i2c_driver = { .probe = ocores_i2c_probe, -- cgit From 7a1aba89ac54ccf6cad23a91a34c0ab24b1d7997 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 13 Oct 2023 12:25:10 +0200 Subject: ice: dpll: fix initial lock status of dpll When dpll device is registered and dpll subsystem performs notify of a new device, the lock state value provided to dpll subsystem equals 0 which is invalid value for the `enum dpll_lock_status`. Provide correct value by obtaining it from firmware before registering the dpll device. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Signed-off-by: Aleksandr Loktionov Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 835c419ccc74..607f534055b6 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1756,6 +1756,7 @@ ice_dpll_init_dpll(struct ice_pf *pf, struct ice_dpll *d, bool cgu, } d->pf = pf; if (cgu) { + ice_dpll_update_state(pf, d, true); ret = dpll_device_register(d->dpll, type, &ice_dpll_ops, d); if (ret) { dpll_device_put(d->dpll); @@ -1796,8 +1797,6 @@ static int ice_dpll_init_worker(struct ice_pf *pf) struct ice_dplls *d = &pf->dplls; struct kthread_worker *kworker; - ice_dpll_update_state(pf, &d->eec, true); - ice_dpll_update_state(pf, &d->pps, true); kthread_init_delayed_work(&d->work, ice_dpll_periodic_work); kworker = kthread_create_worker(0, "ice-dplls-%s", dev_name(ice_pf_to_dev(pf))); -- cgit From 4a4027f25dc3f39c2aafb3bf8926125c5378c9dc Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Tue, 31 Oct 2023 18:06:54 +0100 Subject: ice: dpll: fix check for dpll input priority range Supported priority value for input pins may differ with regard of NIC firmware version. E810T NICs with 3.20/4.00 FW versions would accept priority range 0-31, where firmware 4.10+ would support the range 0-9 and extra value of 255. Remove the in-range check as the driver has no information on supported values from the running firmware, let firmware decide if given value is correct and return extack error if the value is not supported. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Reviewed-by: Przemek Kitszel Reviewed-by: Jacob Keller Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 6 ------ drivers/net/ethernet/intel/ice/ice_dpll.h | 1 - 2 files changed, 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 607f534055b6..831ba6683962 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -815,12 +815,6 @@ ice_dpll_input_prio_set(const struct dpll_pin *pin, void *pin_priv, struct ice_pf *pf = d->pf; int ret; - if (prio > ICE_DPLL_PRIO_MAX) { - NL_SET_ERR_MSG_FMT(extack, "prio out of supported range 0-%d", - ICE_DPLL_PRIO_MAX); - return -EINVAL; - } - mutex_lock(&pf->dplls.lock); ret = ice_dpll_hw_input_prio_set(pf, d, p, prio, extack); mutex_unlock(&pf->dplls.lock); diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h index bb32b6d88373..93172e93995b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.h +++ b/drivers/net/ethernet/intel/ice/ice_dpll.h @@ -6,7 +6,6 @@ #include "ice.h" -#define ICE_DPLL_PRIO_MAX 0xF #define ICE_DPLL_RCLK_NUM_MAX 4 /** ice_dpll_pin - store info about pins -- cgit From 6db5f2cd9ebb12e930a82c01714a6589576cd50f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Tue, 31 Oct 2023 18:08:00 +0100 Subject: ice: dpll: fix output pin capabilities The dpll output pins which are used to feed clock signal of PHY and MAC circuits cannot be disconnected, those integrated circuits require clock signal for operation. By stopping assignment of DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE pin capability, prevent the user from invoking the state set callback on those pins, setting the state on those pins already returns error, as firmware doesn't allow the change of their state. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Fixes: 8a3a565ff210 ("ice: add admin commands to access cgu configuration") Reviewed-by: Andrii Staikov Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 12 ++++--- drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 54 +++++++++++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_ptp_hw.h | 2 ++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 831ba6683962..86b180cb32a0 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1823,6 +1823,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, int num_pins, i, ret = -EINVAL; struct ice_hw *hw = &pf->hw; struct ice_dpll_pin *pins; + unsigned long caps; u8 freq_supp_num; bool input; @@ -1842,6 +1843,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, } for (i = 0; i < num_pins; i++) { + caps = 0; pins[i].idx = i; pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input); pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input); @@ -1854,8 +1856,8 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, &dp->input_prio[i]); if (ret) return ret; - pins[i].prop.capabilities |= - DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE; + caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE | + DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE); pins[i].prop.phase_range.min = pf->dplls.input_phase_adj_max; pins[i].prop.phase_range.max = @@ -1865,9 +1867,11 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, pf->dplls.output_phase_adj_max; pins[i].prop.phase_range.max = -pf->dplls.output_phase_adj_max; + ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps); + if (ret) + return ret; } - pins[i].prop.capabilities |= - DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + pins[i].prop.capabilities = caps; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); if (ret) return ret; diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index 6d573908de7a..a00b55e14aac 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -3961,3 +3961,57 @@ int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num) return ret; } + +/** + * ice_cgu_get_output_pin_state_caps - get output pin state capabilities + * @hw: pointer to the hw struct + * @pin_id: id of a pin + * @caps: capabilities to modify + * + * Return: + * * 0 - success, state capabilities were modified + * * negative - failure, capabilities were not modified + */ +int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id, + unsigned long *caps) +{ + bool can_change = true; + + switch (hw->device_id) { + case ICE_DEV_ID_E810C_SFP: + if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3) + can_change = false; + break; + case ICE_DEV_ID_E810C_QSFP: + if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3 || pin_id == ZL_OUT4) + can_change = false; + break; + case ICE_DEV_ID_E823L_10G_BASE_T: + case ICE_DEV_ID_E823L_1GBE: + case ICE_DEV_ID_E823L_BACKPLANE: + case ICE_DEV_ID_E823L_QSFP: + case ICE_DEV_ID_E823L_SFP: + case ICE_DEV_ID_E823C_10G_BASE_T: + case ICE_DEV_ID_E823C_BACKPLANE: + case ICE_DEV_ID_E823C_QSFP: + case ICE_DEV_ID_E823C_SFP: + case ICE_DEV_ID_E823C_SGMII: + if (hw->cgu_part_number == + ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL30632_80032 && + pin_id == ZL_OUT2) + can_change = false; + else if (hw->cgu_part_number == + ICE_AQC_GET_LINK_TOPO_NODE_NR_SI5383_5384 && + pin_id == SI_OUT1) + can_change = false; + break; + default: + return -EINVAL; + } + if (can_change) + *caps |= DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + else + *caps &= ~DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + + return 0; +} diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h index 36aeeef99ec0..cf76701566c7 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h @@ -282,6 +282,8 @@ int ice_get_cgu_state(struct ice_hw *hw, u8 dpll_idx, int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num); void ice_ptp_init_phy_model(struct ice_hw *hw); +int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id, + unsigned long *caps); #define PFTSYN_SEM_BYTES 4 -- cgit From a778616e4cc2d5e3a253c7d8959aafa5218fc5e4 Mon Sep 17 00:00:00 2001 From: Dan Nowlin Date: Tue, 7 Nov 2023 12:32:27 -0500 Subject: ice: fix DDP package download for packages without signature segment Commit 3cbdb0343022 ("ice: Add support for E830 DDP package segment") incorrectly removed support for package download for packages without a signature segment. These packages include the signature buffer inline in the configurations buffers, and not in a signature segment. Fix package download by providing download support for both packages with (ice_download_pkg_with_sig_seg()) and without signature segment (ice_download_pkg_without_sig_seg()). Fixes: 3cbdb0343022 ("ice: Add support for E830 DDP package segment") Reported-by: Maciej Fijalkowski Closes: https://lore.kernel.org/netdev/ZUT50a94kk2pMGKb@boxer/ Tested-by: Maciej Fijalkowski Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Signed-off-by: Dan Nowlin Signed-off-by: Paul Greenwalt Reviewed-by: Simon Horman Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 103 ++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index cfb1580f5850..8b7504a9df31 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1479,14 +1479,14 @@ ice_post_dwnld_pkg_actions(struct ice_hw *hw) } /** - * ice_download_pkg + * ice_download_pkg_with_sig_seg * @hw: pointer to the hardware structure * @pkg_hdr: pointer to package header * * Handles the download of a complete package. */ static enum ice_ddp_state -ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) +ice_download_pkg_with_sig_seg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) { enum ice_aq_err aq_err = hw->adminq.sq_last_status; enum ice_ddp_state state = ICE_DDP_PKG_ERR; @@ -1519,6 +1519,103 @@ ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) state = ice_post_dwnld_pkg_actions(hw); ice_release_global_cfg_lock(hw); + + return state; +} + +/** + * ice_dwnld_cfg_bufs + * @hw: pointer to the hardware structure + * @bufs: pointer to an array of buffers + * @count: the number of buffers in the array + * + * Obtains global config lock and downloads the package configuration buffers + * to the firmware. + */ +static enum ice_ddp_state +ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count) +{ + enum ice_ddp_state state; + struct ice_buf_hdr *bh; + int status; + + if (!bufs || !count) + return ICE_DDP_PKG_ERR; + + /* If the first buffer's first section has its metadata bit set + * then there are no buffers to be downloaded, and the operation is + * considered a success. + */ + bh = (struct ice_buf_hdr *)bufs; + if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF) + return ICE_DDP_PKG_SUCCESS; + + status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE); + if (status) { + if (status == -EALREADY) + return ICE_DDP_PKG_ALREADY_LOADED; + return ice_map_aq_err_to_ddp_state(hw->adminq.sq_last_status); + } + + state = ice_dwnld_cfg_bufs_no_lock(hw, bufs, 0, count, true); + if (!state) + state = ice_post_dwnld_pkg_actions(hw); + + ice_release_global_cfg_lock(hw); + + return state; +} + +/** + * ice_download_pkg_without_sig_seg + * @hw: pointer to the hardware structure + * @ice_seg: pointer to the segment of the package to be downloaded + * + * Handles the download of a complete package without signature segment. + */ +static enum ice_ddp_state +ice_download_pkg_without_sig_seg(struct ice_hw *hw, struct ice_seg *ice_seg) +{ + struct ice_buf_table *ice_buf_tbl; + + ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n", + ice_seg->hdr.seg_format_ver.major, + ice_seg->hdr.seg_format_ver.minor, + ice_seg->hdr.seg_format_ver.update, + ice_seg->hdr.seg_format_ver.draft); + + ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n", + le32_to_cpu(ice_seg->hdr.seg_type), + le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id); + + ice_buf_tbl = ice_find_buf_table(ice_seg); + + ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n", + le32_to_cpu(ice_buf_tbl->buf_count)); + + return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array, + le32_to_cpu(ice_buf_tbl->buf_count)); +} + +/** + * ice_download_pkg + * @hw: pointer to the hardware structure + * @pkg_hdr: pointer to package header + * @ice_seg: pointer to the segment of the package to be downloaded + * + * Handles the download of a complete package. + */ +static enum ice_ddp_state +ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr, + struct ice_seg *ice_seg) +{ + enum ice_ddp_state state; + + if (hw->pkg_has_signing_seg) + state = ice_download_pkg_with_sig_seg(hw, pkg_hdr); + else + state = ice_download_pkg_without_sig_seg(hw, ice_seg); + ice_post_pkg_dwnld_vlan_mode_cfg(hw); return state; @@ -2083,7 +2180,7 @@ enum ice_ddp_state ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len) /* initialize package hints and then download package */ ice_init_pkg_hints(hw, seg); - state = ice_download_pkg(hw, pkg); + state = ice_download_pkg(hw, pkg, seg); if (state == ICE_DDP_PKG_ALREADY_LOADED) { ice_debug(hw, ICE_DBG_INIT, "package previously loaded - no work.\n"); -- cgit From ff31ba19d732efb9aca3633935d71085e68d5076 Mon Sep 17 00:00:00 2001 From: Anastasia Belova Date: Mon, 13 Nov 2023 17:52:32 +0300 Subject: cifs: spnego: add ';' in HOST_KEY_LEN "host=" should start with ';' (as in cifs_get_spnego_key) So its length should be 6. Found by Linux Verification Center (linuxtesting.org) with SVACE. Reviewed-by: Paulo Alcantara (SUSE) Fixes: 7c9c3760b3a5 ("[CIFS] add constants for string lengths of keynames in SPNEGO upcall string") Signed-off-by: Anastasia Belova Co-developed-by: Ekaterina Esina Signed-off-by: Ekaterina Esina Signed-off-by: Steve French --- fs/smb/client/cifs_spnego.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 6f3285f1dfee..af7849e5974f 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = { * strlen(";sec=ntlmsspi") */ #define MAX_MECH_STR_LEN 13 -/* strlen of "host=" */ -#define HOST_KEY_LEN 5 +/* strlen of ";host=" */ +#define HOST_KEY_LEN 6 /* strlen of ";ip4=" or ";ip6=" */ #define IP_KEY_LEN 5 -- cgit From 181724fc72486dec2bec8803459be05b5162aaa8 Mon Sep 17 00:00:00 2001 From: Ekaterina Esina Date: Mon, 13 Nov 2023 19:42:41 +0300 Subject: cifs: fix check of rc in function generate_smb3signingkey Remove extra check after condition, add check after generating key for encryption. The check is needed to return non zero rc before rewriting it with generating key for decryption. Found by Linux Verification Center (linuxtesting.org) with SVACE. Reviewed-by: Paulo Alcantara (SUSE) Fixes: d70e9fa55884 ("cifs: try opening channels after mounting") Signed-off-by: Ekaterina Esina Co-developed-by: Anastasia Belova Signed-off-by: Anastasia Belova Signed-off-by: Steve French --- fs/smb/client/smb2transport.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 84ea67301303..5a3ca62d2f07 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -458,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses, ptriplet->encryption.context, ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE); + if (rc) + return rc; rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, @@ -466,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses, return rc; } - if (rc) - return rc; - #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__); /* -- cgit From 48d584b7f90f48d2623fb01743b099efbf0d36c2 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 6 Nov 2023 10:34:01 +0800 Subject: bcachefs: make bch2_target_to_text_sb static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bch2_target_to_text_sb are not used outside the file disk_groups.c, so the modification is defined as static. fs/bcachefs/disk_groups.c:583:6: warning: no previous prototype for ‘bch2_target_to_text_sb’. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7144 Signed-off-by: Jiapeng Chong Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index d613695abf9f..1f334124055b 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -580,7 +580,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } } -void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct target t = target_decode(v); -- cgit From c4f1f80a0e8d829ce4e29ca52cb7f74b22f67454 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 12:30:19 -0500 Subject: bcachefs: Use correct fgf_t type as function argument This quiets a sparse complaint. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-pagecache.c | 2 +- fs/bcachefs/fs-io-pagecache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 8bd9bcdd27f7..ff664fd0d8ef 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -13,7 +13,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, loff_t start, u64 end, - int fgp_flags, gfp_t gfp, + fgf_t fgp_flags, gfp_t gfp, folios *fs) { struct folio *f; diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index a2222ad586e9..27f712ae37a6 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -7,7 +7,7 @@ typedef DARRAY(struct folio *) folios; int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, int, gfp_t, folios *); + u64, fgf_t, gfp_t, folios *); int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); /* -- cgit From 1b8bc556280d3f4970407480e6a5ff49efe5601b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 16:27:02 -0600 Subject: bcachefs: Use DECLARE_FLEX_ARRAY() helper and fix multiple -Warray-bounds warnings Transform zero-length array `s` into a proper flexible-array member in `struct snapshot_table` via the DECLARE_FLEX_ARRAY() helper; and fix tons of the following -Warray-bounds warnings: fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.c:135:70: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 86833445af20..2d2e66a4e468 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,7 @@ struct snapshot_t { }; struct snapshot_table { - struct snapshot_t s[0]; + DECLARE_FLEX_ARRAY(struct snapshot_t, s); }; typedef struct { -- cgit From 274c2f8fd27158d15524abe63c3df6fb96707dd3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 15:40:22 -0600 Subject: bcachefs: Fix multiple -Warray-bounds warnings Transform zero-length array `entries` into a proper flexible-array member in `struct journal_seq_blacklist_table`; and fix the following -Warray-bounds warnings: fs/bcachefs/journal_seq_blacklist.c:148:26: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:150:30: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:154:27: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:176:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:177:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:297:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:298:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:300:31: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] This results in no differences in binary output. This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9cb8684959ee..403aa3389fcc 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -617,7 +617,7 @@ struct journal_seq_blacklist_table { u64 start; u64 end; bool dirty; - } entries[0]; + } entries[]; }; struct journal_keys { -- cgit From 03cc1e67a243cbb2c85d5fd84f369449f94d4269 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 Nov 2023 10:30:22 -0500 Subject: bcachefs: Fix null ptr deref in bch2_backpointer_get_node() bch2_btree_iter_peek_node() can return a NULL ptr (when the tree is shorter than the search depth); handle this with an early return. Signed-off-by: Kent Overstreet Reported-by: Dan Carpenter Fixes: https://lore.kernel.org/linux-bcachefs/5fc3c28b-c232-4ec7-b0ac-4ef220ddf976@moroto.mountain/T/ Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ef02c9bb0354..23c0834a97a4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, bp.level - 1, 0); b = bch2_btree_iter_peek_node(iter); - if (IS_ERR(b)) + if (IS_ERR_OR_NULL(b)) goto err; BUG_ON(b->c.level != bp.level - 1); - if (b && extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) return b; - if (b && btree_node_will_make_reachable(b)) { + if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); -- cgit From 4d6128dca6d940015fe2aa383ec1a0eeb9632f08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 11:59:05 -0500 Subject: bcachefs: Guard against insufficient devices to create stripes We can't create stripes if we don't have enough devices - this manifested as an integer underflow bug later. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 875f7c5a6fca..2a77de18c004 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, int ret; h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); - if (!h) - bch_err(c, "no stripe head"); if (IS_ERR_OR_NULL(h)) return h; -- cgit From 1bd5bcc9f5eef968ed021d72b14a157be7abdb49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:44:14 -0500 Subject: bcachefs: Split out btree_key_cache_types.h More consistent organization. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache_types.h | 30 ++++++++++++++++++++++++++++++ fs/bcachefs/btree_types.h | 27 +-------------------------- 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 fs/bcachefs/btree_key_cache_types.h diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h new file mode 100644 index 000000000000..0f967808d766 --- /dev/null +++ b/fs/bcachefs/btree_key_cache_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + struct shrinker *shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 941841a0c5bf..be5d6027e796 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,7 +5,7 @@ #include #include -//#include "bkey_methods.h" +#include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" #include "errcode.h" @@ -312,31 +312,6 @@ struct btree_iter { #endif }; -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - -struct btree_key_cache { - struct mutex lock; - struct rhashtable table; - bool table_init_done; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct shrinker *shrink; - unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - - atomic_long_t nr_freed; - atomic_long_t nr_keys; - atomic_long_t nr_dirty; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 -- cgit From c65c13f0eac61218c9ee4635c05661c0b9760e58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 09:53:14 -0500 Subject: bcachefs: Run btree key cache shrinker less aggressively The btree key cache maintains lists of items that have been freed, but can't yet be reclaimed because a bch2_trans_relock() call might find them - we're waiting for SRCU readers to release. Previously, we wouldn't count these items against the number we're attempting to scan for, which would mean we'd evict more live key cache entries - doing quite a bit of potentially unecessary work. With recent work to make sure we don't hold SRCU locks for too long, it should be safe to count all the items on the freelists against number to scan - even if we can't reclaim them yet, we will be able to soon. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 23 +++++++++++++++++++---- fs/bcachefs/btree_key_cache_types.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9b78f78a75b5..b3305a04d808 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - if (ck->c.lock.readers) + if (ck->c.lock.readers) { list_move_tail(&ck->list, &bc->freed_pcpu); - else + bc->nr_freed_pcpu++; + } else { list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, { struct bkey_cached *pos; + bc->nr_freed_nonpcpu++; + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, pos->btree_trans_barrier_seq)) { @@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, #else mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; mutex_unlock(&bc->lock); #endif } else { @@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, f->nr < ARRAY_SIZE(f->objs) / 2) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; f->objs[f->nr++] = ck; } @@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_nonpcpu)) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; } mutex_unlock(&bc->lock); #endif @@ -850,6 +858,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ + scanned += bc->nr_freed_nonpcpu; + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -859,13 +869,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_nonpcpu--; } if (scanned >= nr) goto out; + scanned += bc->nr_freed_pcpu; + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -875,8 +887,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_pcpu--; } if (scanned >= nr) @@ -982,6 +994,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #endif + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_nonpcpu, &items); diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 0f967808d766..290e4e57df5b 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -11,8 +11,12 @@ struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; + struct list_head freed_pcpu; + size_t nr_freed_pcpu; struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + struct shrinker *shrink; unsigned shrink_iter; struct btree_key_cache_freelist __percpu *pcpu_freed; -- cgit From 3b8c4507779691984e31e64e0b80abb03cc02d0d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 19:49:47 -0500 Subject: bcachefs: btree_trans->write_locked As prep work for the next patch to fix a key cache reclaim issue, we need to start tracking whether we're currently holding write locks - so that we can release and retake the before calling into memory reclaim. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 85 +++++++++++++++++++++++----------------- fs/bcachefs/btree_types.h | 1 + 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index decad7b66c59..02491f7bb831 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, bch2_btree_init_next(trans, b); } +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -732,37 +779,6 @@ revert_fs_usage: return ret; } -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -838,7 +854,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (unlikely(ret)) return ret; - ret = trans_lock_write(trans); + ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -847,10 +863,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_trans_unlock_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index be5d6027e796..f3669fa68591 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -409,6 +409,7 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; + bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_begin_ip; -- cgit From 09b0283ee23a02094a43a9b93146d1060c58fc3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 5 Nov 2023 15:28:44 -0500 Subject: bcachefs: Make sure to drop/retake btree locks before reclaim We really don't want to be invoking memory reclaim with btree locks held: even aside from (solvable, but tricky) recursion issues, it can cause painful to diagnose performance edge cases. This fixes a recently reported issue in btree_key_can_insert_cached(). Signed-off-by: Kent Overstreet Reported-by: Mateusz Guzik Fixes: https://lore.kernel.org/linux-bcachefs/CAGudoHEsb_hGRMeWeXh+UF6po0qQuuq_NKSEo+s1sEb6bDLjpA@mail.gmail.com/T/ --- fs/bcachefs/btree_trans_commit.c | 48 +++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 02491f7bb831..55a120eb8692 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -368,6 +368,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans, return 0; } +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned u64s) { @@ -394,12 +433,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); trans_for_each_update(trans, i) if (i->old_v == &ck->k->v) -- cgit From 701ff57eb3d7c86c9a53de959e0c48fa8ca446d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 3 Nov 2023 18:38:35 -0400 Subject: bcachefs: Check for nonce offset inconsistency in data_update path We've rarely been seeing a nonce offset inconsistency that doesn't show up in tests: this adds some extra verification code to the data update path that prints out more relevant info when it occurs. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0771a6d880bf..5ed66202c226 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -239,6 +239,34 @@ restart_drop_extra_replicas: next_pos = insert->k.p; + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, -- cgit From 4b3812d90b2c93723adf4b6ce99240d301f7d5f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 13 Nov 2023 20:49:39 -0800 Subject: Revert "ptp: Fixes a null pointer dereference in ptp_ioctl" This reverts commit 8a4f030dbced6fc255cbe67b2d0a129947e18493. Richard says: The test itself is harmless, but keeping it will make people think, "oh this pointer can be invalid." In fact the core stack ensures that ioctl() can't be invoked after release(), otherwise Bad Stuff happens. Fixes: 8a4f030dbced ("ptp: Fixes a null pointer dereference in ptp_ioctl") Link: https://lore.kernel.org/all/ZVAf_qdRfDAQYUt-@hoboy.vegasvil.org/ Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index e95a6ed130ef..3f7a74788802 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -176,8 +176,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, int enable, err = 0; tsevq = pccontext->private_clkdata; - if (!tsevq) - return -EINVAL; switch (cmd) { -- cgit From 73bde5a3294853947252cd9092a3517c7cb0cd2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 17:48:59 +0000 Subject: ptp: annotate data-race around q->head and q->tail As I was working on a syzbot report, I found that KCSAN would probably complain that reading q->head or q->tail without barriers could lead to invalid results. Add corresponding READ_ONCE() and WRITE_ONCE() to avoid load-store tearing. Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.") Signed-off-by: Eric Dumazet Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20231109174859.3995880-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 3 ++- drivers/ptp/ptp_clock.c | 5 +++-- drivers/ptp/ptp_private.h | 8 ++++++-- drivers/ptp/ptp_sysfs.c | 3 ++- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 3f7a74788802..7513018c9f9a 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -572,7 +572,8 @@ ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags, for (i = 0; i < cnt; i++) { event[i] = queue->buf[queue->head]; - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + /* Paired with READ_ONCE() in queue_cnt() */ + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } spin_unlock_irqrestore(&queue->lock, flags); diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 3134568af622..15b804ba4868 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -57,10 +57,11 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue, dst->t.sec = seconds; dst->t.nsec = remainder; + /* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */ if (!queue_free(queue)) - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); - queue->tail = (queue->tail + 1) % PTP_MAX_TIMESTAMPS; + WRITE_ONCE(queue->tail, (queue->tail + 1) % PTP_MAX_TIMESTAMPS); spin_unlock_irqrestore(&queue->lock, flags); } diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 35fde0a05746..45f9002a5dca 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -85,9 +85,13 @@ struct ptp_vclock { * that a writer might concurrently increment the tail does not * matter, since the queue remains nonempty nonetheless. */ -static inline int queue_cnt(struct timestamp_event_queue *q) +static inline int queue_cnt(const struct timestamp_event_queue *q) { - int cnt = q->tail - q->head; + /* + * Paired with WRITE_ONCE() in enqueue_external_timestamp(), + * ptp_read(), extts_fifo_show(). + */ + int cnt = READ_ONCE(q->tail) - READ_ONCE(q->head); return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt; } diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index 7d023d9d0acb..f7a499a1bd39 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -94,7 +94,8 @@ static ssize_t extts_fifo_show(struct device *dev, qcnt = queue_cnt(queue); if (qcnt) { event = queue->buf[queue->head]; - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + /* Paired with READ_ONCE() in queue_cnt() */ + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } spin_unlock_irqrestore(&queue->lock, flags); -- cgit From 3cffa2ddc4d3fcf70cde361236f5a614f81a09b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 18:01:02 +0000 Subject: bonding: stop the device in bond_setup_by_slave() Commit 9eed321cde22 ("net: lapbether: only support ethernet devices") has been able to keep syzbot away from net/lapb, until today. In the following splat [1], the issue is that a lapbether device has been created on a bonding device without members. Then adding a non ARPHRD_ETHER member forced the bonding master to change its type. The fix is to make sure we call dev_close() in bond_setup_by_slave() so that the potential linked lapbether devices (or any other devices having assumptions on the physical device) are removed. A similar bug has been addressed in commit 40baec225765 ("bonding: fix panic on non-ARPHRD_ETHER enslave failure") [1] skbuff: skb_under_panic: text:ffff800089508810 len:44 put:40 head:ffff0000c78e7c00 data:ffff0000c78e7bea tail:0x16 end:0x140 dev:bond0 kernel BUG at net/core/skbuff.c:192 ! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 6007 Comm: syz-executor383 Not tainted 6.6.0-rc3-syzkaller-gbf6547d8715b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : skb_panic net/core/skbuff.c:188 [inline] pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 lr : skb_panic net/core/skbuff.c:188 [inline] lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 sp : ffff800096a06aa0 x29: ffff800096a06ab0 x28: ffff800096a06ba0 x27: dfff800000000000 x26: ffff0000ce9b9b50 x25: 0000000000000016 x24: ffff0000c78e7bea x23: ffff0000c78e7c00 x22: 000000000000002c x21: 0000000000000140 x20: 0000000000000028 x19: ffff800089508810 x18: ffff800096a06100 x17: 0000000000000000 x16: ffff80008a629a3c x15: 0000000000000001 x14: 1fffe00036837a32 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000201 x10: 0000000000000000 x9 : cb50b496c519aa00 x8 : cb50b496c519aa00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff800096a063b8 x4 : ffff80008e280f80 x3 : ffff8000805ad11c x2 : 0000000000000001 x1 : 0000000100000201 x0 : 0000000000000086 Call trace: skb_panic net/core/skbuff.c:188 [inline] skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 skb_push+0xf0/0x108 net/core/skbuff.c:2446 ip6gre_header+0xbc/0x738 net/ipv6/ip6_gre.c:1384 dev_hard_header include/linux/netdevice.h:3136 [inline] lapbeth_data_transmit+0x1c4/0x298 drivers/net/wan/lapbether.c:257 lapb_data_transmit+0x8c/0xb0 net/lapb/lapb_iface.c:447 lapb_transmit_buffer+0x178/0x204 net/lapb/lapb_out.c:149 lapb_send_control+0x220/0x320 net/lapb/lapb_subr.c:251 __lapb_disconnect_request+0x9c/0x17c net/lapb/lapb_iface.c:326 lapb_device_event+0x288/0x4e0 net/lapb/lapb_iface.c:492 notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93 raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1970 [inline] call_netdevice_notifiers_extack net/core/dev.c:2008 [inline] call_netdevice_notifiers net/core/dev.c:2022 [inline] __dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508 dev_close_many+0x1e0/0x470 net/core/dev.c:1559 dev_close+0x174/0x250 net/core/dev.c:1585 lapbeth_device_event+0x2e4/0x958 drivers/net/wan/lapbether.c:466 notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93 raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1970 [inline] call_netdevice_notifiers_extack net/core/dev.c:2008 [inline] call_netdevice_notifiers net/core/dev.c:2022 [inline] __dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508 dev_close_many+0x1e0/0x470 net/core/dev.c:1559 dev_close+0x174/0x250 net/core/dev.c:1585 bond_enslave+0x2298/0x30cc drivers/net/bonding/bond_main.c:2332 bond_do_ioctl+0x268/0xc64 drivers/net/bonding/bond_main.c:4539 dev_ifsioc+0x754/0x9ac dev_ioctl+0x4d8/0xd34 net/core/dev_ioctl.c:786 sock_do_ioctl+0x1d4/0x2d0 net/socket.c:1217 sock_ioctl+0x4e8/0x834 net/socket.c:1322 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __arm64_sys_ioctl+0x14c/0x1c8 fs/ioctl.c:857 __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 el0_svc+0x58/0x16c arch/arm64/kernel/entry-common.c:678 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Code: aa1803e6 aa1903e7 a90023f5 94785b8b (d4210000) Fixes: 872254dd6b1f ("net/bonding: Enable bonding to enslave non ARPHRD_ETHER") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Jay Vosburgh Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20231109180102.4085183-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 51d47eda1c87..8e6cc0e133b7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1500,6 +1500,10 @@ done: static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { + bool was_up = !!(bond_dev->flags & IFF_UP); + + dev_close(bond_dev); + bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; @@ -1514,6 +1518,8 @@ static void bond_setup_by_slave(struct net_device *bond_dev, bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } + if (was_up) + dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress -- cgit From 510e35fb931ffc3b100e5d5ae4595cd3beca9f1a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:12 +0100 Subject: net: ethernet: cortina: Fix max RX frame define Enumerator 3 is 1548 bytes according to the datasheet. Not 1542. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Reviewed-by: Andrew Lunn Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-1-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 4 ++-- drivers/net/ethernet/cortina/gemini.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 5423fe26b4ef..562b0917316a 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -432,8 +432,8 @@ static const struct gmac_max_framelen gmac_maxlens[] = { .val = CONFIG0_MAXLEN_1536, }, { - .max_l3_len = 1542, - .val = CONFIG0_MAXLEN_1542, + .max_l3_len = 1548, + .val = CONFIG0_MAXLEN_1548, }, { .max_l3_len = 9212, diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h index 9fdf77d5eb37..99efb1155743 100644 --- a/drivers/net/ethernet/cortina/gemini.h +++ b/drivers/net/ethernet/cortina/gemini.h @@ -787,7 +787,7 @@ union gmac_config0 { #define CONFIG0_MAXLEN_1536 0 #define CONFIG0_MAXLEN_1518 1 #define CONFIG0_MAXLEN_1522 2 -#define CONFIG0_MAXLEN_1542 3 +#define CONFIG0_MAXLEN_1548 3 #define CONFIG0_MAXLEN_9k 4 /* 9212 */ #define CONFIG0_MAXLEN_10k 5 /* 10236 */ #define CONFIG0_MAXLEN_1518__6 6 -- cgit From d4d0c5b4d279bfe3585fbd806efefd3e51c82afa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:13 +0100 Subject: net: ethernet: cortina: Handle large frames The Gemini ethernet controller provides hardware checksumming for frames up to 1514 bytes including ethernet headers but not FCS. If we start sending bigger frames (after first bumping up the MTU on both interfaces sending and receiving the frames), truncated packets start to appear on the target such as in this tcpdump resulting from ping -s 1474: 23:34:17.241983 14:d6:4d:a8:3c:4f (oui Unknown) > bc:ae:c5:6b:a8:3d (oui Unknown), ethertype IPv4 (0x0800), length 1514: truncated-ip - 2 bytes missing! (tos 0x0, ttl 64, id 32653, offset 0, flags [DF], proto ICMP (1), length 1502) OpenWrt.lan > Fecusia: ICMP echo request, id 1672, seq 50, length 1482 If we bypass the hardware checksumming and provide a software fallback, everything starts working fine up to the max TX MTU of 2047 bytes, for example ping -s2000 192.168.1.2: 00:44:29.587598 bc:ae:c5:6b:a8:3d (oui Unknown) > 14:d6:4d:a8:3c:4f (oui Unknown), ethertype IPv4 (0x0800), length 2042: (tos 0x0, ttl 64, id 51828, offset 0, flags [none], proto ICMP (1), length 2028) Fecusia > OpenWrt.lan: ICMP echo reply, id 1683, seq 4, length 2008 The bit enabling to bypass hardware checksum (or any of the "TSS" bits) are undocumented in the hardware reference manual. The entire hardware checksum unit appears undocumented. The conclusion that we need to use the "bypass" bit was found by trial-and-error. Since no hardware checksum will happen, we slot in a software checksum fallback. Check for the condition where we need to compute checksum on the skb with either hardware or software using == CHECKSUM_PARTIAL instead of != CHECKSUM_NONE which is an incomplete check according to . On the D-Link DIR-685 router this fixes a bug on the conduit interface to the RTL8366RB DSA switch: as the switch needs to add space for its tag it increases the MTU on the conduit interface to 1504 and that means that when the router sends packages of 1500 bytes these get an extra 4 bytes of DSA tag and the transfer fails because of the erroneous hardware checksumming, affecting such basic functionality as the LuCI web interface. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-2-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 562b0917316a..e5e73dee13ac 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1145,6 +1145,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, dma_addr_t mapping; unsigned short mtu; void *buffer; + int ret; mtu = ETH_HLEN; mtu += netdev->mtu; @@ -1159,9 +1160,30 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, word3 |= mtu; } - if (skb->ip_summed != CHECKSUM_NONE) { + if (skb->len >= ETH_FRAME_LEN) { + /* Hardware offloaded checksumming isn't working on frames + * bigger than 1514 bytes. A hypothesis about this is that the + * checksum buffer is only 1518 bytes, so when the frames get + * bigger they get truncated, or the last few bytes get + * overwritten by the FCS. + * + * Just use software checksumming and bypass on bigger frames. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + ret = skb_checksum_help(skb); + if (ret) + return ret; + } + word1 |= TSS_BYPASS_BIT; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int tcp = 0; + /* We do not switch off the checksumming on non TCP/UDP + * frames: as is shown from tests, the checksumming engine + * is smart enough to see that a frame is not actually TCP + * or UDP and then just pass it through without any changes + * to the frame. + */ if (skb->protocol == htons(ETH_P_IP)) { word1 |= TSS_IP_CHKSUM_BIT; tcp = ip_hdr(skb)->protocol == IPPROTO_TCP; -- cgit From dc6c0bfbaa947dd7976e30e8c29b10c868b6fa42 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:14 +0100 Subject: net: ethernet: cortina: Fix MTU max setting The RX max frame size is over 10000 for the Gemini ethernet, but the TX max frame size is actually just 2047 (0x7ff after checking the datasheet). Reflect this in what we offer to Linux, cap the MTU at the TX max frame minus ethernet headers. We delete the code disabling the hardware checksum for large MTUs as netdev->mtu can no longer be larger than netdev->max_mtu meaning the if()-clause in gmac_fix_features() is never true. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Reviewed-by: Andrew Lunn Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-3-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 17 ++++------------- drivers/net/ethernet/cortina/gemini.h | 2 +- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index e5e73dee13ac..78287cfcbf63 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2000,15 +2000,6 @@ static int gmac_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -static netdev_features_t gmac_fix_features(struct net_device *netdev, - netdev_features_t features) -{ - if (netdev->mtu + ETH_HLEN + VLAN_HLEN > MTU_SIZE_BIT_MASK) - features &= ~GMAC_OFFLOAD_FEATURES; - - return features; -} - static int gmac_set_features(struct net_device *netdev, netdev_features_t features) { @@ -2234,7 +2225,6 @@ static const struct net_device_ops gmac_351x_ops = { .ndo_set_mac_address = gmac_set_mac_address, .ndo_get_stats64 = gmac_get_stats64, .ndo_change_mtu = gmac_change_mtu, - .ndo_fix_features = gmac_fix_features, .ndo_set_features = gmac_set_features, }; @@ -2486,11 +2476,12 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev) netdev->hw_features = GMAC_OFFLOAD_FEATURES; netdev->features |= GMAC_OFFLOAD_FEATURES | NETIF_F_GRO; - /* We can handle jumbo frames up to 10236 bytes so, let's accept - * payloads of 10236 bytes minus VLAN and ethernet header + /* We can receive jumbo frames up to 10236 bytes but only + * transmit 2047 bytes so, let's accept payloads of 2047 + * bytes minus VLAN and ethernet header */ netdev->min_mtu = ETH_MIN_MTU; - netdev->max_mtu = 10236 - VLAN_ETH_HLEN; + netdev->max_mtu = MTU_SIZE_BIT_MASK - VLAN_ETH_HLEN; port->freeq_refill = 0; netif_napi_add(netdev, &port->napi, gmac_napi_poll); diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h index 99efb1155743..24bb989981f2 100644 --- a/drivers/net/ethernet/cortina/gemini.h +++ b/drivers/net/ethernet/cortina/gemini.h @@ -502,7 +502,7 @@ union gmac_txdesc_3 { #define SOF_BIT 0x80000000 #define EOF_BIT 0x40000000 #define EOFIE_BIT BIT(29) -#define MTU_SIZE_BIT_MASK 0x1fff +#define MTU_SIZE_BIT_MASK 0x7ff /* Max MTU 2047 bytes */ /* GMAC Tx Descriptor */ struct gmac_txdesc { -- cgit From 0ab0c45d8aaea5192328bfa6989673aceafc767c Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Fri, 10 Nov 2023 01:33:59 +0800 Subject: r8169: add handling DASH when DASH is disabled For devices that support DASH, even DASH is disabled, there may still exist a default firmware that will influence device behavior. So driver needs to handle DASH for devices that support DASH, no matter the DASH status is. This patch also prepares for "fix network lost after resume on DASH systems". Fixes: ee7a1beb9759 ("r8169:call "rtl8168_driver_start" "rtl8168_driver_stop" only when hardware dash function is enabled") Cc: stable@vger.kernel.org Signed-off-by: ChunHao Lin Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20231109173400.4573-2-hau@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 36 +++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0c76c162b8a9..cfcb40d90920 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -624,6 +624,7 @@ struct rtl8169_private { unsigned supports_gmii:1; unsigned aspm_manageable:1; + unsigned dash_enabled:1; dma_addr_t counters_phys_addr; struct rtl8169_counters *counters; struct rtl8169_tc_offsets tc_offset; @@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp) return r8168ep_ocp_read(tp, 0x128) & BIT(0); } -static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp) +static bool rtl_dash_is_enabled(struct rtl8169_private *tp) +{ + switch (tp->dash_type) { + case RTL_DASH_DP: + return r8168dp_check_dash(tp); + case RTL_DASH_EP: + return r8168ep_check_dash(tp); + default: + return false; + } +} + +static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp) { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: - return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE; + return RTL_DASH_DP; case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53: - return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE; + return RTL_DASH_EP; default: return RTL_DASH_NONE; } @@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) device_set_wakeup_enable(tp_to_dev(tp), wolopts); - if (tp->dash_type == RTL_DASH_NONE) { + if (!tp->dash_enabled) { rtl_set_d3_pll_down(tp, !wolopts); tp->dev->wol_enabled = wolopts ? 1 : 0; } @@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp) static void rtl_prepare_power_down(struct rtl8169_private *tp) { - if (tp->dash_type != RTL_DASH_NONE) + if (tp->dash_enabled) return; if (tp->mac_version == RTL_GIGA_MAC_VER_32 || @@ -4869,7 +4882,7 @@ static int rtl8169_runtime_idle(struct device *device) { struct rtl8169_private *tp = dev_get_drvdata(device); - if (tp->dash_type != RTL_DASH_NONE) + if (tp->dash_enabled) return -EBUSY; if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev)) @@ -4895,8 +4908,7 @@ static void rtl_shutdown(struct pci_dev *pdev) /* Restore original MAC address */ rtl_rar_set(tp, tp->dev->perm_addr); - if (system_state == SYSTEM_POWER_OFF && - tp->dash_type == RTL_DASH_NONE) { + if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) { pci_wake_from_d3(pdev, tp->saved_wolopts); pci_set_power_state(pdev, PCI_D3hot); } @@ -5254,7 +5266,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1); tp->aspm_manageable = !rc; - tp->dash_type = rtl_check_dash(tp); + tp->dash_type = rtl_get_dash_type(tp); + tp->dash_enabled = rtl_dash_is_enabled(tp); tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK; @@ -5325,7 +5338,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* configure chip for default features */ rtl8169_set_features(dev, dev->features); - if (tp->dash_type == RTL_DASH_NONE) { + if (!tp->dash_enabled) { rtl_set_d3_pll_down(tp, true); } else { rtl_set_d3_pll_down(tp, false); @@ -5365,7 +5378,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "ok" : "ko"); if (tp->dash_type != RTL_DASH_NONE) { - netdev_info(dev, "DASH enabled\n"); + netdev_info(dev, "DASH %s\n", + tp->dash_enabled ? "enabled" : "disabled"); rtl8168_driver_start(tp); } -- cgit From 868c3b95afef4883bfb66c9397482da6840b5baf Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Fri, 10 Nov 2023 01:34:00 +0800 Subject: r8169: fix network lost after resume on DASH systems Device that support DASH may be reseted or powered off during suspend. So driver needs to handle DASH during system suspend and resume. Or DASH firmware will influence device behavior and causes network lost. Fixes: b646d90053f8 ("r8169: magic.") Cc: stable@vger.kernel.org Reviewed-by: Heiner Kallweit Signed-off-by: ChunHao Lin Link: https://lore.kernel.org/r/20231109173400.4573-3-hau@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index cfcb40d90920..b9bb1d2f0237 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4661,10 +4661,16 @@ static void rtl8169_down(struct rtl8169_private *tp) rtl8169_cleanup(tp); rtl_disable_exit_l1(tp); rtl_prepare_power_down(tp); + + if (tp->dash_type != RTL_DASH_NONE) + rtl8168_driver_stop(tp); } static void rtl8169_up(struct rtl8169_private *tp) { + if (tp->dash_type != RTL_DASH_NONE) + rtl8168_driver_start(tp); + pci_set_master(tp->pci_dev); phy_init_hw(tp->phydev); phy_resume(tp->phydev); -- cgit From b28060db7172e6d8912d88b369123eb89e0d36b4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 Nov 2023 11:12:49 +0200 Subject: ovl: fix misformatted comment Remove misleading /** prefix from a regular comment. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311121628.byHp8tkv-lkp@intel.com/ Signed-off-by: Amir Goldstein --- fs/overlayfs/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 50a201e9cd39..c3f020ca13a8 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -978,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper, return 0; } -/** +/* * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. */ -- cgit From 37f32f52643869131ec01bb69bdf9f404f6109fb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 Nov 2023 10:11:25 +0200 Subject: ovl: fix memory leak in ovl_parse_param() On failure to parse parameters in ovl_parse_param_lowerdir(), it is necessary to update ctx->nr with the correct nr before using ovl_reset_lowerdirs() to release l->name. Reported-and-tested-by: syzbot+26eedf3631650972f17c@syzkaller.appspotmail.com Fixes: c835110b588a ("ovl: remove unused code in lowerdir param parsing") Co-authored-by: Edward Adam Davis Signed-off-by: Amir Goldstein --- fs/overlayfs/params.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index ddab9ea267d1..3fe2dde1598f 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -430,7 +430,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; char *dup = NULL, *iter; - ssize_t nr_lower = 0, nr = 0, nr_data = 0; + ssize_t nr_lower, nr; bool data_layer = false; /* @@ -482,6 +482,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) iter = dup; l = ctx->lower; for (nr = 0; nr < nr_lower; nr++, l++) { + ctx->nr++; memset(l, 0, sizeof(*l)); err = ovl_mount_dir(iter, &l->path); @@ -498,10 +499,10 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_put; if (data_layer) - nr_data++; + ctx->nr_data++; /* Calling strchr() again would overrun. */ - if ((nr + 1) == nr_lower) + if (ctx->nr == nr_lower) break; err = -EINVAL; @@ -511,7 +512,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * This is a regular layer so we require that * there are no data layers. */ - if ((ctx->nr_data + nr_data) > 0) { + if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); goto out_put; } @@ -524,8 +525,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) data_layer = true; iter++; } - ctx->nr = nr_lower; - ctx->nr_data += nr_data; kfree(dup); return 0; -- cgit From 686464514fbebb6c8de4415238319e414c3500a4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 08:58:05 +0200 Subject: xen/events: reduce externally visible helper functions get_evtchn_to_irq() has only one external user while irq_from_evtchn() provides the same functionality and is exported for a wider user base. Modify the only external user of get_evtchn_to_irq() to use irq_from_evtchn() instead and make get_evtchn_to_irq() static. evtchn_from_irq() and irq_from_virq() have a single external user and can easily be combined to a new helper irq_evtchn_from_virq() allowing to drop irq_from_virq() and to make evtchn_from_irq() static. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_2l.c | 8 ++++---- drivers/xen/events/events_base.c | 13 +++++++++---- drivers/xen/events/events_internal.h | 1 - include/xen/events.h | 4 ++-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index b8f2f971c2f0..e3585330cf98 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -171,11 +171,11 @@ static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) int i; struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + evtchn_port_t evtchn; /* Timer interrupt has highest priority. */ - irq = irq_from_virq(cpu, VIRQ_TIMER); + irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn); if (irq != -1) { - evtchn_port_t evtchn = evtchn_from_irq(irq); word_idx = evtchn / BITS_PER_LONG; bit_idx = evtchn % BITS_PER_LONG; if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) @@ -328,9 +328,9 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { if (sync_test_bit(i, BM(sh->evtchn_pending))) { int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", + printk(" %d: event %d -> irq %u%s%s%s\n", cpu_from_evtchn(i), i, - get_evtchn_to_irq(i), + irq_from_evtchn(i), sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) ? "" : " l2-clear", !sync_test_bit(i, BM(sh->evtchn_mask)) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index c5d86128eb73..a74ea28dcc3d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -248,7 +248,7 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) return 0; } -int get_evtchn_to_irq(evtchn_port_t evtchn) +static int get_evtchn_to_irq(evtchn_port_t evtchn) { if (evtchn >= xen_evtchn_max_channels()) return -1; @@ -415,7 +415,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) /* * Accessors for packed IRQ information. */ -evtchn_port_t evtchn_from_irq(unsigned irq) +static evtchn_port_t evtchn_from_irq(unsigned int irq) { const struct irq_info *info = NULL; @@ -433,9 +433,14 @@ unsigned int irq_from_evtchn(evtchn_port_t evtchn) } EXPORT_SYMBOL_GPL(irq_from_evtchn); -int irq_from_virq(unsigned int cpu, unsigned int virq) +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn) { - return per_cpu(virq_to_irq, cpu)[virq]; + int irq = per_cpu(virq_to_irq, cpu)[virq]; + + *evtchn = evtchn_from_irq(irq); + + return irq; } static enum ipi_vector ipi_from_irq(unsigned irq) diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 4d3398eff9cd..19ae31695edc 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -33,7 +33,6 @@ struct evtchn_ops { extern const struct evtchn_ops *evtchn_ops; -int get_evtchn_to_irq(evtchn_port_t evtchn); void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl); unsigned int cpu_from_evtchn(evtchn_port_t evtchn); diff --git a/include/xen/events.h b/include/xen/events.h index a129cafa80ed..3b07409f8032 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -100,8 +100,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout); /* Determine the IRQ which is bound to an event channel */ unsigned int irq_from_evtchn(evtchn_port_t evtchn); -int irq_from_virq(unsigned int cpu, unsigned int virq); -evtchn_port_t evtchn_from_irq(unsigned irq); +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn); int xen_set_callback_via(uint64_t via); int xen_evtchn_do_upcall(void); -- cgit From 3bdb0ac350fe5e6301562143e4573971dd01ae0b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 08:24:46 +0200 Subject: xen/events: remove some simple helpers from events_base.c The helper functions type_from_irq() and cpu_from_irq() are just one line functions used only internally. Open code them where needed. At the same time modify and rename get_evtchn_to_irq() to return a struct irq_info instead of the IRQ number. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 97 ++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 59 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index a74ea28dcc3d..a810e8904fbf 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -248,15 +248,6 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) return 0; } -static int get_evtchn_to_irq(evtchn_port_t evtchn) -{ - if (evtchn >= xen_evtchn_max_channels()) - return -1; - if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) - return -1; - return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); -} - /* Get info for IRQ */ static struct irq_info *info_for_irq(unsigned irq) { @@ -274,6 +265,19 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info) irq_set_chip_data(irq, info); } +static struct irq_info *evtchn_to_info(evtchn_port_t evtchn) +{ + int irq; + + if (evtchn >= xen_evtchn_max_channels()) + return NULL; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return NULL; + irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); + + return (irq < 0) ? NULL : info_for_irq(irq); +} + /* Per CPU channel accounting */ static void channels_on_cpu_dec(struct irq_info *info) { @@ -429,7 +433,9 @@ static evtchn_port_t evtchn_from_irq(unsigned int irq) unsigned int irq_from_evtchn(evtchn_port_t evtchn) { - return get_evtchn_to_irq(evtchn); + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->irq : -1; } EXPORT_SYMBOL_GPL(irq_from_evtchn); @@ -473,25 +479,11 @@ static unsigned pirq_from_irq(unsigned irq) return info->u.pirq.pirq; } -static enum xen_irq_type type_from_irq(unsigned irq) -{ - return info_for_irq(irq)->type; -} - -static unsigned cpu_from_irq(unsigned irq) -{ - return info_for_irq(irq)->cpu; -} - unsigned int cpu_from_evtchn(evtchn_port_t evtchn) { - int irq = get_evtchn_to_irq(evtchn); - unsigned ret = 0; - - if (irq != -1) - ret = cpu_from_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); - return ret; + return info ? info->cpu : 0; } static void do_mask(struct irq_info *info, u8 reason) @@ -540,13 +532,12 @@ static bool pirq_needs_eoi_flag(unsigned irq) static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, bool force_affinity) { - int irq = get_evtchn_to_irq(evtchn); - struct irq_info *info = info_for_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); - BUG_ON(irq == -1); + BUG_ON(info == NULL); if (IS_ENABLED(CONFIG_SMP) && force_affinity) { - struct irq_data *data = irq_get_irq_data(irq); + struct irq_data *data = irq_get_irq_data(info->irq); irq_data_update_affinity(data, cpumask_of(cpu)); irq_data_update_effective_affinity(data, cpumask_of(cpu)); @@ -979,13 +970,13 @@ static void __unbind_from_irq(unsigned int irq) } if (VALID_EVTCHN(evtchn)) { - unsigned int cpu = cpu_from_irq(irq); + unsigned int cpu = info->cpu; struct xenbus_device *dev; if (!info->is_static) xen_evtchn_close(evtchn); - switch (type_from_irq(irq)) { + switch (info->type) { case IRQT_VIRQ: per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; break; @@ -1185,15 +1176,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, { int irq; int ret; + struct irq_info *info; if (evtchn >= xen_evtchn_max_channels()) return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); + info = evtchn_to_info(evtchn); - if (irq == -1) { + if (!info) { irq = xen_allocate_irq_dynamic(); if (irq < 0) goto out; @@ -1216,9 +1208,9 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, */ bind_evtchn_to_cpu(evtchn, 0, false); } else { - struct irq_info *info = info_for_irq(irq); - if (!WARN_ON(!info || info->type != IRQT_EVTCHN)) + if (!WARN_ON(info->type != IRQT_EVTCHN)) info->refcnt++; + irq = info->irq; } out: @@ -1556,13 +1548,7 @@ EXPORT_SYMBOL_GPL(xen_set_irq_priority); int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static) { - int irq = get_evtchn_to_irq(evtchn); - struct irq_info *info; - - if (irq == -1) - return -ENOENT; - - info = info_for_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); if (!info) return -ENOENT; @@ -1578,7 +1564,6 @@ EXPORT_SYMBOL_GPL(evtchn_make_refcounted); int evtchn_get(evtchn_port_t evtchn) { - int irq; struct irq_info *info; int err = -ENOENT; @@ -1587,11 +1572,7 @@ int evtchn_get(evtchn_port_t evtchn) mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); - if (irq == -1) - goto done; - - info = info_for_irq(irq); + info = evtchn_to_info(evtchn); if (!info) goto done; @@ -1611,10 +1592,11 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(evtchn_port_t evtchn) { - int irq = get_evtchn_to_irq(evtchn); - if (WARN_ON(irq == -1)) + struct irq_info *info = evtchn_to_info(evtchn); + + if (WARN_ON(!info)) return; - unbind_from_irq(irq); + unbind_from_irq(info->irq); } EXPORT_SYMBOL_GPL(evtchn_put); @@ -1644,12 +1626,10 @@ struct evtchn_loop_ctrl { void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) { - int irq; - struct irq_info *info; + struct irq_info *info = evtchn_to_info(port); struct xenbus_device *dev; - irq = get_evtchn_to_irq(port); - if (irq == -1) + if (!info) return; /* @@ -1674,7 +1654,6 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) } } - info = info_for_irq(irq); if (xchg_acquire(&info->is_active, 1)) return; @@ -1688,7 +1667,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) info->eoi_time = get_jiffies_64() + event_eoi_delay; } - generic_handle_irq(irq); + generic_handle_irq(info->irq); } int xen_evtchn_do_upcall(void) @@ -1746,7 +1725,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(get_evtchn_to_irq(evtchn) != -1); + BUG_ON(evtchn_to_info(evtchn)); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); -- cgit From 4b7b492615cf3017190f55444f7016812b66611d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Nov 2023 13:49:38 +0000 Subject: af_unix: fix use-after-free in unix_stream_read_actor() syzbot reported the following crash [1] After releasing unix socket lock, u->oob_skb can be changed by another thread. We must temporarily increase skb refcount to make sure this other thread will not free the skb under us. [1] BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866 Read of size 4 at addr ffff88801f3b9cc4 by task syz-executor107/5297 CPU: 1 PID: 5297 Comm: syz-executor107 Not tainted 6.6.0-syzkaller-15910-gb8e3a87a627b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866 unix_stream_recv_urg net/unix/af_unix.c:2587 [inline] unix_stream_read_generic+0x19a5/0x2480 net/unix/af_unix.c:2666 unix_stream_recvmsg+0x189/0x1b0 net/unix/af_unix.c:2903 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0xe2/0x170 net/socket.c:1066 ____sys_recvmsg+0x21f/0x5c0 net/socket.c:2803 ___sys_recvmsg+0x115/0x1a0 net/socket.c:2845 __sys_recvmsg+0x114/0x1e0 net/socket.c:2875 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7fc67492c559 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fc6748ab228 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 000000000000001c RCX: 00007fc67492c559 RDX: 0000000040010083 RSI: 0000000020000140 RDI: 0000000000000004 RBP: 00007fc6749b6348 R08: 00007fc6748ab6c0 R09: 00007fc6748ab6c0 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc6749b6340 R13: 00007fc6749b634c R14: 00007ffe9fac52a0 R15: 00007ffe9fac5388 Allocated by task 5295: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 __kasan_slab_alloc+0x81/0x90 mm/kasan/common.c:328 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x180/0x3c0 mm/slub.c:3523 __alloc_skb+0x287/0x330 net/core/skbuff.c:641 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] queue_oob net/unix/af_unix.c:2147 [inline] unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 5295: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826 slab_free mm/slub.c:3809 [inline] kmem_cache_free+0xf8/0x340 mm/slub.c:3831 kfree_skbmem+0xef/0x1b0 net/core/skbuff.c:1015 __kfree_skb net/core/skbuff.c:1073 [inline] consume_skb net/core/skbuff.c:1288 [inline] consume_skb+0xdf/0x170 net/core/skbuff.c:1282 queue_oob net/unix/af_unix.c:2178 [inline] unix_stream_sendmsg+0xd49/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b The buggy address belongs to the object at ffff88801f3b9c80 which belongs to the cache skbuff_head_cache of size 240 The buggy address is located 68 bytes inside of freed 240-byte region [ffff88801f3b9c80, ffff88801f3b9d70) The buggy address belongs to the physical page: page:ffffea00007cee40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1f3b9 flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000800 ffff888142a60640 dead000000000122 0000000000000000 raw: 0000000000000000 00000000000c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 5299, tgid 5283 (syz-executor107), ts 103803840339, free_ts 103600093431 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x2cf/0x340 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1544 [inline] get_page_from_freelist+0xa25/0x36c0 mm/page_alloc.c:3312 __alloc_pages+0x1d0/0x4a0 mm/page_alloc.c:4568 alloc_pages_mpol+0x258/0x5f0 mm/mempolicy.c:2133 alloc_slab_page mm/slub.c:1870 [inline] allocate_slab+0x251/0x380 mm/slub.c:2017 new_slab mm/slub.c:2070 [inline] ___slab_alloc+0x8c7/0x1580 mm/slub.c:3223 __slab_alloc.constprop.0+0x56/0xa0 mm/slub.c:3322 __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] kmem_cache_alloc_node+0x132/0x3c0 mm/slub.c:3523 __alloc_skb+0x287/0x330 net/core/skbuff.c:641 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] queue_oob net/unix/af_unix.c:2147 [inline] unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1137 [inline] free_unref_page_prepare+0x4f8/0xa90 mm/page_alloc.c:2347 free_unref_page+0x33/0x3b0 mm/page_alloc.c:2487 __unfreeze_partials+0x21d/0x240 mm/slub.c:2655 qlink_free mm/kasan/quarantine.c:168 [inline] qlist_free_all+0x6a/0x170 mm/kasan/quarantine.c:187 kasan_quarantine_reduce+0x18e/0x1d0 mm/kasan/quarantine.c:294 __kasan_slab_alloc+0x65/0x90 mm/kasan/common.c:305 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3478 [inline] slab_alloc mm/slub.c:3486 [inline] __kmem_cache_alloc_lru mm/slub.c:3493 [inline] kmem_cache_alloc+0x15d/0x380 mm/slub.c:3502 vm_area_dup+0x21/0x2f0 kernel/fork.c:500 __split_vma+0x17d/0x1070 mm/mmap.c:2365 split_vma mm/mmap.c:2437 [inline] vma_modify+0x25d/0x450 mm/mmap.c:2472 vma_modify_flags include/linux/mm.h:3271 [inline] mprotect_fixup+0x228/0xc80 mm/mprotect.c:635 do_mprotect_pkey+0x852/0xd60 mm/mprotect.c:809 __do_sys_mprotect mm/mprotect.c:830 [inline] __se_sys_mprotect mm/mprotect.c:827 [inline] __x64_sys_mprotect+0x78/0xb0 mm/mprotect.c:827 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Memory state around the buggy address: ffff88801f3b9b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801f3b9c00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff88801f3b9c80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88801f3b9d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc ffff88801f3b9d80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb Fixes: 876c14ad014d ("af_unix: fix holding spinlock in oob handling") Reported-and-tested-by: syzbot+7a2d546fa43e49315ed3@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Rao Shoaib Reviewed-by: Rao shoaib Link: https://lore.kernel.org/r/20231113134938.168151-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 45506a95b25f..a357dc5f2404 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2581,15 +2581,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); - + else + skb_get(oob_skb); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); - if (!(state->flags & MSG_PEEK)) { + if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; - kfree_skb(oob_skb); - } + + consume_skb(oob_skb); mutex_unlock(&u->iolock); -- cgit From 67059b61597c004e23b074547b40604222bee3a0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 1 Nov 2023 09:33:51 +0800 Subject: netfilter: nft_set_rbtree: Remove unused variable nft_net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code that uses nft_net has been removed, and the nft_pernet function is merely obtaining a reference to shared data through the net pointer. The content of the net pointer is not modified or changed, so both of them should be removed. silence the warning: net/netfilter/nft_set_rbtree.c:627:26: warning: variable ‘nft_net’ set but not used Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7103 Signed-off-by: Yang Li Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6f1186abd47b..baa3fea4fe65 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -624,14 +624,12 @@ static void nft_rbtree_gc(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; - struct nftables_pernet *nft_net; struct rb_node *node, *next; struct nft_trans_gc *gc; struct net *net; set = nft_set_container_of(priv); net = read_pnet(&set->net); - nft_net = nft_pernet(net); gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) -- cgit From a44af08e3d4d7566eeea98d7a29fe06e7b9de944 Mon Sep 17 00:00:00 2001 From: Linkui Xiao Date: Wed, 1 Nov 2023 11:20:18 +0800 Subject: netfilter: nf_conntrack_bridge: initialize err to 0 K2CI reported a problem: consume_skb(skb); return err; [nf_br_ip_fragment() error] uninitialized symbol 'err'. err is not initialized, because returning 0 is expected, initialize err to 0. Fixes: 3c171f496ef5 ("netfilter: bridge: add connection tracking system") Reported-by: k2ci Signed-off-by: Linkui Xiao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_conntrack_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index b5c406a6e765..abb090f94ed2 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -37,7 +37,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; - int err; + int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && -- cgit From c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Nov 2023 09:42:51 +0300 Subject: netfilter: nf_tables: fix pointer math issue in nft_byteorder_eval() The problem is in nft_byteorder_eval() where we are iterating through a loop and writing to dst[0], dst[1], dst[2] and so on... On each iteration we are writing 8 bytes. But dst[] is an array of u32 so each element only has space for 4 bytes. That means that every iteration overwrites part of the previous element. I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter: nf_tables: prevent OOB access in nft_byteorder_eval") which is a related issue. I think that the reason we have not detected this bug in testing is that most of time we only write one element. Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion") Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nft_byteorder.c | 5 +++-- net/netfilter/nft_meta.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3bbd13ab1ecf..b157c5cafd14 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg) return *(__force __be32 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e596d1a842f7..f6e791a68101 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -38,13 +38,14 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], + nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; @@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr, for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f7da7c43333b..ba0d3683a45d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key, { switch (key) { case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); -- cgit From a7d5a955bfa854ac6b0c53aaf933394b4e6139e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 20:34:56 +0100 Subject: netfilter: nf_tables: bogus ENOENT when destroying element which does not exist destroy element command bogusly reports ENOENT in case a set element does not exist. ENOENT errors are skipped, however, err is still set and propagated to userspace. # nft destroy element ip raw BLACKLIST { 1.2.3.4 } Error: Could not process rule: No such file or directory destroy element ip raw BLACKLIST { 1.2.3.4 } ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Fixes: f80a612dd77c ("netfilter: nf_tables: add support to destroy operation") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a761ee6796f6..debea1c67701 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7263,10 +7263,11 @@ static int nf_tables_delsetelem(struct sk_buff *skb, if (err < 0) { NL_SET_BAD_ATTR(extack, attr); - break; + return err; } } - return err; + + return 0; } /* -- cgit From 28628fa952fefc7f2072ce6e8016968cc452b1ba Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 13 Nov 2023 21:13:23 +0100 Subject: netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test Linkui Xiao reported that there's a race condition when ipset swap and destroy is called, which can lead to crash in add/del/test element operations. Swap then destroy are usual operations to replace a set with another one in a production system. The issue can in some cases be reproduced with the script: ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do # ... Ongoing traffic... ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done In the race case the possible order of the operations are CPU0 CPU1 ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 hash_net_kadt Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy removed it, hash_net_kadt crashes. The fix is to force ip_set_swap() to wait for all readers to finish accessing the old set pointers by calling synchronize_rcu(). The first version of the patch was written by Linkui Xiao . v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden ip_set_destroy() unnecessarily when all sets are destroyed. v3: Florian Westphal pointed out that all netfilter hooks run with rcu_read_lock() held and em_ipset.c wraps the entire ip_set_test() in rcu read lock/unlock pair. So there's no need to extend the rcu read locked area in ipset itself. Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.org/ Reported by: Linkui Xiao Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 35d2f9c9ada0..4c133e06be1d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -708,15 +710,10 @@ __ip_set_put_netlink(struct ip_set *set) static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -1397,6 +1394,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); + /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; } -- cgit From 8837ba3e58ea1e3d09ae36db80b1e80853aada95 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 21:13:31 +0100 Subject: netfilter: nf_tables: split async and sync catchall in two functions list_for_each_entry_safe() does not work for the async case which runs under RCU, therefore, split GC logic for catchall in two functions instead, one for each of the sync and async GC variants. The catchall sync GC variant never sees a _DEAD bit set on ever, thus, this handling is removed in such case, moreover, allocate GC sync batch via GFP_KERNEL. Fixes: 93995bf4af2c ("netfilter: nf_tables: remove catchall element in GC sync path") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 55 +++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index debea1c67701..c0a42989b982 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9680,16 +9680,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq, - bool sync) +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) { - struct nft_set_elem_catchall *catchall, *next; + struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; - struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9699,35 +9697,42 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - if (sync) - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - else - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); - + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); if (!gc) return NULL; - elem_priv = catchall->elem; - if (sync) { - nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); - nft_setelem_catchall_destroy(catchall); - } - - nft_trans_gc_elem_add(gc, elem_priv); + nft_trans_gc_elem_add(gc, catchall->elem); } return gc; } -struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, - unsigned int gc_seq) -{ - return nft_trans_gc_catchall(gc, gc_seq, false); -} - struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { - return nft_trans_gc_catchall(gc, 0, true); + struct nft_set_elem_catchall *catchall, *next; + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return NULL; + + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + nft_trans_gc_elem_add(gc, elem_priv); + } + + return gc; } static void nf_tables_module_autoload_cleanup(struct net *net) -- cgit From b944aa9d86d5f782bfe5e51336434c960304839c Mon Sep 17 00:00:00 2001 From: Matus Malych Date: Tue, 14 Nov 2023 14:35:25 +0100 Subject: ALSA: hda/realtek: Enable Mute LED on HP 255 G10 HP 255 G10 has a mute LED that can be made to work using quirk ALC236_FIXUP_HP_MUTE_LED_COEFBIT2. Enable already existing quirk - at correct line to keep order Signed-off-by: Matus Malych Cc: Link: https://lore.kernel.org/r/20231114133524.11340-1-matus@malych.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cdd808e02b44..3c85b8247c11 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9868,6 +9868,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b2f, "HP 255 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), -- cgit From 2e6ef8aaba6b709ce91164401fa1c12668510360 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 14 Nov 2023 07:57:53 -0600 Subject: Remove myself as maintainer of GFS2 I am retiring from Red Hat and will no longer be a maintainer of the gfs2 file system. Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..5c9f868e13b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8950,7 +8950,6 @@ S: Maintained F: scripts/get_maintainer.pl GFS2 FILE SYSTEM -M: Bob Peterson M: Andreas Gruenbacher L: gfs2@lists.linux.dev S: Supported -- cgit From 782ce431613cf08c3a00dca42ad925c3b1108d09 Mon Sep 17 00:00:00 2001 From: Konstantin Runov Date: Mon, 30 Oct 2023 12:45:08 +0300 Subject: gcc-plugins: latent_entropy: Fix typo (args -> argc) in plugin description Fix the typo in the plugin description comment. Clearly, "argc" should be used. Signed-off-by: Konstantin Runov Link: https://lore.kernel.org/r/20231030094508.245432-1-runebone1@gmail.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/latent_entropy_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index 39e86be60dd2..ff0b192be91f 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -17,7 +17,7 @@ * if (argc <= 1) * printf("%s: no command arguments :(\n", *argv); * else - * printf("%s: %d command arguments!\n", *argv, args - 1); + * printf("%s: %d command arguments!\n", *argv, argc - 1); * } * * after: @@ -47,7 +47,7 @@ * // perturb_local_entropy() * } else { * local_entropy ^= 3896280633962944730; - * printf("%s: %d command arguments!\n", *argv, args - 1); + * printf("%s: %d command arguments!\n", *argv, argc - 1); * } * * // latent_entropy_execute() 4. -- cgit From 29954d5b1e0d67a4cd61c30c2201030c97e94b1e Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 14 Nov 2023 04:54:12 +0000 Subject: cifs: fix leak of iface for primary channel My last change in this area introduced a change which accounted for primary channel in the interface ref count. However, it did not reduce this ref count on deallocation of the primary channel. i.e. during umount. Fixing this leak here, by dropping this ref count for primary channel while freeing up the session. Fixes: fa1d0508bdd4 ("cifs: account for primary channel in the interface list") Cc: stable@vger.kernel.org Reported-by: Paulo Alcantara Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 57c2a7df3457..f896f60c924b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2065,6 +2065,12 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) ses->chans[i].server = NULL; } + /* we now account for primary channel in iface->refcount */ + if (ses->chans[0].iface) { + kref_put(&ses->chans[0].iface->refcount, release_iface); + ses->chans[0].server = NULL; + } + sesInfoFree(ses); cifs_put_tcp_session(server, 0); } -- cgit From 5eef12c4e3230f2025dc46ad8c4a3bc19978e5d7 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 14 Nov 2023 04:58:23 +0000 Subject: cifs: fix lock ordering while disabling multichannel The code to handle the case of server disabling multichannel was picking iface_lock with chan_lock held. This goes against the lock ordering rules, as iface_lock is a higher order lock (even if it isn't so obvious). This change fixes the lock ordering by doing the following in that order for each secondary channel: 1. store iface and server pointers in local variable 2. remove references to iface and server in channels 3. unlock chan_lock 4. lock iface_lock 5. dec ref count for iface 6. unlock iface_lock 7. dec ref count for server 8. lock chan_lock again Since this function can only be called in smb2_reconnect, and that cannot be called by two parallel processes, we should not have races due to dropping chan_lock between steps 3 and 8. Fixes: ee1d21794e55 ("cifs: handle when server stops supporting multichannel") Reported-by: Paulo Alcantara Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/sess.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0bb2ac929061..8b2d7c1ca428 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -322,28 +322,32 @@ cifs_disable_secondary_channels(struct cifs_ses *ses) iface = ses->chans[i].iface; server = ses->chans[i].server; + /* + * remove these references first, since we need to unlock + * the chan_lock here, since iface_lock is a higher lock + */ + ses->chans[i].iface = NULL; + ses->chans[i].server = NULL; + spin_unlock(&ses->chan_lock); + if (iface) { spin_lock(&ses->iface_lock); kref_put(&iface->refcount, release_iface); - ses->chans[i].iface = NULL; iface->num_channels--; if (iface->weight_fulfilled) iface->weight_fulfilled--; spin_unlock(&ses->iface_lock); } - spin_unlock(&ses->chan_lock); - if (server && !server->terminate) { - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - } - spin_lock(&ses->chan_lock); - if (server) { - ses->chans[i].server = NULL; + if (!server->terminate) { + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + } cifs_put_tcp_session(server, false); } + spin_lock(&ses->chan_lock); } done: -- cgit From eab03c23c2a162085b13200d7942fc5a00b5ccc8 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Tue, 7 Nov 2023 17:05:07 +0800 Subject: sched/eevdf: Fix vruntime adjustment on reweight vruntime of the (on_rq && !0-lag) entity needs to be adjusted when it gets re-weighted, and the calculations can be simplified based on the fact that re-weight won't change the w-average of all the entities. Please check the proofs in comments. But adjusting vruntime can also cause position change in RB-tree hence require re-queue to fix up which might be costly. This might be avoided by deferring adjustment to the time the entity actually leaves tree (dequeue/pick), but that will negatively affect task selection and probably not good enough either. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com --- kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 23 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2048138ce54b..025d90925bf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3666,41 +3666,140 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + unsigned long old_weight = se->load.weight; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag, vslice; + + /* + * VRUNTIME + * ======== + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + if (avruntime != se->vruntime) { + vlag = (s64)(avruntime - se->vruntime); + vlag = div_s64(vlag * old_weight, weight); + se->vruntime = avruntime - vlag; + } + + /* + * DEADLINE + * ======== + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + vslice = (s64)(se->deadline - avruntime); + vslice = div_s64(vslice * old_weight, weight); + se->deadline = avruntime + vslice; +} + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { - unsigned long old_weight = se->load.weight; + bool curr = cfs_rq->curr == se; if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) + if (curr) update_curr(cfs_rq); else - avg_vruntime_sub(cfs_rq, se); + __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - update_load_set(&se->load, weight); - if (!se->on_rq) { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ - se->vlag = div_s64(se->vlag * old_weight, weight); + se->vlag = div_s64(se->vlag * se->load.weight, weight); } else { - s64 deadline = se->deadline - se->vruntime; - /* - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - */ - deadline = div_s64(deadline * old_weight, weight); - se->deadline = se->vruntime + deadline; - if (se != cfs_rq->curr) - min_deadline_cb_propagate(&se->run_node, NULL); + reweight_eevdf(cfs_rq, se, weight); } + update_load_set(&se->load, weight); + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (cfs_rq->curr != se) - avg_vruntime_add(cfs_rq, se); + if (!curr) { + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + __enqueue_entity(cfs_rq, se); + update_min_vruntime(cfs_rq); + } } } @@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -- cgit From 8b39d20eceeda6c4eb23df1497f9ed2fffdc8f69 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 26 Oct 2023 12:41:14 -0400 Subject: sched: psi: fix unprivileged polling against cgroups 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") breaks unprivileged psi polling on cgroups. Historically, we had a privilege check for polling in the open() of a pressure file in /proc, but were erroneously missing it for the open() of cgroup pressure files. When unprivileged polling was introduced in d82caa273565 ("sched/psi: Allow unprivileged polling of N*2s period"), it needed to filter privileges depending on the exact polling parameters, and as such moved the CAP_SYS_RESOURCE check from the proc open() callback to psi_trigger_create(). Both the proc files as well as cgroup files go through this during write(). This implicitly added the missing check for privileges required for HT polling for cgroups. When 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") followed right after to remove further restrictions on the RT polling window, it incorrectly assumed the cgroup privilege check was still missing and added it to the cgroup open(), mirroring what we used to do for proc files in the past. As a result, unprivileged poll requests that would be supported now get rejected when opening the cgroup pressure file for writing. Remove the cgroup open() check. psi_trigger_create() handles it. Fixes: 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") Reported-by: Luca Boccassi Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Luca Boccassi Acked-by: Suren Baghdasaryan Cc: stable@vger.kernel.org # 6.5+ Link: https://lore.kernel.org/r/20231026164114.2488682-1-hannes@cmpxchg.org --- kernel/cgroup/cgroup.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1d5b9de3b1b9..4b9ff41ca603 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } -static int cgroup_pressure_open(struct kernfs_open_file *of) -{ - if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return 0; -} - static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), - .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), - .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), - .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), - .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, -- cgit From 6d7e4782bcf549221b4ccfffec2cf4d1a473f1a3 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Tue, 31 Oct 2023 14:38:22 +0100 Subject: sched/fair: Fix the decision for load balance should_we_balance is called for the decision to do load-balancing. When sched ticks invoke this function, only one CPU should return true. However, in the current code, two CPUs can return true. The following situation, where b means busy and i means idle, is an example, because CPU 0 and CPU 2 return true. [0, 1] [2, 3] b b i b This fix checks if there exists an idle CPU with busy sibling(s) after looking for a CPU on an idle core. If some idle CPUs with busy siblings are found, just the first one should do load-balancing. Fixes: b1bfeab9b002 ("sched/fair: Consider the idle state of the whole core for load balance") Signed-off-by: Keisuke Nishimura Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Reviewed-by: Shrikanth Hegde Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231031133821.1570861-1-keisuke.nishimura@inria.fr --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 025d90925bf6..d7a3c63a2171 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11184,12 +11184,16 @@ static int should_we_balance(struct lb_env *env) continue; } - /* Are we the first idle CPU? */ + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } - if (idle_smt == env->dst_cpu) - return true; + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; -- cgit From 09f12bf9f790052710bd6e48a1fc1bc4d9e17389 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Nov 2023 18:18:31 +0300 Subject: nouveau/gsp/r535: uninitialized variable in r535_gsp_acpi_mux_id() The if we hit the "continue" statement on the first iteration through the loop then "handle_mux" needs to be set to NULL so we continue looping. Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Dan Carpenter Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/1d864f6e-43e9-43d8-9d90-30e76c9c843b@moroto.mountain --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index e31f9641114b..afa8e7377a76 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1159,7 +1159,7 @@ static void r535_gsp_acpi_mux_id(acpi_handle handle, u32 id, MUX_METHOD_DATA_ELEMENT *mode, MUX_METHOD_DATA_ELEMENT *part) { - acpi_handle iter = NULL, handle_mux; + acpi_handle iter = NULL, handle_mux = NULL; acpi_status status; unsigned long long value; -- cgit From 42bd415bd8bd43721d423930b4695c565661e687 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Nov 2023 10:40:21 +0300 Subject: nouveau/gsp/r535: Fix a NULL vs error pointer bug The r535_gsp_cmdq_get() function returns error pointers but this code checks for NULL. Also we need to propagate the error pointer back to the callers in r535_gsp_rpc_get(). Returning NULL will lead to a NULL pointer dereference. Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Dan Carpenter Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/f71996d9-d1cb-45ea-a4b2-2dfc21312d8c@kili.mountain --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index afa8e7377a76..dc44f5c7833f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -689,8 +689,8 @@ r535_gsp_rpc_get(struct nvkm_gsp *gsp, u32 fn, u32 argc) struct nvfw_gsp_rpc *rpc; rpc = r535_gsp_cmdq_get(gsp, ALIGN(sizeof(*rpc) + argc, sizeof(u64))); - if (!rpc) - return NULL; + if (IS_ERR(rpc)) + return ERR_CAST(rpc); rpc->header_version = 0x03000000; rpc->signature = ('C' << 24) | ('P' << 16) | ('R' << 8) | 'V'; -- cgit From a2e36cd56041e277d7d81d35638fd8d9731e21f5 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 7 Nov 2023 15:32:55 +1000 Subject: nouveau: use an rwlock for the event lock. This allows it to break the following circular locking dependency. Aug 10 07:01:29 dg1test kernel: ====================================================== Aug 10 07:01:29 dg1test kernel: WARNING: possible circular locking dependency detected Aug 10 07:01:29 dg1test kernel: 6.4.0-rc7+ #10 Not tainted Aug 10 07:01:29 dg1test kernel: ------------------------------------------------------ Aug 10 07:01:29 dg1test kernel: wireplumber/2236 is trying to acquire lock: Aug 10 07:01:29 dg1test kernel: ffff8fca5320da18 (&fctx->lock){-...}-{2:2}, at: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: but task is already holding lock: Aug 10 07:01:29 dg1test kernel: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: which lock already depends on the new lock. Aug 10 07:01:29 dg1test kernel: the existing dependency chain (in reverse order) is: Aug 10 07:01:29 dg1test kernel: -> #3 (&event->list_lock#2){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: -> #2 (&device->intr.lock){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_inth_allow+0x2c/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_state+0x181/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_allow+0x63/0xd0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_uevent_mthd+0x4d/0x70 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_ioctl+0x10b/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_object_mthd+0xa8/0x1f0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_event_allow+0x2a/0xa0 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_enable_signaling+0x78/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: __dma_fence_enable_signaling+0x5e/0x100 Aug 10 07:01:29 dg1test kernel: dma_fence_add_callback+0x4b/0xd0 Aug 10 07:01:29 dg1test kernel: nouveau_cli_work_queue+0xae/0x110 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_gem_object_close+0x1d1/0x2a0 [nouveau] Aug 10 07:01:29 dg1test kernel: drm_gem_handle_delete+0x70/0xe0 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl_kernel+0xa5/0x150 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl+0x256/0x490 [drm] Aug 10 07:01:29 dg1test kernel: nouveau_drm_ioctl+0x5a/0xb0 [nouveau] Aug 10 07:01:29 dg1test kernel: __x64_sys_ioctl+0x91/0xd0 Aug 10 07:01:29 dg1test kernel: do_syscall_64+0x3c/0x90 Aug 10 07:01:29 dg1test kernel: entry_SYSCALL_64_after_hwframe+0x72/0xdc Aug 10 07:01:29 dg1test kernel: -> #1 (&event->refs_lock#4){....}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_state+0x37/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_allow+0x63/0xd0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_uevent_mthd+0x4d/0x70 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_ioctl+0x10b/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_object_mthd+0xa8/0x1f0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_event_allow+0x2a/0xa0 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_enable_signaling+0x78/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: __dma_fence_enable_signaling+0x5e/0x100 Aug 10 07:01:29 dg1test kernel: dma_fence_add_callback+0x4b/0xd0 Aug 10 07:01:29 dg1test kernel: nouveau_cli_work_queue+0xae/0x110 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_gem_object_close+0x1d1/0x2a0 [nouveau] Aug 10 07:01:29 dg1test kernel: drm_gem_handle_delete+0x70/0xe0 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl_kernel+0xa5/0x150 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl+0x256/0x490 [drm] Aug 10 07:01:29 dg1test kernel: nouveau_drm_ioctl+0x5a/0xb0 [nouveau] Aug 10 07:01:29 dg1test kernel: __x64_sys_ioctl+0x91/0xd0 Aug 10 07:01:29 dg1test kernel: do_syscall_64+0x3c/0x90 Aug 10 07:01:29 dg1test kernel: entry_SYSCALL_64_after_hwframe+0x72/0xdc Aug 10 07:01:29 dg1test kernel: -> #0 (&fctx->lock){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: __lock_acquire+0x14e3/0x2240 Aug 10 07:01:29 dg1test kernel: lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_client_event+0xf/0x20 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x9b/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: other info that might help us debug this: Aug 10 07:01:29 dg1test kernel: Chain exists of: &fctx->lock --> &device->intr.lock --> &event->list_lock#2 Aug 10 07:01:29 dg1test kernel: Possible unsafe locking scenario: Aug 10 07:01:29 dg1test kernel: CPU0 CPU1 Aug 10 07:01:29 dg1test kernel: ---- ---- Aug 10 07:01:29 dg1test kernel: lock(&event->list_lock#2); Aug 10 07:01:29 dg1test kernel: lock(&device->intr.lock); Aug 10 07:01:29 dg1test kernel: lock(&event->list_lock#2); Aug 10 07:01:29 dg1test kernel: lock(&fctx->lock); Aug 10 07:01:29 dg1test kernel: *** DEADLOCK *** Aug 10 07:01:29 dg1test kernel: 2 locks held by wireplumber/2236: Aug 10 07:01:29 dg1test kernel: #0: ffff8fca53177bf8 (&device->intr.lock){-...}-{2:2}, at: nvkm_intr+0x29/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: #1: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: stack backtrace: Aug 10 07:01:29 dg1test kernel: CPU: 6 PID: 2236 Comm: wireplumber Not tainted 6.4.0-rc7+ #10 Aug 10 07:01:29 dg1test kernel: Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021 Aug 10 07:01:29 dg1test kernel: Call Trace: Aug 10 07:01:29 dg1test kernel: Aug 10 07:01:29 dg1test kernel: dump_stack_lvl+0x5b/0x90 Aug 10 07:01:29 dg1test kernel: check_noncircular+0xe2/0x110 Aug 10 07:01:29 dg1test kernel: __lock_acquire+0x14e3/0x2240 Aug 10 07:01:29 dg1test kernel: lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: ? lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_client_event+0xf/0x20 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x9b/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: RIP: 0033:0x7fb66174d700 Aug 10 07:01:29 dg1test kernel: Code: c1 e2 05 29 ca 8d 0c 10 0f be 07 84 c0 75 eb 89 c8 c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa e9 d7 0f fc ff 0f 1f 80 00 00 00 00 0f 1e fa e9 c7 0f fc> Aug 10 07:01:29 dg1test kernel: RSP: 002b:00007ffdd3c48438 EFLAGS: 00000206 Aug 10 07:01:29 dg1test kernel: RAX: 000055bb758763c0 RBX: 000055bb758752c0 RCX: 00000000000028b0 Aug 10 07:01:29 dg1test kernel: RDX: 000055bb758752c0 RSI: 000055bb75887490 RDI: 000055bb75862950 Aug 10 07:01:29 dg1test kernel: RBP: 00007ffdd3c48490 R08: 000055bb75873b10 R09: 0000000000000001 Aug 10 07:01:29 dg1test kernel: R10: 0000000000000004 R11: 000055bb7587f000 R12: 000055bb75887490 Aug 10 07:01:29 dg1test kernel: R13: 000055bb757f6280 R14: 000055bb758875c0 R15: 000055bb757f6280 Aug 10 07:01:29 dg1test kernel: Signed-off-by: Dave Airlie Tested-by: Danilo Krummrich Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20231107053255.2257079-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/include/nvkm/core/event.h | 4 ++-- drivers/gpu/drm/nouveau/nvkm/core/event.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h index 82b267c11147..460459af272d 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h @@ -14,7 +14,7 @@ struct nvkm_event { int index_nr; spinlock_t refs_lock; - spinlock_t list_lock; + rwlock_t list_lock; int *refs; struct list_head ntfy; @@ -38,7 +38,7 @@ nvkm_event_init(const struct nvkm_event_func *func, struct nvkm_subdev *subdev, int types_nr, int index_nr, struct nvkm_event *event) { spin_lock_init(&event->refs_lock); - spin_lock_init(&event->list_lock); + rwlock_init(&event->list_lock); return __nvkm_event_init(func, subdev, types_nr, index_nr, event); } diff --git a/drivers/gpu/drm/nouveau/nvkm/core/event.c b/drivers/gpu/drm/nouveau/nvkm/core/event.c index a6c877135598..61fed7792e41 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/event.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/event.c @@ -81,17 +81,17 @@ nvkm_event_ntfy_state(struct nvkm_event_ntfy *ntfy) static void nvkm_event_ntfy_remove(struct nvkm_event_ntfy *ntfy) { - spin_lock_irq(&ntfy->event->list_lock); + write_lock_irq(&ntfy->event->list_lock); list_del_init(&ntfy->head); - spin_unlock_irq(&ntfy->event->list_lock); + write_unlock_irq(&ntfy->event->list_lock); } static void nvkm_event_ntfy_insert(struct nvkm_event_ntfy *ntfy) { - spin_lock_irq(&ntfy->event->list_lock); + write_lock_irq(&ntfy->event->list_lock); list_add_tail(&ntfy->head, &ntfy->event->ntfy); - spin_unlock_irq(&ntfy->event->list_lock); + write_unlock_irq(&ntfy->event->list_lock); } static void @@ -176,7 +176,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits) return; nvkm_trace(event->subdev, "event: ntfy %08x on %d\n", bits, id); - spin_lock_irqsave(&event->list_lock, flags); + read_lock_irqsave(&event->list_lock, flags); list_for_each_entry_safe(ntfy, ntmp, &event->ntfy, head) { if (ntfy->id == id && ntfy->bits & bits) { @@ -185,7 +185,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits) } } - spin_unlock_irqrestore(&event->list_lock, flags); + read_unlock_irqrestore(&event->list_lock, flags); } void -- cgit From 969d90ec212bae4b45bf9d21d7daa30aa6cf055e Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 14 Nov 2023 17:25:48 -0500 Subject: audit: don't WARN_ON_ONCE(!current->mm) in audit_exe_compare() eBPF can end up calling into the audit code from some odd places, and some of these places don't have @current set properly so we end up tripping the `WARN_ON_ONCE(!current->mm)` near the top of `audit_exe_compare()`. While the basic `!current->mm` check is good, the `WARN_ON_ONCE()` results in some scary console messages so let's drop that and just do the regular `!current->mm` check to avoid problems. Cc: Fixes: 47846d51348d ("audit: don't take task_lock() in audit_exe_compare() code path") Reported-by: Artem Savkov Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 91e82e34b51e..7a98cd176a12 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) if (tsk != current) return 0; - if (WARN_ON_ONCE(!current->mm)) + if (!current->mm) return 0; exe_file = get_mm_exe_file(current->mm); if (!exe_file) -- cgit From 77618db346455129424fadbbaec596a09feaf3bb Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 12 Oct 2023 12:55:34 -0700 Subject: zstd: Fix array-index-out-of-bounds UBSAN warning Zstd used an array of length 1 to mean a flexible array for C89 compatibility. Switch to a C99 flexible array to fix the UBSAN warning. Tested locally by booting the kernel and writing to and reading from a BtrFS filesystem with zstd compression enabled. I was unable to reproduce the issue before the fix, however it is a trivial change. Link: https://lkml.kernel.org/r/20231012213428.1390905-1-nickrterrell@gmail.com Reported-by: syzbot+1f2eb3e8cd123ffce499@syzkaller.appspotmail.com Reported-by: Eric Biggers Reported-by: Kees Cook Signed-off-by: Nick Terrell Reviewed-by: Kees Cook --- lib/zstd/common/fse_decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c index a0d06095be83..8dcb8ca39767 100644 --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -312,7 +312,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size typedef struct { short ncount[FSE_MAX_SYMBOL_VALUE + 1]; - FSE_DTable dtable[1]; /* Dynamically sized */ + FSE_DTable dtable[]; /* Dynamically sized */ } FSE_DecompressWksp; -- cgit From c9bd1568d5462f4108417518ce1af7b924acfb6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Nov 2023 21:36:13 +0100 Subject: futex: Fix hardcoded flags Xi reported that commit 5694289ce183 ("futex: Flag conversion") broke glibc's robust futex tests. This was narrowed down to the change of FLAGS_SHARED from 0x01 to 0x10, at which point Florian noted that handle_futex_death() has a hardcoded flags argument of 1. Change this to: FLAGS_SIZE_32 | FLAGS_SHARED, matching how futex_to_flags() unconditionally sets FLAGS_SIZE_32 for all legacy futex ops. Reported-by: Xi Ruoyao Reported-by: Florian Weimer Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20231114201402.GA25315@noisy.programming.kicks-ass.net Fixes: 5694289ce183 ("futex: Flag conversion") Cc: --- kernel/futex/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 52695c59d041..dad981a865b8 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -700,7 +700,8 @@ retry: owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); return 0; } @@ -752,8 +753,10 @@ retry: * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + if (!pi && (uval & FUTEX_WAITERS)) { + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); + } return 0; } -- cgit From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 12:34:46 +0200 Subject: perf/core: Fix cpuctx refcounting Audit of the refcounting turned up that perf_pmu_migrate_context() fails to migrate the ctx refcount. Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org Cc: --- include/linux/perf_event.h | 13 ++++++++----- kernel/events/core.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index afb028c54f33..5547ba68e6e4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -843,11 +843,11 @@ struct perf_event { }; /* - * ,-----------------------[1:n]----------------------. - * V V - * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event - * ^ ^ | | - * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' + * ,-----------------------[1:n]------------------------. + * V V + * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event + * | | + * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed @@ -865,6 +865,9 @@ struct perf_event { * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. + * + * perf_event holds a refcount on perf_event_context + * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; diff --git a/kernel/events/core.c b/kernel/events/core.c index 683dc086ef10..b704d83a28b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, void *task_ctx_data = NULL; if (!ctx->task) { + /* + * perf_pmu_migrate_context() / __perf_pmu_install_event() + * relies on the fact that find_get_pmu_context() cannot fail + * for CPU contexts. + */ struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); @@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; + struct perf_event_context *old_ctx = event->ctx; + + get_ctx(ctx); /* normally find_get_context() */ event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); @@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); + + /* + * Now that event->ctx is updated and visible, put the old ctx. + */ + put_ctx(old_ctx); } static void __perf_pmu_install(struct perf_event_context *ctx, @@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); + /* + * Since per-cpu context is persistent, no need to grab an extra + * reference. + */ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; -- cgit From fa02de9e75889915b554eda1964a631fd019973b Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 13 Nov 2023 19:42:49 +0200 Subject: net: stmmac: fix rx budget limit check The while loop condition verifies 'count < limit'. Neither value change before the 'count >= limit' check. As is this check is dead code. But code inspection reveals a code path that modifies 'count' and then goto 'drain_data' and back to 'read_again'. So there is a need to verify count value sanity after 'read_again'. Move 'read_again' up to fix the count limit check. Fixes: ec222003bd94 ("net: stmmac: Prepare to add Split Header support") Signed-off-by: Baruch Siach Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/d9486296c3b6b12ab3a0515fcd47d56447a07bfc.1699897370.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3e50fd53a617..f28838c8cdb3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5328,10 +5328,10 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) len = 0; } +read_again: if (count >= limit) break; -read_again: buf1_len = 0; buf2_len = 0; entry = next_entry; -- cgit From b6cb4541853c7ee512111b0e7ddf3cb66c99c137 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 13 Nov 2023 19:42:50 +0200 Subject: net: stmmac: avoid rx queue overrun dma_rx_size can be set as low as 64. Rx budget might be higher than that. Make sure to not overrun allocated rx buffers when budget is larger. Leave one descriptor unused to avoid wrap around of 'dirty_rx' vs 'cur_rx'. Signed-off-by: Baruch Siach Reviewed-by: Serge Semin Fixes: 47dd7a540b8a ("net: add support for STMicroelectronics Ethernet controllers.") Link: https://lore.kernel.org/r/d95413e44c97d4692e72cec13a75f894abeb6998.1699897370.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f28838c8cdb3..2afb2bd25977 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5293,6 +5293,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) dma_dir = page_pool_get_dma_dir(rx_q->page_pool); buf_sz = DIV_ROUND_UP(priv->dma_conf.dma_buf_sz, PAGE_SIZE) * PAGE_SIZE; + limit = min(priv->dma_conf.dma_rx_size - 1, (unsigned int)limit); if (netif_msg_rx_status(priv)) { void *rx_head; -- cgit From 907d1bdb8b2cc0357d03a1c34d2a08d9943760b1 Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:49 -0800 Subject: tg3: Move the [rt]x_dropped counters to tg3_napi This change moves [rt]x_dropped counters to tg3_napi so that they can be updated by a single writer, race-free. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20231113182350.37472-1-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 38 ++++++++++++++++++++++++++++++++----- drivers/net/ethernet/broadcom/tg3.h | 4 ++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1dee27349367..884e9eecc471 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -6889,7 +6889,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) desc_idx, *post_ptr); drop_it_no_recycle: /* Other statistics kept track of by card. */ - tp->rx_dropped++; + tnapi->rx_dropped++; goto next_pkt; } @@ -8190,7 +8190,7 @@ dma_error: drop: dev_kfree_skb_any(skb); drop_nofree: - tp->tx_dropped++; + tnapi->tx_dropped++; return NETDEV_TX_OK; } @@ -9405,7 +9405,7 @@ static void __tg3_set_rx_mode(struct net_device *); /* tp->lock is held. */ static int tg3_halt(struct tg3 *tp, int kind, bool silent) { - int err; + int err, i; tg3_stop_fw(tp); @@ -9426,6 +9426,13 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent) /* And make sure the next sample is new data */ memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats)); + + for (i = 0; i < TG3_IRQ_MAX_VECS; ++i) { + struct tg3_napi *tnapi = &tp->napi[i]; + + tnapi->rx_dropped = 0; + tnapi->tx_dropped = 0; + } } return err; @@ -11975,6 +11982,9 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) { struct rtnl_link_stats64 *old_stats = &tp->net_stats_prev; struct tg3_hw_stats *hw_stats = tp->hw_stats; + unsigned long rx_dropped; + unsigned long tx_dropped; + int i; stats->rx_packets = old_stats->rx_packets + get_stat64(&hw_stats->rx_ucast_packets) + @@ -12021,8 +12031,26 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) stats->rx_missed_errors = old_stats->rx_missed_errors + get_stat64(&hw_stats->rx_discards); - stats->rx_dropped = tp->rx_dropped; - stats->tx_dropped = tp->tx_dropped; + /* Aggregate per-queue counters. The per-queue counters are updated + * by a single writer, race-free. The result computed by this loop + * might not be 100% accurate (counters can be updated in the middle of + * the loop) but the next tg3_get_nstats() will recompute the current + * value so it is acceptable. + * + * Note that these counters wrap around at 4G on 32bit machines. + */ + rx_dropped = (unsigned long)(old_stats->rx_dropped); + tx_dropped = (unsigned long)(old_stats->tx_dropped); + + for (i = 0; i < tp->irq_cnt; i++) { + struct tg3_napi *tnapi = &tp->napi[i]; + + rx_dropped += tnapi->rx_dropped; + tx_dropped += tnapi->tx_dropped; + } + + stats->rx_dropped = rx_dropped; + stats->tx_dropped = tx_dropped; } static int tg3_get_regs_len(struct net_device *dev) diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index ae5c01bd1110..5016475e5005 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3018,6 +3018,7 @@ struct tg3_napi { u16 *rx_rcb_prod_idx; struct tg3_rx_prodring_set prodring; struct tg3_rx_buffer_desc *rx_rcb; + unsigned long rx_dropped; u32 tx_prod ____cacheline_aligned; u32 tx_cons; @@ -3026,6 +3027,7 @@ struct tg3_napi { u32 prodmbox; struct tg3_tx_buffer_desc *tx_ring; struct tg3_tx_ring_info *tx_buffers; + unsigned long tx_dropped; dma_addr_t status_mapping; dma_addr_t rx_rcb_mapping; @@ -3220,8 +3222,6 @@ struct tg3 { /* begin "everything else" cacheline(s) section */ - unsigned long rx_dropped; - unsigned long tx_dropped; struct rtnl_link_stats64 net_stats_prev; struct tg3_ethtool_stats estats_prev; -- cgit From 17dd5efe5f36a96bd78012594fabe21efb01186b Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:50 -0800 Subject: tg3: Increment tx_dropped in tg3_tso_bug() tg3_tso_bug() drops a packet if it cannot be segmented for any reason. The number of discarded frames should be incremented accordingly. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20231113182350.37472-2-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 884e9eecc471..48b6191efa56 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7918,8 +7918,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, segs = skb_gso_segment(skb, tp->dev->features & ~(NETIF_F_TSO | NETIF_F_TSO6)); - if (IS_ERR(segs) || !segs) + if (IS_ERR(segs) || !segs) { + tnapi->tx_dropped++; goto tg3_tso_bug_end; + } skb_list_walk_safe(segs, seg, next) { skb_mark_not_on_list(seg); -- cgit From 09d4c14c6c5e6e781a3879fed7f8e116a18b8c65 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 13 Nov 2023 10:32:56 -0800 Subject: pds_core: use correct index to mask irq Use the qcq's interrupt index, not the irq number, to mask the interrupt. Since the irq number can be out of range from the number of possible interrupts, we can end up accessing and potentially scribbling on out-of-range and/or unmapped memory, making the kernel angry. [ 3116.039364] BUG: unable to handle page fault for address: ffffbeea1c3edf84 [ 3116.047059] #PF: supervisor write access in kernel mode [ 3116.052895] #PF: error_code(0x0002) - not-present page [ 3116.058636] PGD 100000067 P4D 100000067 PUD 1001f2067 PMD 10f82e067 PTE 0 [ 3116.066221] Oops: 0002 [#1] SMP NOPTI [ 3116.092948] RIP: 0010:iowrite32+0x9/0x76 [ 3116.190452] Call Trace: [ 3116.193185] [ 3116.195430] ? show_trace_log_lvl+0x1d6/0x2f9 [ 3116.200298] ? show_trace_log_lvl+0x1d6/0x2f9 [ 3116.205166] ? pdsc_adminq_isr+0x43/0x55 [pds_core] [ 3116.210618] ? __die_body.cold+0x8/0xa [ 3116.214806] ? page_fault_oops+0x16d/0x1ac [ 3116.219382] ? exc_page_fault+0xbe/0x13b [ 3116.223764] ? asm_exc_page_fault+0x22/0x27 [ 3116.228440] ? iowrite32+0x9/0x76 [ 3116.232143] pdsc_adminq_isr+0x43/0x55 [pds_core] [ 3116.237627] __handle_irq_event_percpu+0x3a/0x184 [ 3116.243088] handle_irq_event+0x57/0xb0 [ 3116.247575] handle_edge_irq+0x87/0x225 [ 3116.252062] __common_interrupt+0x3e/0xbc [ 3116.256740] common_interrupt+0x7b/0x98 [ 3116.261216] [ 3116.263745] [ 3116.266268] asm_common_interrupt+0x22/0x27 Reported-by: Joao Martins Fixes: 01ba61b55b20 ("pds_core: Add adminq processing and commands") Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231113183257.71110-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/adminq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c index 045fe133f6ee..5beadabc2136 100644 --- a/drivers/net/ethernet/amd/pds_core/adminq.c +++ b/drivers/net/ethernet/amd/pds_core/adminq.c @@ -146,7 +146,7 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data) } queue_work(pdsc->wq, &qcq->work); - pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR); + pds_core_intr_mask(&pdsc->intr_ctrl[qcq->intx], PDS_CORE_INTR_MASK_CLEAR); return IRQ_HANDLED; } -- cgit From 7c02f6ae676a954216a192612040f9a0cde3adf7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 13 Nov 2023 10:32:57 -0800 Subject: pds_core: fix up some format-truncation complaints Our friendly kernel test robot pointed out a couple of potential string truncation issues. None of which were we worried about, but can be relatively easily fixed to quiet the complaints. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310211736.66syyDpp-lkp@intel.com/ Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231113183257.71110-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/core.h | 2 +- drivers/net/ethernet/amd/pds_core/dev.c | 8 ++++++-- drivers/net/ethernet/amd/pds_core/devlink.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index f3a7deda9972..e35d3e7006bf 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -15,7 +15,7 @@ #define PDSC_DRV_DESCRIPTION "AMD/Pensando Core Driver" #define PDSC_WATCHDOG_SECS 5 -#define PDSC_QUEUE_NAME_MAX_SZ 32 +#define PDSC_QUEUE_NAME_MAX_SZ 16 #define PDSC_ADMINQ_MIN_LENGTH 16 /* must be a power of two */ #define PDSC_NOTIFYQ_LENGTH 64 /* must be a power of two */ #define PDSC_TEARDOWN_RECOVERY false diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index 7c1b965d61a9..31940b857e0e 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -261,10 +261,14 @@ static int pdsc_identify(struct pdsc *pdsc) struct pds_core_drv_identity drv = {}; size_t sz; int err; + int n; drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX); - snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str), - "%s %s", PDS_CORE_DRV_NAME, utsname()->release); + /* Catching the return quiets a Wformat-truncation complaint */ + n = snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str), + "%s %s", PDS_CORE_DRV_NAME, utsname()->release); + if (n > sizeof(drv.driver_ver_str)) + dev_dbg(pdsc->dev, "release name truncated, don't care\n"); /* Next let's get some info about the device * We use the devcmd_lock at this level in order to diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 57f88c8b37de..e9948ea5bbcd 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -104,7 +104,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, struct pds_core_fw_list_info fw_list; struct pdsc *pdsc = devlink_priv(dl); union pds_core_dev_comp comp; - char buf[16]; + char buf[32]; int listlen; int err; int i; -- cgit From 278a370c1766060d2144d6cf0b06c101e1043b6d Mon Sep 17 00:00:00 2001 From: Ziwei Xiao Date: Mon, 13 Nov 2023 16:41:44 -0800 Subject: gve: Fixes for napi_poll when budget is 0 Netpoll will explicilty pass the polling call with a budget of 0 to indicate it's clearing the Tx path only. For the gve_rx_poll and gve_xdp_poll, they were mistakenly taking the 0 budget as the indication to do all the work. Add check to avoid the rx path and xdp path being called when budget is 0. And also avoid napi_complete_done being called when budget is 0 for netpoll. Fixes: f5cedc84a30d ("gve: Add transmit and receive support") Signed-off-by: Ziwei Xiao Link: https://lore.kernel.org/r/20231114004144.2022268-1-ziweixiao@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 8 +++++++- drivers/net/ethernet/google/gve/gve_rx.c | 4 ---- drivers/net/ethernet/google/gve/gve_tx.c | 4 ---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 276f996f95dc..2d42e733837b 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -254,10 +254,13 @@ static int gve_napi_poll(struct napi_struct *napi, int budget) if (block->tx) { if (block->tx->q_num < priv->tx_cfg.num_queues) reschedule |= gve_tx_poll(block, budget); - else + else if (budget) reschedule |= gve_xdp_poll(block, budget); } + if (!budget) + return 0; + if (block->rx) { work_done = gve_rx_poll(block, budget); reschedule |= work_done == budget; @@ -298,6 +301,9 @@ static int gve_napi_poll_dqo(struct napi_struct *napi, int budget) if (block->tx) reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); + if (!budget) + return 0; + if (block->rx) { work_done = gve_rx_poll_dqo(block, budget); reschedule |= work_done == budget; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index e84a066aa1a4..73655347902d 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -1007,10 +1007,6 @@ int gve_rx_poll(struct gve_notify_block *block, int budget) feat = block->napi.dev->features; - /* If budget is 0, do all the work */ - if (budget == 0) - budget = INT_MAX; - if (budget > 0) work_done = gve_clean_rx_done(rx, budget, feat); diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 6957a865cff3..9f6ffc4a54f0 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -925,10 +925,6 @@ bool gve_xdp_poll(struct gve_notify_block *block, int budget) bool repoll; u32 to_do; - /* If budget is 0, do all the work */ - if (budget == 0) - budget = INT_MAX; - /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); -- cgit From 9fce92f050f448a0d1ddd9083ef967d9930f1e52 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:13 +0100 Subject: mptcp: deal with large GSO size After the blamed commit below, the TCP sockets (and the MPTCP subflows) can build egress packets larger than 64K. That exceeds the maximum DSS data size, the length being misrepresent on the wire and the stream being corrupted, as later observed on the receiver: WARNING: CPU: 0 PID: 9696 at net/mptcp/protocol.c:705 __mptcp_move_skbs_from_subflow+0x2604/0x26e0 CPU: 0 PID: 9696 Comm: syz-executor.7 Not tainted 6.6.0-rc5-gcd8bdf563d46 #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'. RIP: 0010:__mptcp_move_skbs_from_subflow+0x2604/0x26e0 net/mptcp/protocol.c:705 RSP: 0018:ffffc90000006e80 EFLAGS: 00010246 RAX: ffffffff83e9f674 RBX: ffff88802f45d870 RCX: ffff888102ad0000 netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'. RDX: 0000000080000303 RSI: 0000000000013908 RDI: 0000000000003908 RBP: ffffc90000007110 R08: ffffffff83e9e078 R09: 1ffff1100e548c8a R10: dffffc0000000000 R11: ffffed100e548c8b R12: 0000000000013908 R13: dffffc0000000000 R14: 0000000000003908 R15: 000000000031cf29 FS: 00007f239c47e700(0000) GS:ffff88811b200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f239c45cd78 CR3: 000000006a66c006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 PKRU: 55555554 Call Trace: mptcp_data_ready+0x263/0xac0 net/mptcp/protocol.c:819 subflow_data_ready+0x268/0x6d0 net/mptcp/subflow.c:1409 tcp_data_queue+0x21a1/0x7a60 net/ipv4/tcp_input.c:5151 tcp_rcv_established+0x950/0x1d90 net/ipv4/tcp_input.c:6098 tcp_v6_do_rcv+0x554/0x12f0 net/ipv6/tcp_ipv6.c:1483 tcp_v6_rcv+0x2e26/0x3810 net/ipv6/tcp_ipv6.c:1749 ip6_protocol_deliver_rcu+0xd6b/0x1ae0 net/ipv6/ip6_input.c:438 ip6_input+0x1c5/0x470 net/ipv6/ip6_input.c:483 ipv6_rcv+0xef/0x2c0 include/linux/netfilter.h:304 __netif_receive_skb+0x1ea/0x6a0 net/core/dev.c:5532 process_backlog+0x353/0x660 net/core/dev.c:5974 __napi_poll+0xc6/0x5a0 net/core/dev.c:6536 net_rx_action+0x6a0/0xfd0 net/core/dev.c:6603 __do_softirq+0x184/0x524 kernel/softirq.c:553 do_softirq+0xdd/0x130 kernel/softirq.c:454 Address the issue explicitly bounding the maximum GSO size to what MPTCP actually allows. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/450 Fixes: 7c4e983c4f3c ("net: allow gso_max_size to exceed 65536") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-1-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a0b8356cd8c5..66e947054945 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1230,6 +1230,8 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, mptcp_do_fallback(ssk); } +#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1256,6 +1258,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return -EAGAIN; /* compute send limit */ + if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) + ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; -- cgit From d109a7767273d1706b541c22b83a0323823dfde4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:14 +0100 Subject: mptcp: fix possible NULL pointer dereference on close After the blamed commit below, the MPTCP release callback can dereference the first subflow pointer via __mptcp_set_connected() and send buffer auto-tuning. Such pointer is always expected to be valid, except at socket destruction time, when the first subflow is deleted and the pointer zeroed. If the connect event is handled by the release callback while the msk socket is finally released, MPTCP hits the following splat: general protection fault, probably for non-canonical address 0xdffffc00000000f2: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000790-0x0000000000000797] CPU: 1 PID: 26719 Comm: syz-executor.2 Not tainted 6.6.0-syzkaller-10102-gff269e2cd5ad #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:mptcp_subflow_ctx net/mptcp/protocol.h:542 [inline] RIP: 0010:__mptcp_propagate_sndbuf net/mptcp/protocol.h:813 [inline] RIP: 0010:__mptcp_set_connected+0x57/0x3e0 net/mptcp/subflow.c:424 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8a62323c RDX: 00000000000000f2 RSI: ffffffff8a630116 RDI: 0000000000000790 RBP: ffff88803334b100 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000034 R12: ffff88803334b198 R13: ffff888054f0b018 R14: 0000000000000000 R15: ffff88803334b100 FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbcb4f75198 CR3: 000000006afb5000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mptcp_release_cb+0xa2c/0xc40 net/mptcp/protocol.c:3405 release_sock+0xba/0x1f0 net/core/sock.c:3537 mptcp_close+0x32/0xf0 net/mptcp/protocol.c:3084 inet_release+0x132/0x270 net/ipv4/af_inet.c:433 inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:485 __sock_release+0xae/0x260 net/socket.c:659 sock_close+0x1c/0x20 net/socket.c:1419 __fput+0x270/0xbb0 fs/file_table.c:394 task_work_run+0x14d/0x240 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xa92/0x2a20 kernel/exit.c:876 do_group_exit+0xd4/0x2a0 kernel/exit.c:1026 get_signal+0x23ba/0x2790 kernel/signal.c:2900 arch_do_signal_or_restart+0x90/0x7f0 arch/x86/kernel/signal.c:309 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x11f/0x240 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x60 kernel/entry/common.c:296 do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:88 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7fb515e7cae9 Code: Unable to access opcode bytes at 0x7fb515e7cabf. RSP: 002b:00007fb516c560c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: 000000000000003c RBX: 00007fb515f9c120 RCX: 00007fb515e7cae9 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000006 RBP: 00007fb515ec847a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007fb515f9c120 R15: 00007ffc631eb968 To avoid sparkling unneeded conditionals, address the issue explicitly checking msk->first only in the critical place. Fixes: 8005184fd1ca ("mptcp: refactor sndbuf auto-tuning") Cc: stable@vger.kernel.org Reported-by: Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/454 Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iLZUA6S2a=K8GObnS62KK6Jt4B7PsAs7meMFooM8xaTgw@mail.gmail.com/ Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-2-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 66e947054945..bc81ea53a049 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3402,10 +3402,11 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { - /* be sure to set the current sk state before tacking actions - * depending on sk_state, that is processing MPTCP_ERROR_REPORT + /* be sure to set the current sk state before taking actions + * depending on sk_state (MPTCP_ERROR_REPORT) + * On sk release avoid actions depending on the first subflow */ - if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags) && msk->first) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); -- cgit From 8df220b29282e8b450ea57be62e1eccd4996837c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 14 Nov 2023 00:16:15 +0100 Subject: mptcp: add validity check for sending RM_ADDR This patch adds the validity check for sending RM_ADDRs for userspace PM in mptcp_pm_remove_addrs(), only send a RM_ADDR when the address is in the anno_list or conn_list. Fixes: 8b1c94da1e48 ("mptcp: only send RM_ADDR in nl_cmd_remove") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-3-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1529ec358815..bf4d96f6f99a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1515,8 +1515,9 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { - remove_anno_list_by_saddr(msk, &entry->addr); - if (alist.nr < MPTCP_RM_IDS_MAX) + if ((remove_anno_list_by_saddr(msk, &entry->addr) || + lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) && + alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; } -- cgit From 7679d34f97b7a09fd565f5729f79fd61b7c55329 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:16 +0100 Subject: mptcp: fix setsockopt(IP_TOS) subflow locking The MPTCP implementation of the IP_TOS socket option uses the lockless variant of the TOS manipulation helper and does not hold such lock at the helper invocation time. Add the required locking. Fixes: ffcacff87cd6 ("mptcp: Support for IP_TOS for MPTCP setsockopt()") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/457 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-4-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 77f5e8932abf..353680733700 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -738,8 +738,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, val = READ_ONCE(inet_sk(sk)->tos); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + slow = lock_sock_fast(ssk); __ip_sock_set_tos(ssk, val); + unlock_sock_fast(ssk, slow); } release_sock(sk); -- cgit From 7cefbe5e1dacc7236caa77e9d072423f21422fe2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:17 +0100 Subject: selftests: mptcp: fix fastclose with csum failure Running the mp_join selftest manually with the following command line: ./mptcp_join.sh -z -C leads to some failures: 002 fastclose server test # ... rtx [fail] got 1 MP_RST[s] TX expected 0 # ... rstrx [fail] got 1 MP_RST[s] RX expected 0 The problem is really in the wrong expectations for the RST checks implied by the csum validation. Note that the same check is repeated explicitly in the same test-case, with the correct expectation and pass successfully. Address the issue explicitly setting the correct expectation for the failing checks. Reported-by: Xiumei Mu Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 75a2438efdf3..3c94f2f194d6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3240,7 +3240,7 @@ fastclose_tests() if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 + chk_join_nr 0 0 0 0 0 0 1 chk_fclose_nr 1 1 invert chk_rst_nr 1 1 fi -- cgit From 006ccc3090e2f30f5f97857f3946312692a5279e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 4 Nov 2023 22:54:26 -0400 Subject: bcachefs: Kill journal pre-reservations This deletes the complicated and somewhat expensive journal pre-reservation machinery in favor of just using journal watermarks: when the journal is more than half full, we run journal reclaim more aggressively, and when the journal is more than 3/4s full we only allow journal reclaim to get new journal reservations. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 - fs/bcachefs/btree_key_cache.c | 14 ------ fs/bcachefs/btree_trans_commit.c | 36 +------------- fs/bcachefs/btree_types.h | 3 -- fs/bcachefs/btree_update_interior.c | 30 ------------ fs/bcachefs/btree_update_interior.h | 1 - fs/bcachefs/journal.c | 31 ------------ fs/bcachefs/journal.h | 98 ------------------------------------- fs/bcachefs/journal_reclaim.c | 42 ++++++---------- fs/bcachefs/journal_types.h | 26 ---------- fs/bcachefs/trace.h | 11 +---- 11 files changed, 19 insertions(+), 275 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index c2adf3fbb0b3..6fa90bcd7016 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - kfree(trans->extra_journal_entries.data); if (trans->fs_usage_deltas) { diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index b3305a04d808..37fbf22de8fc 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -672,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; bch2_journal_pin_drop(j, &ck->journal); - bch2_journal_preres_put(j, &ck->res); BUG_ON(!btree_node_locked(c_iter.path, 0)); @@ -770,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - int difference; - - BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - - difference = jset_u64s(insert->k.u64s) - ck->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - ck->res.u64s += difference; - } - } - bkey_copy(ck->k, insert); ck->valid = true; @@ -1006,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); - bch2_journal_preres_put(&c->journal, &ck->res); list_del(&ck->list); kfree(ck->k); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 55a120eb8692..12907beda98c 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -323,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); } -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { @@ -882,14 +871,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags } } - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -1052,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; struct btree_write_buffered_key *wb; - unsigned u64s; int ret = 0; if (!trans->nr_updates && @@ -1112,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1134,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - if (i->flags & BTREE_UPDATE_NOJOURNAL) continue; - trans->journal_u64s += u64s; + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) @@ -1175,8 +1145,6 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index f3669fa68591..6fbd4ef3df6b 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -327,7 +327,6 @@ struct bkey_cached { struct rhash_head hash; struct list_head list; - struct journal_preres res; struct journal_entry_pin journal; u64 seq; @@ -441,11 +440,9 @@ struct btree_trans { struct journal_entry_pin *journal_pin; struct journal_res journal_res; - struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; - unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 39c2db68123b..76f27bc9fa24 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * up_read(&c->gc_lock); as->took_gc_lock = false; - bch2_journal_preres_put(&c->journal, &as->journal_preres); - bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -734,8 +732,6 @@ err: bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_preres_put(&c->journal, &as->journal_preres); - mutex_lock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; @@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - journal_flags |= watermark; - while (1) { nr_nodes[!!update_level] += 1 + split; update_level++; @@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret) { - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - goto err; - } - - ret = drop_locks_do(trans, - bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags)); - if (ret == -BCH_ERR_journal_preres_get_blocked) { - trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - } - if (ret) - goto err; - } - ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 4df21512d640..031076e75fa1 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -55,7 +55,6 @@ struct btree_update { unsigned update_level; struct disk_reservation disk_res; - struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5b5d69f2316b..23a9b7845d11 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; } -/* journal_preres: */ - -static bool journal_preres_available(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - - if (!ret && mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - - return ret; -} - -int __bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - int ret; - - closure_wait_event(&j->preres_wait, - (ret = bch2_journal_error(j)) || - journal_preres_available(j, res, new_u64s, flags)); - return ret; -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 011711e99c8d..c85d01cf4948 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -395,104 +395,6 @@ out: return 0; } -/* journal_preres: */ - -static inline void journal_set_watermark(struct journal *j) -{ - union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = BCH_WATERMARK_stripe; - - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (!s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static inline void bch2_journal_preres_put(struct journal *j, - struct journal_preres *res) -{ - union journal_preres_state s = { .reserved = res->u64s }; - - if (!res->u64s) - return; - - s.v = atomic64_sub_return(s.v, &j->prereserved.counter); - res->u64s = 0; - - if (unlikely(s.waiting)) { - clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), - (unsigned long *) &j->prereserved.v); - closure_wake_up(&j->preres_wait); - } - - if (s.reserved <= s.remaining && j->watermark) - journal_set_watermark(j); -} - -int __bch2_journal_preres_get(struct journal *, - struct journal_preres *, unsigned, unsigned); - -static inline int bch2_journal_preres_get_fast(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags, - bool set_waiting) -{ - int d = new_u64s - res->u64s; - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret; - - do { - old.v = new.v = v; - ret = 0; - - if (watermark == BCH_WATERMARK_reclaim || - new.reserved + d < new.remaining) { - new.reserved += d; - ret = 1; - } else if (set_waiting && !new.waiting) - new.waiting = true; - else - return 0; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); - - if (ret) - res->u64s += d; - return ret; -} - -static inline int bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - if (new_u64s <= res->u64s) - return 0; - - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) - return 0; - - if (flags & JOURNAL_RES_GET_NONBLOCK) - return -BCH_ERR_journal_preres_get_blocked; - - return __bch2_journal_preres_get(j, res, new_u64s, flags); -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 9a584aaaa2eb..e63c6eda86af 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +static inline void journal_set_watermark(struct journal *j, bool low_on_space) { - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); + unsigned watermark = BCH_WATERMARK_stripe; - do { - old.v = new.v = v; - new.remaining = u64s_remaining; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); + if (low_on_space) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static struct journal_space @@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned clean, clean_ondisk, total; - s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - u64s_remaining = (u64) clean << 6; - u64s_remaining -= (u64) total << 3; - u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 4; - u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + journal_set_watermark(j, clean * 4 <= total); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; - journal_set_remaining(j, u64s_remaining); - journal_set_watermark(j); if (!ret) journal_wake(j); @@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - /* And include pre-reservations: */ - nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, - (ca->mi.bucket_size << 6) - - journal_entry_overhead(j)); - nr_buckets = min(nr_buckets, ja->nr); bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; @@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; - if (j->prereserved.reserved * 4 > j->prereserved.remaining) - min_nr = 1; - - if (fifo_free(&j->pin) <= 32) + if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) @@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - j->prereserved.reserved, - j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, atomic_long_read(&c->btree_key_cache.nr_dirty), diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 42504e16acb6..a756b69582e3 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -76,14 +76,6 @@ struct journal_res { u64 seq; }; -/* - * For reserving space in the journal prior to getting a reservation on a - * particular journal entry: - */ -struct journal_preres { - unsigned u64s; -}; - union journal_res_state { struct { atomic64_t counter; @@ -104,22 +96,6 @@ union journal_res_state { }; }; -union journal_preres_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 waiting:1, - reserved:31, - remaining:32; - }; -}; - /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -180,8 +156,6 @@ struct journal { union journal_res_state reservations; enum bch_watermark watermark; - union journal_preres_state prereserved; - } __aligned(SMP_CACHE_BYTES); unsigned long flags; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 893304a1f06e..7857671159b4 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write, TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, - u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), @@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start, __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) - __field(u64, prereserved ) - __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) @@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start, __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; - __entry->prereserved = prereserved; - __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, - __entry->prereserved, - __entry->prereserved_total, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, -- cgit From 069749688ea4bbaeff0ca3b229b443ea96b03757 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 22:15:59 -0500 Subject: bcachefs: Fix iterator leak in may_delete_deleted_inode() may_delete_deleted_inode() was returning without exiting a btree iterator, eventually causing propagate_key_to_snaphot_leaves() to go into an infinite loop hitting btree_trans_too_many_iters(). Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index def77f2d8802..dab12c14d1ad 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, * unlinked inodes in the snapshot leaves: */ *need_another_pass = true; - return 0; + goto out; } ret = 1; -- cgit From b783fc4d1366658200bf759e1010655a9e2e145c Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Sun, 12 Nov 2023 00:38:41 +0000 Subject: bcachefs: Fix potential sleeping during mount During mount, bcachefs mount option processing may sleep while allocating a string buffer. Fix this by reference counting in order to take the atomic path. Signed-off-by: Daniel J Blueman Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 1f334124055b..4d0cb0ccff32 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) case TARGET_DEV: { struct bch_dev *ca; + out->atomic++; rcu_read_lock(); ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) @@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } rcu_read_unlock(); + out->atomic--; break; } case TARGET_GROUP: -- cgit From 178c4873fd06c0361d260547ce70fcdc29b74809 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 14:15:35 -0500 Subject: bcachefs: Fix error path in bch2_mount() This fixes a bug discovered by generic/388 where sb->s_fs_info was NULL while the superblock was still active - the error path was entirely fubar, and was trying to do something unclear and unecessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 166d8d8abe68..8ef817304e4a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1922,10 +1922,7 @@ out: return dget(sb->s_root); err_put_super: - sb->s_fs_info = NULL; - c->vfs_sb = NULL; deactivate_locked_super(sb); - bch2_fs_stop(c); return ERR_PTR(bch2_err_class(ret)); } @@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - if (c) - c->vfs_sb = NULL; generic_shutdown_super(sb); - if (c) - bch2_fs_free(c); + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { -- cgit From f42fa17883e73d8509fff5925781d4157db82f00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 15:47:02 -0500 Subject: bcachefs: Fix missing transaction commit In may_delete_deleted_inode(), there's a corner case when a snapshot was taken while we had an unlinked inode: we don't want to delete the inode in the internal (shared) snapshot node, since it might have been reattached in a descendent snapshot. Instead we propagate the key to any snapshot leaves it doesn't exist in, so that it can be deleted there if necessary, and then clear the unlinked flag in the internal node. But we forgot to commit after clearing the unlinked flag, causing us to go into an infinite loop. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index dab12c14d1ad..c7849b0753e7 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1169,8 +1169,10 @@ again: */ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, - &need_another_pass)); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); if (ret < 0) break; -- cgit From 497c57a303590ea69ace23506e182c489e85694d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 17:02:08 -0500 Subject: bcachefs: Disable debug log statements The journal read path had some informational log statements preperatory for ZNS support - they're not of interest to users, so we can turn them off. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f4bc2cdbfdd7..786a09285509 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1079,6 +1079,12 @@ found: if (ja->bucket_seq[ja->cur_idx] && ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); for (i = 0; i < 3; i++) { @@ -1086,6 +1092,7 @@ found: bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); } +#endif ja->sectors_free = 0; } -- cgit From 7125063fc6dfb77138b3a100527f3d8f9203ff2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 2 Mar 2023 23:52:57 -0500 Subject: bcachefs: Don't decrease BTREE_ITER_MAX when LOCKDEP=y Running with fewer max btree paths doesn't work anymore when replication is enabled - as we've added e.g. the freespace and bucket gens btrees, we naturally end up needing more btree paths. This is an issue with lockdep, we end up taking more locks than lockdep will track (the MAX_LOCKD_DEPTH constant). But bcachefs as merged does not yet support lockdep anyways, so we can leave that for later. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 6fbd4ef3df6b..60453ba86c4b 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -363,11 +363,7 @@ struct btree_insert_entry { unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); -- cgit From db18ef1a02bc2cd924f86b2582302f2c2711b67c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:17:19 -0500 Subject: bcachefs: Fix bch2_check_nlinks() for snapshots When searching the link table for the matching inode, we were searching for a specific - incorrect - snapshot ID as well, causing us to fail to find the inode. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9f3e9bd3d767..e0c5cd119acc 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r) const struct nlink *l = _l; const struct nlink *r = _r; - return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + return cmp_int(l->inum, r->inum); } static void inc_link(struct bch_fs *c, struct snapshots_seen *s, -- cgit From 62d73dfc44d54c97e0df6b947f0bccf6c4b8030e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 21:46:52 -0500 Subject: bcachefs: Fix no_data_io mode checksum check In no_data_io mode, we expect data checksums to be wrong - don't want to spew the log with them. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f02b3f7d26a0..d704a8f829c8 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) * checksum: */ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return -EIO; ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -- cgit From 61b85cb0d773115d9a4b20c3e67286844cf73f34 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 18:52:22 -0500 Subject: bcachefs: six locks: Fix lost wakeup In percpu reader mode, trylock() for read had a lost wakeup: on failure to get the lock, we may have caused a writer to fail to get the lock, because we temporarily elevated the reader count. We need to check for waiters after decrementing the read count - not before. Signed-off-by: Kent Overstreet --- fs/bcachefs/six.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index b775cf0fb7cb..97790445e67a 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, this_cpu_sub(*lock->readers, !ret); preempt_enable(); - if (!ret && (old & SIX_LOCK_WAITING_write)) - ret = -1 - SIX_LOCK_write; + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic_add(SIX_LOCK_HELD_write, &lock->state); -- cgit From 5dd9ad32d7758b1a76742f394acf0eb3ac8a636a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 10:29:02 +0200 Subject: xen/events: drop xen_allocate_irqs_dynamic() Instead of having a common function for allocating a single IRQ or a consecutive number of IRQs, split up the functionality into the callers of xen_allocate_irqs_dynamic(). This allows to handle any allocation error in xen_irq_init() gracefully instead of panicing the system. Let xen_irq_init() return the irq_info pointer or NULL in case of an allocation error. Additionally set the IRQ into irq_info already at allocation time, as otherwise the IRQ would be '0' (which is a valid IRQ number) until being set. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 74 ++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index a810e8904fbf..75f8d1dcbbb7 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -304,6 +304,13 @@ static void channels_on_cpu_inc(struct irq_info *info) info->is_accounted = 1; } +static void xen_irq_free_desc(unsigned int irq) +{ + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq >= nr_legacy_irqs()) + irq_free_desc(irq); +} + static void delayed_free_irq(struct work_struct *work) { struct irq_info *info = container_of(to_rcu_work(work), struct irq_info, @@ -315,9 +322,7 @@ static void delayed_free_irq(struct work_struct *work) kfree(info); - /* Legacy IRQ descriptors are managed by the arch. */ - if (irq >= nr_legacy_irqs()) - irq_free_desc(irq); + xen_irq_free_desc(irq); } /* Constructors for packed IRQ information. */ @@ -332,7 +337,6 @@ static int xen_irq_info_common_setup(struct irq_info *info, BUG_ON(info->type != IRQT_UNBOUND && info->type != type); info->type = type; - info->irq = irq; info->evtchn = evtchn; info->cpu = cpu; info->mask_reason = EVT_MASK_REASON_EXPLICIT; @@ -733,47 +737,45 @@ void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) } EXPORT_SYMBOL_GPL(xen_irq_lateeoi); -static void xen_irq_init(unsigned irq) +static struct irq_info *xen_irq_init(unsigned int irq) { struct irq_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info == NULL) - panic("Unable to allocate metadata for IRQ%d\n", irq); + if (info) { + info->irq = irq; + info->type = IRQT_UNBOUND; + info->refcnt = -1; + INIT_RCU_WORK(&info->rwork, delayed_free_irq); - info->type = IRQT_UNBOUND; - info->refcnt = -1; - INIT_RCU_WORK(&info->rwork, delayed_free_irq); + set_info_for_irq(irq, info); + /* + * Interrupt affinity setting can be immediate. No point + * in delaying it until an interrupt is handled. + */ + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_info_for_irq(irq, info); - /* - * Interrupt affinity setting can be immediate. No point - * in delaying it until an interrupt is handled. - */ - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + INIT_LIST_HEAD(&info->eoi_list); + list_add_tail(&info->list, &xen_irq_list_head); + } - INIT_LIST_HEAD(&info->eoi_list); - list_add_tail(&info->list, &xen_irq_list_head); + return info; } -static int __must_check xen_allocate_irqs_dynamic(int nvec) +static inline int __must_check xen_allocate_irq_dynamic(void) { - int i, irq = irq_alloc_descs(-1, 0, nvec, -1); + int irq = irq_alloc_desc_from(0, -1); if (irq >= 0) { - for (i = 0; i < nvec; i++) - xen_irq_init(irq + i); + if (!xen_irq_init(irq)) { + xen_irq_free_desc(irq); + irq = -1; + } } return irq; } -static inline int __must_check xen_allocate_irq_dynamic(void) -{ - - return xen_allocate_irqs_dynamic(1); -} - static int __must_check xen_allocate_irq_gsi(unsigned gsi) { int irq; @@ -793,7 +795,10 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) else irq = irq_alloc_desc_at(gsi, -1); - xen_irq_init(irq); + if (!xen_irq_init(irq)) { + xen_irq_free_desc(irq); + irq = -1; + } return irq; } @@ -963,6 +968,11 @@ static void __unbind_from_irq(unsigned int irq) evtchn_port_t evtchn = evtchn_from_irq(irq); struct irq_info *info = info_for_irq(irq); + if (!info) { + xen_irq_free_desc(irq); + return; + } + if (info->refcnt > 0) { info->refcnt--; if (info->refcnt != 0) @@ -1101,11 +1111,14 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irqs_dynamic(nvec); + irq = irq_alloc_descs(-1, 0, nvec, -1); if (irq < 0) goto out; for (i = 0; i < nvec; i++) { + if (!xen_irq_init(irq + i)) + goto error_irq; + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, @@ -1730,6 +1743,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); + info->irq = irq; (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); -- cgit From 3fcdaf3d7634338c3f5cbfa7451eb0b6b0024844 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Sep 2023 09:09:52 +0200 Subject: xen/events: modify internal [un]bind interfaces Modify the internal bind- and unbind-interfaces to take a struct irq_info parameter. When allocating a new IRQ pass the pointer from the allocating function further up. This will reduce the number of info_for_irq() calls and make the code more efficient. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 259 +++++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 135 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 75f8d1dcbbb7..88f0c80d0f87 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -327,7 +327,6 @@ static void delayed_free_irq(struct work_struct *work) /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, - unsigned irq, enum xen_irq_type type, evtchn_port_t evtchn, unsigned short cpu) @@ -342,23 +341,22 @@ static int xen_irq_info_common_setup(struct irq_info *info, info->mask_reason = EVT_MASK_REASON_EXPLICIT; raw_spin_lock_init(&info->lock); - ret = set_evtchn_to_irq(evtchn, irq); + ret = set_evtchn_to_irq(evtchn, info->irq); if (ret < 0) return ret; - irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN); return xen_evtchn_port_setup(evtchn); } -static int xen_irq_info_evtchn_setup(unsigned irq, +static int xen_irq_info_evtchn_setup(struct irq_info *info, evtchn_port_t evtchn, struct xenbus_device *dev) { - struct irq_info *info = info_for_irq(irq); int ret; - ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); + ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0); info->u.interdomain = dev; if (dev) atomic_inc(&dev->event_channels); @@ -366,50 +364,37 @@ static int xen_irq_info_evtchn_setup(unsigned irq, return ret; } -static int xen_irq_info_ipi_setup(unsigned cpu, - unsigned irq, - evtchn_port_t evtchn, - enum ipi_vector ipi) +static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, enum ipi_vector ipi) { - struct irq_info *info = info_for_irq(irq); - info->u.ipi = ipi; - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + per_cpu(ipi_to_irq, cpu)[ipi] = info->irq; per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; - return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0); } -static int xen_irq_info_virq_setup(unsigned cpu, - unsigned irq, - evtchn_port_t evtchn, - unsigned virq) +static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, unsigned int virq) { - struct irq_info *info = info_for_irq(irq); - info->u.virq = virq; - per_cpu(virq_to_irq, cpu)[virq] = irq; + per_cpu(virq_to_irq, cpu)[virq] = info->irq; - return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0); } -static int xen_irq_info_pirq_setup(unsigned irq, - evtchn_port_t evtchn, - unsigned pirq, - unsigned gsi, - uint16_t domid, - unsigned char flags) +static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn, + unsigned int pirq, unsigned int gsi, + uint16_t domid, unsigned char flags) { - struct irq_info *info = info_for_irq(irq); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.domid = domid; info->u.pirq.flags = flags; - return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0); } static void xen_irq_info_cleanup(struct irq_info *info) @@ -453,20 +438,16 @@ int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, return irq; } -static enum ipi_vector ipi_from_irq(unsigned irq) +static enum ipi_vector ipi_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_IPI); return info->u.ipi; } -static unsigned virq_from_irq(unsigned irq) +static unsigned int virq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_VIRQ); @@ -533,13 +514,9 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, +static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu, bool force_affinity) { - struct irq_info *info = evtchn_to_info(evtchn); - - BUG_ON(info == NULL); - if (IS_ENABLED(CONFIG_SMP) && force_affinity) { struct irq_data *data = irq_get_irq_data(info->irq); @@ -547,7 +524,7 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, irq_data_update_effective_affinity(data, cpumask_of(cpu)); } - xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu); + xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu); channels_on_cpu_dec(info); info->cpu = cpu; @@ -762,23 +739,24 @@ static struct irq_info *xen_irq_init(unsigned int irq) return info; } -static inline int __must_check xen_allocate_irq_dynamic(void) +static struct irq_info *xen_allocate_irq_dynamic(void) { int irq = irq_alloc_desc_from(0, -1); + struct irq_info *info = NULL; if (irq >= 0) { - if (!xen_irq_init(irq)) { + info = xen_irq_init(irq); + if (!info) xen_irq_free_desc(irq); - irq = -1; - } } - return irq; + return info; } -static int __must_check xen_allocate_irq_gsi(unsigned gsi) +static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi) { int irq; + struct irq_info *info; /* * A PV guest has no concept of a GSI (since it has no ACPI @@ -795,18 +773,15 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) else irq = irq_alloc_desc_at(gsi, -1); - if (!xen_irq_init(irq)) { + info = xen_irq_init(irq); + if (!info) xen_irq_free_desc(irq); - irq = -1; - } - return irq; + return info; } -static void xen_free_irq(unsigned irq) +static void xen_free_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - if (WARN_ON(!info)) return; @@ -897,7 +872,7 @@ static unsigned int __startup_pirq(unsigned int irq) goto err; info->evtchn = evtchn; - bind_evtchn_to_cpu(evtchn, 0, false); + bind_evtchn_to_cpu(info, 0, false); rc = xen_evtchn_port_setup(evtchn); if (rc) @@ -963,10 +938,9 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); -static void __unbind_from_irq(unsigned int irq) +static void __unbind_from_irq(struct irq_info *info, unsigned int irq) { - evtchn_port_t evtchn = evtchn_from_irq(irq); - struct irq_info *info = info_for_irq(irq); + evtchn_port_t evtchn; if (!info) { xen_irq_free_desc(irq); @@ -979,6 +953,8 @@ static void __unbind_from_irq(unsigned int irq) return; } + evtchn = info->evtchn; + if (VALID_EVTCHN(evtchn)) { unsigned int cpu = info->cpu; struct xenbus_device *dev; @@ -988,11 +964,11 @@ static void __unbind_from_irq(unsigned int irq) switch (info->type) { case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1; break; case IRQT_IPI: - per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; - per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0; + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0; break; case IRQT_EVTCHN: dev = info->u.interdomain; @@ -1006,7 +982,7 @@ static void __unbind_from_irq(unsigned int irq) xen_irq_info_cleanup(info); } - xen_free_irq(irq); + xen_free_irq(info); } /* @@ -1022,24 +998,24 @@ static void __unbind_from_irq(unsigned int irq) int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name) { - int irq; + struct irq_info *info; struct physdev_irq irq_op; int ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_irq_from_gsi(gsi); - if (irq != -1) { + ret = xen_irq_from_gsi(gsi); + if (ret != -1) { pr_info("%s: returning irq %d for gsi %u\n", - __func__, irq, gsi); + __func__, ret, gsi); goto out; } - irq = xen_allocate_irq_gsi(gsi); - if (irq < 0) + info = xen_allocate_irq_gsi(gsi); + if (!info) goto out; - irq_op.irq = irq; + irq_op.irq = info->irq; irq_op.vector = 0; /* Only the privileged domain can do this. For non-priv, the pcifront @@ -1047,20 +1023,19 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - xen_free_irq(irq); - irq = -ENOSPC; + xen_free_irq(info); + ret = -ENOSPC; goto out; } - ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, + ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - pirq_query_unmask(irq); + pirq_query_unmask(info->irq); /* We try to use the handler with the appropriate semantic for the * type of interrupt: if the interrupt is an edge triggered * interrupt we use handle_edge_irq. @@ -1077,16 +1052,18 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * is the right choice either way. */ if (shareable) - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_fasteoi_irq, name); else - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_edge_irq, name); + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } #ifdef CONFIG_PCI_MSI @@ -1108,6 +1085,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, int pirq, int nvec, const char *name, domid_t domid) { int i, irq, ret; + struct irq_info *info; mutex_lock(&irq_mapping_update_lock); @@ -1116,12 +1094,13 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, goto out; for (i = 0; i < nvec; i++) { - if (!xen_irq_init(irq + i)) + info = xen_irq_init(irq + i); + if (!info) goto error_irq; irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); - ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid, i == 0 ? 0 : PIRQ_MSI_GROUP); if (ret < 0) goto error_irq; @@ -1133,9 +1112,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, out: mutex_unlock(&irq_mapping_update_lock); return irq; + error_irq: - while (nvec--) - __unbind_from_irq(irq + nvec); + while (nvec--) { + info = info_for_irq(irq + nvec); + __unbind_from_irq(info, irq + nvec); + } mutex_unlock(&irq_mapping_update_lock); return ret; } @@ -1171,7 +1153,7 @@ int xen_destroy_irq(int irq) } } - xen_free_irq(irq); + xen_free_irq(info); out: mutex_unlock(&irq_mapping_update_lock); @@ -1187,8 +1169,7 @@ EXPORT_SYMBOL_GPL(xen_pirq_from_irq); static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, struct xenbus_device *dev) { - int irq; - int ret; + int ret = -ENOMEM; struct irq_info *info; if (evtchn >= xen_evtchn_max_channels()) @@ -1199,17 +1180,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, info = evtchn_to_info(evtchn); if (!info) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, chip, + irq_set_chip_and_handler_name(info->irq, chip, handle_edge_irq, "event"); - ret = xen_irq_info_evtchn_setup(irq, evtchn, dev); + ret = xen_irq_info_evtchn_setup(info, evtchn, dev); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } /* @@ -1219,17 +1199,17 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, * affinity setting is not invoked on them so nothing would * bind the channel. */ - bind_evtchn_to_cpu(evtchn, 0, false); - } else { - if (!WARN_ON(info->type != IRQT_EVTCHN)) - info->refcnt++; - irq = info->irq; + bind_evtchn_to_cpu(info, 0, false); + } else if (!WARN_ON(info->type != IRQT_EVTCHN)) { + info->refcnt++; } + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } int bind_evtchn_to_irq(evtchn_port_t evtchn) @@ -1248,18 +1228,19 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; evtchn_port_t evtchn; - int ret, irq; + struct irq_info *info; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(ipi_to_irq, cpu)[ipi]; + ret = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "ipi"); bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -1268,25 +1249,25 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } /* * Force the affinity mask to the target CPU so proc shows * the correct target. */ - bind_evtchn_to_cpu(evtchn, cpu, true); + bind_evtchn_to_cpu(info, cpu, true); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_IPI); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, @@ -1354,22 +1335,23 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) { struct evtchn_bind_virq bind_virq; evtchn_port_t evtchn = 0; - int irq, ret; + struct irq_info *info; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(virq_to_irq, cpu)[virq]; + ret = per_cpu(virq_to_irq, cpu)[virq]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; if (percpu) - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "virq"); else - irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip, handle_edge_irq, "virq"); bind_virq.virq = virq; @@ -1384,10 +1366,9 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) BUG_ON(ret < 0); } - ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } @@ -1395,22 +1376,26 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) * Force the affinity mask for percpu interrupts so proc * shows the correct target. */ - bind_evtchn_to_cpu(evtchn, cpu, percpu); + bind_evtchn_to_cpu(info, cpu, percpu); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_VIRQ); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } static void unbind_from_irq(unsigned int irq) { + struct irq_info *info; + mutex_lock(&irq_mapping_update_lock); - __unbind_from_irq(irq); + info = info_for_irq(irq); + __unbind_from_irq(info, irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1744,11 +1729,11 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) BUG_ON(info->type == IRQT_UNBOUND); info->irq = irq; - (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL); + (void)xen_irq_info_evtchn_setup(info, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); - bind_evtchn_to_cpu(evtchn, info->cpu, false); + bind_evtchn_to_cpu(info, info->cpu, false); /* Unmask the event channel. */ enable_irq(irq); @@ -1782,7 +1767,7 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) * it, but don't do the xenlinux-level rebind in that case. */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu, false); + bind_evtchn_to_cpu(info, tcpu, false); do_unmask(info, EVT_MASK_REASON_TEMPORARY); @@ -1933,7 +1918,7 @@ static void restore_pirqs(void) if (rc) { pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", gsi, irq, pirq, rc); - xen_free_irq(irq); + xen_free_irq(info); continue; } @@ -1947,13 +1932,15 @@ static void restore_cpu_virqs(unsigned int cpu) { struct evtchn_bind_virq bind_virq; evtchn_port_t evtchn; + struct irq_info *info; int virq, irq; for (virq = 0; virq < NR_VIRQS; virq++) { if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(virq_from_irq(irq) != virq); + BUG_ON(virq_from_irq(info) != virq); /* Get a new binding from Xen. */ bind_virq.virq = virq; @@ -1964,9 +1951,9 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + xen_irq_info_virq_setup(info, cpu, evtchn, virq); /* The affinity mask is still valid */ - bind_evtchn_to_cpu(evtchn, cpu, false); + bind_evtchn_to_cpu(info, cpu, false); } } @@ -1974,13 +1961,15 @@ static void restore_cpu_ipis(unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; evtchn_port_t evtchn; + struct irq_info *info; int ipi, irq; for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(ipi_from_irq(irq) != ipi); + BUG_ON(ipi_from_irq(info) != ipi); /* Get a new binding from Xen. */ bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -1990,9 +1979,9 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); /* The affinity mask is still valid */ - bind_evtchn_to_cpu(evtchn, cpu, false); + bind_evtchn_to_cpu(info, cpu, false); } } -- cgit From cee96422e863f0b0e9d3d0c2d617271ef2255858 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Sep 2023 11:25:32 +0200 Subject: xen/events: remove some info_for_irq() calls in pirq handling Instead of the IRQ number user the struct irq_info pointer as parameter in the internal pirq related functions. This allows to drop some calls of info_for_irq(). Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 117 +++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 88f0c80d0f87..f5edb9e27e3c 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -174,7 +174,7 @@ static int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; #endif -static bool (*pirq_needs_eoi)(unsigned irq); +static bool (*pirq_needs_eoi)(struct irq_info *info); #define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) #define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) @@ -190,7 +190,6 @@ static struct irq_chip xen_lateeoi_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); -static void disable_dynirq(struct irq_data *data); static DEFINE_PER_CPU(unsigned int, irq_epoch); @@ -454,10 +453,8 @@ static unsigned int virq_from_irq(struct irq_info *info) return info->u.virq; } -static unsigned pirq_from_irq(unsigned irq) +static unsigned int pirq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_PIRQ); @@ -500,15 +497,14 @@ static void do_unmask(struct irq_info *info, u8 reason) } #ifdef CONFIG_X86 -static bool pirq_check_eoi_map(unsigned irq) +static bool pirq_check_eoi_map(struct irq_info *info) { - return test_bit(pirq_from_irq(irq), pirq_eoi_map); + return test_bit(pirq_from_irq(info), pirq_eoi_map); } #endif -static bool pirq_needs_eoi_flag(unsigned irq) +static bool pirq_needs_eoi_flag(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.flags & PIRQ_NEEDS_EOI; @@ -802,14 +798,11 @@ static void event_handler_exit(struct irq_info *info) clear_evtchn(info->evtchn); } -static void pirq_query_unmask(int irq) +static void pirq_query_unmask(struct irq_info *info) { struct physdev_irq_status_query irq_status; - struct irq_info *info = info_for_irq(irq); - BUG_ON(info->type != IRQT_PIRQ); - - irq_status.irq = pirq_from_irq(irq); + irq_status.irq = pirq_from_irq(info); if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) irq_status.flags = 0; @@ -818,56 +811,76 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static void eoi_pirq(struct irq_data *data) +static void do_eoi_pirq(struct irq_info *info) { - struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; - struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + struct physdev_eoi eoi = { .irq = pirq_from_irq(info) }; int rc = 0; - if (!VALID_EVTCHN(evtchn)) + if (!VALID_EVTCHN(info->evtchn)) return; event_handler_exit(info); - if (pirq_needs_eoi(data->irq)) { + if (pirq_needs_eoi(info)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); WARN_ON(rc); } } +static void eoi_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + do_eoi_pirq(info); +} + +static void do_disable_dynirq(struct irq_info *info) +{ + if (VALID_EVTCHN(info->evtchn)) + do_mask(info, EVT_MASK_REASON_EXPLICIT); +} + +static void disable_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) + do_disable_dynirq(info); +} + static void mask_ack_pirq(struct irq_data *data) { - disable_dynirq(data); - eoi_pirq(data); + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_eoi_pirq(info); + } } -static unsigned int __startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(struct irq_info *info) { struct evtchn_bind_pirq bind_pirq; - struct irq_info *info = info_for_irq(irq); - evtchn_port_t evtchn = evtchn_from_irq(irq); + evtchn_port_t evtchn = info->evtchn; int rc; - BUG_ON(info->type != IRQT_PIRQ); - if (VALID_EVTCHN(evtchn)) goto out; - bind_pirq.pirq = pirq_from_irq(irq); + bind_pirq.pirq = pirq_from_irq(info); /* NB. We are happy to share unless we are probing. */ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - pr_warn("Failed to obtain physical IRQ %d\n", irq); + pr_warn("Failed to obtain physical IRQ %d\n", info->irq); return 0; } evtchn = bind_pirq.port; - pirq_query_unmask(irq); + pirq_query_unmask(info); - rc = set_evtchn_to_irq(evtchn, irq); + rc = set_evtchn_to_irq(evtchn, info->irq); if (rc) goto err; @@ -881,26 +894,28 @@ static unsigned int __startup_pirq(unsigned int irq) out: do_unmask(info, EVT_MASK_REASON_EXPLICIT); - eoi_pirq(irq_get_irq_data(irq)); + do_eoi_pirq(info); return 0; err: - pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq, + rc); xen_evtchn_close(evtchn); return 0; } static unsigned int startup_pirq(struct irq_data *data) { - return __startup_pirq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + + return __startup_pirq(info); } static void shutdown_pirq(struct irq_data *data) { - unsigned int irq = data->irq; - struct irq_info *info = info_for_irq(irq); - evtchn_port_t evtchn = evtchn_from_irq(irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info->evtchn; BUG_ON(info->type != IRQT_PIRQ); @@ -1035,7 +1050,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - pirq_query_unmask(info->irq); + pirq_query_unmask(info); /* We try to use the handler with the appropriate semantic for the * type of interrupt: if the interrupt is an edge triggered * interrupt we use handle_edge_irq. @@ -1162,7 +1177,9 @@ out: int xen_pirq_from_irq(unsigned irq) { - return pirq_from_irq(irq); + struct irq_info *info = info_for_irq(irq); + + return pirq_from_irq(info); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); @@ -1824,28 +1841,30 @@ static void enable_dynirq(struct irq_data *data) do_unmask(info, EVT_MASK_REASON_EXPLICIT); } -static void disable_dynirq(struct irq_data *data) +static void do_ack_dynirq(struct irq_info *info) { - struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; + evtchn_port_t evtchn = info->evtchn; if (VALID_EVTCHN(evtchn)) - do_mask(info, EVT_MASK_REASON_EXPLICIT); + event_handler_exit(info); } static void ack_dynirq(struct irq_data *data) { struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; - if (VALID_EVTCHN(evtchn)) - event_handler_exit(info); + if (info) + do_ack_dynirq(info); } static void mask_ack_dynirq(struct irq_data *data) { - disable_dynirq(data); - ack_dynirq(data); + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_ack_dynirq(info); + } } static void lateeoi_ack_dynirq(struct irq_data *data) @@ -1924,7 +1943,7 @@ static void restore_pirqs(void) printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - __startup_pirq(irq); + __startup_pirq(info); } } -- cgit From 9e2e7efbbbff69d8340abb56d375dd79d1f5770f Mon Sep 17 00:00:00 2001 From: Johnathan Mantey Date: Mon, 13 Nov 2023 08:30:29 -0800 Subject: Revert ncsi: Propagate carrier gain/loss events to the NCSI controller This reverts commit 3780bb29311eccb7a1c9641032a112eed237f7e3. The cited commit introduced unwanted behavior. The intent for the commit was to be able to detect carrier loss/gain for just the NIC connected to the BMC. The unwanted effect is a carrier loss for auxiliary paths also causes the BMC to lose carrier. The BMC never regains carrier despite the secondary NIC regaining a link. This change, when merged, needs to be backported to stable kernels. 5.4-stable, 5.10-stable, 5.15-stable, 6.1-stable, 6.5-stable Fixes: 3780bb29311e ("ncsi: Propagate carrier gain/loss events to the NCSI controller") CC: stable@vger.kernel.org Signed-off-by: Johnathan Mantey Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index f8854bff286c..62fb1031763d 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -89,11 +89,6 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, if ((had_link == has_link) || chained) return 0; - if (had_link) - netif_carrier_off(ndp->ndev.dev); - else - netif_carrier_on(ndp->ndev.dev); - if (!ndp->multi_package && !nc->package->multi_channel) { if (had_link) { ndp->flags |= NCSI_DEV_RESHUFFLE; -- cgit From efc0c8363bc6dab2cd540acc886e6097deee8bb9 Mon Sep 17 00:00:00 2001 From: Niklas Söderlund Date: Mon, 13 Nov 2023 17:44:12 +0100 Subject: dt-bindings: net: ethernet-controller: Fix formatting error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When moving the *-internal-delay-ps properties to only apply for RGMII interface modes there where a typo in the text formatting. Signed-off-by: Niklas Söderlund Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet-controller.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 9f6a5ccbcefe..d14d123ad7a0 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -275,12 +275,12 @@ allOf: properties: rx-internal-delay-ps: description: - RGMII Receive Clock Delay defined in pico seconds.This is used for + RGMII Receive Clock Delay defined in pico seconds. This is used for controllers that have configurable RX internal delays. If this property is present then the MAC applies the RX delay. tx-internal-delay-ps: description: - RGMII Transmit Clock Delay defined in pico seconds.This is used for + RGMII Transmit Clock Delay defined in pico seconds. This is used for controllers that have configurable TX internal delays. If this property is present then the MAC applies the TX delay. -- cgit From 00a614fc3527ba8846e6d823013bcf724e9a68f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Oct 2023 17:26:23 +0200 Subject: accel/ivpu: avoid build failure with CONFIG_PM=n The usage count of struct dev_pm_info is an implementation detail that is only available if CONFIG_PM is enabled, so printing it in a debug message causes a build failure in configurations without PM: In file included from include/linux/device.h:15, from include/linux/pci.h:37, from drivers/accel/ivpu/ivpu_pm.c:8: drivers/accel/ivpu/ivpu_pm.c: In function 'ivpu_rpm_get_if_active': drivers/accel/ivpu/ivpu_pm.c:254:51: error: 'struct dev_pm_info' has no member named 'usage_count' 254 | atomic_read(&vdev->drm.dev->power.usage_count)); | ^ include/linux/dev_printk.h:129:48: note: in definition of macro 'dev_printk' 129 | _dev_printk(level, dev, fmt, ##__VA_ARGS__); \ | ^~~~~~~~~~~ drivers/accel/ivpu/ivpu_drv.h:75:17: note: in expansion of macro 'dev_dbg' 75 | dev_dbg((vdev)->drm.dev, "[%s] " fmt, #type, ##args); \ | ^~~~~~~ drivers/accel/ivpu/ivpu_pm.c:253:9: note: in expansion of macro 'ivpu_dbg' 253 | ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n", | ^~~~~~~~ The print message does not seem essential, so the easiest workaround is to just remove it. Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up") Signed-off-by: Arnd Bergmann Reviewed-by: Stanislaw Gruszka Signed-off-by: Stanislaw Gruszka Link: https://patchwork.freedesktop.org/patch/msgid/20231027152633.528490-1-arnd@kernel.org (cherry picked from commit 1470acbef122c7e2e588f6346ce459c26d0568a2) Signed-off-by: Jacek Lawrynowicz --- drivers/accel/ivpu/ivpu_pm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 0ace218783c8..e9b16cbc26f4 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -250,9 +250,6 @@ int ivpu_rpm_get_if_active(struct ivpu_device *vdev) { int ret; - ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n", - atomic_read(&vdev->drm.dev->power.usage_count)); - ret = pm_runtime_get_if_active(vdev->drm.dev, false); drm_WARN_ON(&vdev->drm, ret < 0); -- cgit From 674e318089468ece99aef4796eaef7add57f36b2 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 14 Nov 2023 09:56:18 +0200 Subject: net: Fix undefined behavior in netdev name allocation Cited commit removed the strscpy() call and kept the snprintf() only. It is common to use 'dev->name' as the format string before a netdev is registered, this results in 'res' and 'name' pointers being equal. According to POSIX, if copying takes place between objects that overlap as a result of a call to sprintf() or snprintf(), the results are undefined. Add back the strscpy() and use 'buf' as an intermediate buffer. Fixes: 7ad17b04dc7b ("net: trust the bitmap in __dev_alloc_name()") Cc: Jakub Kicinski Reviewed-by: Vlad Buslov Signed-off-by: Gal Pressman Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0d548431f3fa..af53f6d838ce 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1119,7 +1119,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res) if (i == max_netdevices) return -ENFILE; - snprintf(res, IFNAMSIZ, name, i); + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); return i; } -- cgit From 9e88b493157a9901fa498f23cc3c9ab82b43ce83 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 15 Nov 2023 13:36:25 +0100 Subject: ALSA: hda: i915: Alays handle -EPROBE_DEFER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that even if the comment says that the driver can load fine, it's not really the case and no codecs are detected. Specifically for -EPROBE_DEFER, always fail the probe. This fixes a regression when HDA-intel is loaded before i915. Reported-by: Ville Syrjälä Closes: https://lore.kernel.org/r/ZVNUxZzCGcxQzqJX@intel.com Signed-off-by: Maarten Lankhorst Tested-by: Kai Vehmanen Fixes: e6d0c13e9f46 ("ALSA: hda: i915: Remove extra argument from snd_hdac_i915_init") Link: https://gitlab.freedesktop.org/drm/intel/-/issues/9671 Link: https://lore.kernel.org/r/20231115123625.74286-1-maarten.lankhorst@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e871afeeb383..e79508002bb1 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2141,6 +2141,9 @@ static int azx_probe(struct pci_dev *pci, if (chip->driver_caps & AZX_DCAPS_I915_COMPONENT) { err = snd_hdac_i915_init(azx_bus(chip)); if (err < 0) { + if (err == -EPROBE_DEFER) + goto out_free; + /* if the controller is bound only with HDMI/DP * (for HSW and BDW), we need to abort the probe; * for other chips, still continue probing as other -- cgit From a0d45c3f596be53c1bd8822a1984532d14fdcea9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Nov 2023 09:55:50 -0700 Subject: io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval A previous commit added a trylock for getting the SQPOLL thread info via fdinfo, but this introduced a regression where we often fail to get it if the thread is busy. For that case, we end up not printing the current CPU and PID info. Rather than rely on this lock, just print the pid we already stored in the io_sq_data struct, and ensure we update the current CPU every time we've slept or potentially rescheduled. The latter won't potentially be 100% accurate, but that wasn't the case before either as the task can get migrated at any time unless it has been pinned at creation time. We retain keeping the io_sq_data dereference inside the ctx->uring_lock, as it has always been, as destruction of the thread and data happen below that. We could make this RCU safe, but there's little point in doing that. With this, we always print the last valid information we had, rather than have spurious outputs with missing information. Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 9 ++------- io_uring/sqpoll.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f04a43044d91..976e9500f651 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -145,13 +145,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { struct io_sq_data *sq = ctx->sq_data; - if (mutex_trylock(&sq->lock)) { - if (sq->thread) { - sq_pid = task_pid_nr(sq->thread); - sq_cpu = task_cpu(sq->thread); - } - mutex_unlock(&sq->lock); - } + sq_pid = sq->task_pid; + sq_cpu = sq->sq_cpu; } seq_printf(m, "SqThread:\t%d\n", sq_pid); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index bd6c2c7959a5..65b5dbe3c850 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -214,6 +214,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) did_sig = get_signal(&ksig); cond_resched(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); } @@ -229,10 +230,15 @@ static int io_sq_thread(void *data) snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); - if (sqd->sq_cpu != -1) + /* reset to our pid after we've set task_comm, for fdinfo */ + sqd->task_pid = current->pid; + + if (sqd->sq_cpu != -1) { set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else + } else { set_cpus_allowed_ptr(current, cpu_online_mask); + sqd->sq_cpu = raw_smp_processor_id(); + } mutex_lock(&sqd->lock); while (1) { @@ -261,6 +267,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); cond_resched(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } continue; } @@ -294,6 +301,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); schedule(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_andnot(IORING_SQ_NEED_WAKEUP, -- cgit From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 10 Nov 2023 17:39:28 -0800 Subject: bpf: Do not allocate percpu memory at init stage Kirill Shutemov reported significant percpu memory consumption increase after booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation"). The percpu memory consumption is increased from 111MB to 969MB. The number is from /proc/meminfo. I tried to reproduce the issue with my local VM which at most supports upto 255 cpus. With 252 cpus, without the above commit, the percpu memory consumption immediately after boot is 57MB while with the above commit the percpu memory consumption is 231MB. This is not good since so far percpu memory from bpf memory allocator is not widely used yet. Let us change pre-allocation in init stage to on-demand allocation when verifier detects there is a need of percpu memory for bpf program. With this change, percpu memory consumption after boot can be reduced signicantly. [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/ Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation") Reported-and-tested-by: Kirill A. Shutemov Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 8 +++----- kernel/bpf/verifier.c | 20 ++++++++++++++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35bff17396c0..6762dac3ef76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 08626b519ce2..cd3afe57ece3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -64,8 +64,8 @@ #define OFF insn->off #define IMM insn->imm -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -bool bpf_global_ma_set, bpf_global_percpu_ma_set; +struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; /* No hurry in this branch * @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void) ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; - ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - bpf_global_percpu_ma_set = !ret; - return !bpf_global_ma_set || !bpf_global_percpu_ma_set; + return ret; } late_initcall(bpf_global_ma_init); #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2267d5ed14e..6da370a047fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "disasm.h" @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +struct bpf_mem_alloc bpf_global_percpu_ma; +static bool bpf_global_percpu_ma_set; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); +static DEFINE_MUTEX(bpf_percpu_ma_lock); static const struct bpf_line_info * find_linfo(const struct bpf_verifier_env *env, u32 insn_off) @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) - return -ENOMEM; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + } if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); -- cgit From 430143b0d3611f4a9c8434319e5e504244749e79 Mon Sep 17 00:00:00 2001 From: Brenton Simpson Date: Tue, 14 Nov 2023 23:38:59 +0000 Subject: drm: panel-orientation-quirks: Add quirk for Lenovo Legion Go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Legion Go has a 2560x1600 portrait screen, with the native "up" facing the right controller (90° CW from the rest of the device). Signed-off-by: Brenton Simpson Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231114233859.274189-1-appsforartists@google.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index d5c15292ae93..3d92f66e550c 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -336,6 +336,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "IdeaPad Duet 3 10IGL5"), }, .driver_data = (void *)&lcd1200x1920_rightside_up, + }, { /* Lenovo Legion Go 8APU1 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Legion Go 8APU1"), + }, + .driver_data = (void *)&lcd1600x2560_leftside_up, }, { /* Lenovo Yoga Book X90F / X90L */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), -- cgit From ae1aadb1eb8d3cbc52e42bee71d67bd4a71f9f07 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 16 Nov 2023 00:39:33 +1000 Subject: nouveau: don't fail driver load if no display hw present. If we get back ENODEV don't fail load. There are nvidia devices that don't have display blocks and the driver should work on those. Fixes: 15740541e8f0 ("drm/nouveau/devinit/tu102-: prepare for GSP-RM") Link: https://gitlab.freedesktop.org/drm/nouveau/-/issues/270 Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20231115143933.261287-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_display.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index d8c92521226d..f28f9a857458 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -726,6 +726,11 @@ nouveau_display_create(struct drm_device *dev) if (nouveau_modeset != 2) { ret = nvif_disp_ctor(&drm->client.device, "kmsDisp", 0, &disp->disp); + /* no display hw */ + if (ret == -ENODEV) { + ret = 0; + goto disp_create_err; + } if (!ret && (disp->disp.outp_mask || drm->vbios.dcb.entries)) { nouveau_display_create_properties(dev); -- cgit From 61cbc08fdb04fd445458b0f4cba7e6929afdfaef Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 15 Nov 2023 16:21:15 +0000 Subject: ALSA: hda/realtek: Add quirks for ASUS 2024 Zenbooks These ASUS Zenbook laptops use Realtek HDA codec combined with 2xCS35L41 Amplifiers using SPI or I2C with External Boost or Internal Boost. Signed-off-by: Stefan Binding Cc: Link: https://lore.kernel.org/r/20231115162116.494968-2-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3c85b8247c11..a1e124370283 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9947,13 +9947,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW), SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC), + SND_PCI_QUIRK(0x1043, 0x1a63, "ASUS UX3405MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JI", ALC285_FIXUP_ASUS_HEADSET_MIC), -- cgit From 5d639b60971f003d3a9b2b31f8ec73b0718b5d57 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 15 Nov 2023 16:21:16 +0000 Subject: ALSA: hda/realtek: Add quirks for HP Laptops These HP laptops use Realtek HDA codec combined with 2 or 4 CS35L41 Amplifiers using SPI with Internal Boost. Signed-off-by: Stefan Binding Cc: Link: https://lore.kernel.org/r/20231115162116.494968-3-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a1e124370283..5618b1d9bfd1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9902,6 +9902,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), -- cgit From 85c2ceaafbd306814a3a4740bf4d95ac26a8b36a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:07:40 +0300 Subject: mm/damon/sysfs: eliminate potential uninitialized variable warning The "err" variable is not initialized if damon_target_has_pid(ctx) is false and sys_target->regions->nr is zero. Link: https://lkml.kernel.org/r/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mountain Fixes: 0bcd216c4741 ("mm/damon/sysfs: update monitoring target regions for online input commit") Signed-off-by: Dan Carpenter Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index e27846708b5a..1dfa96d4de99 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target, struct damon_ctx *ctx, struct damon_sysfs_target *sys_target) { - int err; + int err = 0; if (damon_target_has_pid(ctx)) { err = damon_sysfs_update_target_pid(target, sys_target->pid); -- cgit From 019b277b680f5b95135c042c78dd79318d8f9e3c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 3 Nov 2023 23:23:41 +0500 Subject: selftests: mm: skip whole test instead of failure Some architectures don't support userfaultfd. Skip running the whole test on them instead of registering the failure. Link: https://lkml.kernel.org/r/20231103182343.2874015-1-usama.anjum@collabora.com Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests") Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/f8463381-2697-49e9-9460-9dc73452830d@arm.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 0161fb49fc6e..f8685a2ea07e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -94,19 +94,19 @@ int init_uffd(void) uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd == -1) - ksft_exit_fail_msg("uffd syscall failed\n"); + return uffd; uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC | UFFD_FEATURE_WP_HUGETLBFS_SHMEM; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - ksft_exit_fail_msg("UFFDIO_API\n"); + return -1; if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) || !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) || !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) || !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) - ksft_exit_fail_msg("UFFDIO_API error %llu\n", uffdio_api.api); + return -1; return 0; } @@ -1479,6 +1479,10 @@ int main(void) struct stat sbuf; ksft_print_header(); + + if (init_uffd()) + return ksft_exit_pass(); + ksft_set_plan(115); page_size = getpagesize(); @@ -1488,9 +1492,6 @@ int main(void) if (pagemap_fd < 0) return -EINVAL; - if (init_uffd()) - ksft_exit_fail_msg("uffd init failed\n"); - /* 1. Sanity testing */ sanity_tests_sd(); -- cgit From 9297e5360c3bd777f95d5146dbeda7fb9ba4273a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 3 Nov 2023 23:23:42 +0500 Subject: selftests: mm: fix some build warnings Fix build warnings: pagemap_ioctl.c:1154:38: warning: format `%s' expects a matching `char *' argument [-Wformat=] pagemap_ioctl.c:1162:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1192:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1600:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1628:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] Link: https://lkml.kernel.org/r/20231103182343.2874015-2-usama.anjum@collabora.com Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests") Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Ryan Roberts Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index f8685a2ea07e..befab43719ba 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1151,7 +1151,7 @@ int sanity_tests(void) /* 9. Memory mapped file */ fd = open(__FILE__, O_RDONLY); if (fd < 0) - ksft_exit_fail_msg("%s Memory mapped file\n"); + ksft_exit_fail_msg("%s Memory mapped file\n", __func__); ret = stat(__FILE__, &sbuf); if (ret < 0) @@ -1159,7 +1159,7 @@ int sanity_tests(void) fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); tmp_buf = malloc(sbuf.st_size); memcpy(tmp_buf, fmem, sbuf.st_size); @@ -1189,7 +1189,7 @@ int sanity_tests(void) fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, buf_size); wp_addr_range(fmem, buf_size); @@ -1596,7 +1596,7 @@ int main(void) fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, sbuf.st_size); wp_addr_range(fmem, sbuf.st_size); @@ -1624,7 +1624,7 @@ int main(void) fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, buf_size); wp_addr_range(fmem, buf_size); -- cgit From dd9b35efd719be242e227f9eebad1e50ea5c914f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 Nov 2023 10:33:59 -0700 Subject: selftests/mm: restore number of hugepages The test mm `hugetlb_fault_after_madv` selftest needs one and only one huge page to run, thus it sets `/proc/sys/vm/nr_hugepages` to 1. The problem is that further tests require the previous number of hugepages allocated in order to succeed. Save the number of huge pages before changing it, and restore it once the test finishes, so, further tests could run successfully. Link: https://lkml.kernel.org/r/20231103173400.1608403-1-leitao@debian.org Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") Signed-off-by: Breno Leitao Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index cc16f6ca8533..00757445278e 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -223,9 +223,12 @@ CATEGORY="hugetlb" run_test ./hugepage-mremap CATEGORY="hugetlb" run_test ./hugepage-vmemmap CATEGORY="hugetlb" run_test ./hugetlb-madvise +nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) # For this test, we need one and just one huge page echo 1 > /proc/sys/vm/nr_hugepages CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv +# Restore the previous number of huge pages, since further tests rely on it +echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages if test_selected "hugetlb"; then echo "NOTE: These hugetlb tests provide minimal coverage. Use" -- cgit From edf14544324dd036183fafe372fe5709708bdddd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 Nov 2023 10:34:00 -0700 Subject: selftests/mm: add hugetlb_fault_after_madv to .gitignore commit 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") added a new test case, but, it didn't add the binary name in tools/testing/selftests/mm/.gitignore. Add hugetlb_fault_after_madv to tools/testing/selftests/mm/.gitignore. Link: https://lkml.kernel.org/r/20231103173400.1608403-2-leitao@debian.org Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") Signed-off-by: Breno Leitao Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index cc920c79ff1c..4ff10ea61461 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -45,3 +45,4 @@ mdwe_test gup_longterm mkdirty va_high_addr_switch +hugetlb_fault_after_madv -- cgit From a48d5bdc877b85201e42cef9c2fdf5378164c23a Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 6 Nov 2023 10:19:18 -0800 Subject: mm: fix for negative counter: nr_file_hugepages While qualifiying the 6.4 release, the following warning was detected in messages: vmstat_refresh: nr_file_hugepages -15664 The warning is caused by the incorrect updating of the NR_FILE_THPS counter in the function split_huge_page_to_list. The if case is checking for folio_test_swapbacked, but the else case is missing the check for folio_test_pmd_mappable. The other functions that manipulate the counter like __filemap_add_folio and filemap_unaccount_folio have the corresponding check. I have a test case, which reproduces the problem. It can be found here: https://github.com/sroeschus/testcase/blob/main/vmstat_refresh/madv.c The test case reproduces on an XFS filesystem. Running the same test case on a BTRFS filesystem does not reproduce the problem. AFAIK version 6.1 until 6.6 are affected by this problem. [akpm@linux-foundation.org: whitespace fix] [shr@devkernel.io: test for folio_test_pmd_mappable()] Link: https://lkml.kernel.org/r/20231108171517.2436103-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20231106181918.1091043-1-shr@devkernel.io Signed-off-by: Stefan Roesch Co-debugged-by: Johannes Weiner Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f31f02472396..4f542444a91f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2769,13 +2769,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, - -nr); - } else { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, - -nr); - filemap_nr_thps_dec(mapping); + if (folio_test_pmd_mappable(folio)) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } } } -- cgit From b4936b544b08ed44949055b92bd25f77759ebafc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:06 +0000 Subject: mm/damon/sysfs: check error from damon_sysfs_update_target() Patch series "mm/damon/sysfs: fix unhandled return values". Some of DAMON sysfs interface code is not handling return values from some functions. As a result, confusing user input handling or NULL-dereference is possible. Check those properly. This patch (of 3): damon_sysfs_update_target() returns error code for failures, but its caller, damon_sysfs_set_targets() is ignoring that. The update function seems making no critical change in case of such failures, but the behavior will look like DAMON sysfs is silently ignoring or only partially accepting the user input. Fix it. Link: https://lkml.kernel.org/r/20231106233408.51159-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231106233408.51159-2-sj@kernel.org Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1dfa96d4de99..7472404456aa 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, damon_for_each_target_safe(t, next, ctx) { if (i < sysfs_targets->nr) { - damon_sysfs_update_target(t, ctx, + err = damon_sysfs_update_target(t, ctx, sysfs_targets->targets_arr[i]); + if (err) + return err; } else { if (damon_target_has_pid(ctx)) put_pid(t->pid); -- cgit From 84055688b6bc075c92a88e2d6c3ad26ab93919f9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:07 +0000 Subject: mm/damon/sysfs-schemes: handle tried regions sysfs directory allocation failure DAMOS tried regions sysfs directory allocation function (damon_sysfs_scheme_regions_alloc()) is not handling the memory allocation failure. In the case, the code will dereference NULL pointer. Handle the failure to avoid such invalid access. Link: https://lkml.kernel.org/r/20231106233408.51159-3-sj@kernel.org Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 45bd0fd4a8b1..7413cb35c5a9 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void) struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), GFP_KERNEL); + if (!regions) + return NULL; + regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; -- cgit From ae636ae2bbfd9279f5681dbf320d1da817e52b68 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:08 +0000 Subject: mm/damon/sysfs-schemes: handle tried region directory allocation failure DAMON sysfs interface's before_damos_apply callback (damon_sysfs_before_damos_apply()), which creates the DAMOS tried regions for each DAMOS action applied region, is not handling the allocation failure for the sysfs directory data. As a result, NULL pointer derefeence is possible. Fix it by handling the case. Link: https://lkml.kernel.org/r/20231106233408.51159-4-sj@kernel.org Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 7413cb35c5a9..be667236b8e6 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1826,6 +1826,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; region = damon_sysfs_scheme_region_alloc(r); + if (!region) + return 0; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, -- cgit From 24948e3b7b12e0031a6edb4f49bbb9fb2ad1e4e9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 7 Nov 2023 09:18:02 -0800 Subject: mm: kmem: drop __GFP_NOFAIL when allocating objcg vectors Objcg vectors attached to slab pages to store slab object ownership information are allocated using gfp flags for the original slab allocation. Depending on slab page order and the size of slab objects, objcg vector can take several pages. If the original allocation was done with the __GFP_NOFAIL flag, it triggered a warning in the page allocation code. Indeed, order > 1 pages should not been allocated with the __GFP_NOFAIL flag. Fix this by simply dropping the __GFP_NOFAIL flag when allocating the objcg vector. It effectively allows to skip the accounting of a single slab object under a heavy memory pressure. An alternative would be to implement the mechanism to fallback to order-0 allocations for accounting metadata, which is also not perfect because it will increase performance penalty and memory footprint of the kernel memory accounting under memory pressure. Link: https://lkml.kernel.org/r/ZUp8ZFGxwmCx4ZFr@P9FQF9L96D.corp.robot.car Signed-off-by: Roman Gushchin Reported-by: Christoph Lameter Closes: https://lkml.kernel.org/r/6b42243e-f197-600a-5d22-56bd728a5ad8@gentwo.org Acked-by: Shakeel Butt Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e27..1c1061df9cd1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2936,7 +2936,8 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) * Moreover, it should not come from DMA buffer and is not readily * reclaimable. So those GFP bits should be masked off. */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) /* * mod_objcg_mlstate() may be called with irq enabled, so -- cgit From 13b2a4b22e98ff80b888a160a2acd92d81b05925 Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 10 Nov 2023 14:37:09 +0900 Subject: mm/damon/core.c: avoid unintentional filtering out of schemes The function '__damos_filter_out()' causes DAMON to always filter out schemes whose filter type is anon or memcg if its matching value is set to false. This commit addresses the issue by ensuring that '__damos_filter_out()' no longer applies to filters whose type is 'anon' or 'memcg'. Link: https://lkml.kernel.org/r/1699594629-3816-1-git-send-email-hyeongtak.ji@gmail.com Fixes: ab9bda001b681 ("mm/damon/core: introduce address range type damos filter") Signed-off-by: Hyeongtak Ji Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 630077d95dc6..6262d55904e7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -924,7 +924,7 @@ static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, matched = true; break; default: - break; + return false; } return matched == filter->matching; -- cgit From 5f74f820f6fc844b95f9e5e406e0a07d97510420 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 13 Nov 2023 11:12:57 +0100 Subject: parisc: fix mmap_base calculation when stack grows upwards Matoro reported various userspace crashes on the parisc platform with kernel 6.6 and bisected it to commit 3033cd430768 ("parisc: Use generic mmap top-down layout and brk randomization"). That commit switched parisc to use the common infrastructure to calculate mmap_base, but missed that the mmap_base() function takes care for architectures where the stack grows downwards only. Fix the mmap_base() calculation to include the stack-grows-upwards case and thus fix the userspace crashes on parisc. Link: https://lkml.kernel.org/r/ZVH2qeS1bG7/1J/l@p100 Fixes: 3033cd430768 ("parisc: Use generic mmap top-down layout and brk randomization") Signed-off-by: Helge Deller Reported-by: matoro Tested-by: matoro Cc: [6.6+] Signed-off-by: Andrew Morton --- arch/parisc/Kconfig | 6 +++--- arch/parisc/include/asm/elf.h | 10 +--------- arch/parisc/include/asm/processor.h | 2 ++ arch/parisc/kernel/sys_parisc.c | 2 +- mm/util.c | 10 ++++++++++ 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index fd69dfa0cdab..a7c9c0e69e5a 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -140,11 +140,11 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN default 8 config ARCH_MMAP_RND_BITS_MAX - default 24 if 64BIT - default 17 + default 18 if 64BIT + default 13 config ARCH_MMAP_RND_COMPAT_BITS_MAX - default 17 + default 13 # unless you want to implement ACPI on PA-RISC ... ;-) config PM diff --git a/arch/parisc/include/asm/elf.h b/arch/parisc/include/asm/elf.h index 140eaa97bf21..2d73d3c3cd37 100644 --- a/arch/parisc/include/asm/elf.h +++ b/arch/parisc/include/asm/elf.h @@ -349,15 +349,7 @@ struct pt_regs; /* forward declaration... */ #define ELF_HWCAP 0 -/* Masks for stack and mmap randomization */ -#define BRK_RND_MASK (is_32bit_task() ? 0x07ffUL : 0x3ffffUL) -#define MMAP_RND_MASK (is_32bit_task() ? 0x1fffUL : 0x3ffffUL) -#define STACK_RND_MASK MMAP_RND_MASK - -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *); -#define arch_randomize_brk arch_randomize_brk - +#define STACK_RND_MASK 0x7ff /* 8MB of VA */ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 struct linux_binprm; diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index c05d121cf5d0..982aca20f56f 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -47,6 +47,8 @@ #ifndef __ASSEMBLY__ +struct rlimit; +unsigned long mmap_upper_limit(struct rlimit *rlim_stack); unsigned long calc_max_stack_size(unsigned long stack_max); /* diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index ab896eff7a1d..98af719d5f85 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -77,7 +77,7 @@ unsigned long calc_max_stack_size(unsigned long stack_max) * indicating that "current" should be used instead of a passed-in * value from the exec bprm as done with arch_pick_mmap_layout(). */ -static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) +unsigned long mmap_upper_limit(struct rlimit *rlim_stack) { unsigned long stack_base; diff --git a/mm/util.c b/mm/util.c index aa01f6ea5a75..744b4d7e3fae 100644 --- a/mm/util.c +++ b/mm/util.c @@ -414,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { +#ifdef CONFIG_STACK_GROWSUP + /* + * For an upwards growing stack the calculation is much simpler. + * Memory for the maximum stack size is reserved at the top of the + * task. mmap_base starts directly below the stack and grows + * downwards. + */ + return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); +#else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; @@ -431,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); +#endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -- cgit From afccb0804fc74ac2f6737af6a139632606cb461d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 14 Nov 2023 15:49:45 +0000 Subject: mm: more ptep_get() conversion Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch) call sites to use ptep_get() instead of doing a direct dereference of the pte. Full rationale can be found in that commit's log. Since then, three new call sites have snuck in, which directly dereference the pte, so let's fix those up. Unfortunately there is no reliable automated mechanism to catch these; I'm relying on a combination of Coccinelle (which throws up a lot of false positives) and some compiler magic to force a compiler error on dereference (While this approach finds dereferences, it also yields a non-booting kernel so can't be committed). Link: https://lkml.kernel.org/r/20231114154945.490401-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/ksm.c | 2 +- mm/userfaultfd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9710f43a89ac..32eedf3afd45 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3443,7 +3443,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(vmf->pte[count])) + if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; diff --git a/mm/ksm.c b/mm/ksm.c index 7efcc68ccc6e..6a831009b4cb 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -468,7 +468,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex page = pfn_swap_entry_to_page(entry); } /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); + ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent); pte_unmap_unlock(pte, ptl); return ret; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc..0b6ca553bebe 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -312,7 +312,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); -- cgit From df3aafe501853c92bc9e25b05dcb030fee072962 Mon Sep 17 00:00:00 2001 From: Itamar Gozlan Date: Tue, 14 Nov 2023 13:58:32 -0800 Subject: Revert "net/mlx5: DR, Supporting inline WQE when possible" This reverts commit 95c337cce0e11d06a715da73e6796ade9216637f. The revert is required due to the suspicion it cause some tests fail and will be moved to further investigation. Fixes: 95c337cce0e1 ("net/mlx5: DR, Supporting inline WQE when possible") Signed-off-by: Itamar Gozlan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 115 +++------------------ 1 file changed, 13 insertions(+), 102 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 4e8527a724f5..6fa06ba2d346 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -52,7 +52,6 @@ struct dr_qp_init_attr { u32 cqn; u32 pdn; u32 max_send_wr; - u32 max_send_sge; struct mlx5_uars_page *uar; u8 isolate_vl_tc:1; }; @@ -247,37 +246,6 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) return err == CQ_POLL_ERR ? err : npolled; } -static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr) -{ - return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_flow_update_ctrl_seg) + - sizeof(struct mlx5_wqe_header_modify_argument_update_seg)); -} - -/* We calculate for specific RC QP with the required functionality */ -static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr) -{ - int update_arg_size; - int inl_size = 0; - int tot_size; - int size; - - update_arg_size = dr_qp_get_args_update_send_wqe_size(attr); - - size = sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg); - inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) + - DR_STE_SIZE, 16); - - size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg); - - size = max(size, update_arg_size); - - tot_size = max(size, inl_size); - - return ALIGN(tot_size, MLX5_SEND_WQE_BB); -} - static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, struct dr_qp_init_attr *attr) { @@ -285,7 +253,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; struct mlx5_wq_param wqp; struct mlx5dr_qp *dr_qp; - int wqe_size; int inlen; void *qpc; void *in; @@ -365,15 +332,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, if (err) goto err_in; dr_qp->uar = attr->uar; - wqe_size = dr_qp_calc_rc_send_wqe(attr); - dr_qp->max_inline_data = min(wqe_size - - (sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg)), - (2 * MLX5_SEND_WQE_BB - - (sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg)))); return dr_qp; @@ -437,48 +395,8 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, MLX5_SEND_WQE_DS; } -static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp, - struct dr_data_seg *data_seg, void *wqe) -{ - int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg); - struct mlx5_wqe_inline_seg *seg; - int left_space; - int inl = 0; - void *addr; - int len; - int idx; - - seg = wqe; - wqe += sizeof(*seg); - addr = (void *)(unsigned long)(data_seg->addr); - len = data_seg->length; - inl += len; - left_space = MLX5_SEND_WQE_BB - inline_header_size; - - if (likely(len > left_space)) { - memcpy(wqe, addr, left_space); - len -= left_space; - addr += left_space; - idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1); - wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); - } - - memcpy(wqe, addr, len); - - if (likely(inl)) { - seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); - return DIV_ROUND_UP(inl + sizeof(seg->byte_count), - MLX5_SEND_WQE_DS); - } else { - return 0; - } -} - static void -dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp, - struct mlx5_wqe_ctrl_seg *wq_ctrl, +dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, u64 remote_addr, u32 rkey, struct dr_data_seg *data_seg, @@ -494,17 +412,15 @@ dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp, wq_raddr->reserved = 0; wq_dseg = (void *)(wq_raddr + 1); - /* WQE ctrl segment + WQE remote addr segment */ - *size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS; - if (data_seg->send_flags & IB_SEND_INLINE) { - *size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg); - } else { - wq_dseg->byte_count = cpu_to_be32(data_seg->length); - wq_dseg->lkey = cpu_to_be32(data_seg->lkey); - wq_dseg->addr = cpu_to_be64(data_seg->addr); - *size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS; /* WQE data segment */ - } + wq_dseg->byte_count = cpu_to_be32(data_seg->length); + wq_dseg->lkey = cpu_to_be32(data_seg->lkey); + wq_dseg->addr = cpu_to_be64(data_seg->addr); + + *size = (sizeof(*wq_ctrl) + /* WQE ctrl segment */ + sizeof(*wq_dseg) + /* WQE data segment */ + sizeof(*wq_raddr)) / /* WQE remote addr segment */ + MLX5_SEND_WQE_DS; } static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl, @@ -535,7 +451,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, switch (opcode) { case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: - dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr, + dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr, rkey, data_seg, &size); break; case MLX5_OPCODE_FLOW_TBL_ACCESS: @@ -656,7 +572,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring, if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; else - send_info->write.send_flags &= ~IB_SEND_SIGNALED; + send_info->write.send_flags = 0; } static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, @@ -680,13 +596,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, } send_ring->pending_wqe++; - if (!send_info->write.lkey) - send_info->write.send_flags |= IB_SEND_INLINE; if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; - else - send_info->write.send_flags &= ~IB_SEND_SIGNALED; send_ring->pending_wqe++; send_info->read.length = send_info->write.length; @@ -696,9 +608,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, send_info->read.lkey = send_ring->sync_mr->mkey; if (send_ring->pending_wqe % send_ring->signal_th == 0) - send_info->read.send_flags |= IB_SEND_SIGNALED; + send_info->read.send_flags = IB_SEND_SIGNALED; else - send_info->read.send_flags &= ~IB_SEND_SIGNALED; + send_info->read.send_flags = 0; } static void dr_fill_data_segs(struct mlx5dr_domain *dmn, @@ -1345,7 +1257,6 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) dmn->send_ring->cq->qp = dmn->send_ring->qp; dmn->info.max_send_wr = QUEUE_SIZE; - init_attr.max_send_sge = 1; dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, DR_STE_SIZE); -- cgit From 7d2f74d1d4385a5bcf90618537f16a45121c30ae Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 14 Nov 2023 13:58:33 -0800 Subject: net/mlx5: Free used cpus mask when an IRQ is released Each EQ table maintains a cpumask of the already used CPUs that are mapped to IRQs to ensure that each IRQ gets mapped to a unique CPU. However, on IRQ release, the said cpumask is not updated by clearing the CPU from the mask to allow future IRQ request, causing the following error when a SF is reloaded after it has utilized all CPUs for its IRQs: mlx5_irq_affinity_request:135:(pid 306010): Didn't find a matching IRQ. err = -28 Thus, when releasing an IRQ, clear its mapped CPU from the used CPUs mask, to prevent the case described above. While at it, move the used cpumask update to the EQ layer as it is more fitting and preserves symmetricity of the IRQ request/release API. Fixes: a1772de78d73 ("net/mlx5: Refactor completion IRQ request/release API") Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Reviewed-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-3-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 25 +++++++++---- .../net/ethernet/mellanox/mlx5/core/irq_affinity.c | 42 ---------------------- 2 files changed, 19 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ea0405e0a43f..40a6cb052a2d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -885,11 +885,14 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx) { struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_irq *irq; + int cpu; irq = xa_load(&table->comp_irqs, vecidx); if (!irq) return; + cpu = cpumask_first(mlx5_irq_get_affinity_mask(irq)); + cpumask_clear_cpu(cpu, &table->used_cpus); xa_erase(&table->comp_irqs, vecidx); mlx5_irq_affinity_irq_release(dev, irq); } @@ -897,16 +900,26 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx) static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx) { struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + struct irq_affinity_desc af_desc = {}; struct mlx5_irq *irq; - irq = mlx5_irq_affinity_irq_request_auto(dev, &table->used_cpus, vecidx); - if (IS_ERR(irq)) { - /* In case SF irq pool does not exist, fallback to the PF irqs*/ - if (PTR_ERR(irq) == -ENOENT) - return comp_irq_request_pci(dev, vecidx); + /* In case SF irq pool does not exist, fallback to the PF irqs*/ + if (!mlx5_irq_pool_is_sf_pool(pool)) + return comp_irq_request_pci(dev, vecidx); + af_desc.is_managed = 1; + cpumask_copy(&af_desc.mask, cpu_online_mask); + cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus); + irq = mlx5_irq_affinity_request(pool, &af_desc); + if (IS_ERR(irq)) return PTR_ERR(irq); - } + + cpumask_or(&table->used_cpus, &table->used_cpus, mlx5_irq_get_affinity_mask(irq)); + mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); return xa_err(xa_store(&table->comp_irqs, vecidx, irq, GFP_KERNEL)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index 047d5fed5f89..612e666ec263 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -168,45 +168,3 @@ void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *i if (pool->irqs_per_cpu) cpu_put(pool, cpu); } - -/** - * mlx5_irq_affinity_irq_request_auto - request one IRQ for mlx5 device. - * @dev: mlx5 device that is requesting the IRQ. - * @used_cpus: cpumask of bounded cpus by the device - * @vecidx: vector index to request an IRQ for. - * - * Each IRQ is bounded to at most 1 CPU. - * This function is requesting an IRQ according to the default assignment. - * The default assignment policy is: - * - request the least loaded IRQ which is not bound to any - * CPU of the previous IRQs requested. - * - * On success, this function updates used_cpus mask and returns an irq pointer. - * In case of an error, an appropriate error pointer is returned. - */ -struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev, - struct cpumask *used_cpus, u16 vecidx) -{ - struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); - struct irq_affinity_desc af_desc = {}; - struct mlx5_irq *irq; - - if (!mlx5_irq_pool_is_sf_pool(pool)) - return ERR_PTR(-ENOENT); - - af_desc.is_managed = 1; - cpumask_copy(&af_desc.mask, cpu_online_mask); - cpumask_andnot(&af_desc.mask, &af_desc.mask, used_cpus); - irq = mlx5_irq_affinity_request(pool, &af_desc); - - if (IS_ERR(irq)) - return irq; - - cpumask_or(used_cpus, used_cpus, mlx5_irq_get_affinity_mask(irq)); - mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", - pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), - cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), - mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); - - return irq; -} -- cgit From ad4d82c3eacdd500a246af736e6e01d96484e35e Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Nov 2023 13:58:34 -0800 Subject: net/mlx5: DR, Allow old devices to use multi destination FTE The current check isn't aware of old devices that don't have the relevant FW capability. This patch allows multi destination FTE in old cards, as it was before this check. Fixes: f6f46e7173cb ("net/mlx5: DR, Add check for multi destination FTE") Signed-off-by: Erez Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-4-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 6ea88a581804..e3ec559369fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -57,7 +57,8 @@ static const char *dr_action_id_to_str(enum mlx5dr_action_type action_id) static bool mlx5dr_action_supp_fwd_fdb_multi_ft(struct mlx5_core_dev *dev) { - return (MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) || + return (MLX5_CAP_GEN(dev, steering_format_version) < MLX5_STEERING_FORMAT_CONNECTX_6DX || + MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) || MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table)); } -- cgit From fd64fd13c49a53012ce9170449dcd9eb71c11284 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:35 -0800 Subject: net/mlx5: Decouple PHC .adjtime and .adjphase implementations When running a phase adjustment operation, the free running clock should not be modified at all. The phase control keyword is intended to trigger an internal servo on the device that will converge to the provided delta. A free running counter cannot implement phase adjustment. Fixes: 8e11a68e2e8a ("net/mlx5: Add adjphase function to support hardware-only offset control") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-5-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index aa29f09e8356..0c83ef174275 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -384,7 +384,12 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) { - return mlx5_ptp_adjtime(ptp, delta); + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + + return mlx5_ptp_adjtime_real_time(mdev, delta); } static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm) -- cgit From 6f9b1a0731662648949a1c0587f6acb3b7f8acf1 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 14 Nov 2023 13:58:36 -0800 Subject: net/mlx5e: fix double free of encap_header When mlx5_packet_reformat_alloc() fails, the encap_header allocated in mlx5e_tc_tun_create_header_ipv4{6} will be released within it. However, e->encap_header is already set to the previously freed encap_header before mlx5_packet_reformat_alloc(). As a result, the later mlx5e_encap_put() will free e->encap_header again, causing a double free issue. mlx5e_encap_put() --> mlx5e_encap_dealloc() --> kfree(e->encap_header) This happens when cmd: MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT fail. This patch fix it by not setting e->encap_header until mlx5_packet_reformat_alloc() success. Fixes: d589e785baf5e ("net/mlx5e: Allow concurrent creation of encap entries") Reported-by: Cruz Zhao Reported-by: Tianchen Ding Signed-off-by: Dust Li Reviewed-by: Wojciech Drewek Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 00a04fdd756f..8bca696b6658 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -300,9 +300,6 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, if (err) goto destroy_neigh_entry; - e->encap_size = ipv4_encap_size; - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event @@ -322,6 +319,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, goto destroy_neigh_entry; } + e->encap_size = ipv4_encap_size; + e->encap_header = encap_header; e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv4_put(&attr); @@ -568,9 +567,6 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, if (err) goto destroy_neigh_entry; - e->encap_size = ipv6_encap_size; - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event @@ -590,6 +586,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, goto destroy_neigh_entry; } + e->encap_size = ipv6_encap_size; + e->encap_header = encap_header; e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv6_put(&attr); -- cgit From 3a4aa3cb83563df942be49d145ee3b7ddf17d6bb Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 14 Nov 2023 13:58:37 -0800 Subject: net/mlx5e: fix double free of encap_header in update funcs Follow up to the previous patch to fix the same issue for mlx5e_tc_tun_update_header_ipv4{6} when mlx5_packet_reformat_alloc() fails. When mlx5_packet_reformat_alloc() fails, the encap_header allocated in mlx5e_tc_tun_update_header_ipv4{6} will be released within it. However, e->encap_header is already set to the previously freed encap_header before mlx5_packet_reformat_alloc(). As a result, the later mlx5e_encap_put() will free e->encap_header again, causing a double free issue. mlx5e_encap_put() --> mlx5e_encap_dealloc() --> kfree(e->encap_header) This patch fix it by not setting e->encap_header until mlx5_packet_reformat_alloc() success. Fixes: a54e20b4fcae ("net/mlx5e: Add basic TC tunnel set action for SRIOV offloads") Signed-off-by: Gavin Li Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-7-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 8bca696b6658..668da5c70e63 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -403,16 +403,12 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, if (err) goto free_encap; - e->encap_size = ipv4_encap_size; - kfree(e->encap_header); - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event * and not used before that. */ - goto release_neigh; + goto free_encap; } memset(&reformat_params, 0, sizeof(reformat_params)); @@ -426,6 +422,10 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, goto free_encap; } + e->encap_size = ipv4_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv4_put(&attr); @@ -669,16 +669,12 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, if (err) goto free_encap; - e->encap_size = ipv6_encap_size; - kfree(e->encap_header); - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event * and not used before that. */ - goto release_neigh; + goto free_encap; } memset(&reformat_params, 0, sizeof(reformat_params)); @@ -692,6 +688,10 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, goto free_encap; } + e->encap_size = ipv6_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv6_put(&attr); -- cgit From 0c101a23ca7eaf00eef1328eefb04b3a93401cc8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Nov 2023 13:58:38 -0800 Subject: net/mlx5e: Fix pedit endianness Referenced commit addressed endianness issue in mlx5 pedit implementation in ad hoc manner instead of systematically treating integer values according to their types which left pedit fields of sizes not equal to 4 and where the bytes being modified are not least significant ones broken on big endian machines since wrong bits will be consumed during parsing which leads to following example error when applying pedit to source and destination MAC addresses: [Wed Oct 18 12:52:42 2023] mlx5_core 0001:00:00.1 p1v3_r: attempt to offload an unsupported field (cmd 0) [Wed Oct 18 12:52:42 2023] mask: 00000000330c5b68: 00 00 00 00 ff ff 00 00 00 00 ff ff 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 0000000017d22fd9: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 000000008186d717: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 0000000029eb6149: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 000000007ed103e4: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 00000000db8101a6: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 00000000ec3c08a9: 00 00 00 00 00 00 00 00 00 00 00 00 ............ Treat masks and values of pedit and filter match as network byte order, refactor pointers to them to void pointers instead of confusing u32 pointers and only cast to pointer-to-integer when reading a value from them. Treat pedit mlx5_fields->field_mask as host byte order according to its type u32, change the constants in fields array accordingly. Fixes: 82198d8bcdef ("net/mlx5e: Fix endianness when calculating pedit mask first bit") Signed-off-by: Vlad Buslov Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-8-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 60 +++++++++++++------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9a5a5c2c7da9..7ca9e5b86778 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3147,7 +3147,7 @@ static struct mlx5_fields fields[] = { OFFLOAD(DIPV6_31_0, 32, U32_MAX, ip6.daddr.s6_addr32[3], 0, dst_ipv4_dst_ipv6.ipv6_layout.ipv6[12]), OFFLOAD(IPV6_HOPLIMIT, 8, U8_MAX, ip6.hop_limit, 0, ttl_hoplimit), - OFFLOAD(IP_DSCP, 16, 0xc00f, ip6, 0, ip_dscp), + OFFLOAD(IP_DSCP, 16, 0x0fc0, ip6, 0, ip_dscp), OFFLOAD(TCP_SPORT, 16, U16_MAX, tcp.source, 0, tcp_sport), OFFLOAD(TCP_DPORT, 16, U16_MAX, tcp.dest, 0, tcp_dport), @@ -3158,21 +3158,31 @@ static struct mlx5_fields fields[] = { OFFLOAD(UDP_DPORT, 16, U16_MAX, udp.dest, 0, udp_dport), }; -static unsigned long mask_to_le(unsigned long mask, int size) +static u32 mask_field_get(void *mask, struct mlx5_fields *f) { - __be32 mask_be32; - __be16 mask_be16; - - if (size == 32) { - mask_be32 = (__force __be32)(mask); - mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32)); - } else if (size == 16) { - mask_be32 = (__force __be32)(mask); - mask_be16 = *(__be16 *)&mask_be32; - mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16)); + switch (f->field_bsize) { + case 32: + return be32_to_cpu(*(__be32 *)mask) & f->field_mask; + case 16: + return be16_to_cpu(*(__be16 *)mask) & (u16)f->field_mask; + default: + return *(u8 *)mask & (u8)f->field_mask; } +} - return mask; +static void mask_field_clear(void *mask, struct mlx5_fields *f) +{ + switch (f->field_bsize) { + case 32: + *(__be32 *)mask &= ~cpu_to_be32(f->field_mask); + break; + case 16: + *(__be16 *)mask &= ~cpu_to_be16((u16)f->field_mask); + break; + default: + *(u8 *)mask &= ~(u8)f->field_mask; + break; + } } static int offload_pedit_fields(struct mlx5e_priv *priv, @@ -3184,11 +3194,12 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; struct pedit_headers_action *hdrs = parse_attr->hdrs; void *headers_c, *headers_v, *action, *vals_p; - u32 *s_masks_p, *a_masks_p, s_mask, a_mask; struct mlx5e_tc_mod_hdr_acts *mod_acts; - unsigned long mask, field_mask; + void *s_masks_p, *a_masks_p; int i, first, last, next_z; struct mlx5_fields *f; + unsigned long mask; + u32 s_mask, a_mask; u8 cmd; mod_acts = &parse_attr->mod_hdr_acts; @@ -3204,15 +3215,11 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, bool skip; f = &fields[i]; - /* avoid seeing bits set from previous iterations */ - s_mask = 0; - a_mask = 0; - s_masks_p = (void *)set_masks + f->offset; a_masks_p = (void *)add_masks + f->offset; - s_mask = *s_masks_p & f->field_mask; - a_mask = *a_masks_p & f->field_mask; + s_mask = mask_field_get(s_masks_p, f); + a_mask = mask_field_get(a_masks_p, f); if (!s_mask && !a_mask) /* nothing to offload here */ continue; @@ -3239,22 +3246,20 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, match_mask, f->field_bsize)) skip = true; /* clear to denote we consumed this field */ - *s_masks_p &= ~f->field_mask; + mask_field_clear(s_masks_p, f); } else { cmd = MLX5_ACTION_TYPE_ADD; mask = a_mask; vals_p = (void *)add_vals + f->offset; /* add 0 is no change */ - if ((*(u32 *)vals_p & f->field_mask) == 0) + if (!mask_field_get(vals_p, f)) skip = true; /* clear to denote we consumed this field */ - *a_masks_p &= ~f->field_mask; + mask_field_clear(a_masks_p, f); } if (skip) continue; - mask = mask_to_le(mask, f->field_bsize); - first = find_first_bit(&mask, f->field_bsize); next_z = find_next_zero_bit(&mask, f->field_bsize, first); last = find_last_bit(&mask, f->field_bsize); @@ -3281,10 +3286,9 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, MLX5_SET(set_action_in, action, field, f->field); if (cmd == MLX5_ACTION_TYPE_SET) { + unsigned long field_mask = f->field_mask; int start; - field_mask = mask_to_le(f->field_mask, f->field_bsize); - /* if field is bit sized it can start not from first bit */ start = find_first_bit(&field_mask, f->field_bsize); -- cgit From bdf788cf224f61c20a01c58c00685d394d57887f Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 14 Nov 2023 13:58:39 -0800 Subject: net/mlx5e: Don't modify the peer sent-to-vport rules for IPSec offload As IPSec packet offload in switchdev mode is not supported with LAG, it's unnecessary to modify those sent-to-vport rules to the peer eswitch. Fixes: c6c2bf5db4ea ("net/mlx5e: Support IPsec packet offload for TX in switchdev mode") Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-9-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index b296ac52a439..88236e75fd90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -984,7 +984,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - if (rep->vport == MLX5_VPORT_UPLINK && on_esw->offloads.ft_ipsec_tx_pol) { + if (rep->vport == MLX5_VPORT_UPLINK && + on_esw == from_esw && on_esw->offloads.ft_ipsec_tx_pol) { dest.ft = on_esw->offloads.ft_ipsec_tx_pol; flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; -- cgit From 64f14d16eef1f939000f2617b50c7c996b5117d4 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:40 -0800 Subject: net/mlx5e: Avoid referencing skb after free-ing in drop path of mlx5e_sq_xmit_wqe When SQ is a port timestamping SQ for PTP, do not access tx flags of skb after free-ing the skb. Free the skb only after all references that depend on it have been handled in the dropped WQE path. Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-10-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index d41435c22ce5..19f2c25b05a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -494,10 +494,10 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, err_drop: stats->dropped++; - dev_kfree_skb_any(skb); if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist, be32_to_cpu(eseg->flow_table_metadata)); + dev_kfree_skb_any(skb); mlx5e_tx_flush(sq); } -- cgit From 7e3f3ba97e6cc6fce5bf62df2ca06c8e59040167 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:41 -0800 Subject: net/mlx5e: Track xmit submission to PTP WQ after populating metadata map Ensure the skb is available in metadata mapping to skbs before tracking the metadata index for detecting undelivered CQEs. If the metadata index is put in the tracking list before putting the skb in the map, the metadata index might be used for detecting undelivered CQEs before the relevant skb is available in the map, which can lead to a null-ptr-deref. Log: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 0 PID: 1243 Comm: kworker/0:2 Not tainted 6.6.0-rc4+ #108 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: events mlx5e_rx_dim_work [mlx5_core] RIP: 0010:mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core] Code: 8c 24 38 cc ff ff 4c 8d 3c c1 4c 89 f9 48 c1 e9 03 42 80 3c 31 00 0f 85 97 0f 00 00 4d 8b 3f 49 8d 7f 28 48 89 f9 48 c1 e9 03 <42> 80 3c 31 00 0f 85 8b 0f 00 00 49 8b 47 28 48 85 c0 0f 84 05 07 RSP: 0018:ffff8884d3c09c88 EFLAGS: 00010206 RAX: 0000000000000069 RBX: ffff8881160349d8 RCX: 0000000000000005 RDX: ffffed10218f48cf RSI: 0000000000000004 RDI: 0000000000000028 RBP: ffff888122707700 R08: 0000000000000001 R09: ffffed109a781383 R10: 0000000000000003 R11: 0000000000000003 R12: ffff88810c7a7a40 R13: ffff888122707700 R14: dffffc0000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8884d3c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f878dd6e0 CR3: 000000014d108002 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? die_addr+0x3c/0xa0 ? exc_general_protection+0x144/0x210 ? asm_exc_general_protection+0x22/0x30 ? mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core] ? mlx5e_ptp_napi_poll+0x8f6/0x2290 [mlx5_core] __napi_poll.constprop.0+0xa4/0x580 net_rx_action+0x460/0xb80 ? _raw_spin_unlock_irqrestore+0x32/0x60 ? __napi_poll.constprop.0+0x580/0x580 ? tasklet_action_common.isra.0+0x2ef/0x760 __do_softirq+0x26c/0x827 irq_exit_rcu+0xc2/0x100 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:__kmem_cache_alloc_node+0xb/0x330 Code: 41 5d 41 5e 41 5f c3 8b 44 24 14 8b 4c 24 10 09 c8 eb d5 e8 b7 43 ca 01 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 57 <41> 56 41 89 d6 41 55 41 89 f5 41 54 49 89 fc 53 48 83 e4 f0 48 83 RSP: 0018:ffff88812c4079c0 EFLAGS: 00000246 RAX: 1ffffffff083c7fe RBX: ffff888100042dc0 RCX: 0000000000000218 RDX: 00000000ffffffff RSI: 0000000000000dc0 RDI: ffff888100042dc0 RBP: ffff88812c4079c8 R08: ffffffffa0289f96 R09: ffffed1025880ea9 R10: ffff888138839f80 R11: 0000000000000002 R12: 0000000000000dc0 R13: 0000000000000100 R14: 000000000000008c R15: ffff8881271fc450 ? cmd_exec+0x796/0x2200 [mlx5_core] kmalloc_trace+0x26/0xc0 cmd_exec+0x796/0x2200 [mlx5_core] mlx5_cmd_do+0x22/0xc0 [mlx5_core] mlx5_cmd_exec+0x17/0x30 [mlx5_core] mlx5_core_modify_cq_moderation+0x139/0x1b0 [mlx5_core] ? mlx5_add_cq_to_tasklet+0x280/0x280 [mlx5_core] ? lockdep_set_lock_cmp_fn+0x190/0x190 ? process_one_work+0x659/0x1220 mlx5e_rx_dim_work+0x9d/0x100 [mlx5_core] process_one_work+0x730/0x1220 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? max_active_store+0xf0/0xf0 ? assign_work+0x168/0x240 worker_thread+0x70f/0x12d0 ? __kthread_parkme+0xd1/0x1d0 ? process_one_work+0x1220/0x1220 kthread+0x2d9/0x3b0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x2d/0x70 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_ib ib_uverbs ib_core zram zsmalloc mlx5_core fuse ---[ end trace 0000000000000000 ]--- Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-11-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 19f2c25b05a0..f0b506e562df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -399,9 +399,9 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata); mlx5e_skb_cb_hwtstamp_init(skb); - mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); + mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index); if (!netif_tx_queue_stopped(sq->txq) && mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) { netif_tx_stop_queue(sq->txq); -- cgit From 92214be5979c0961a471b7eaaaeacab41bdf456c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:42 -0800 Subject: net/mlx5e: Update doorbell for port timestamping CQ before the software counter Previously, mlx5e_ptp_poll_ts_cq would update the device doorbell with the incremented consumer index after the relevant software counters in the kernel were updated. In the mlx5e_sq_xmit_wqe context, this would lead to either overrunning the device CQ or exceeding the expected software buffer size in the device CQ if the device CQ size was greater than the software buffer size. Update the relevant software counter only after updating the device CQ consumer index in the port timestamping napi_poll context. Log: mlx5_core 0000:08:00.0: cq_err_event_notifier:517:(pid 0): CQ error on CQN 0x487, syndrome 0x1 mlx5_core 0000:08:00.0 eth2: mlx5e_cq_error_event: cqn=0x000487 event=0x04 Fixes: 1880bc4e4a96 ("net/mlx5e: Add TX port timestamp support") Signed-off-by: Rahul Rameshbabu Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-12-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index bb11e644d24f..af3928eddafd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -177,6 +177,8 @@ static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq, static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, struct mlx5_cqe64 *cqe, + u8 *md_buff, + u8 *md_buff_sz, int budget) { struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list; @@ -211,19 +213,24 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); out: napi_consume_skb(skb, budget); - mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id); + md_buff[*md_buff_sz++] = metadata_id; if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); } -static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) +static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int napi_budget) { struct mlx5e_ptpsq *ptpsq = container_of(cq, struct mlx5e_ptpsq, ts_cq); - struct mlx5_cqwq *cqwq = &cq->wq; + int budget = min(napi_budget, MLX5E_TX_CQ_POLL_BUDGET); + u8 metadata_buff[MLX5E_TX_CQ_POLL_BUDGET]; + u8 metadata_buff_sz = 0; + struct mlx5_cqwq *cqwq; struct mlx5_cqe64 *cqe; int work_done = 0; + cqwq = &cq->wq; + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))) return false; @@ -234,7 +241,8 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) do { mlx5_cqwq_pop(cqwq); - mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, budget); + mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, + metadata_buff, &metadata_buff_sz, napi_budget); } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); mlx5_cqwq_update_db_record(cqwq); @@ -242,6 +250,10 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) /* ensure cq space is freed before enabling more cqes */ wmb(); + while (metadata_buff_sz > 0) + mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, + metadata_buff[--metadata_buff_sz]); + mlx5e_txqsq_wake(&ptpsq->txqsq); return work_done == budget; -- cgit From 3338bebfc26a1e2cebbba82a1cf12c0159608e73 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:43 -0800 Subject: net/mlx5: Increase size of irq name buffer Without increased buffer size, will trigger -Wformat-truncation with W=1 for the snprintf operation writing to the buffer. drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc': drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=] 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32 295 | snprintf(irq->name, MLX5_MAX_IRQ_NAME, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 653648216730..4dcf995cb1a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -28,7 +28,7 @@ struct mlx5_irq { struct atomic_notifier_head nh; cpumask_var_t mask; - char name[MLX5_MAX_IRQ_NAME]; + char name[MLX5_MAX_IRQ_FORMATTED_NAME]; struct mlx5_irq_pool *pool; int refcount; struct msi_map map; @@ -292,8 +292,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, else irq_sf_set_name(pool, name, i); ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh); - snprintf(irq->name, MLX5_MAX_IRQ_NAME, - "%s@pci:%s", name, pci_name(dev->pdev)); + snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME, + MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev)); err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name, &irq->nh); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h index d3a77a0ab848..c4d377f8df30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h @@ -7,6 +7,9 @@ #include #define MLX5_MAX_IRQ_NAME (32) +#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s") +#define MLX5_MAX_IRQ_FORMATTED_NAME \ + (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR)) /* max irq_index is 2047, so four chars */ #define MLX5_MAX_IRQ_IDX_CHARS (4) #define MLX5_EQ_REFS_PER_IRQ (2) -- cgit From dce94142842e119b982c27c1b62bd20890c7fd21 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 14 Nov 2023 13:58:44 -0800 Subject: net/mlx5e: Reduce the size of icosq_str icosq_str size is unnecessarily too long, and it causes a build warning -Wformat-truncation with W=1. Looking closely, It doesn't need to be 255B, hence this patch reduces the size to 32B which should be more than enough to host the string: "ICOSQ: 0x%x, ". While here, add a missing space in the formatted string. This fixes the following build warning: $ KCFLAGS='-Wall -Werror' $ make O=/tmp/kbuild/linux W=1 -s -j12 drivers/net/ethernet/mellanox/mlx5/core/ drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c: In function 'mlx5e_reporter_rx_timeout': drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:718:56: error: ', CQ: 0x' directive output may be truncated writing 8 bytes into a region of size between 0 and 255 [-Werror=format-truncation=] 718 | "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", | ^~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:717:9: note: 'snprintf' output between 43 and 322 bytes into a destination of size 288 717 | snprintf(err_str, sizeof(err_str), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 718 | "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 719 | rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 521f31af004a ("net/mlx5e: Allow RQ outside of channel context") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-14-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index fea8c0a5fe89..4358798d6ce1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -492,11 +492,11 @@ static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) { - char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; struct mlx5e_icosq *icosq = rq->icosq; struct mlx5e_priv *priv = rq->priv; struct mlx5e_err_ctx err_ctx = {}; + char icosq_str[32] = {}; err_ctx.ctx = rq; err_ctx.recover = mlx5e_rx_reporter_timeout_recover; @@ -505,7 +505,7 @@ void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) if (icosq) snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); snprintf(err_str, sizeof(err_str), - "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", + "RX timeout on channel: %d, %s RQ: 0x%x, CQ: 0x%x", rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); -- cgit From 41e63c2baa11dc2aa71df5dd27a5bd87d11b6bbb Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:45 -0800 Subject: net/mlx5e: Check return value of snprintf writing to fw_version buffer Treat the operation as an error case when the return value is equivalent to the size of the name buffer. Failed to write null terminator to the name buffer, making the string malformed and should not be used. Provide a string with only the firmware version when forming the string with the board id fails. Without check, will trigger -Wformat-truncation with W=1. drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c: In function 'mlx5e_ethtool_get_drvinfo': drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:49:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=] 49 | "%d.%d.%04d (%.16s)", | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:48:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32 48 | snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 49 | "%d.%d.%04d (%.16s)", | ~~~~~~~~~~~~~~~~~~~~~ 50 | fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev), | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 51 | mdev->board_id); | ~~~~~~~~~~~~~~~ Fixes: 84e11edb71de ("net/mlx5e: Show board id in ethtool driver information") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 215261a69255..792a0ea544cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -43,12 +43,17 @@ void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, struct ethtool_drvinfo *drvinfo) { struct mlx5_core_dev *mdev = priv->mdev; + int count; strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver)); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%04d (%.16s)", - fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev), - mdev->board_id); + count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id); + if (count == sizeof(drvinfo->fw_version)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev)); + strscpy(drvinfo->bus_info, dev_name(mdev->device), sizeof(drvinfo->bus_info)); } -- cgit From 1b2bd0c0264febcd8d47209079a6671c38e6558b Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:46 -0800 Subject: net/mlx5e: Check return value of snprintf writing to fw_version buffer for representors Treat the operation as an error case when the return value is equivalent to the size of the name buffer. Failed to write null terminator to the name buffer, making the string malformed and should not be used. Provide a string with only the firmware version when forming the string with the board id fails. This logic for representors is identical to normal flow with ethtool. Without check, will trigger -Wformat-truncation with W=1. drivers/net/ethernet/mellanox/mlx5/core/en_rep.c: In function 'mlx5e_rep_get_drvinfo': drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:78:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=] 78 | "%d.%d.%04d (%.16s)", | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:77:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32 77 | snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 78 | "%d.%d.%04d (%.16s)", | ~~~~~~~~~~~~~~~~~~~~~ 79 | fw_rev_maj(mdev), fw_rev_min(mdev), | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 80 | fw_rev_sub(mdev), mdev->board_id); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: cf83c8fdcd47 ("net/mlx5e: Add missing ethtool driver info for representors") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-16-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 693e55b010d9..3ab682bbcf86 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -71,13 +71,17 @@ static void mlx5e_rep_get_drvinfo(struct net_device *dev, { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; + int count; strscpy(drvinfo->driver, mlx5e_rep_driver_name, sizeof(drvinfo->driver)); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%04d (%.16s)", - fw_rev_maj(mdev), fw_rev_min(mdev), - fw_rev_sub(mdev), mdev->board_id); + count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id); + if (count == sizeof(drvinfo->fw_version)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev)); } static const struct counter_desc sw_rep_stats_desc[] = { -- cgit From 7cd5af0e937a197295f3aa3721031f0fbae49cff Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 13 Nov 2023 12:53:28 -0500 Subject: net: sched: do not offload flows with a helper in act_ct There is no hardware supporting ct helper offload. However, prior to this patch, a flower filter with a helper in the ct action can be successfully set into the HW, for example (eth1 is a bnxt NIC): # tc qdisc add dev eth1 ingress_block 22 ingress # tc filter add block 22 proto ip flower skip_sw ip_proto tcp \ dst_port 21 ct_state -trk action ct helper ipv4-tcp-ftp # tc filter show dev eth1 ingress filter block 22 protocol ip pref 49152 flower chain 0 handle 0x1 eth_type ipv4 ip_proto tcp dst_port 21 ct_state -trk skip_sw in_hw in_hw_count 1 <---- action order 1: ct zone 0 helper ipv4-tcp-ftp pipe index 2 ref 1 bind 1 used_hw_stats delayed This might cause the flower filter not to work as expected in the HW. This patch avoids this problem by simply returning -EOPNOTSUPP in tcf_ct_offload_act_setup() to not allow to offload flows with a helper in act_ct. Fixes: a21b06e73191 ("net: sched: add helper support in act_ct") Signed-off-by: Xin Long Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/f8685ec7702c4a448a1371a8b34b43217b583b9d.1699898008.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_ct.h | 9 +++++++++ net/sched/act_ct.c | 3 +++ 2 files changed, 12 insertions(+) diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8a6dbfb23336..77f87c622a2e 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -58,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) return to_ct_params(a)->nf_ft; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return to_ct_params(a)->helper; +} + #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } @@ -65,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return NULL; +} #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0db0ecf1d110..b3f4a503ee2b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1549,6 +1549,9 @@ static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data, if (bind) { struct flow_action_entry *entry = entry_data; + if (tcf_ct_helper(act)) + return -EOPNOTSUPP; + entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); entry->ct.zone = tcf_ct_zone(act); -- cgit From 7e1caeace0418381f36b3aa8403dfd82fc57fc53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Nov 2023 18:59:15 +0100 Subject: macvlan: Don't propagate promisc change to lower dev in passthru Macvlan device in passthru mode sets its lower device promiscuous mode according to its MACVLAN_FLAG_NOPROMISC flag instead of synchronizing it to its own promiscuity setting. However, macvlan_change_rx_flags() function doesn't check the mode before propagating such changes to the lower device which can cause net_device->promiscuity counter overflow as illustrated by reproduction example [0] and resulting dmesg log [1]. Fix the issue by first verifying the mode in macvlan_change_rx_flags() function before propagating promiscuous mode change to the lower device. [0]: ip link add macvlan1 link enp8s0f0 type macvlan mode passthru ip link set macvlan1 promisc on ip l set dev macvlan1 up ip link set macvlan1 promisc off ip l set dev macvlan1 down ip l set dev macvlan1 up [1]: [ 5156.281724] macvlan1: entered promiscuous mode [ 5156.285467] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode [ 5156.287639] macvlan1: left promiscuous mode [ 5156.288339] mlx5_core 0000:08:00.0 enp8s0f0: left promiscuous mode [ 5156.290907] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode [ 5156.317197] mlx5_core 0000:08:00.0 enp8s0f0: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken. Fixes: efdbd2b30caa ("macvlan: Propagate promiscuity setting to lower devices.") Reviewed-by: Gal Pressman Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231114175915.1649154-1-vladbu@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 02bd201bc7e5..c8da94af4161 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -780,7 +780,7 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change) if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) + if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); -- cgit From 1c4a7587d1bbee0fd53b63af60e4244a62775f57 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 1 Nov 2023 02:46:27 +0900 Subject: modpost: fix section mismatch message for RELA The section mismatch check prints a bogus symbol name on some architectures. [test code] #include int __initdata foo; int get_foo(void) { return foo; } If you compile it with GCC for riscv or loongarch, modpost will show an incorrect symbol name: WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> done (section: .init.data) To get the correct symbol address, the st_value must be added. This issue has never been noticed since commit 93684d3b8062 ("kbuild: include symbol names in section mismatch warnings") presumably because st_value becomes zero on most architectures when the referenced symbol is looked up. It is not true for riscv or loongarch, at least. With this fix, modpost will show the correct symbol name: WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> foo (section: .init.data) Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- scripts/mod/modpost.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 973b5e5ae2dd..cb6406f485a9 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1383,13 +1383,15 @@ static void section_rela(struct module *mod, struct elf_info *elf, const Elf_Rela *rela; for (rela = start; rela < stop; rela++) { + Elf_Sym *tsym; Elf_Addr taddr, r_offset; unsigned int r_type, r_sym; r_offset = TO_NATIVE(rela->r_offset); get_rel_type_and_sym(elf, rela->r_info, &r_type, &r_sym); - taddr = TO_NATIVE(rela->r_addend); + tsym = elf->symtab_start + r_sym; + taddr = tsym->st_value + TO_NATIVE(rela->r_addend); switch (elf->hdr->e_machine) { case EM_RISCV: @@ -1404,7 +1406,7 @@ static void section_rela(struct module *mod, struct elf_info *elf, break; } - check_section_mismatch(mod, elf, elf->symtab_start + r_sym, + check_section_mismatch(mod, elf, tsym, fsecndx, fromsec, r_offset, taddr); } } -- cgit From ba276ce5865b5a22ee96c4c5664bfefd9c1bb593 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 19:11:04 -0500 Subject: bcachefs: Fix missing locking for dentry->d_parent access Reported-by: Al Viro Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index a39ff0c296ec..79d982674c18 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, s.v = v + 1; s.defined = true; } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } + spin_unlock(&dentry->d_lock); s.defined = false; } -- cgit From 9e0be3f50c0e8517d0238b62409c20bcb8cd8785 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 10 Nov 2023 13:07:22 +0100 Subject: linux/export: clean up the IA-64 KSYM_FUNC macro With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), there is no need to keep the IA-64 definition of the KSYM_FUNC macro. Clean up the IA-64 definition of the KSYM_FUNC macro. Signed-off-by: Lukas Bulwahn Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- include/linux/export-internal.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 45fca09b2319..69501e0ec239 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -50,9 +50,7 @@ " .previous" "\n" \ ) -#ifdef CONFIG_IA64 -#define KSYM_FUNC(name) @fptr(name) -#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT) +#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT) #define KSYM_FUNC(name) P%name #else #define KSYM_FUNC(name) name -- cgit From 76020731d4ee897411ce4a73916ed805ea15d946 Mon Sep 17 00:00:00 2001 From: Simon Glass Date: Fri, 10 Nov 2023 17:28:01 -0700 Subject: kbuild: Move the single quotes for image name Add quotes where UIMAGE_NAME is used, rather than where it is defined. This allows the UIMAGE_NAME variable to be set by the user. Signed-off-by: Simon Glass Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 68d0134bdbf9..1a965fe68e01 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -487,14 +487,14 @@ UIMAGE_OPTS-y ?= UIMAGE_TYPE ?= kernel UIMAGE_LOADADDR ?= arch_must_set_this UIMAGE_ENTRYADDR ?= $(UIMAGE_LOADADDR) -UIMAGE_NAME ?= 'Linux-$(KERNELRELEASE)' +UIMAGE_NAME ?= Linux-$(KERNELRELEASE) quiet_cmd_uimage = UIMAGE $@ cmd_uimage = $(BASH) $(MKIMAGE) -A $(UIMAGE_ARCH) -O linux \ -C $(UIMAGE_COMPRESSION) $(UIMAGE_OPTS-y) \ -T $(UIMAGE_TYPE) \ -a $(UIMAGE_LOADADDR) -e $(UIMAGE_ENTRYADDR) \ - -n $(UIMAGE_NAME) -d $< $@ + -n '$(UIMAGE_NAME)' -d $< $@ # XZ # --------------------------------------------------------------------------- -- cgit From ae1eff0349f2e908fc083630e8441ea6dc434dc0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 15 Nov 2023 13:16:53 +0900 Subject: kconfig: fix memory leak from range properties Currently, sym_validate_range() duplicates the range string using xstrdup(), which is overwritten by a subsequent sym_calc_value() call. It results in a memory leak. Instead, only the pointer should be copied. Below is a test case, with a summary from Valgrind. [Test Kconfig] config FOO int "foo" range 10 20 [Test .config] CONFIG_FOO=0 [Before] LEAK SUMMARY: definitely lost: 3 bytes in 1 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,465 bytes in 21 blocks suppressed: 0 bytes in 0 blocks [After] LEAK SUMMARY: definitely lost: 0 bytes in 0 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,462 bytes in 20 blocks suppressed: 0 bytes in 0 blocks Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 0572330bf8a7..a76925b46ce6 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -122,9 +122,9 @@ static long long sym_get_range_val(struct symbol *sym, int base) static void sym_validate_range(struct symbol *sym) { struct property *prop; + struct symbol *range_sym; int base; long long val, val2; - char str[64]; switch (sym->type) { case S_INT: @@ -140,17 +140,15 @@ static void sym_validate_range(struct symbol *sym) if (!prop) return; val = strtoll(sym->curr.val, NULL, base); - val2 = sym_get_range_val(prop->expr->left.sym, base); + range_sym = prop->expr->left.sym; + val2 = sym_get_range_val(range_sym, base); if (val >= val2) { - val2 = sym_get_range_val(prop->expr->right.sym, base); + range_sym = prop->expr->right.sym; + val2 = sym_get_range_val(range_sym, base); if (val <= val2) return; } - if (sym->type == S_INT) - sprintf(str, "%lld", val2); - else - sprintf(str, "0x%llx", val2); - sym->curr.val = xstrdup(str); + sym->curr.val = range_sym->curr.val; } static void sym_set_changed(struct symbol *sym) -- cgit From 1ffa8602e39b89469dc703ebab7a7e44c33da0f7 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Wed, 13 Sep 2023 16:18:44 -0400 Subject: drm/amd/display: Guard against invalid RPTR/WPTR being set [WHY] HW can return invalid values on register read, guard against these being set and causing us to access memory out of range and page fault. [HOW] Guard at sync_inbox1 and guard at pushing commands. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Hansen Dsouza Acked-by: Alex Hung Signed-off-by: Nicholas Kazlauskas Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index e43e8d4bfe37..5d36f3e5dc2b 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -707,9 +707,16 @@ enum dmub_status dmub_srv_sync_inbox1(struct dmub_srv *dmub) return DMUB_STATUS_INVALID; if (dmub->hw_funcs.get_inbox1_rptr && dmub->hw_funcs.get_inbox1_wptr) { - dmub->inbox1_rb.rptr = dmub->hw_funcs.get_inbox1_rptr(dmub); - dmub->inbox1_rb.wrpt = dmub->hw_funcs.get_inbox1_wptr(dmub); - dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt; + uint32_t rptr = dmub->hw_funcs.get_inbox1_rptr(dmub); + uint32_t wptr = dmub->hw_funcs.get_inbox1_wptr(dmub); + + if (rptr > dmub->inbox1_rb.capacity || wptr > dmub->inbox1_rb.capacity) { + return DMUB_STATUS_HW_FAILURE; + } else { + dmub->inbox1_rb.rptr = rptr; + dmub->inbox1_rb.wrpt = wptr; + dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt; + } } return DMUB_STATUS_OK; @@ -743,6 +750,11 @@ enum dmub_status dmub_srv_cmd_queue(struct dmub_srv *dmub, if (!dmub->hw_init) return DMUB_STATUS_INVALID; + if (dmub->inbox1_rb.rptr > dmub->inbox1_rb.capacity || + dmub->inbox1_rb.wrpt > dmub->inbox1_rb.capacity) { + return DMUB_STATUS_HW_FAILURE; + } + if (dmub_rb_push_front(&dmub->inbox1_rb, cmd)) return DMUB_STATUS_OK; -- cgit From 0288603040c38ccfeb5342f34a52673366d90038 Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Wed, 4 Oct 2023 14:24:15 -0400 Subject: drm/amdgpu: Do not program VF copy regs in mmhub v1.8 under SRIOV (v2) MC_VM_AGP_* registers should not be programmed by guest driver. v2: move early return outside of loop Signed-off-by: Victor Lu Reviewed-by: Samir Dhume Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index ea142611be1c..9b0146732e13 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -130,6 +130,9 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev) uint64_t value; int i; + if (amdgpu_sriov_vf(adev)) + return; + inst_mask = adev->aid_mask; for_each_inst(i, inst_mask) { /* Program the AGP BAR */ @@ -139,9 +142,6 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev) WREG32_SOC15(MMHUB, i, regMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); - if (amdgpu_sriov_vf(adev)) - return; - /* Program the system aperture low logical page number. */ WREG32_SOC15(MMHUB, i, regMC_VM_SYSTEM_APERTURE_LOW_ADDR, min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18); -- cgit From bdb72185d310fc8049c7ea95221d640e9e7165e5 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Mon, 13 Nov 2023 18:05:34 +0800 Subject: drm/amdgpu: finalizing mem_partitions at the end of GMC v9 sw_fini The valid num_mem_partitions is required during ttm pool fini, thus move the cleanup at the end of the function. Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index bde25eb4ed8e..c1f2f166f064 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -2170,8 +2170,6 @@ static int gmc_v9_0_sw_fini(void *handle) if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) amdgpu_gmc_sysfs_fini(adev); - adev->gmc.num_mem_partitions = 0; - kfree(adev->gmc.mem_partitions); amdgpu_gmc_ras_fini(adev); amdgpu_gem_force_release(adev); @@ -2185,6 +2183,9 @@ static int gmc_v9_0_sw_fini(void *handle) amdgpu_bo_free_kernel(&adev->gmc.pdb0_bo, NULL, &adev->gmc.ptr_pdb0); amdgpu_bo_fini(adev); + adev->gmc.num_mem_partitions = 0; + kfree(adev->gmc.mem_partitions); + return 0; } -- cgit From 8a0173cd90984835645022bf1997abd1bcd81aae Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Sun, 12 Nov 2023 09:51:19 +0530 Subject: drm/amdgpu: Address member 'ring' not described in 'amdgpu_ vce, uvd_entity_init()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following: drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c:237: warning: Function parameter or member 'ring' not described in 'amdgpu_vce_entity_init' drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c:405: warning: Function parameter or member 'ring' not described in 'amdgpu_uvd_entity_init' Cc: Christian König Cc: Alex Deucher Cc: "Pan, Xinhui" Signed-off-by: Srinivasan Shanmugam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 65949cc7abb9..07d930339b07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -398,6 +398,7 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) * amdgpu_uvd_entity_init - init entity * * @adev: amdgpu_device pointer + * @ring: amdgpu_ring pointer to check * * Initialize the entity used for handle management in the kernel driver. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 0954447f689d..59acf424a078 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -230,6 +230,7 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev) * amdgpu_vce_entity_init - init entity * * @adev: amdgpu_device pointer + * @ring: amdgpu_ring pointer to check * * Initialize the entity used for handle management in the kernel driver. */ -- cgit From a58555359a9f870543aaddef277c3396159895ce Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Mon, 23 Oct 2023 13:57:32 -0400 Subject: drm/amd/display: Fix DSC not Enabled on Direct MST Sink [WHY & HOW] For the scenario when a dsc capable MST sink device is directly connected, it needs to use max dsc compression as the link bw constraint. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Roman Li Acked-by: Alex Hung Signed-off-by: Fangzhi Zuo Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 29 +++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index d3b13d362eda..11da0eebee6c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1604,31 +1604,31 @@ enum dc_status dm_dp_mst_is_port_support_mode( unsigned int upper_link_bw_in_kbps = 0, down_link_bw_in_kbps = 0; unsigned int max_compressed_bw_in_kbps = 0; struct dc_dsc_bw_range bw_range = {0}; - struct drm_dp_mst_topology_mgr *mst_mgr; + uint16_t full_pbn = aconnector->mst_output_port->full_pbn; /* - * check if the mode could be supported if DSC pass-through is supported - * AND check if there enough bandwidth available to support the mode - * with DSC enabled. + * Consider the case with the depth of the mst topology tree is equal or less than 2 + * A. When dsc bitstream can be transmitted along the entire path + * 1. dsc is possible between source and branch/leaf device (common dsc params is possible), AND + * 2. dsc passthrough supported at MST branch, or + * 3. dsc decoding supported at leaf MST device + * Use maximum dsc compression as bw constraint + * B. When dsc bitstream cannot be transmitted along the entire path + * Use native bw as bw constraint */ if (is_dsc_common_config_possible(stream, &bw_range) && - aconnector->mst_output_port->passthrough_aux) { - mst_mgr = aconnector->mst_output_port->mgr; - mutex_lock(&mst_mgr->lock); - + (aconnector->mst_output_port->passthrough_aux || + aconnector->dsc_aux == &aconnector->mst_output_port->aux)) { cur_link_settings = stream->link->verified_link_cap; upper_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link, - &cur_link_settings - ); - down_link_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn); + &cur_link_settings); + down_link_bw_in_kbps = kbps_from_pbn(full_pbn); /* pick the bottleneck */ end_to_end_bw_in_kbps = min(upper_link_bw_in_kbps, down_link_bw_in_kbps); - mutex_unlock(&mst_mgr->lock); - /* * use the maximum dsc compression bandwidth as the required * bandwidth for the mode @@ -1643,8 +1643,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( /* check if mode could be supported within full_pbn */ bpp = convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3; pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, bpp, false); - - if (pbn > aconnector->mst_output_port->full_pbn) + if (pbn > full_pbn) return DC_FAIL_BANDWIDTH_VALIDATE; } -- cgit From 50d51374b498457c4dea26779d32ccfed12ddaff Mon Sep 17 00:00:00 2001 From: YuanShang Date: Tue, 31 Oct 2023 10:32:37 +0800 Subject: drm/amdgpu: correct chunk_ptr to a pointer to chunk. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable "chunk_ptr" should be a pointer pointing to a struct drm_amdgpu_cs_chunk instead of to a pointer of that. Signed-off-by: YuanShang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index df3ecfa9e13f..e50be6500030 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -207,7 +207,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } for (i = 0; i < p->nchunks; i++) { - struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL; + struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL; struct drm_amdgpu_cs_chunk user_chunk; uint32_t __user *cdata; -- cgit From 786c355797b3942725829d02ce9e2e6a9eba11fe Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Tue, 31 Oct 2023 03:14:02 +0800 Subject: drm/amd/pm: Update metric table for smu v13_0_6 Update pmfw metric table to include pcie instantaneous bandwidth & pcie error counters Signed-off-by: Asad Kamal Reviewed-by: Le Ma Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h index dab35d878a90..fef2d290f3f2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h @@ -123,7 +123,7 @@ typedef enum { VOLTAGE_GUARDBAND_COUNT } GFX_GUARDBAND_e; -#define SMU_METRICS_TABLE_VERSION 0x8 +#define SMU_METRICS_TABLE_VERSION 0x9 typedef struct __attribute__((packed, aligned(4))) { uint32_t AccumulationCounter; @@ -211,6 +211,14 @@ typedef struct __attribute__((packed, aligned(4))) { //XGMI Data tranfser size uint64_t XgmiReadDataSizeAcc[8];//in KByte uint64_t XgmiWriteDataSizeAcc[8];//in KByte + + //PCIE BW Data and error count + uint32_t PcieBandwidth[4]; + uint32_t PCIeL0ToRecoveryCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIenReplayAAcc; // The Pcie counter itself is accumulated + uint32_t PCIenReplayARolloverCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIeNAKSentCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIeNAKReceivedCountAcc; // The Pcie counter itself is accumulated } MetricsTable_t; #define SMU_VF_METRICS_TABLE_VERSION 0x3 -- cgit From e4d0be18243ca006258b5c7c148796c0b43505c4 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Tue, 14 Nov 2023 16:17:17 +0800 Subject: drm/amd/pm: Fill pcie error counters for gpu v1_4 Fill PCIE error counters & instantaneous bandwidth in gpu metrics v1_4 for smu v_13_0_6 Signed-off-by: Asad Kamal Reviewed-by: Le Ma Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 891605d4975f..84fac499cf0c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2095,6 +2095,14 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table smu_v13_0_6_get_current_pcie_link_speed(smu); gpu_metrics->pcie_bandwidth_acc = SMUQ10_ROUND(metrics->PcieBandwidthAcc[0]); + gpu_metrics->pcie_bandwidth_inst = + SMUQ10_ROUND(metrics->PcieBandwidth[0]); + gpu_metrics->pcie_l0_to_recov_count_acc = + metrics->PCIeL0ToRecoveryCountAcc; + gpu_metrics->pcie_replay_count_acc = + metrics->PCIenReplayAAcc; + gpu_metrics->pcie_replay_rover_count_acc = + metrics->PCIenReplayARolloverCountAcc; } gpu_metrics->system_clock_counter = ktime_get_boottime_ns(); -- cgit From 9725a4f9eb495bfa6c7f5ccdb49440ff06dba0a1 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Date: Tue, 31 Oct 2023 16:03:21 -0400 Subject: drm/amd/display: Add null checks for 8K60 lightup [WHY & HOW] Add some null checks to fix an issue where 8k60 tiled display fails to light up. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Muhammad Ahmed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 7b9bf5cb4529..d8f434738212 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3178,7 +3178,7 @@ static bool update_planes_and_stream_state(struct dc *dc, struct pipe_ctx *otg_master = resource_get_otg_master_for_stream(&context->res_ctx, context->streams[i]); - if (otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE) + if (otg_master && otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE) resource_build_test_pattern_params(&context->res_ctx, otg_master); } } diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 1d48278cba96..a1f1d1003992 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -5190,6 +5190,9 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm_legacy( sec_next = sec_pipe->next_odm_pipe; sec_prev = sec_pipe->prev_odm_pipe; + if (pri_pipe == NULL) + return false; + *sec_pipe = *pri_pipe; sec_pipe->top_pipe = sec_top; -- cgit From b71f4ade1b8900d30c661d6c27f87c35214c398c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 8 Nov 2023 13:31:57 -0600 Subject: drm/amd/display: fix a NULL pointer dereference in amdgpu_dm_i2c_xfer() When ddc_service_construct() is called, it explicitly checks both the link type and whether there is something on the link which will dictate whether the pin is marked as hw_supported. If the pin isn't set or the link is not set (such as from unloading/reloading amdgpu in an IGT test) then fail the amdgpu_dm_i2c_xfer() call. Cc: stable@vger.kernel.org Fixes: 22676bc500c2 ("drm/amd/display: Fix dmub soft hang for PSR 1") Link: https://github.com/fwupd/fwupd/issues/6327 Signed-off-by: Mario Limonciello Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6f99f6754c11..5ec7acf65ee1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7481,6 +7481,9 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap, int i; int result = -EIO; + if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported) + return result; + cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL); if (!cmd.payloads) -- cgit From 270b301beca58e427a0fda7523a71a9562e644bb Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Tue, 14 Nov 2023 17:27:51 +0200 Subject: drm/amd/display: fix NULL dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch will fix a minor issue where a debug message is referencing an struct that has just being checked whether is null or not. This has been noticed by using coccinelle, in the following output: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c:540:25-29: ERROR: aconnector is NULL but dereferenced. Fixes: 5d72e247e58c ("drm/amd/display: switch DC over to the new DRM logging macros") Signed-off-by: José Pekkarinen Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index ed784cf27d39..c7a29bb737e2 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -536,11 +536,8 @@ bool dm_helpers_dp_read_dpcd( struct amdgpu_dm_connector *aconnector = link->priv; - if (!aconnector) { - drm_dbg_dp(aconnector->base.dev, - "Failed to find connector for link!\n"); + if (!aconnector) return false; - } return drm_dp_dpcd_read(&aconnector->dm_dp_aux.aux, address, data, size) == size; -- cgit From 435f5b369657cffee4b04db1f5805b48599f4dbe Mon Sep 17 00:00:00 2001 From: Tianci Yin Date: Wed, 1 Nov 2023 09:47:13 +0800 Subject: drm/amd/display: Enable fast plane updates on DCN3.2 and above [WHY] When cursor moves across screen boarder, lag cursor observed, since subvp settings need to sync up with vblank that causes cursor updates being delayed. [HOW] Enable fast plane updates on DCN3.2 to fix it. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Aurabindo Pillai Acked-by: Alex Hung Signed-off-by: Tianci Yin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5ec7acf65ee1..9581510d3740 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9606,14 +9606,14 @@ static bool should_reset_plane(struct drm_atomic_state *state, struct drm_plane *other; struct drm_plane_state *old_other_state, *new_other_state; struct drm_crtc_state *new_crtc_state; + struct amdgpu_device *adev = drm_to_adev(plane->dev); int i; /* - * TODO: Remove this hack once the checks below are sufficient - * enough to determine when we need to reset all the planes on - * the stream. + * TODO: Remove this hack for all asics once it proves that the + * fast updates works fine on DCN3.2+. */ - if (state->allow_modeset) + if (adev->ip_versions[DCE_HWIP][0] < IP_VERSION(3, 2, 0) && state->allow_modeset) return true; /* Exit early if we know that we're adding or removing the plane. */ -- cgit From 923bbfe6c888812db1088d684bd30c24036226d2 Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Wed, 25 Oct 2023 10:53:35 +0800 Subject: drm/amd/display: Clear dpcd_sink_ext_caps if not set [WHY] Some eDP panels' ext caps don't set initial values and the value of dpcd_addr (0x317) is random. It means that sometimes the eDP can be OLED, miniLED and etc, and cause incorrect backlight control interface. [HOW] Add remove_sink_ext_caps to remove sink ext caps (HDR, OLED and etc) Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Anthony Koo Acked-by: Alex Hung Signed-off-by: Paul Hsieh Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_types.h | 1 + drivers/gpu/drm/amd/display/dc/link/link_detection.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h index cea666ea66c6..fcb825e4f1bb 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_types.h @@ -177,6 +177,7 @@ struct dc_panel_patch { unsigned int disable_fams; unsigned int skip_avmute; unsigned int mst_start_top_delay; + unsigned int remove_sink_ext_caps; }; struct dc_edid_caps { diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c index d6f0f857c05a..f2fe523f914f 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c @@ -1088,6 +1088,9 @@ static bool detect_link_and_local_sink(struct dc_link *link, if (sink->edid_caps.panel_patch.skip_scdc_overwrite) link->ctx->dc->debug.hdmi20_disable = true; + if (sink->edid_caps.panel_patch.remove_sink_ext_caps) + link->dpcd_sink_ext_caps.raw = 0; + if (dc_is_hdmi_signal(link->connector_signal)) read_scdc_caps(link->ddc, link->local_sink); -- cgit From 07ee43faeb7eb088e49a7549fcabcae94c443d3b Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 13 Nov 2023 14:34:55 +0800 Subject: drm/amdgpu: fix ras err_data null pointer issue in amdgpu_ras.c fix ras err_data null pointer issue in amdgpu_ras.c Fixes: 8cc0f5669eb6 ("drm/amdgpu: Support multiple error query modes") Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 84e5987b14e0..a3dc68e98910 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1188,7 +1188,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } if (block_obj->hw_ops->query_ras_error_count) - block_obj->hw_ops->query_ras_error_count(adev, &err_data); + block_obj->hw_ops->query_ras_error_count(adev, err_data); if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || (info->head.block == AMDGPU_RAS_BLOCK__GFX) || -- cgit From 0f216364625cb453b4f933deacfa92df7f2a2fc9 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 10 Nov 2023 13:15:39 +0530 Subject: drm/amd/pm: Don't send unload message for reset No need to notify about unload during reset. Also remove the FW version check. Signed-off-by: Lijo Lazar Reviewed-by: Yang Wang Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 84fac499cf0c..0e5a77c3c2e2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1454,7 +1454,7 @@ static int smu_v13_0_6_register_irq_handler(struct smu_context *smu) static int smu_v13_0_6_notify_unload(struct smu_context *smu) { - if (smu->smc_fw_version <= 0x553500) + if (amdgpu_in_reset(smu->adev)) return 0; dev_dbg(smu->adev->dev, "Notify PMFW about driver unload"); -- cgit From 5e8a0d3598b47ee5a57708072bdef08816264538 Mon Sep 17 00:00:00 2001 From: Duncan Ma Date: Wed, 25 Oct 2023 19:07:21 -0400 Subject: drm/amd/display: Negate IPS allow and commit bits [WHY] On s0i3, IPS mask isn't saved and restored. It is reset to zero on exit. If it is cleared unexpectedly, driver will proceed operations while DCN is in IPS2 and cause a hang. [HOW] Negate the bit logic. Default value of zero indicates it is still in IPS2. Driver must poll for the bit to assert. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Duncan Ma Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 18 +++++++++--------- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ++-- drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 0fa4fcd00de2..507a7cf56711 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -820,22 +820,22 @@ static void dcn35_set_idle_state(struct clk_mgr *clk_mgr_base, bool allow_idle) if (dc->config.disable_ips == DMUB_IPS_ENABLE || dc->config.disable_ips == DMUB_IPS_DISABLE_DYNAMIC) { - val |= DMUB_IPS1_ALLOW_MASK; - val |= DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) { val = val & ~DMUB_IPS1_ALLOW_MASK; val = val & ~DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) { - val |= DMUB_IPS1_ALLOW_MASK; - val = val & ~DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) { + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) { val |= DMUB_IPS1_ALLOW_MASK; val |= DMUB_IPS2_ALLOW_MASK; + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) { + val = val & ~DMUB_IPS1_ALLOW_MASK; + val |= DMUB_IPS2_ALLOW_MASK; + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) { + val = val & ~DMUB_IPS1_ALLOW_MASK; + val = val & ~DMUB_IPS2_ALLOW_MASK; } if (!allow_idle) { - val = val & ~DMUB_IPS1_ALLOW_MASK; - val = val & ~DMUB_IPS2_ALLOW_MASK; + val |= DMUB_IPS1_ALLOW_MASK; + val |= DMUB_IPS2_ALLOW_MASK; } dcn35_smu_write_ips_scratch(clk_mgr, val); diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index d8f434738212..76b47f178127 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -4934,8 +4934,8 @@ bool dc_dmub_is_ips_idle_state(struct dc *dc) if (dc->hwss.get_idle_state) idle_state = dc->hwss.get_idle_state(dc); - if ((idle_state & DMUB_IPS1_ALLOW_MASK) || - (idle_state & DMUB_IPS2_ALLOW_MASK)) + if (!(idle_state & DMUB_IPS1_ALLOW_MASK) || + !(idle_state & DMUB_IPS2_ALLOW_MASK)) return true; return false; diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index e4c007203318..0e07699c1e83 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -1202,11 +1202,11 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) allow_state = dc->hwss.get_idle_state(dc); dc->hwss.set_idle_state(dc, false); - if (allow_state & DMUB_IPS2_ALLOW_MASK) { + if (!(allow_state & DMUB_IPS2_ALLOW_MASK)) { // Wait for evaluation time udelay(dc->debug.ips2_eval_delay_us); commit_state = dc->hwss.get_idle_state(dc); - if (commit_state & DMUB_IPS2_COMMIT_MASK) { + if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) { // Tell PMFW to exit low power state dc->clk_mgr->funcs->exit_low_power_state(dc->clk_mgr); @@ -1216,7 +1216,7 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) for (i = 0; i < max_num_polls; ++i) { commit_state = dc->hwss.get_idle_state(dc); - if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) + if (commit_state & DMUB_IPS2_COMMIT_MASK) break; udelay(1); @@ -1235,10 +1235,10 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) } dc_dmub_srv_notify_idle(dc, false); - if (allow_state & DMUB_IPS1_ALLOW_MASK) { + if (!(allow_state & DMUB_IPS1_ALLOW_MASK)) { for (i = 0; i < max_num_polls; ++i) { commit_state = dc->hwss.get_idle_state(dc); - if (!(commit_state & DMUB_IPS1_COMMIT_MASK)) + if (commit_state & DMUB_IPS1_COMMIT_MASK) break; udelay(1); -- cgit From 9ddea8c9775d9379d71e6ac1519c552461b90b07 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Tue, 31 Oct 2023 11:02:49 +0800 Subject: drm/amdgpu: add and populate the port num into xgmi topology info The port num info is firstly introduced with 20.00.01.13 xgmi ta and make them as part of topology info. Signed-off-by: Shiwu Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 32b701cc0376..a21045d018f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1473,6 +1473,11 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ? topology->nodes[i].num_links : node_num_links; } + /* popluate the connected port num info if supported and available */ + if (ta_port_num_support && topology->nodes[i].num_links) { + memcpy(topology->nodes[i].port_num, link_extend_info_output->nodes[i].port_num, + sizeof(struct xgmi_connected_port_num) * TA_XGMI__MAX_PORT_NUM); + } /* reflect the topology information for bi-directionality */ if (requires_reflection && topology->nodes[i].num_hops) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5d36ad3f48c7..c4d9cbde55b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -150,6 +150,7 @@ struct psp_xgmi_node_info { uint8_t is_sharing_enabled; enum ta_xgmi_assigned_sdma_engine sdma_engine; uint8_t num_links; + struct xgmi_connected_port_num port_num[TA_XGMI__MAX_PORT_NUM]; }; struct psp_xgmi_topology_info { -- cgit From 5911d02cac70d7fb52009fbd37423e63f8f6f9bc Mon Sep 17 00:00:00 2001 From: Lewis Huang Date: Thu, 19 Oct 2023 17:22:21 +0800 Subject: drm/amd/display: Change the DMCUB mailbox memory location from FB to inbox [WHY] Flush command sent to DMCUB spends more time for execution on a dGPU than on an APU. This causes cursor lag when using high refresh rate mouses. [HOW] 1. Change the DMCUB mailbox memory location from FB to inbox. 2. Only change windows memory to inbox. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Alex Hung Signed-off-by: Lewis Huang Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++----- drivers/gpu/drm/amd/display/dmub/dmub_srv.h | 22 ++++++++++------ drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c | 32 +++++++++++++++++------ 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 9581510d3740..ee97814ebd99 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2079,7 +2079,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) struct dmub_srv_create_params create_params; struct dmub_srv_region_params region_params; struct dmub_srv_region_info region_info; - struct dmub_srv_fb_params fb_params; + struct dmub_srv_memory_params memory_params; struct dmub_srv_fb_info *fb_info; struct dmub_srv *dmub_srv; const struct dmcub_firmware_header_v1_0 *hdr; @@ -2182,6 +2182,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) adev->dm.dmub_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes) + PSP_HEADER_BYTES; + region_params.is_mailbox_in_inbox = false; status = dmub_srv_calc_region_info(dmub_srv, ®ion_params, ®ion_info); @@ -2205,10 +2206,10 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) return r; /* Rebase the regions on the framebuffer address. */ - memset(&fb_params, 0, sizeof(fb_params)); - fb_params.cpu_addr = adev->dm.dmub_bo_cpu_addr; - fb_params.gpu_addr = adev->dm.dmub_bo_gpu_addr; - fb_params.region_info = ®ion_info; + memset(&memory_params, 0, sizeof(memory_params)); + memory_params.cpu_fb_addr = adev->dm.dmub_bo_cpu_addr; + memory_params.gpu_fb_addr = adev->dm.dmub_bo_gpu_addr; + memory_params.region_info = ®ion_info; adev->dm.dmub_fb_info = kzalloc(sizeof(*adev->dm.dmub_fb_info), GFP_KERNEL); @@ -2220,7 +2221,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) return -ENOMEM; } - status = dmub_srv_calc_fb_info(dmub_srv, &fb_params, fb_info); + status = dmub_srv_calc_mem_info(dmub_srv, &memory_params, fb_info); if (status != DMUB_STATUS_OK) { DRM_ERROR("Error calculating DMUB FB info: %d\n", status); return -EINVAL; diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h index 9665ada0f894..df63aa8f01e9 100644 --- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h +++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h @@ -195,6 +195,7 @@ struct dmub_srv_region_params { uint32_t vbios_size; const uint8_t *fw_inst_const; const uint8_t *fw_bss_data; + bool is_mailbox_in_inbox; }; /** @@ -214,20 +215,25 @@ struct dmub_srv_region_params { */ struct dmub_srv_region_info { uint32_t fb_size; + uint32_t inbox_size; uint8_t num_regions; struct dmub_region regions[DMUB_WINDOW_TOTAL]; }; /** - * struct dmub_srv_fb_params - parameters used for driver fb setup + * struct dmub_srv_memory_params - parameters used for driver fb setup * @region_info: region info calculated by dmub service - * @cpu_addr: base cpu address for the framebuffer - * @gpu_addr: base gpu virtual address for the framebuffer + * @cpu_fb_addr: base cpu address for the framebuffer + * @cpu_inbox_addr: base cpu address for the gart + * @gpu_fb_addr: base gpu virtual address for the framebuffer + * @gpu_inbox_addr: base gpu virtual address for the gart */ -struct dmub_srv_fb_params { +struct dmub_srv_memory_params { const struct dmub_srv_region_info *region_info; - void *cpu_addr; - uint64_t gpu_addr; + void *cpu_fb_addr; + void *cpu_inbox_addr; + uint64_t gpu_fb_addr; + uint64_t gpu_inbox_addr; }; /** @@ -563,8 +569,8 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, * DMUB_STATUS_OK - success * DMUB_STATUS_INVALID - unspecified error */ -enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, - const struct dmub_srv_fb_params *params, +enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub, + const struct dmub_srv_memory_params *params, struct dmub_srv_fb_info *out); /** diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index 5d36f3e5dc2b..22fc4ba96def 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -434,7 +434,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, uint32_t fw_state_size = DMUB_FW_STATE_SIZE; uint32_t trace_buffer_size = DMUB_TRACE_BUFFER_SIZE; uint32_t scratch_mem_size = DMUB_SCRATCH_MEM_SIZE; - + uint32_t previous_top = 0; if (!dmub->sw_init) return DMUB_STATUS_INVALID; @@ -459,8 +459,15 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, bios->base = dmub_align(stack->top, 256); bios->top = bios->base + params->vbios_size; - mail->base = dmub_align(bios->top, 256); - mail->top = mail->base + DMUB_MAILBOX_SIZE; + if (params->is_mailbox_in_inbox) { + mail->base = 0; + mail->top = mail->base + DMUB_MAILBOX_SIZE; + previous_top = bios->top; + } else { + mail->base = dmub_align(bios->top, 256); + mail->top = mail->base + DMUB_MAILBOX_SIZE; + previous_top = mail->top; + } fw_info = dmub_get_fw_meta_info(params); @@ -479,7 +486,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, dmub->fw_version = fw_info->fw_version; } - trace_buff->base = dmub_align(mail->top, 256); + trace_buff->base = dmub_align(previous_top, 256); trace_buff->top = trace_buff->base + dmub_align(trace_buffer_size, 64); fw_state->base = dmub_align(trace_buff->top, 256); @@ -490,11 +497,14 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, out->fb_size = dmub_align(scratch_mem->top, 4096); + if (params->is_mailbox_in_inbox) + out->inbox_size = dmub_align(mail->top, 4096); + return DMUB_STATUS_OK; } -enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, - const struct dmub_srv_fb_params *params, +enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub, + const struct dmub_srv_memory_params *params, struct dmub_srv_fb_info *out) { uint8_t *cpu_base; @@ -509,8 +519,8 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, if (params->region_info->num_regions != DMUB_NUM_WINDOWS) return DMUB_STATUS_INVALID; - cpu_base = (uint8_t *)params->cpu_addr; - gpu_base = params->gpu_addr; + cpu_base = (uint8_t *)params->cpu_fb_addr; + gpu_base = params->gpu_fb_addr; for (i = 0; i < DMUB_NUM_WINDOWS; ++i) { const struct dmub_region *reg = @@ -518,6 +528,12 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, out->fb[i].cpu_addr = cpu_base + reg->base; out->fb[i].gpu_addr = gpu_base + reg->base; + + if (i == DMUB_WINDOW_4_MAILBOX && params->cpu_inbox_addr != 0) { + out->fb[i].cpu_addr = (uint8_t *)params->cpu_inbox_addr + reg->base; + out->fb[i].gpu_addr = params->gpu_inbox_addr + reg->base; + } + out->fb[i].size = reg->top - reg->base; } -- cgit From 0ee057e66c4b782809a0a9265cdac5542e646706 Mon Sep 17 00:00:00 2001 From: Nicholas Susanto Date: Wed, 1 Nov 2023 15:30:10 -0400 Subject: drm/amd/display: Fix encoder disable logic [WHY] DENTIST hangs when OTG is off and encoder is on. We were not disabling the encoder properly when switching from extended mode to external monitor only. [HOW] Disable the encoder using an existing enable/disable fifo helper instead of enc35_stream_encoder_enable. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Alex Hung Signed-off-by: Nicholas Susanto Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c index 001f9eb66920..62a8f0b56006 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c @@ -261,12 +261,6 @@ static void enc35_stream_encoder_enable( /* invalid mode ! */ ASSERT_CRITICAL(false); } - - REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1); - REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1); - } else { - REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0); - REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0); } } @@ -436,6 +430,8 @@ static void enc35_disable_fifo(struct stream_encoder *enc) struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_ENABLE, 0); + REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0); + REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0); } static void enc35_enable_fifo(struct stream_encoder *enc) @@ -443,6 +439,8 @@ static void enc35_enable_fifo(struct stream_encoder *enc) struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7); + REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1); + REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1); enc35_reset_fifo(enc, true); enc35_reset_fifo(enc, false); -- cgit From 564ca1b53ece166b5915c2ac90f3e9313100f4ea Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Nov 2023 11:36:56 -0500 Subject: drm/amdgpu/gmc11: fix logic typo in AGP check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Should be && rather than ||. Fixes: b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5") Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 6dce9b29f675..ba4c82f5e617 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -640,7 +640,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); - if (!amdgpu_sriov_vf(adev) || + if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) amdgpu_gmc_agp_location(adev, mc); -- cgit From 6ba5b613837c5d997ad8297b22fc46cd0be58d76 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:31:00 -0500 Subject: drm/amdgpu: add a module parameter to control the AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a module parameter to control the AGP aperture. The AGP aperture is an aperture in the GPU's internal address space which provides direct non-paged access to the platform address space. This access is non-snooped so only uncached memory can be accessed. Add a knob so that we can toggle this for debugging. Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11") Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index afec09930efa..9d92ca157677 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -248,6 +248,7 @@ extern int amdgpu_umsch_mm; extern int amdgpu_seamless; extern int amdgpu_user_partt_mode; +extern int amdgpu_agp; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 3095a3a864af..8f24cabe2155 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE; int amdgpu_umsch_mm; int amdgpu_seamless = -1; /* auto */ uint amdgpu_debug_mask; +int amdgpu_agp = -1; /* auto */ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444); MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default"); module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444); +/** + * DOC: agp (int) + * Enable the AGP aperture. This provides an aperture in the GPU's internal + * address space for direct access to system memory. Note that these accesses + * are non-snooped, so they are only used for access to uncached memory. + */ +MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)"); +module_param_named(agp, amdgpu_agp, int, 0444); + /* These devices are not supported by amdgpu. * They are supported by the mach64, r128, radeon drivers */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 0ec7b061d7c2..23483bffa1c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index ba4c82f5e617..e1078b53e942 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -641,7 +641,8 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) + (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && + (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index c1f2f166f064..1638c3177799 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ -- cgit From 0db062eac3e0846c6f120867a79df83b4c3db46f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:34:19 -0500 Subject: drm/amdgpu/gmc11: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11") Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index e1078b53e942..23d7b548d13f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -642,7 +642,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && - (amdgpu_agp != 0)) + (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ -- cgit From 61fc93695bbfde218d5f9f0b8051ce36eb649669 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:38:54 -0500 Subject: drm/amdgpu/gmc10: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 23483bffa1c7..a5a05c16c10d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ -- cgit From e8c2d3e25b844ad8f7c8b269a7cfd65285329264 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:40:00 -0500 Subject: drm/amdgpu/gmc9: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 1638c3177799..2ac5820e9c92 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ -- cgit From 6fc45b6ed921dc00dfb264dc08c7d67ee63d2656 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:21:14 +0100 Subject: dm-delay: fix a race between delay_presuspend and delay_bio In delay_presuspend, we set the atomic variable may_delay and then stop the timer and flush pending bios. The intention here is to prevent the delay target from re-arming the timer again. However, this test is racy. Suppose that one thread goes to delay_bio, sees that dc->may_delay is one and proceeds; now, another thread executes delay_presuspend, it sets dc->may_delay to zero, deletes the timer and flushes pending bios. Then, the first thread continues and adds the bio to delayed->list despite the fact that dc->may_delay is false. Fix this bug by changing may_delay's type from atomic_t to bool and only access it while holding the delayed_bios_lock mutex. Note that we don't have to grab the mutex in delay_resume because there are no bios in flight at this point. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index efd510984e25..2d6b900e4353 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -33,7 +33,7 @@ struct delay_c { struct work_struct flush_expired_bios; struct list_head delayed_bios; struct task_struct *worker; - atomic_t may_delay; + bool may_delay; struct delay_class read; struct delay_class write; @@ -236,7 +236,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; dc->argc = argc; ret = delay_class_ctr(ti, &dc->read, argv); @@ -312,7 +312,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) struct dm_delay_info *delayed; unsigned long expires = 0; - if (!c->delay || !atomic_read(&dc->may_delay)) + if (!c->delay) return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); @@ -321,6 +321,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); mutex_lock(&delayed_bios_lock); + if (unlikely(!dc->may_delay)) { + mutex_unlock(&delayed_bios_lock); + return DM_MAPIO_REMAPPED; + } c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); mutex_unlock(&delayed_bios_lock); @@ -337,7 +341,9 @@ static void delay_presuspend(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 0); + mutex_lock(&delayed_bios_lock); + dc->may_delay = false; + mutex_unlock(&delayed_bios_lock); if (delay_is_fast(dc)) flush_delayed_bios_fast(dc, true); @@ -351,7 +357,7 @@ static void delay_resume(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; } static int delay_map(struct dm_target *ti, struct bio *bio) -- cgit From 38cfff568169ff9f99784948f79f62ca1af5a187 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:22:47 +0100 Subject: dm-delay: fix bugs introduced by kthread mode This commit fixes the following bugs introduced by commit 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq"): * the function flush_worker_fn has no exit path - on unload, this function will just loop and consume 100% CPU without any progress * the wake-up mechanism in flush_worker_fn is racy - a wake up will be missed if the process adds entries to the delayed_bios list just before set_current_state(TASK_INTERRUPTIBLE) * flush_delayed_bios_fast submits a bio while holding a global mutex; this may deadlock if we have multiple stacked dm-delay devices and the underlying device attempts to acquire the mutex too * if the target constructor fails, it will call delay_dtr. delay_dtr would attempt to free dc->timer_lock without it being initialized by the constructor. * if the target constructor's kthread allocation fails, delay_dtr would crash trying to dereference dc->worker because it is non-NULL due to ERR_PTR. Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 61 +++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2d6b900e4353..00c09d9fffeb 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -73,9 +73,23 @@ static inline bool delay_is_fast(struct delay_c *dc) return !!dc->worker; } +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + dm_submit_bio_remap(bio, NULL); + bio = n; + } +} + static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; + struct bio_list flush_bio_list; + bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { @@ -83,47 +97,42 @@ static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); - dm_submit_bio_remap(bio, NULL); + bio_list_add(&flush_bio_list, bio); delayed->class->ops--; } } mutex_unlock(&delayed_bios_lock); + + flush_bios(bio_list_get(&flush_bio_list)); } static int flush_worker_fn(void *data) { struct delay_c *dc = data; - while (1) { + while (!kthread_should_stop()) { flush_delayed_bios_fast(dc, false); + mutex_lock(&delayed_bios_lock); if (unlikely(list_empty(&dc->delayed_bios))) { set_current_state(TASK_INTERRUPTIBLE); + mutex_unlock(&delayed_bios_lock); schedule(); - } else + } else { + mutex_unlock(&delayed_bios_lock); cond_resched(); + } } return 0; } -static void flush_bios(struct bio *bio) -{ - struct bio *n; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - dm_submit_bio_remap(bio, NULL); - bio = n; - } -} - -static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) +static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; unsigned long next_expires = 0; unsigned long start_timer = 0; - struct bio_list flush_bios = { }; + struct bio_list flush_bio_list; + bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { @@ -131,7 +140,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); - bio_list_add(&flush_bios, bio); + bio_list_add(&flush_bio_list, bio); delayed->class->ops--; continue; } @@ -147,7 +156,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) if (start_timer) queue_timeout(dc, next_expires); - return bio_list_get(&flush_bios); + flush_bios(bio_list_get(&flush_bio_list)); } static void flush_expired_bios(struct work_struct *work) @@ -158,7 +167,7 @@ static void flush_expired_bios(struct work_struct *work) if (delay_is_fast(dc)) flush_delayed_bios_fast(dc, false); else - flush_bios(flush_delayed_bios(dc, false)); + flush_delayed_bios(dc, false); } static void delay_dtr(struct dm_target *ti) @@ -177,8 +186,7 @@ static void delay_dtr(struct dm_target *ti) if (dc->worker) kthread_stop(dc->worker); - if (!delay_is_fast(dc)) - mutex_destroy(&dc->timer_lock); + mutex_destroy(&dc->timer_lock); kfree(dc); } @@ -236,6 +244,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); + mutex_init(&dc->timer_lock); dc->may_delay = true; dc->argc = argc; @@ -282,12 +291,12 @@ out: "dm-delay-flush-worker"); if (IS_ERR(dc->worker)) { ret = PTR_ERR(dc->worker); + dc->worker = NULL; goto bad; } } else { timer_setup(&dc->delay_timer, handle_delayed_timer, 0); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); - mutex_init(&dc->timer_lock); dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!dc->kdelayd_wq) { ret = -EINVAL; @@ -345,11 +354,11 @@ static void delay_presuspend(struct dm_target *ti) dc->may_delay = false; mutex_unlock(&delayed_bios_lock); - if (delay_is_fast(dc)) + if (delay_is_fast(dc)) { flush_delayed_bios_fast(dc, true); - else { + } else { del_timer_sync(&dc->delay_timer); - flush_bios(flush_delayed_bios(dc, true)); + flush_delayed_bios(dc, true); } } -- cgit From ccadc8a21ef13eb80006ecff6a7466810e4f0dd6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:24:04 +0100 Subject: dm-delay: avoid duplicate logic This is small refactoring of dm-delay - we avoid duplicate logic in flush_delayed_bios and flush_delayed_bios_fast and join these two functions into one. We also add cond_resched() to flush_delayed_bios because the list may have unbounded number of entries. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 65 +++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 00c09d9fffeb..5eabdb06c649 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -85,24 +85,40 @@ static void flush_bios(struct bio *bio) } } -static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) +static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; struct bio_list flush_bio_list; + unsigned long next_expires = 0; + bool start_timer = false; bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + cond_resched(); if (flush_all || time_after_eq(jiffies, delayed->expires)) { struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); bio_list_add(&flush_bio_list, bio); delayed->class->ops--; + continue; + } + + if (!delay_is_fast(dc)) { + if (!start_timer) { + start_timer = true; + next_expires = delayed->expires; + } else { + next_expires = min(next_expires, delayed->expires); + } } } mutex_unlock(&delayed_bios_lock); + if (start_timer) + queue_timeout(dc, next_expires); + flush_bios(bio_list_get(&flush_bio_list)); } @@ -111,7 +127,7 @@ static int flush_worker_fn(void *data) struct delay_c *dc = data; while (!kthread_should_stop()) { - flush_delayed_bios_fast(dc, false); + flush_delayed_bios(dc, false); mutex_lock(&delayed_bios_lock); if (unlikely(list_empty(&dc->delayed_bios))) { set_current_state(TASK_INTERRUPTIBLE); @@ -126,48 +142,12 @@ static int flush_worker_fn(void *data) return 0; } -static void flush_delayed_bios(struct delay_c *dc, bool flush_all) -{ - struct dm_delay_info *delayed, *next; - unsigned long next_expires = 0; - unsigned long start_timer = 0; - struct bio_list flush_bio_list; - bio_list_init(&flush_bio_list); - - mutex_lock(&delayed_bios_lock); - list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { - if (flush_all || time_after_eq(jiffies, delayed->expires)) { - struct bio *bio = dm_bio_from_per_bio_data(delayed, - sizeof(struct dm_delay_info)); - list_del(&delayed->list); - bio_list_add(&flush_bio_list, bio); - delayed->class->ops--; - continue; - } - - if (!start_timer) { - start_timer = 1; - next_expires = delayed->expires; - } else - next_expires = min(next_expires, delayed->expires); - } - mutex_unlock(&delayed_bios_lock); - - if (start_timer) - queue_timeout(dc, next_expires); - - flush_bios(bio_list_get(&flush_bio_list)); -} - static void flush_expired_bios(struct work_struct *work) { struct delay_c *dc; dc = container_of(work, struct delay_c, flush_expired_bios); - if (delay_is_fast(dc)) - flush_delayed_bios_fast(dc, false); - else - flush_delayed_bios(dc, false); + flush_delayed_bios(dc, false); } static void delay_dtr(struct dm_target *ti) @@ -354,12 +334,9 @@ static void delay_presuspend(struct dm_target *ti) dc->may_delay = false; mutex_unlock(&delayed_bios_lock); - if (delay_is_fast(dc)) { - flush_delayed_bios_fast(dc, true); - } else { + if (!delay_is_fast(dc)) del_timer_sync(&dc->delay_timer); - flush_delayed_bios(dc, true); - } + flush_delayed_bios(dc, true); } static void delay_resume(struct dm_target *ti) -- cgit From 2a695062a5a42aead8c539a344168d4806b3fda2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:36:34 +0100 Subject: dm-bufio: fix no-sleep mode dm-bufio has a no-sleep mode. When activated (with the DM_BUFIO_CLIENT_NO_SLEEP flag), the bufio client is read-only and we could call dm_bufio_get from tasklets. This is used by dm-verity. Unfortunately, commit 450e8dee51aa ("dm bufio: improve concurrent IO performance") broke this and the kernel would warn that cache_get() was calling down_read() from no-sleeping context. The bug can be reproduced by using "veritysetup open" with the "--use-tasklets" flag. This commit fixes dm-bufio, so that the tasklet mode works again, by expanding use of the 'no_sleep_enabled' static_key to conditionally use either a rw_semaphore or rwlock_t (which are colocated in the buffer_tree structure using a union). Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v6.4 Fixes: 450e8dee51aa ("dm bufio: improve concurrent IO performance") Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 87 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 62eb27639c9b..f03d7dba270c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -254,7 +254,7 @@ enum evict_result { typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context); -static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context) +static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep) { unsigned long tested = 0; struct list_head *h = lru->cursor; @@ -295,7 +295,8 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con h = h->next; - cond_resched(); + if (!no_sleep) + cond_resched(); } return NULL; @@ -382,7 +383,10 @@ struct dm_buffer { */ struct buffer_tree { - struct rw_semaphore lock; + union { + struct rw_semaphore lock; + rwlock_t spinlock; + } u; struct rb_root root; } ____cacheline_aligned_in_smp; @@ -393,9 +397,12 @@ struct dm_buffer_cache { * on the locks. */ unsigned int num_locks; + bool no_sleep; struct buffer_tree trees[]; }; +static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); + static inline unsigned int cache_index(sector_t block, unsigned int num_locks) { return dm_hash_locks_index(block, num_locks); @@ -403,22 +410,34 @@ static inline unsigned int cache_index(sector_t block, unsigned int num_locks) static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block) { - down_read(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block) { - up_read(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block) { - down_write(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block) { - up_write(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } /* @@ -442,18 +461,32 @@ static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool static void __lh_lock(struct lock_history *lh, unsigned int index) { - if (lh->write) - down_write(&lh->cache->trees[index].lock); - else - down_read(&lh->cache->trees[index].lock); + if (lh->write) { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + write_lock_bh(&lh->cache->trees[index].u.spinlock); + else + down_write(&lh->cache->trees[index].u.lock); + } else { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + read_lock_bh(&lh->cache->trees[index].u.spinlock); + else + down_read(&lh->cache->trees[index].u.lock); + } } static void __lh_unlock(struct lock_history *lh, unsigned int index) { - if (lh->write) - up_write(&lh->cache->trees[index].lock); - else - up_read(&lh->cache->trees[index].lock); + if (lh->write) { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + write_unlock_bh(&lh->cache->trees[index].u.spinlock); + else + up_write(&lh->cache->trees[index].u.lock); + } else { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + read_unlock_bh(&lh->cache->trees[index].u.spinlock); + else + up_read(&lh->cache->trees[index].u.lock); + } } /* @@ -502,14 +535,18 @@ static struct dm_buffer *list_to_buffer(struct list_head *l) return le_to_buffer(le); } -static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks) +static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep) { unsigned int i; bc->num_locks = num_locks; + bc->no_sleep = no_sleep; for (i = 0; i < bc->num_locks; i++) { - init_rwsem(&bc->trees[i].lock); + if (no_sleep) + rwlock_init(&bc->trees[i].u.spinlock); + else + init_rwsem(&bc->trees[i].u.lock); bc->trees[i].root = RB_ROOT; } @@ -648,7 +685,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode struct lru_entry *le; struct dm_buffer *b; - le = lru_evict(&bc->lru[list_mode], __evict_pred, &w); + le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep); if (!le) return NULL; @@ -702,7 +739,7 @@ static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_ struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context}; while (true) { - le = lru_evict(&bc->lru[old_mode], __evict_pred, &w); + le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep); if (!le) break; @@ -915,10 +952,11 @@ static void cache_remove_range(struct dm_buffer_cache *bc, { unsigned int i; + BUG_ON(bc->no_sleep); for (i = 0; i < bc->num_locks; i++) { - down_write(&bc->trees[i].lock); + down_write(&bc->trees[i].u.lock); __remove_range(bc, &bc->trees[i].root, begin, end, pred, release); - up_write(&bc->trees[i].lock); + up_write(&bc->trees[i].u.lock); } } @@ -979,8 +1017,6 @@ struct dm_bufio_client { struct dm_buffer_cache cache; /* must be last member */ }; -static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); - /*----------------------------------------------------------------*/ #define dm_bufio_in_request() (!!current->bio_list) @@ -1871,7 +1907,8 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, if (need_submit) submit_io(b, REQ_OP_READ, read_endio); - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + if (nf != NF_GET) /* we already tested this condition above */ + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { int error = blk_status_to_errno(b->read_error); @@ -2421,7 +2458,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign r = -ENOMEM; goto bad_client; } - cache_init(&c->cache, num_locks); + cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0); c->bdev = bdev; c->block_size = block_size; -- cgit From 28f07f2ab4b3a2714f1fefcc58ada4bcc195f806 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:37:25 +0100 Subject: dm-verity: don't use blocking calls from tasklets The commit 5721d4e5a9cd enhanced dm-verity, so that it can verify blocks from tasklets rather than from workqueues. This reportedly improves performance significantly. However, dm-verity was using the flag CRYPTO_TFM_REQ_MAY_SLEEP from tasklets which resulted in warnings about sleeping function being called from non-sleeping context. BUG: sleeping function called from invalid context at crypto/internal.h:206 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 14, name: ksoftirqd/0 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 CPU: 0 PID: 14 Comm: ksoftirqd/0 Tainted: G W 6.7.0-rc1 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl+0x32/0x50 __might_resched+0x110/0x160 crypto_hash_walk_done+0x54/0xb0 shash_ahash_update+0x51/0x60 verity_hash_update.isra.0+0x4a/0x130 [dm_verity] verity_verify_io+0x165/0x550 [dm_verity] ? free_unref_page+0xdf/0x170 ? psi_group_change+0x113/0x390 verity_tasklet+0xd/0x70 [dm_verity] tasklet_action_common.isra.0+0xb3/0xc0 __do_softirq+0xaf/0x1ec ? smpboot_thread_fn+0x1d/0x200 ? sort_range+0x20/0x20 run_ksoftirqd+0x15/0x30 smpboot_thread_fn+0xed/0x200 kthread+0xdc/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x28/0x40 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 This commit fixes dm-verity so that it doesn't use the flags CRYPTO_TFM_REQ_MAY_SLEEP and CRYPTO_TFM_REQ_MAY_BACKLOG from tasklets. The crypto API would do GFP_ATOMIC allocation instead, it could return -ENOMEM and we catch -ENOMEM in verity_tasklet and requeue the request to the workqueue. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v6.0+ Fixes: 5721d4e5a9cd ("dm verity: Add optional "try_verify_in_tasklet" feature") Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-fec.c | 4 ++-- drivers/md/dm-verity-target.c | 23 ++++++++++++----------- drivers/md/dm-verity.h | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ef9f018da60..2099c755119e 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -185,7 +185,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, { if (unlikely(verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)))) + verity_io_real_digest(v, io), true))) return 0; return memcmp(verity_io_real_digest(v, io), want_digest, @@ -386,7 +386,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, verity_io_hash_req(v, io), fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)); + verity_io_real_digest(v, io), true); if (unlikely(r < 0)) return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 26adcfea0302..e115fcfe723c 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -135,20 +135,21 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, * Wrapper for crypto_ahash_init, which handles verity salting. */ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, - struct crypto_wait *wait) + struct crypto_wait *wait, bool may_sleep) { int r; ahash_request_set_tfm(req, v->tfm); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, (void *)wait); + ahash_request_set_callback(req, + may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, + crypto_req_done, (void *)wait); crypto_init_wait(wait); r = crypto_wait_req(crypto_ahash_init(req), wait); if (unlikely(r < 0)) { - DMERR("crypto_ahash_init failed: %d", r); + if (r != -ENOMEM) + DMERR("crypto_ahash_init failed: %d", r); return r; } @@ -179,12 +180,12 @@ out: } int verity_hash(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, u8 *digest) + const u8 *data, size_t len, u8 *digest, bool may_sleep) { int r; struct crypto_wait wait; - r = verity_hash_init(v, req, &wait); + r = verity_hash_init(v, req, &wait, may_sleep); if (unlikely(r < 0)) goto out; @@ -322,7 +323,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, r = verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io)); + verity_io_real_digest(v, io), !io->in_tasklet); if (unlikely(r < 0)) goto release_ret_r; @@ -556,7 +557,7 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash_init(v, req, &wait); + r = verity_hash_init(v, req, &wait, !io->in_tasklet); if (unlikely(r < 0)) return r; @@ -652,7 +653,7 @@ static void verity_tasklet(unsigned long data) io->in_tasklet = true; err = verity_verify_io(io); - if (err == -EAGAIN) { + if (err == -EAGAIN || err == -ENOMEM) { /* fallback to retrying with work-queue */ INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -1033,7 +1034,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) goto out; r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, - v->zero_digest); + v->zero_digest, true); out: kfree(req); diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 2f555b420367..f96f4e281ee4 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -128,7 +128,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len)); extern int verity_hash(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, u8 *digest); + const u8 *data, size_t len, u8 *digest, bool may_sleep); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); -- cgit From 13648e04a9b831b3dfa5cf3887dfa6cf8fe5fe69 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:38:33 +0100 Subject: dm-crypt: start allocating with MAX_ORDER Commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") changed the meaning of MAX_ORDER from exclusive to inclusive. So, we can allocate compound pages with up to 1 << MAX_ORDER pages. Reflect this change in dm-crypt and start trying to allocate compound pages with MAX_ORDER. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6de107aff331..2ae8560b6a14 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned int remaining_size; - unsigned int order = MAX_ORDER - 1; + unsigned int order = MAX_ORDER; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) -- cgit From bc1b5acb40201a0746d68a7d7cfc141899937f4f Mon Sep 17 00:00:00 2001 From: Mahmoud Adam Date: Fri, 10 Nov 2023 19:21:04 +0100 Subject: nfsd: fix file memleak on client_opens_release seq_release should be called to free the allocated seq_file Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Mahmoud Adam Reviewed-by: Jeff Layton Fixes: 78599c42ae3c ("nfsd4: add file to display list of client's opens") Reviewed-by: NeilBrown Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 65fd5510323a..3709e58f0a4a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file) /* XXX: alternatively, we could get/drop in seq start/stop */ drop_client(clp); - return 0; + return seq_release(inode, file); } static const struct file_operations client_states_fops = { -- cgit From 49cecd8628a9855cd993792a0377559ea32d5e7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:39 -0500 Subject: NFSD: Update nfsd_cache_append() to use xdr_stream When inserting a DRC-cached response into the reply buffer, ensure that the reply buffer's xdr_stream is updated properly. Otherwise the server will send a garbage response. Cc: stable@vger.kernel.org # v6.3+ Reviewed-by: Jeff Layton Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80621a709510..abb453be71ca 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -640,24 +640,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, return; } -/* - * Copy cached reply to current reply buffer. Should always fit. - * FIXME as reply is in a page, we should just attach the page, and - * keep a refcount.... - */ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) { - struct kvec *vec = &rqstp->rq_res.head[0]; - - if (vec->iov_len + data->iov_len > PAGE_SIZE) { - printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", - data->iov_len); - return 0; - } - memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); - vec->iov_len += data->iov_len; - return 1; + __be32 *p; + + p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len); + if (unlikely(!p)) + return false; + memcpy(p, data->iov_base, data->iov_len); + xdr_commit_encode(&rqstp->rq_res_stream); + return true; } /* -- cgit From 1caf5f61dd8430ae5a0b4538afe4953ce7517cbb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:33 -0500 Subject: NFSD: Fix "start of NFS reply" pointer passed to nfsd_cache_update() The "statp + 1" pointer that is passed to nfsd_cache_update() is supposed to point to the start of the egress NFS Reply header. In fact, it does point there for AUTH_SYS and RPCSEC_GSS_KRB5 requests. But both krb5i and krb5p add fields between the RPC header's accept_stat field and the start of the NFS Reply header. In those cases, "statp + 1" points at the extra fields instead of the Reply. The result is that nfsd_cache_update() caches what looks to the client like garbage. A connection break can occur for a number of reasons, but the most common reason when using krb5i/p is a GSS sequence number window underrun. When an underrun is detected, the server is obliged to drop the RPC and the connection to force a retransmit with a fresh GSS sequence number. The client presents the same XID, it hits in the server's DRC, and the server returns the garbage cache entry. The "statp + 1" argument has been used since the oldest changeset in the kernel history repo, so it has been in nfsd_dispatch() literally since before history began. The problem arose only when the server-side GSS implementation was added twenty years ago. Reviewed-by: Jeff Layton Tested-by: Jeff Layton --- fs/nfsd/nfssvc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d6122bb2d167..b4e4e04f9931 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; + __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants @@ -1010,6 +1011,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) goto out_dropit; } + nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; @@ -1023,7 +1025,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; -- cgit From bf51c52a1f3c238d72c64e14d5e7702d3a245b82 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:45 -0500 Subject: NFSD: Fix checksum mismatches in the duplicate reply cache nfsd_cache_csum() currently assumes that the server's RPC layer has been advancing rq_arg.head[0].iov_base as it decodes an incoming request, because that's the way it used to work. On entry, it expects that buf->head[0].iov_base points to the start of the NFS header, and excludes the already-decoded RPC header. These days however, head[0].iov_base now points to the start of the RPC header during all processing. It no longer points at the NFS Call header when execution arrives at nfsd_cache_csum(). In a retransmitted RPC the XID and the NFS header are supposed to be the same as the original message, but the contents of the retransmitted RPC header can be different. For example, for krb5, the GSS sequence number will be different between the two. Thus if the RPC header is always included in the DRC checksum computation, the checksum of the retransmitted message might not match the checksum of the original message, even though the NFS part of these messages is identical. The result is that, even if a matching XID is found in the DRC, the checksum mismatch causes the server to execute the retransmitted RPC transaction again. Reviewed-by: Jeff Layton Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/cache.h | 4 ++-- fs/nfsd/nfscache.c | 64 ++++++++++++++++++++++++++++++++++++------------------ fs/nfsd/nfssvc.c | 10 ++++++++- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 929248c6ca84..4cbe0434cbb8 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,8 +84,8 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *rqstp, - struct nfsd_cacherep **cacherep); +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep); void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index abb453be71ca..6cd36af2f97e 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -368,33 +368,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) return freed; } -/* - * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes +/** + * nfsd_cache_csum - Checksum incoming NFS Call arguments + * @buf: buffer containing a whole RPC Call message + * @start: starting byte of the NFS Call header + * @remaining: size of the NFS Call header, in bytes + * + * Compute a weak checksum of the leading bytes of an NFS procedure + * call header to help verify that a retransmitted Call matches an + * entry in the duplicate reply cache. + * + * To avoid assumptions about how the RPC message is laid out in + * @buf and what else it might contain (eg, a GSS MIC suffix), the + * caller passes us the exact location and length of the NFS Call + * header. + * + * Returns a 32-bit checksum value, as defined in RFC 793. */ -static __wsum -nfsd_cache_csum(struct svc_rqst *rqstp) +static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start, + unsigned int remaining) { + unsigned int base, len; + struct xdr_buf subbuf; + __wsum csum = 0; + void *p; int idx; - unsigned int base; - __wsum csum; - struct xdr_buf *buf = &rqstp->rq_arg; - const unsigned char *p = buf->head[0].iov_base; - size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, - RC_CSUMLEN); - size_t len = min(buf->head[0].iov_len, csum_len); + + if (remaining > RC_CSUMLEN) + remaining = RC_CSUMLEN; + if (xdr_buf_subsegment(buf, &subbuf, start, remaining)) + return csum; /* rq_arg.head first */ - csum = csum_partial(p, len, 0); - csum_len -= len; + if (subbuf.head[0].iov_len) { + len = min_t(unsigned int, subbuf.head[0].iov_len, remaining); + csum = csum_partial(subbuf.head[0].iov_base, len, csum); + remaining -= len; + } /* Continue into page array */ - idx = buf->page_base / PAGE_SIZE; - base = buf->page_base & ~PAGE_MASK; - while (csum_len) { - p = page_address(buf->pages[idx]) + base; - len = min_t(size_t, PAGE_SIZE - base, csum_len); + idx = subbuf.page_base / PAGE_SIZE; + base = subbuf.page_base & ~PAGE_MASK; + while (remaining) { + p = page_address(subbuf.pages[idx]) + base; + len = min_t(unsigned int, PAGE_SIZE - base, remaining); csum = csum_partial(p, len, csum); - csum_len -= len; + remaining -= len; base = 0; ++idx; } @@ -465,6 +484,8 @@ out: /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @start: starting byte in @rqstp->rq_arg of the NFS Call header + * @len: size of the NFS Call header, in bytes * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none @@ -478,7 +499,8 @@ out: * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; struct nfsd_cacherep *rp, *found; @@ -494,7 +516,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) goto out; } - csum = nfsd_cache_csum(rqstp); + csum = nfsd_cache_csum(&rqstp->rq_arg, start, len); /* * Since the common case is a cache miss followed by an insert, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b4e4e04f9931..fe61d9bbcc1f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; + unsigned int start, len; __be32 *nfs_reply; /* @@ -989,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ rqstp->rq_cachetype = proc->pc_cachetype; + /* + * ->pc_decode advances the argument stream past the NFS + * Call header, so grab the header's starting location and + * size now for the call to nfsd_cache_lookup(). + */ + start = xdr_stream_pos(&rqstp->rq_arg_stream); + len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -1002,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; - switch (nfsd_cache_lookup(rqstp, &rp)) { + switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: -- cgit From 721d28f3dfb3e40c45ce45fbeeff47b72c230bc9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Nov 2023 11:13:40 -0800 Subject: parisc: Replace strlcpy() with strscpy() strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated[1]. Additionally, it returns the size of the source string, not the resulting size of the destination string. In an effort to remove strlcpy() completely[2], replace strlcpy() here with strscpy(). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1] Link: https://github.com/KSPP/linux/issues/89 [2] Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Azeem Shaikh Cc: linux-parisc@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Helge Deller --- arch/parisc/kernel/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 29e2750f86a4..e95a977ba5f3 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -383,7 +383,7 @@ show_cpuinfo (struct seq_file *m, void *v) char cpu_name[60], *p; /* strip PA path from CPU name to not confuse lscpu */ - strlcpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name)); + strscpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name)); p = strrchr(cpu_name, '['); if (p) *(--p) = 0; -- cgit From 6ad6e15a9c46b8f0932cd99724f26f3db4db1cdf Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 17 Nov 2023 16:43:52 +0100 Subject: parisc/power: Fix power soft-off when running on qemu Firmware returns the physical address of the power switch, so need to use gsc_writel() instead of direct memory access. Fixes: d0c219472980 ("parisc/power: Add power soft-off when running on qemu") Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- drivers/parisc/power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index 539d8920c202..bb0d92461b08 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -176,7 +176,7 @@ static struct notifier_block parisc_panic_block = { static int qemu_power_off(struct sys_off_data *data) { /* this turns the system off via SeaBIOS */ - *(int *)data->cb_data = 0; + gsc_writel(0, (unsigned long) data->cb_data); pdc_soft_power_button(1); return NOTIFY_DONE; } -- cgit From 793838138c157d4c49f4fb744b170747e3dabf58 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 18 Nov 2023 19:33:35 +0100 Subject: prctl: Disable prctl(PR_SET_MDWE) on parisc systemd-254 tries to use prctl(PR_SET_MDWE) for it's MemoryDenyWriteExecute functionality, but fails on parisc which still needs executable stacks in certain combinations of gcc/glibc/kernel. Disable prctl(PR_SET_MDWE) by returning -EINVAL for now on parisc, until userspace has catched up. Signed-off-by: Helge Deller Co-developed-by: Linus Torvalds Reported-by: Sam James Closes: https://github.com/systemd/systemd/issues/29775 Tested-by: Sam James Link: https://lore.kernel.org/all/875y2jro9a.fsf@gentoo.org/ Cc: # v6.3+ --- kernel/sys.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index 420d9cb9cc8e..e219fcfa112d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; + /* PARISC cannot allow mdwe as it needs writable stacks */ + if (IS_ENABLED(CONFIG_PARISC)) + return -EINVAL; + current_bits = get_current_mdwe(); if (current_bits && current_bits != bits) return -EPERM; /* Cannot unset the flags */ -- cgit From 98b1cc82c4affc16f5598d4fa14b1858671b2263 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Nov 2023 15:02:14 -0800 Subject: Linux 6.7-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ede0bd241056..724c79bebe72 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit From 8771127e25d6c20d458ad27cf32f7fcfc1755e05 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Sat, 18 Nov 2023 00:19:17 +0100 Subject: USB: serial: option: don't claim interface 4 for ZTE MF290 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interface 4 is used by for QMI interface in stock firmware of MF28D, the router which uses MF290 modem. Free the interface up, to rebind it to qmi_wwan driver. The proper configuration is: Interface mapping is: 0: QCDM, 1: (unknown), 2: AT (PCUI), 2: AT (Modem), 4: QMI T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=0189 Rev= 0.00 S: Manufacturer=ZTE, Incorporated S: Product=ZTE LTE Technologies MSM C:* #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms Cc: Bjørn Mork Signed-off-by: Lech Perczak Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 45dcfaadaf98..ff9049db6e65 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1546,7 +1546,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff), .driver_info = RSVD(4) }, - { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff), + .driver_info = RSVD(4) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0191, 0xff, 0xff, 0xff), /* ZTE EuFi890 */ .driver_info = RSVD(4) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0196, 0xff, 0xff, 0xff) }, -- cgit From a1092619dd28ac0fcf23016160a2fdccd98ef935 Mon Sep 17 00:00:00 2001 From: Puliang Lu Date: Thu, 26 Oct 2023 20:35:06 +0800 Subject: USB: serial: option: fix FM101R-GL defines Modify the definition of the two Fibocom FM101R-GL PID macros, which had their PIDs switched. The correct PIDs are: - VID:PID 413C:8213, FM101R-GL ESIM are laptop M.2 cards (with MBIM interfaces for Linux) - VID:PID 413C:8215, FM101R-GL are laptop M.2 cards (with MBIM interface for Linux) 0x8213: mbim, tty 0x8215: mbim, tty Signed-off-by: Puliang Lu Fixes: 52480e1f1a25 ("USB: serial: option: add Fibocom to DELL custom modem FM101R-GL") Link: https://lore.kernel.org/lkml/TYZPR02MB508845BAD7936A62A105CE5D89DFA@TYZPR02MB5088.apcprd02.prod.outlook.com/ Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index ff9049db6e65..9c76095ebfe1 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -203,8 +203,8 @@ static void option_instat_callback(struct urb *urb); #define DELL_PRODUCT_5829E_ESIM 0x81e4 #define DELL_PRODUCT_5829E 0x81e6 -#define DELL_PRODUCT_FM101R 0x8213 -#define DELL_PRODUCT_FM101R_ESIM 0x8215 +#define DELL_PRODUCT_FM101R_ESIM 0x8213 +#define DELL_PRODUCT_FM101R 0x8215 #define KYOCERA_VENDOR_ID 0x0c88 #define KYOCERA_PRODUCT_KPC650 0x17da -- cgit From e389fe8b68137344562fb6e4d53d8a89ef6212dd Mon Sep 17 00:00:00 2001 From: Victor Fragoso Date: Tue, 21 Nov 2023 21:05:56 +0000 Subject: USB: serial: option: add Fibocom L7xx modules Add support for Fibocom L716-EU module series. L716-EU is a Fibocom module based on ZTE's V3E/V3T chipset. Device creates multiple interfaces when connected to PC as follows: - Network Interface: ECM or RNDIS (set by FW or AT Command) - ttyUSB0: AT port - ttyUSB1: Modem port - ttyUSB2: AT2 port - ttyUSB3: Trace port for log information - ADB: ADB port for debugging. ("Driver=usbfs" when ADB server enabled) Here are the outputs of lsusb and usb-devices: $ ls /dev/ttyUSB* /dev/ttyUSB0 /dev/ttyUSB1 /dev/ttyUSB2 /dev/ttyUSB3 usb-devices: L716-EU (ECM mode): T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 51 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0001 Rev= 1.00 S: Manufacturer=Fibocom,Incorporated S: Product=Fibocom Mobile Boardband S: SerialNumber=1234567890ABCDEF C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=06 Prot=00 Driver=cdc_ether E: Ad=87(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms L716-EU (RNDIS mode): T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 49 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0001 Rev= 1.00 S: Manufacturer=Fibocom,Incorporated S: Product=Fibocom Mobile Boardband S: SerialNumber=1234567890ABCDEF C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=ff Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Victor Fragoso Reviewed-by: Lars Melin Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 9c76095ebfe1..06b9b04c022a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2250,6 +2250,7 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(4) | RSVD(5) | RSVD(6) }, { USB_DEVICE(0x1782, 0x4d10) }, /* Fibocom L610 (AT mode) */ { USB_DEVICE_INTERFACE_CLASS(0x1782, 0x4d11, 0xff) }, /* Fibocom L610 (ECM/RNDIS mode) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0001, 0xff, 0xff, 0xff) }, /* Fibocom L716-EU (ECM/RNDIS mode) */ { USB_DEVICE(0x2cb7, 0x0104), /* Fibocom NL678 series */ .driver_info = RSVD(4) | RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff), /* Fibocom NL678 series */ -- cgit From da90e45d5afc4da2de7cd3ea7943d0f1baa47cc2 Mon Sep 17 00:00:00 2001 From: Asuna Yang Date: Wed, 22 Nov 2023 22:18:03 +0800 Subject: USB: serial: option: add Luat Air72*U series products Update the USB serial option driver support for Luat Air72*U series products. ID 1782:4e00 Spreadtrum Communications Inc. UNISOC-8910 T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 13 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1782 ProdID=4e00 Rev=00.00 S: Manufacturer=UNISOC S: Product=UNISOC-8910 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=400mA I: If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=4096ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms If#= 2: AT If#= 3: PPP + AT If#= 4: Debug Co-developed-by: Yangyu Chen Signed-off-by: Yangyu Chen Signed-off-by: Asuna Yang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 06b9b04c022a..4dffcfefd62d 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -609,6 +609,8 @@ static void option_instat_callback(struct urb *urb); #define UNISOC_VENDOR_ID 0x1782 /* TOZED LT70-C based on UNISOC SL8563 uses UNISOC's vendor ID */ #define TOZED_PRODUCT_LT70C 0x4055 +/* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */ +#define LUAT_PRODUCT_AIR720U 0x4e00 /* Device flags */ @@ -2273,6 +2275,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); -- cgit